コード例 #1
0
    def merge_fastqs(self, out_dir, save_background=True):
        """
            Merge the background file with the fastq_files Holding
            the reads supporting the variants

            Args:
                out_dir (path): Path to directory where synthetic fastqs are stored
                save_background (bool): If true, the backgrounds with excluded
                                        reads are not removed.
        """
        synthetic_fastqs = []

        out_dir = parse_path(out_dir, file_type='dir')

        reads = len(self.excluded_backgrounds)

        #For each fastq file given as background (two if paired end)
        for i in range(reads):

            fastq_list = [self.excluded_backgrounds[i]]

            for sample in self.samples:

                if len(sample['variant_fastq_files']) != reads:
                    continue

                fastq_list.append(sample['variant_fastq_files'][i])

            file_name = parse_path(self.excluded_backgrounds[i]).name
            out_path = out_dir.joinpath("synthetic_" + file_name)

            LOG.info("Merging fastq files")

            try:
                merge_fastqs_sub(fastq_list, out_path)

            except:

                log_msg = f"Files were not merged, background files in {out_dir} not removed"
                LOG.critical(log_msg)
                raise

            synthetic_fastqs.append(out_path)

        #Remove background fastqs
        if not save_background:
            for background in self.excluded_backgrounds:
                log_msg = f"Removing file from disk: {background}"
                LOG.info(log_msg)
                os.remove(background)

        for fastq in synthetic_fastqs:
            log_msg = f"Created {fastq}"
            LOG.info(log_msg)

        return synthetic_fastqs
コード例 #2
0
    def exclude_from_background(self, seqkit_exe=None):
        """
            for each background fastq file exclude the reads overlapping with
            any region in self.regions by finding the names of the reads, and
             writing new fastq files excluding these reads.

            Args:
                tmp_dir (str): path to directory where fastq_files with excluded
                               reads, i.e. the 'background' reads.
                backgrounds (list): list of paths to the 'background' bam files
                member (str): synthetic family member for synthetic dataset.
                    choices: 'child', 'father', 'mother', 'affected'
        """

        bam_file = parse_path(self.background["bam_file"])
        fastq_files = [
            parse_path(fastq) for fastq in self.background["fastq_files"]
        ]

        with BAMContext(bam_file=bam_file) as bam_handle:
            #for each region, find the reads overlapping
            for variant in self.variants:
                bam_handle.find_names_from_region(chrom=variant["chrom"],
                                                  start=variant["start"],
                                                  end=variant["end"],
                                                  padding=variant["padding"])

            log_msg = f"{bam_handle.record_number} reads to be excluded from {fastq_files}"
            LOG.info(log_msg)

            name_file = bam_handle.make_names_temp(self.tmp_dir)
            excluded_backgrounds = []
            for fastq_file in fastq_files:

                fastq_path = str(fastq_file)
                out_name = str(self.member) + "_" + str(fastq_file.name)
                out_path = str(self.tmp_dir.joinpath(out_name))

                #Here command line tool seqkit grep is used
                exclude_from_fastq(name_file,
                                   out_path,
                                   fastq_path,
                                   seqkit_exe=seqkit_exe)

                excluded_backgrounds.append(out_path)

        return excluded_backgrounds
コード例 #3
0
    def __init__(self, input_sample, variants, padding, picard_exe, case_dir):

        super(Sample, self).__init__(**input_sample)

        self.input_sample = input_sample
        self.variants = variants
        self.padding = padding
        self.picard_exe = picard_exe
        self.case_dir = case_dir
        self.bam_file = parse_path(self.input_sample['bam_file'])

        # Build sample
        self._build_sample()
コード例 #4
0
def version_command(context, dataset_dir, md5, comment):
    """
        Version dataset and truth set
    """

    log_msg = f"Versioning dataset"
    LOG.info(log_msg)

    adapter = context.obj['adapter']

    dataset_dir=parse_path(dataset_dir, file_type='dir')
    dataset = VersionedDataset(dataset_dir=dataset_dir)
    dataset.build_dataset(md5=md5, comment=comment)

    insert_version(adapter, dataset)
コード例 #5
0
    def _extract_bam(self, sample_dir):

        with BAMContext(self.bam_file, out_dir=sample_dir) as bam_handle:

            for variant in self.variants:

                bam_handle.find_reads_from_region(
                    chrom=variant["chrom"],
                    start=variant["start"],
                    end=variant["end"],
                    padding=variant["padding"]
                )

            log_msg = "{} reads found for sample {}".format(
                bam_handle.record_number,
                self.input_sample['sample_id']
            )
            LOG.info(log_msg)
            variant_bam_file = bam_handle.out_file
            self.input_sample["variant_bam_file"] = variant_bam_file
            file_name = parse_path(variant_bam_file).name
            paired = bam_handle.paired

        # Convert bam to fastq
        fastq1 = str(sample_dir.joinpath(file_name.split('.')[0] + '_R1.fastq.gz'))
        fastq2 = None
        if paired:
            fastq2 = str(sample_dir.joinpath(file_name.split('.')[0] + '_R2.fastq.gz'))

        # Use picard SamToFastq to convert from bam to paired end fastqs

        bam_to_fastq(
            variant_bam_file,
            fastq1,
            fastq2,
            picard_exe=self.picard_exe
        )

        self["variant_fastq_files"] = [fastq1, fastq2]
        self["paired_reads"] = paired
コード例 #6
0
def get_variants(vcf_file, padding, sv_padding, vcf_parse=None):
    """

        Given a vcf file, this function parses through the file and yields the variant with all
        relevant information

        Args:
            vcf_file (string): Path to vcf file

        Yields:
            variant (mutacc.builds.build_variant.Variant): Variant object
    """

    vcf_file = parse_path(vcf_file)
    vcf = VCF(str(vcf_file), "r")
    samples = vcf.samples
    parser = None
    if vcf_parse:
        parser = INFOParser(vcf_parse, "read")
    for entry in vcf:
        yield Variant(entry, samples, padding, sv_padding, parser=parser)
    vcf.close()
コード例 #7
0
def yaml_parse(yaml_file):

    yaml_file = parse_path(yaml_file)

    with open(yaml_file, "r") as yaml_handle:

        try:
            yaml_dict = yaml.safe_load(yaml_handle)

        except yaml.YAMLError as exc:
            LOG.critical(f"Error loading yaml object: {exc}")

            raise

    if set(yaml_dict.keys()) != set(["case", "variants", "samples"]):

        raise YAMLFieldsError(
            "Yaml object must contain 'case', 'samples', and 'variants'")

    for sample in yaml_dict["samples"]:

        if not set(SAMPLE).issubset(set(sample.keys())):

            raise YAMLFieldsError(
                "sample object must contain 'sample_id', 'mother', 'father',\
                    'bam', and 'fastq'")

    # Check if valid pedigree with ped_parser classes Family and individual
    family = ped_parser.Family(family_id=yaml_dict["case"]["case_id"])
    for sample in yaml_dict["samples"]:

        if sample["sex"] == "male":
            sex = "1"
        elif sample["sex"] == "female":
            sex = "2"
        else:
            sex = "0"

        if sample["phenotype"] == "unaffected":
            phenotype = "1"
        elif sample["phenotype"] == "affected":
            phenotype = "2"
        else:
            phenotype = "0"

        individual = ped_parser.Individual(
            ind=sample["sample_id"],
            family=yaml_dict["case"]["case_id"],
            mother=str(sample["mother"]),
            father=str(sample["father"]),
            sex=sex,
            phenotype=phenotype,
        )

        family.add_individual(individual)

    try:
        family.family_check()
    except:
        LOG.info("Not valid pedigree")
        raise

    return yaml_dict
コード例 #8
0
    def __init__(self, dataset_dir):

        super(VersionedDataset, self).__init__()
        self.dataset_dir = parse_path(dataset_dir, file_type='dir')
コード例 #9
0
def cli(context, loglevel, config_file, root_dir, demo, vcf_parser):

    coloredlogs.install(level=loglevel)
    LOG.info("Running mutacc")

    cli_config = {}
    if demo:
        host = "localhost"
        port = 27017
        db_name = "mutacc-demo"
        username = None
        password = None
        padding = PADDING
        sv_padding = SV_PADDING
        root_dir = make_dir(root_dir or "./mutacc_demo_root")

    else:

        if config_file:
            with open(config_file, "r") as in_handle:
                cli_config = yaml.safe_load(in_handle)

        host = cli_config.get("host") or "localhost"
        port = cli_config.get("port") or 27017
        uri = cli_config.get("uri")
        db_name = cli_config.get("database") or "mutacc"
        username = cli_config.get("username")
        password = cli_config.get("password")
        root_dir = cli_config.get("root_dir") or root_dir
        padding = cli_config.get("padding")
        sv_padding = cli_config.get("sv_padding")

        if not root_dir:
            LOG.warning(
                "Please provide a root directory, through option --root-dir or in config_file"
            )
            context.abort()

    vcf_parser = get_vcf_parser(parser_file=vcf_parser, config_dict=cli_config)

    mutacc_config = {}
    mutacc_config["host"] = host
    mutacc_config["port"] = port
    mutacc_config["uri"] = uri
    mutacc_config["username"] = username
    mutacc_config["password"] = password
    mutacc_config["db_name"] = db_name
    mutacc_config["vcf_parser_import"] = vcf_parser.get("import")
    mutacc_config["vcf_parser_export"] = vcf_parser.get("export")
    mutacc_config["root_dir"] = parse_path(root_dir, file_type="dir")
    mutacc_config["demo"] = demo
    mutacc_config["padding"] = padding
    mutacc_config["sv_padding"] = sv_padding

    # Create subdirectories in root, if not already created
    for dir_type in SUB_DIRS.keys():
        subdir = mutacc_config["root_dir"].joinpath(SUB_DIRS[dir_type])
        mutacc_config[dir_type] = make_dir(subdir)

    # Get binaries for picard and seqkit if specified in config
    mutacc_config["binaries"] = {}

    binaries = {}
    if cli_config.get("binaries"):
        binaries = cli_config["binaries"]

    mutacc_config["binaries"]["picard"] = binaries.get("picard")
    mutacc_config["binaries"]["seqkit"] = binaries.get("seqkit")

    context.obj = mutacc_config
コード例 #10
0
def fastq_extract(fastq_files: list, record_ids: set, dir_path=''):
    """

        Given a list of read identifiers, and one or two (for paired end) fastq files,
        creates new fastq files only containing the reads specified.

        Args:

            fastq_files (list): List of fastq files
            record_ids (set): Set of read names
            dir_path (string): path to directory where new fastq files are written to

        Returns:

            out_paths (list): List of paths to newly created fastq files

    """

    fastq_files = [parse_path(fastq_file) for fastq_file in fastq_files]

    #Save the file names for the fastq files to be used later
    file_names = [
        Path(file_name).name.split(".")[0] for file_name in fastq_files
    ]

    dir_path = parse_path(dir_path, file_type='dir')

    #Uses ExitStack context manager to manage a variable number of
    #files
    with ExitStack() as stack:
        #Opens fastq files and places file handles in list fastq_handles and Opens __exit__ method
        # to the ExitStack callback stack.
        fastq_handles = [stack.enter_context(get_file_handle(fastq_file)) \
                for fastq_file in fastq_files]

        #Opens fastq files to write found records.
        out_handles = [stack.enter_context(gzip.open(dir_path.joinpath(file_name + "_mutacc" +
                                                                       ".fastq.gz"), 'wt')) \
                for file_name in file_names]

        #parse fastq and places in list fastqs
        #FastqGeneralIterator parses each record as a tuple with name, seq, and quality
        #on index 0, 1, 2 respectively.
        fastqs = [FastqGeneralIterator(handle) for handle in fastq_handles]

        records_found = 0
        #Iterates over parsed fastq files simultaneously
        for count, records in enumerate(zip(*fastqs)):

            # Checks if record name exists in record_ids. This Check is only done for one of the
            # fastq files (records[0]). It is thus assumed that paired end reads exists on the same
            # position in the two files
            # Example: if records[0][0] is 'ST-E00266:38:H2TF5CCXX:8:1101:2563:2170 1:N:0:CGCGCATT'
            # records[0][0].split()[0] is 'ST-E00266:38:H2TF5CCXX:8:1101:2563:2170'
            if records[0][0].split()[0].split("/")[0] in record_ids:

                records_found += 1

                #Writes current records from each fastq file to corresponding output file
                for record, out_handle in zip(records, out_handles):

                    out_handle.write("@{}\n{}\n+\n{}\n".format(
                        record[0], record[1], record[2]))

                #removes found record name from the record_ids set
                record_ids.remove(records[0][0].split()[0].split("/")[0])

                #If record_ids is empty all records have been found so there is no need to iterate
                #further over the fastq files
                if not record_ids:
                    break

            if count % 1e6 == 0:
                log_msg = f"### {count/1e6}M READS PROCESSED: {records_found} READS FOUND ###\r"
                LOG.info(log_msg)

        #for out_buffer in out_buffers: out_buffer.flush()

    #Returns the file paths for the output fastq files, that should only contain the records
    #with its record name in record_ids
        out_paths = [out_handle.name for out_handle in out_handles]

    return out_paths