def merge_fastqs(self, out_dir, save_background=True): """ Merge the background file with the fastq_files Holding the reads supporting the variants Args: out_dir (path): Path to directory where synthetic fastqs are stored save_background (bool): If true, the backgrounds with excluded reads are not removed. """ synthetic_fastqs = [] out_dir = parse_path(out_dir, file_type='dir') reads = len(self.excluded_backgrounds) #For each fastq file given as background (two if paired end) for i in range(reads): fastq_list = [self.excluded_backgrounds[i]] for sample in self.samples: if len(sample['variant_fastq_files']) != reads: continue fastq_list.append(sample['variant_fastq_files'][i]) file_name = parse_path(self.excluded_backgrounds[i]).name out_path = out_dir.joinpath("synthetic_" + file_name) LOG.info("Merging fastq files") try: merge_fastqs_sub(fastq_list, out_path) except: log_msg = f"Files were not merged, background files in {out_dir} not removed" LOG.critical(log_msg) raise synthetic_fastqs.append(out_path) #Remove background fastqs if not save_background: for background in self.excluded_backgrounds: log_msg = f"Removing file from disk: {background}" LOG.info(log_msg) os.remove(background) for fastq in synthetic_fastqs: log_msg = f"Created {fastq}" LOG.info(log_msg) return synthetic_fastqs
def exclude_from_background(self, seqkit_exe=None): """ for each background fastq file exclude the reads overlapping with any region in self.regions by finding the names of the reads, and writing new fastq files excluding these reads. Args: tmp_dir (str): path to directory where fastq_files with excluded reads, i.e. the 'background' reads. backgrounds (list): list of paths to the 'background' bam files member (str): synthetic family member for synthetic dataset. choices: 'child', 'father', 'mother', 'affected' """ bam_file = parse_path(self.background["bam_file"]) fastq_files = [ parse_path(fastq) for fastq in self.background["fastq_files"] ] with BAMContext(bam_file=bam_file) as bam_handle: #for each region, find the reads overlapping for variant in self.variants: bam_handle.find_names_from_region(chrom=variant["chrom"], start=variant["start"], end=variant["end"], padding=variant["padding"]) log_msg = f"{bam_handle.record_number} reads to be excluded from {fastq_files}" LOG.info(log_msg) name_file = bam_handle.make_names_temp(self.tmp_dir) excluded_backgrounds = [] for fastq_file in fastq_files: fastq_path = str(fastq_file) out_name = str(self.member) + "_" + str(fastq_file.name) out_path = str(self.tmp_dir.joinpath(out_name)) #Here command line tool seqkit grep is used exclude_from_fastq(name_file, out_path, fastq_path, seqkit_exe=seqkit_exe) excluded_backgrounds.append(out_path) return excluded_backgrounds
def __init__(self, input_sample, variants, padding, picard_exe, case_dir): super(Sample, self).__init__(**input_sample) self.input_sample = input_sample self.variants = variants self.padding = padding self.picard_exe = picard_exe self.case_dir = case_dir self.bam_file = parse_path(self.input_sample['bam_file']) # Build sample self._build_sample()
def version_command(context, dataset_dir, md5, comment): """ Version dataset and truth set """ log_msg = f"Versioning dataset" LOG.info(log_msg) adapter = context.obj['adapter'] dataset_dir=parse_path(dataset_dir, file_type='dir') dataset = VersionedDataset(dataset_dir=dataset_dir) dataset.build_dataset(md5=md5, comment=comment) insert_version(adapter, dataset)
def _extract_bam(self, sample_dir): with BAMContext(self.bam_file, out_dir=sample_dir) as bam_handle: for variant in self.variants: bam_handle.find_reads_from_region( chrom=variant["chrom"], start=variant["start"], end=variant["end"], padding=variant["padding"] ) log_msg = "{} reads found for sample {}".format( bam_handle.record_number, self.input_sample['sample_id'] ) LOG.info(log_msg) variant_bam_file = bam_handle.out_file self.input_sample["variant_bam_file"] = variant_bam_file file_name = parse_path(variant_bam_file).name paired = bam_handle.paired # Convert bam to fastq fastq1 = str(sample_dir.joinpath(file_name.split('.')[0] + '_R1.fastq.gz')) fastq2 = None if paired: fastq2 = str(sample_dir.joinpath(file_name.split('.')[0] + '_R2.fastq.gz')) # Use picard SamToFastq to convert from bam to paired end fastqs bam_to_fastq( variant_bam_file, fastq1, fastq2, picard_exe=self.picard_exe ) self["variant_fastq_files"] = [fastq1, fastq2] self["paired_reads"] = paired
def get_variants(vcf_file, padding, sv_padding, vcf_parse=None): """ Given a vcf file, this function parses through the file and yields the variant with all relevant information Args: vcf_file (string): Path to vcf file Yields: variant (mutacc.builds.build_variant.Variant): Variant object """ vcf_file = parse_path(vcf_file) vcf = VCF(str(vcf_file), "r") samples = vcf.samples parser = None if vcf_parse: parser = INFOParser(vcf_parse, "read") for entry in vcf: yield Variant(entry, samples, padding, sv_padding, parser=parser) vcf.close()
def yaml_parse(yaml_file): yaml_file = parse_path(yaml_file) with open(yaml_file, "r") as yaml_handle: try: yaml_dict = yaml.safe_load(yaml_handle) except yaml.YAMLError as exc: LOG.critical(f"Error loading yaml object: {exc}") raise if set(yaml_dict.keys()) != set(["case", "variants", "samples"]): raise YAMLFieldsError( "Yaml object must contain 'case', 'samples', and 'variants'") for sample in yaml_dict["samples"]: if not set(SAMPLE).issubset(set(sample.keys())): raise YAMLFieldsError( "sample object must contain 'sample_id', 'mother', 'father',\ 'bam', and 'fastq'") # Check if valid pedigree with ped_parser classes Family and individual family = ped_parser.Family(family_id=yaml_dict["case"]["case_id"]) for sample in yaml_dict["samples"]: if sample["sex"] == "male": sex = "1" elif sample["sex"] == "female": sex = "2" else: sex = "0" if sample["phenotype"] == "unaffected": phenotype = "1" elif sample["phenotype"] == "affected": phenotype = "2" else: phenotype = "0" individual = ped_parser.Individual( ind=sample["sample_id"], family=yaml_dict["case"]["case_id"], mother=str(sample["mother"]), father=str(sample["father"]), sex=sex, phenotype=phenotype, ) family.add_individual(individual) try: family.family_check() except: LOG.info("Not valid pedigree") raise return yaml_dict
def __init__(self, dataset_dir): super(VersionedDataset, self).__init__() self.dataset_dir = parse_path(dataset_dir, file_type='dir')
def cli(context, loglevel, config_file, root_dir, demo, vcf_parser): coloredlogs.install(level=loglevel) LOG.info("Running mutacc") cli_config = {} if demo: host = "localhost" port = 27017 db_name = "mutacc-demo" username = None password = None padding = PADDING sv_padding = SV_PADDING root_dir = make_dir(root_dir or "./mutacc_demo_root") else: if config_file: with open(config_file, "r") as in_handle: cli_config = yaml.safe_load(in_handle) host = cli_config.get("host") or "localhost" port = cli_config.get("port") or 27017 uri = cli_config.get("uri") db_name = cli_config.get("database") or "mutacc" username = cli_config.get("username") password = cli_config.get("password") root_dir = cli_config.get("root_dir") or root_dir padding = cli_config.get("padding") sv_padding = cli_config.get("sv_padding") if not root_dir: LOG.warning( "Please provide a root directory, through option --root-dir or in config_file" ) context.abort() vcf_parser = get_vcf_parser(parser_file=vcf_parser, config_dict=cli_config) mutacc_config = {} mutacc_config["host"] = host mutacc_config["port"] = port mutacc_config["uri"] = uri mutacc_config["username"] = username mutacc_config["password"] = password mutacc_config["db_name"] = db_name mutacc_config["vcf_parser_import"] = vcf_parser.get("import") mutacc_config["vcf_parser_export"] = vcf_parser.get("export") mutacc_config["root_dir"] = parse_path(root_dir, file_type="dir") mutacc_config["demo"] = demo mutacc_config["padding"] = padding mutacc_config["sv_padding"] = sv_padding # Create subdirectories in root, if not already created for dir_type in SUB_DIRS.keys(): subdir = mutacc_config["root_dir"].joinpath(SUB_DIRS[dir_type]) mutacc_config[dir_type] = make_dir(subdir) # Get binaries for picard and seqkit if specified in config mutacc_config["binaries"] = {} binaries = {} if cli_config.get("binaries"): binaries = cli_config["binaries"] mutacc_config["binaries"]["picard"] = binaries.get("picard") mutacc_config["binaries"]["seqkit"] = binaries.get("seqkit") context.obj = mutacc_config
def fastq_extract(fastq_files: list, record_ids: set, dir_path=''): """ Given a list of read identifiers, and one or two (for paired end) fastq files, creates new fastq files only containing the reads specified. Args: fastq_files (list): List of fastq files record_ids (set): Set of read names dir_path (string): path to directory where new fastq files are written to Returns: out_paths (list): List of paths to newly created fastq files """ fastq_files = [parse_path(fastq_file) for fastq_file in fastq_files] #Save the file names for the fastq files to be used later file_names = [ Path(file_name).name.split(".")[0] for file_name in fastq_files ] dir_path = parse_path(dir_path, file_type='dir') #Uses ExitStack context manager to manage a variable number of #files with ExitStack() as stack: #Opens fastq files and places file handles in list fastq_handles and Opens __exit__ method # to the ExitStack callback stack. fastq_handles = [stack.enter_context(get_file_handle(fastq_file)) \ for fastq_file in fastq_files] #Opens fastq files to write found records. out_handles = [stack.enter_context(gzip.open(dir_path.joinpath(file_name + "_mutacc" + ".fastq.gz"), 'wt')) \ for file_name in file_names] #parse fastq and places in list fastqs #FastqGeneralIterator parses each record as a tuple with name, seq, and quality #on index 0, 1, 2 respectively. fastqs = [FastqGeneralIterator(handle) for handle in fastq_handles] records_found = 0 #Iterates over parsed fastq files simultaneously for count, records in enumerate(zip(*fastqs)): # Checks if record name exists in record_ids. This Check is only done for one of the # fastq files (records[0]). It is thus assumed that paired end reads exists on the same # position in the two files # Example: if records[0][0] is 'ST-E00266:38:H2TF5CCXX:8:1101:2563:2170 1:N:0:CGCGCATT' # records[0][0].split()[0] is 'ST-E00266:38:H2TF5CCXX:8:1101:2563:2170' if records[0][0].split()[0].split("/")[0] in record_ids: records_found += 1 #Writes current records from each fastq file to corresponding output file for record, out_handle in zip(records, out_handles): out_handle.write("@{}\n{}\n+\n{}\n".format( record[0], record[1], record[2])) #removes found record name from the record_ids set record_ids.remove(records[0][0].split()[0].split("/")[0]) #If record_ids is empty all records have been found so there is no need to iterate #further over the fastq files if not record_ids: break if count % 1e6 == 0: log_msg = f"### {count/1e6}M READS PROCESSED: {records_found} READS FOUND ###\r" LOG.info(log_msg) #for out_buffer in out_buffers: out_buffer.flush() #Returns the file paths for the output fastq files, that should only contain the records #with its record name in record_ids out_paths = [out_handle.name for out_handle in out_handles] return out_paths