class CoverageParser(object): def __init__( self, sample, panel_file_paths, kmer, force, seq=None, ctx=None, threads=2, memory="1GB", panels=None, verbose=True, tmp_dir="tmp/", skeleton_dir="atlas/data/skeletons/", ): self.sample = sample self.seq = seq self.ctx = ctx self.kmer = kmer self.force = force self.covgs = {"variant": {}, "presence": {}} self.variant_covgs = self.covgs["variant"] self.gene_presence_covgs = self.covgs["presence"] self.mc_cortex_runner = None self.verbose = verbose self.skeleton_dir = skeleton_dir self.tmp_dir = tmp_dir self.panel_file_paths = panel_file_paths self.panels = [] self.threads = threads self.memory = memory for panel_file_path in self.panel_file_paths: panel = Panel(panel_file_path) self.panels.append(panel) if self.seq and self.ctx: raise ValueError("Can't have both -1 and -c") def run(self): self._run_cortex() self._parse_covgs() def _run_cortex(self): self.mc_cortex_runner = McCortexGenoRunner( sample=self.sample, panels=self.panels, seq=self.seq, ctx=self.ctx, kmer=self.kmer, force=self.force, threads=self.threads, memory=self.memory, panel_name=self.panel_name, tmp_dir=self.tmp_dir, skeleton_dir=self.skeleton_dir, ) self.mc_cortex_runner.run() def estimate_depth(self): depth = [] for variant_covg in self.variant_covgs.values(): if variant_covg.reference_coverage.median_depth > 0: depth.append(variant_covg.reference_coverage.median_depth) for spcs in self.gene_presence_covgs.values(): __median_depth = median( [spc.median_depth for spc in spcs.values()]) if __median_depth > 0: depth.append(__median_depth) _median = median(depth) if _median < 1: return 1 else: return _median def remove_temporary_files(self): self.mc_cortex_runner.remove_temporary_files() @property def panel_name(self): return "-".join([panel.name for panel in self.panels]) def _parse_summary_covgs_row(self, row): try: return ( row[0], int(row[2]), int(row[3]), 100 * float(row[4]), int(row[5]), int(row[6]), ) except ValueError: logger.warning("Failed to parse %s" % str(row)) return row[0], 0, 0, 0.0, 0, 0 def _parse_covgs(self): with open(self.mc_cortex_runner.covg_tmp_file_path, "r") as infile: self.reader = csv.reader(infile, delimiter="\t") for row in self.reader: allele, median_depth, min_depth, percent_coverage, k_count, klen = self._parse_summary_covgs_row( row) allele_name = allele.split("?")[0] if self._is_variant_panel(allele_name): self._parse_variant_panel(row) else: self._parse_seq_panel(row) def _is_variant_panel(self, allele_name): try: alt_or_ref = allele_name.split("-")[0] return alt_or_ref in ["ref", "alt"] except ValueError: return False def _parse_seq_panel(self, row): allele, median_depth, min_depth, percent_coverage, k_count, klen = self._parse_summary_covgs_row( row) probe_coverage = ProbeCoverage( percent_coverage=percent_coverage, median_depth=median_depth, min_depth=min_depth, k_count=k_count, klen=klen, ) allele_name = allele.split("?")[0] params = get_params(allele) panel_type = params.get("panel_type", "presence") name = params.get("name") version = params.get("version", "1") if panel_type in ["variant", "presence"]: sequence_probe_coverage = SequenceProbeCoverage( name=name, probe_coverage=probe_coverage, version=version, length=params.get("length"), ) try: self.covgs[panel_type][name][version] = sequence_probe_coverage except KeyError: self.covgs[panel_type][name] = {} self.covgs[panel_type][name][version] = sequence_probe_coverage else: # Species panels are treated differently l = int(params.get("length", -1)) try: self.covgs[panel_type][name]["total_bases"] += l if percent_coverage > 75 and median_depth > 0: self.covgs[panel_type][name]["percent_coverage"].append( percent_coverage) self.covgs[panel_type][name]["length"].append(l) self.covgs[panel_type][name]["median"].append(median_depth) except KeyError: if panel_type not in self.covgs: self.covgs[panel_type] = {} self.covgs[panel_type][name] = {} self.covgs[panel_type][name]["total_bases"] = l if percent_coverage > 75 and median_depth > 0: self.covgs[panel_type][name]["percent_coverage"] = [ percent_coverage ] self.covgs[panel_type][name]["length"] = [l] self.covgs[panel_type][name]["median"] = [median_depth] else: self.covgs[panel_type][name]["percent_coverage"] = [] self.covgs[panel_type][name]["length"] = [] self.covgs[panel_type][name]["median"] = [] def _parse_variant_panel(self, row): probe, median_depth, min_depth, percent_coverage, k_count, klen = self._parse_summary_covgs_row( row) params = get_params(probe) probe_type = probe.split("-")[0] if "var_name" in params: var_name = (params.get("gene", "") + "_" + params.get("mut", "") + "-" + params.get("var_name", "")) else: var_name = allele.split("?")[0].split("-")[1] if not var_name in self.variant_covgs: variant_probe_coverage = VariantProbeCoverage( reference_coverages=[], alternate_coverages=[], var_name=probe, params=params, ) self.variant_covgs[var_name] = variant_probe_coverage probe_coverage = ProbeCoverage( min_depth=min_depth, k_count=k_count, percent_coverage=percent_coverage, median_depth=median_depth, klen=klen, ) if probe_type == "ref": self.variant_covgs[var_name].reference_coverages.append( probe_coverage) self.variant_covgs[ var_name].best_reference_coverage = self.variant_covgs[ var_name]._choose_best_reference_coverage() elif probe_type == "alt": self.variant_covgs[var_name].alternate_coverages.append( probe_coverage) self.variant_covgs[ var_name].best_alternate_coverage = self.variant_covgs[ var_name]._choose_best_alternate_coverage() else: raise ValueError("probe_type must be ref or alt")
class CoverageParser(object): def __init__(self, sample, panel_file_paths, kmer, force, seq=None, ctx=None, threads=2, memory="1GB", panels=None, verbose=True, tmp_dir='tmp/', skeleton_dir='atlas/data/skeletons/', mccortex31_path="mccortex31"): self.sample = sample self.seq = seq self.ctx = ctx self.kmer = kmer self.force = force self.covgs = {"variant": {}, "presence": {}} self.variant_covgs = self.covgs["variant"] self.gene_presence_covgs = self.covgs["presence"] self.mc_cortex_runner = None self.verbose = verbose self.skeleton_dir = skeleton_dir self.tmp_dir = tmp_dir self.panel_file_paths = panel_file_paths self.panels = [] self.mccortex31_path = mccortex31_path self.threads = threads self.memory = memory for panel_file_path in self.panel_file_paths: panel = Panel(panel_file_path) self.panels.append(panel) if self.seq and self.ctx: raise ValueError("Can't have both -1 and -c") def run(self): self._run_cortex() self._parse_covgs() def _run_cortex(self): self.mc_cortex_runner = McCortexGenoRunner( sample=self.sample, panels=self.panels, seq=self.seq, ctx=self.ctx, kmer=self.kmer, force=self.force, threads=self.threads, memory=self.memory, panel_name=self.panel_name, tmp_dir=self.tmp_dir, skeleton_dir=self.skeleton_dir, mccortex31_path=self.mccortex31_path) self.mc_cortex_runner.run() def estimate_depth(self): depth = [] for variant_coverages in self.variant_covgs.values(): for variant_covg in variant_coverages: if variant_covg.reference_coverage.median_depth > 0: depth.append(variant_covg.reference_coverage.median_depth) for spcs in self.gene_presence_covgs.values(): __median_depth = median( [spc.median_depth for spc in spcs.values()]) if __median_depth > 0: depth.append(__median_depth) _median = median(depth) if _median < 1: return 1 else: return _median def remove_temporary_files(self): self.mc_cortex_runner.remove_temporary_files() @property def panel_name(self): return "-".join([panel.name for panel in self.panels]) def _parse_summary_covgs_row(self, row): try: return row[0], int(row[2]), int(row[3]), 100 * float(row[4]), int( row[5]) except ValueError: logger.warning("Failed to parse %s" % str(row)) return row[0], 0, 0, 0.0, 0 def _parse_covgs(self): with open(self.mc_cortex_runner.covg_tmp_file_path, 'r') as infile: self.reader = csv.reader(infile, delimiter="\t") for row in self.reader: allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row( row) allele_name = allele.split('?')[0] if self._is_variant_panel(allele_name): self._parse_variant_panel(row) else: self._parse_seq_panel(row) def _is_variant_panel(self, allele_name): try: alt_or_ref = allele_name.split('-')[0] return alt_or_ref in ["ref", "alt"] except ValueError: return False def _parse_seq_panel(self, row): allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row( row) probe_coverage = ProbeCoverage(percent_coverage=percent_coverage, median_depth=median_depth, min_depth=min_depth, k_count=k_count) allele_name = allele.split('?')[0] params = get_params(allele) panel_type = params.get("panel_type", "presence") name = params.get('name') version = params.get('version', '1') if panel_type in ["variant", "presence"]: sequence_probe_coverage = SequenceProbeCoverage( name=name, probe_coverage=probe_coverage, version=version, length=params.get("length")) try: self.covgs[panel_type][name][version] = sequence_probe_coverage except KeyError: self.covgs[panel_type][name] = {} self.covgs[panel_type][name][version] = sequence_probe_coverage else: # Species panels are treated differently l = int(params.get("length", -1)) try: self.covgs[panel_type][name]["total_bases"] += l if percent_coverage > 75 and median_depth > 0: self.covgs[panel_type][name]["percent_coverage"].append( percent_coverage) self.covgs[panel_type][name]["length"].append(l) self.covgs[panel_type][name]["median"].append(median_depth) except KeyError: if panel_type not in self.covgs: self.covgs[panel_type] = {} self.covgs[panel_type][name] = {} self.covgs[panel_type][name]["total_bases"] = l if percent_coverage > 75 and median_depth > 0: self.covgs[panel_type][name]["percent_coverage"] = [ percent_coverage ] self.covgs[panel_type][name]["length"] = [l] self.covgs[panel_type][name]["median"] = [median_depth] else: self.covgs[panel_type][name]["percent_coverage"] = [] self.covgs[panel_type][name]["length"] = [] self.covgs[panel_type][name]["median"] = [] def _parse_variant_panel(self, row): allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row( row) params = get_params(allele) if 'var_name' in params: var_name = params.get('var_name') else: var_name = allele.split('?')[0].split('-')[1] num_alts = int(params.get("num_alts", 0)) reference_coverages = [ ProbeCoverage(percent_coverage=percent_coverage, median_depth=median_depth, min_depth=min_depth, k_count=k_count) ] alt_or_ref = 'ref' alternate_coverages = [] for i in range(num_alts - 1): row = next(self.reader) ref_allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row( row) if ref_allele.split('-')[0] != 'ref': logger.warning("Fewer ref alleles than alt alleles for %s" % ref_allele) alternate_coverages.append( ProbeCoverage(min_depth=min_depth, k_count=k_count, percent_coverage=percent_coverage, median_depth=median_depth)) num_alts -= 1 break assert ref_allele.split('-')[0] == 'ref' reference_coverages.append( ProbeCoverage(percent_coverage=percent_coverage, median_depth=median_depth, min_depth=min_depth, k_count=k_count)) for i in range(num_alts): row = next(self.reader) alt_allele, median_depth, min_depth, percent_coverage, k_count = self._parse_summary_covgs_row( row) assert alt_allele.split('-')[0] == 'alt' alternate_coverages.append( ProbeCoverage(min_depth=min_depth, k_count=k_count, percent_coverage=percent_coverage, median_depth=median_depth)) variant_probe_coverage = VariantProbeCoverage( reference_coverages=reference_coverages, alternate_coverages=alternate_coverages, var_name=var_name, params=params) try: self.variant_covgs[allele].append(variant_probe_coverage) except KeyError: self.variant_covgs[allele] = [variant_probe_coverage]