def _get_paired_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) for origin in data: if dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input": sample["work_bam_input"] = dd.get_work_bam(origin[0]) return [sample]
def check_paired_problems(items): """Check for incorrectly paired tumor/normal samples in a batch. """ # ensure we're in a paired batch if not get_paired(items): return num_tumor = len( [x for x in items if dd.get_phenotype(x).lower() == "tumor"]) if num_tumor > 1: raise ValueError( "Unsupported configuration: found multiple tumor samples in batch %s: %s" % (tz.get_in(["metadata", "batch"], items[0]), [dd.get_sample_name(data) for data in items])) elif num_tumor == 0 and any( dd.get_phenotype(data).lower() == "normal" for data in items): raise ValueError("Found normal sample without tumor in batch %s: %s" % (tz.get_in(["metadata", "batch"], items[0]), [dd.get_sample_name(data) for data in items])) else: vcs = get_somatic_variantcallers(items) if "mutect" in vcs or "mutect2" in vcs or "strelka2" in vcs: paired = get_paired(items) if not (paired.normal_data or paired.normal_panel): raise ValueError( "MuTect, MuTect2 and Strelka2 somatic calling requires normal sample or panel: %s" % [dd.get_sample_name(data) for data in items])
def parallel_calling(data, run_parallel): """This is needed only if running methylated veruss hidroxy-methulated""" out = [] for sample in data: work_bam = dd.get_work_bam(sample[0]) with closing(pysam.Samfile(work_bam, "rb")) as pysam_work_bam: chroms = pysam_work_bam.references for chrom in chroms: new_sample = copy.deepcopy(sample) if chrom.find("_") > -1: continue new_sample[0]['chr_to_run'] = chrom out.append(new_sample) out = run_parallel("cpg_calling", out) for sample in out: phenotype = dd.get_phenotype(sample[0]) batch = dd.get_batch(sample[0]) if phenotype == "mC": for sample2 in out: if batch in dd.get_batch(sample2[0]) and dd.get_phenotype( sample2[0]) == "hmC": if sample[0]["chr_to_run"] == sample2[0]["chr_to_run"]: sample[0]["control"] = sample2[0]["cpg_file"] break out = run_parallel("cpg_processing", out) for sample in data: sample[0]["cpg_split"] = [] sample[0]["hmc_split"] = [] name = dd.get_sample_name(sample[0]) for chunck in out: if name == dd.get_sample_name(chunck[0]): sample[0]["cpg_split"].append(chunck[0]["cpg_file"]) if "hmc_file" in chunck[0]: sample[0]["hmc_split"].append(chunck[0]["hmc_file"])
def _get_paired_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) for origin in data: if dd.get_batch(sample) in dd.get_batch( origin[0]) and dd.get_phenotype(origin[0]) == "input": sample["work_bam_input"] = dd.get_work_bam(origin[0]) return [sample]
def _check(sample, data): """Get input sample for each chip bam file.""" if dd.get_chip_method(sample).lower() == "atac": return [sample] if dd.get_phenotype(sample) == "input": return None for origin in data: if dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input": sample["work_bam_input"] = dd.get_work_bam(origin[0]) return [sample] return [sample]
def _check(sample, data): """Get input sample for each chip bam file.""" if dd.get_chip_method(sample).lower() == "atac": return [sample] if dd.get_phenotype(sample) == "input": return None for origin in data: if dd.get_batch(sample) in (dd.get_batches(origin[0]) or []) and dd.get_phenotype(origin[0]) == "input": sample["work_bam_input"] = origin[0].get("work_bam") return [sample] return [sample]
def _get_replicate_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) rep_bam = "" for origin in data: if dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(sample) in dd.get_phenotype(origin[0]) and dd.get_work_bam(sample) != dd.get_work_bam(origin[0]) and dd.get_phenotype(origin[0]) != "control": if rep_bam != "": rep_bam = rep_bam + "," + dd.get_work_bam(origin[0]) else: rep_bam = dd.get_work_bam(origin[0]) sample["work_bam_rep"] = dd.get_work_bam(origin[0]) return [sample]
def _get_paired_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) input_bam = "" for origin in data: if dd.get_phenotype(origin[0]) == "control": if input_bam != "": input_bam = input_bam + "," + dd.get_work_bam(origin[0]) else: input_bam = dd.get_work_bam(origin[0]) sample["work_bam_input"] = input_bam return [sample]
def check_paired_problems(items): """Check for incorrectly paired tumor/normal samples in a batch. """ # ensure we're in a paired batch if not get_paired(items): return num_tumor = len([x for x in items if dd.get_phenotype(x).lower() == "tumor"]) if num_tumor > 1: raise ValueError("Unsupported configuration: found multiple tumor samples in batch %s: %s" % (tz.get_in(["metadata", "batch"], items[0]), [dd.get_sample_name(data) for data in items])) elif num_tumor == 0 and any(dd.get_phenotype(data).lower() == "normal" for data in items): raise ValueError("Found normal sample without tumor in batch %s: %s" % (tz.get_in(["metadata", "batch"], items[0]), [dd.get_sample_name(data) for data in items]))
def finalize_sv(samples, config): """Combine results from multiple sv callers into a single ordered 'sv' key. """ by_bam = collections.OrderedDict() for x in samples: batch = dd.get_batch(x) or [dd.get_sample_name(x)] try: by_bam[x["align_bam"], tuple(batch)].append(x) except KeyError: by_bam[x["align_bam"], tuple(batch)] = [x] by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): orig_callers = tz.get_in(["config", "algorithm", "svcaller_orig"], x) cur_caller = tz.get_in(["config", "algorithm", "svcaller"], x) return orig_callers.index(cur_caller) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final["sv"] = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) final["config"]["algorithm"]["svcaller"] = final["config"][ "algorithm"].pop("svcaller_orig") batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] if len(batches) > 1: lead_batches[(dd.get_sample_name(final), dd.get_phenotype(final) == "germline")] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): if any("svplots" in dd.get_tools_on(d) for d in items): items = plot.by_regions(items) for data in items: if lead_batches.get( (dd.get_sample_name(data), dd.get_phenotype(data) == "germline")) in [batch, None]: out.append([data]) return out
def _get_vcf_samples(calls, items): have_full_file = False all_samples = set([]) sample_matches = False for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for data in items: for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)): # For tumor/normal batches, want to attach germline VCFs to normals # Standard somatics go to tumors if dd.get_phenotype(data) == "normal": test_name += "-germline" if os.path.basename(f).startswith( ("%s-" % test_name, "%s." % test_name)): # Prefer matches to single samples (gVCF) over joint batches if i == 0: sample_matches = True if sample_matches and i > 0: continue else: all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def _maybe_add_alignment(algorithm, sample, out): if _has_alignment_file(algorithm, sample) and dd.get_phenotype(sample) != "germline": for (fname, ext, isplus) in [(sample.get("work_bam"), "ready", False), (sample.get("umi_bam"), "umi", False), (sample.get("bigwig"), "ready", False), (dd.get_disc_bam(sample), "disc", True), (dd.get_sr_bam(sample), "sr", True)]: if fname and os.path.exists(fname): if fname.endswith("bam"): ftype, fext = "bam", ".bai" elif fname.endswith("cram"): ftype, fext = "cram", ".crai" elif fname.endswith("bw"): ftype, fext = "bw", ".bw" else: raise ValueError("Unexpected alignment file type %s" % fname) out.append({ "path": fname, "type": ftype, "plus": isplus, "ext": ext }) if utils.file_exists(fname + fext): out.append({ "path": fname + fext, "type": ftype + fext, "plus": isplus, "index": True, "ext": ext }) return out
def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting the module order module_order = [] module_order.extend([ "bcbio", "samtools", "goleft_indexcov", "peddy" ]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) # tumor-only somatic with germline extraction or dd.get_phenotype(s) == "germline" # or paired somatic with germline calling for normal for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([{ 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'write_general_stats': True, }}, {'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'write_general_stats': False }}, ]) else: module_order.append("bcftools") module_order.extend([ "picard", "qualimap", "snpeff", "fastqc", "preseq", ]) out["module_order"] = module_order preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def _maybe_add_alignment(algorithm, sample, out): if _has_alignment_file(algorithm, sample) and dd.get_phenotype(sample) != "germline": for (fname, ext, isplus) in [(sample.get("work_bam"), "ready", False), (sample.get("umi_bam"), "umi", False), (sample.get("bigwig"), "ready", False), (dd.get_disc_bam(sample), "disc", True), (dd.get_sr_bam(sample), "sr", True)]: if fname and os.path.exists(fname): if fname.endswith("bam"): ftype, fext = "bam", ".bai" elif fname.endswith("cram"): ftype, fext = "cram", ".crai" elif fname.endswith("bw"): ftype, fext = "bw", ".bw" else: raise ValueError("Unexpected alignment file type %s" % fname) out.append({"path": fname, "type": ftype, "plus": isplus, "ext": ext}) if utils.file_exists(fname + fext): out.append({"path": fname + fext, "type": ftype + fext, "plus": isplus, "index": True, "ext": ext}) return out
def batch(samples): """CWL: batch together per sample, joint and germline calls for ensemble combination. Sets up groups of same sample/batch variant calls for ensemble calling, as long as we have more than one caller per group. """ samples = [utils.to_single_data(x) for x in samples] sample_order = [dd.get_sample_name(x) for x in samples] batch_groups = collections.defaultdict(list) for data in samples: batch_samples = tuple(data.get("batch_samples", [dd.get_sample_name(data)])) batch_groups[(batch_samples, dd.get_phenotype(data))].append(data) out = [] for (batch_samples, phenotype), gsamples in batch_groups.items(): if len(gsamples) > 1: batches = set([]) for d in gsamples: batches |= set(dd.get_batches(d)) cur = copy.deepcopy(gsamples[0]) cur.update({"batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples), "batch_samples": batch_samples, "variants": {"variantcallers": [dd.get_variantcaller(d) for d in gsamples], "calls": [d.get("vrn_file") for d in gsamples]}}) out.append(cur) def by_original_order(d): return min([sample_order.index(s) for s in d["batch_samples"] if s in sample_order]) return sorted(out, key=by_original_order)
def _batch_split_by_sv(samples, stage): to_process = collections.OrderedDict() extras = [] background = [] for data in (utils.to_single_data(x) for x in samples): ready_data = _handle_multiple_svcallers(data, stage) if len(ready_data) > 0: background.append(data) for x in ready_data: svcaller = tz.get_in(["config", "algorithm", "svcaller"], x) batch = dd.get_batch(x) or dd.get_sample_name(x) if stage in ["precall", "ensemble" ]: # no batching for precall or ensemble methods if isinstance( batch, basestring) and batch != dd.get_sample_name(x): batch += "_%s" % dd.get_sample_name(x) else: batch = dd.get_sample_name(x) if dd.get_phenotype(x) == "germline": batch += "_germline" elif svcaller in _GLOBAL_BATCHING: # All samples batched together for analyses batch = "all" batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: extras.append([data]) return to_process, extras, background
def _batch_split_by_sv(samples, stage): to_process = collections.OrderedDict() extras = [] background = [] for data in (utils.to_single_data(x) for x in samples): ready_data = _handle_multiple_svcallers(data, stage) if len(ready_data) > 0: background.append(data) for x in ready_data: svcaller = tz.get_in(["config", "algorithm", "svcaller"], x) batch = dd.get_batch(x) or dd.get_sample_name(x) if stage in ["ensemble"]: # no batching for ensemble methods if isinstance(batch, six.string_types) and batch != dd.get_sample_name(x): batch += "_%s" % dd.get_sample_name(x) else: batch = dd.get_sample_name(x) if dd.get_phenotype(x) == "germline": batch += "_germline" elif svcaller in _GLOBAL_BATCHING: # All samples batched together for analyses batch = "all" batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: extras.append([data]) return to_process, extras, background
def _get_vcf_samples(calls, items): have_full_file = False all_samples = set([]) sample_matches = False for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for data in items: for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)): # For tumor/normal batches, want to attach germline VCFs to normals # Standard somatics go to tumors if dd.get_phenotype(data) == "normal": test_name += "-germline" if os.path.basename(f).startswith(("%s-" % test_name, "%s." % test_name)): # Prefer matches to single samples (gVCF) over joint batches if i == 0: sample_matches = True if sample_matches and i > 0: continue else: all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import (atropos, coverage, damage, fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant, viral, preseq) tools = {"fastqc": fastqc.run, "atropos": atropos.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "coverage": coverage.run, "damage": damage.run, "variants": variant.run, "peddy": peddy.run_qc, "kraken": kraken.run, "picard": picard.run, "umi": umi.run, "viral": viral.run, "preseq": preseq.run, } qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = utils.deepish_copy(dd.get_summary_qc(data)) for program_name in dd.get_algorithm_qc(data): if not bam_file and program_name != "kraken": # kraken doesn't need bam continue if dd.get_phenotype(data) == "germline" and program_name != "variants": continue qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): # Check for metrics output, two cases: # 1. output with {"metrics"} and files ("base") if "metrics" in out: metrics.update(out.pop("metrics")) # 2. a dictionary of metrics elif "base" not in out: metrics.update(out) # Check for files only output if "base" in out: qc_files = out elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting the module order module_order = [] module_order.extend([ "bcbio", "samtools", "goleft_indexcov" ]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) # tumor-only somatic with germline extraction or dd.get_phenotype(s) == "germline" # or paired somatic with germline calling for normal for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([{ 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'write_general_stats': True, }}, {'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'write_general_stats': False }}, ]) else: module_order.append("bcftools") module_order.extend([ "picard", "qualimap", "snpeff", "fastqc", "preseq", ]) out["module_order"] = module_order preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import (coverage, damage, fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant, viral, preseq) tools = {"fastqc": fastqc.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "coverage": coverage.run, "damage": damage.run, "variants": variant.run, "kraken": kraken.run, "picard": picard.run, "umi": umi.run, "viral": viral.run, "preseq": preseq.run, } qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = {} for program_name in dd.get_algorithm_qc(data): if not bam_file and program_name != "kraken": # kraken doesn't need bam continue if dd.get_phenotype(data) == "germline" and program_name != "variants": continue qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): # Check for metrics output, two cases: # 1. output with {"metrics"} and files ("base") if "metrics" in out: metrics.update(out.pop("metrics")) # 2. a dictionary of metrics elif "base" not in out: metrics.update(out) # Check for files only output if "base" in out: qc_files = out elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def _get_multiplier(samples): """Get multiplier to get jobs only for samples that have input """ to_process = 1.0 for sample in samples: if dd.get_phenotype(sample[0]) == "chip": to_process += 1.0 return to_process / len(samples)
def run(_, data, out_dir): """Prepare variants QC analysis: bcftools stats and snpEff output. """ out = [] vcinfo = get_active_vcinfo(data) if vcinfo: if dd.get_phenotype(data) == "normal" and "germline" in vcinfo: out.append(_bcftools_stats(data, out_dir, "germline", germline=True)) elif dd.get_phenotype(data) != "germline": out.append(_bcftools_stats(data, out_dir)) if "germline" in vcinfo: out.append(_bcftools_stats(data, out_dir, "germline", germline=True)) else: out.append(_bcftools_stats(data, out_dir, germline=True)) out.append(_snpeff_stats(data, out_dir)) out = [item for item in out if item] if out: return {"base": out[0], "secondary": out[1:]}
def finalize_sv(samples, config): """Combine results from multiple sv callers into a single ordered 'sv' key. """ by_bam = collections.OrderedDict() for x in samples: batch = dd.get_batch(x) or [dd.get_sample_name(x)] try: by_bam[x["align_bam"], tuple(batch)].append(x) except KeyError: by_bam[x["align_bam"], tuple(batch)] = [x] by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): orig_callers = tz.get_in(["config", "algorithm", "svcaller_orig"], x) cur_caller = tz.get_in(["config", "algorithm", "svcaller"], x) return orig_callers.index(cur_caller) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final["sv"] = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) final["config"]["algorithm"]["svcaller"] = final["config"]["algorithm"].pop("svcaller_orig") batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] if len(batches) > 1: lead_batches[(dd.get_sample_name(final), dd.get_phenotype(final) == "germline")] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): if any("svplots" in dd.get_tools_on(d) for d in items): items = plot.by_regions(items) for data in items: if lead_batches.get((dd.get_sample_name(data), dd.get_phenotype(data) == "germline")) in [batch, None]: out.append([data]) return out
def _get_multiplier(samples): """Get multiplier to get jobs only for samples that have input """ to_process = 1.0 for sample in samples: if dd.get_phenotype(sample[0]) != "control" and dd.get_replicate(sample[0]) == 1: to_process += 1.0 if to_process / len(samples) < 1.0: to_process = 1.0 else: to_process = to_process / len(samples) return to_process
def check_paired_problems(items): """Check for incorrectly paired tumor/normal samples in a batch. """ # ensure we're in a paired batch if not get_paired(items): return num_tumor = len([x for x in items if dd.get_phenotype(x).lower() == "tumor"]) if num_tumor > 1: raise ValueError("Unsupported configuration: found multiple tumor samples in batch %s: %s" % (tz.get_in(["metadata", "batch"], items[0]), [dd.get_sample_name(data) for data in items])) elif num_tumor == 0 and any(dd.get_phenotype(data).lower() == "normal" for data in items): raise ValueError("Found normal sample without tumor in batch %s: %s" % (tz.get_in(["metadata", "batch"], items[0]), [dd.get_sample_name(data) for data in items])) else: vcs = get_somatic_variantcallers(items) if "mutect" in vcs or "mutect2" in vcs or "strelka2" in vcs: paired = get_paired(items) if not (paired.normal_data or paired.normal_panel): raise ValueError("MuTect, MuTect2 and Strelka2 somatic calling requires normal sample or panel: %s" % [dd.get_sample_name(data) for data in items])
def _select_sample(data, variant_file, work_dir): """Select current sample from original call file. """ sample_name = dd.get_sample_name(data) if dd.get_phenotype(data) == "germline": variant_file = germline.fix_germline_samplename(variant_file, sample_name, data) out_file = os.path.join(work_dir, "%s-%s.vcf.gz" % (utils.splitext_plus(os.path.basename(variant_file))[0], sample_name)) if not utils.file_uptodate(out_file, variant_file): with file_transaction(data, out_file) as tx_out_file: cmd = "bcftools view -s {sample_name} -O z -o {tx_out_file} {variant_file}" do.run(cmd.format(**locals()), "Run manta SV analysis") return vcfutils.bgzip_and_index(out_file, data["config"])
def _get_multiplier(samples): """Get multiplier to get jobs only for samples that have input """ to_process = 1.0 to_skip = 0 for sample in samples: if dd.get_phenotype(sample[0]) == "chip": to_process += 1.0 elif dd.get_chip_method(sample[0]).lower() == "atac": to_process += 1.0 else: to_skip += 1.0 return (to_process - to_skip) / len(samples)
def extract_germline_vcinfo(data, out_dir): """Extract germline VCFs from existing tumor inputs. """ supported_germline = set(["vardict", "octopus", "freebayes"]) if dd.get_phenotype(data) in ["tumor"]: for v in _get_variants(data): if v.get("variantcaller") in supported_germline: if v.get("germline"): return v else: d = utils.deepish_copy(data) d["vrn_file"] = v["vrn_file"] gd = germline.extract(d, [d], out_dir) v["germline"] = gd["vrn_file_plus"]["germline"] return v
def extract(data, items): """Extract germline calls for the given sample, if tumor only. For germline calling done separately, fix VCF sample naming to match. """ if vcfutils.get_paired_phenotype(data): if dd.get_batches(data) and len(items) == 1: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} elif dd.get_phenotype(data) == "germline": sample_name = dd.get_sample_name(data) vcf_samples = vcfutils.get_samples(data["vrn_file"]) if (sample_name.endswith("-germline") and len(vcf_samples) == 1 and sample_name.replace("-germline", "") == vcf_samples[0]): data["vrn_file"] = fix_germline_samplename(data["vrn_file"], sample_name, data) return data
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def _get_multiplier(samples): """Get multiplier to get jobs only for samples that have input """ to_process = 1.0 to_skip = 0 for sample in samples: if dd.get_phenotype(sample[0]) == "chip": to_process += 1.0 elif dd.get_chip_method(sample[0]).lower() == "atac": to_process += 1.0 else: to_skip += 1.0 mult = (to_process - to_skip) / len(samples) if mult <= 0: mult = 1 / len(samples) return max(mult, 1)
def extract(data, items): """Extract germline calls for the given sample, if tumor only. For germline calling done separately, fix VCF sample naming to match. """ if vcfutils.get_paired_phenotype(data): if dd.get_batches(data) and len(items) == 1: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} elif dd.get_phenotype(data) == "germline": sample_name = dd.get_sample_name(data) vcf_samples = vcfutils.get_samples(data["vrn_file"]) if (sample_name.endswith("-germline") and len(vcf_samples) == 1 and sample_name.replace("-germline", "") == vcf_samples[0]): data["vrn_file"] = _fix_germline_samplename(data["vrn_file"], sample_name, data) return data
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["archive"] = any([dd.get_archive(d) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def peakcall_prepare(data, run_parallel): """Entry point for doing peak calling""" caller_fns = get_callers() to_process = [] for sample in data: mimic = copy.copy(sample[0]) for caller in dd.get_peakcaller(sample[0]): if caller in caller_fns and dd.get_phenotype(mimic) == "chip": mimic["peak_fn"] = caller name = dd.get_sample_name(mimic) mimic = _get_paired_samples(mimic, data) if mimic: to_process.append(mimic) else: logger.info("Skipping peak calling. No input sample for %s" % name) if to_process: after_process = run_parallel("peakcalling", to_process) data = _sync(data, after_process) return data
def _batch_split_by_sv(samples, stage): """Return - to_process = svcaller-batch => [svcaller-sample1, svcaller-sample2...] odict - extras = samples without sv calling (should there be any?) - background - all samples """ to_process = collections.OrderedDict() extras = [] background = [] for data in (utils.to_single_data(x) for x in samples): # data = sample ready_data = _handle_multiple_svcallers(data, stage) if len(ready_data) > 0: # why appending every sample to background? background.append(data) # x is sample - sv caller pair for x in ready_data: svcaller = tz.get_in(["config", "algorithm", "svcaller"], x) batch = dd.get_batch(x) or dd.get_sample_name(x) if stage in ["ensemble"]: # no batching for ensemble methods if isinstance(batch, six.string_types ) and batch != dd.get_sample_name(x): batch += "_%s" % dd.get_sample_name(x) else: batch = dd.get_sample_name(x) if dd.get_phenotype(x) == "germline": batch += "_germline" elif svcaller in _GLOBAL_BATCHING: # All samples batched together for analyses batch = "all" # just creating PON - no calling if stage in ["standard"] and batch in ["pon_build"]: extras.append(x) else: batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: extras.append([data]) return to_process, extras, background
def peakcall_prepare(data, run_parallel): """Entry point for doing peak calling""" caller_fns = get_callers() to_process = [] for sample in data: mimic = copy.copy(sample[0]) for caller in dd.get_peakcaller(sample[0]): if caller in caller_fns and dd.get_phenotype(mimic) == "chip": mimic["peak_fn"] = caller name = dd.get_sample_name(mimic) mimic = _get_paired_samples(mimic, data) if mimic: to_process.append(mimic) else: logger.info( "Skipping peak calling. No input sample for %s" % name) if to_process: after_process = run_parallel("peakcalling", to_process) data = _sync(data, after_process) return data
def splicecall_prepare(data, run_parallel): """Entry point for doing alternative splice callers""" gtf_file = dd.get_gtf_file(data) caller_fns = get_callers() to_process = [] caller = "rmats" for sample in data: if dd.get_replicate(sample[0]) == 1: mimic = copy.copy(sample[0]) if caller in dd.get_splicecaller(sample[0]): if caller in caller_fns and dd.get_phenotype(mimic) != "control": mimic["rmats_fn"] = caller name = dd.get_sample_name(mimic) rep_mimic = _get_replicate_samples(mimic, data) mimic = _get_paired_samples(mimic, data) if mimic: to_process.append(mimic) else: logger.info("Skipping alternative splice calling. No input sample for %s" % name) if to_process: after_process = run_parallel("splicecalling", to_process) data = _sync(data, after_process) return data
def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): # Hiding metrics duplicated by Qualimap out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths avg_depths = [tz.get_in(["summary", "metrics", "Avg_coverage"], s) for s in samples] avg_depths = [x for x in avg_depths if x] # Picking all thresholds up to the highest sample average depth thresholds = [t for t in coverage.DEPTH_THRESHOLDS if not avg_depths or t <= max(avg_depths)] # ...plus one more if len(thresholds) < len(coverage.DEPTH_THRESHOLDS): thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)]) # Showing only thresholds surrounding any of average depths thresholds_hidden = [] for i, t in enumerate(thresholds): if t > 20: # Not hiding anything below 20x if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \ any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)): pass else: thresholds_hidden.append(t) # Hide coverage unless running full qualimap, downsampled inputs are confusing if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples): thresholds_hidden = thresholds + thresholds_hidden thresholds_hidden.sort() thresholds = [] out['qualimap_config'] = { 'general_stats_coverage': [str(t) for t in thresholds], 'general_stats_coverage_hidden': [str(t) for t in thresholds_hidden]} # Avoid confusing peddy outputs, sticking to ancestry and sex prediction out["table_columns_visible"]["Peddy"] = {"family_id": False, "sex_het_ratio": False, "error_sex_check": False} # Setting the module order module_order = [] module_order.extend([ "bcbio", "samtools", "goleft_indexcov", "peddy" ]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) or # tumor-only somatic with germline extraction dd.get_phenotype(s) == "germline" or # or paired somatic with germline calling for normal _has_bcftools_germline_stats(s) # CWL organized statistics for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([{ 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'write_general_stats': True, }}, {'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'write_general_stats': False }}, ]) else: module_order.append("bcftools") module_order.extend([ "salmon", "picard", "qualimap", "snpeff", "fastqc", "preseq", ]) out["module_order"] = module_order preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def _create_config_file(out_dir, samples): """Provide configuration file for multiqc report.""" out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} extra_fn_clean_trim = [] extra_fn_clean_trim.extend( ["coverage.mosdepth.region.dist", "coverage.mosdepth.global.dist"]) out["extra_fn_clean_trim"] = extra_fn_clean_trim # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): # Hiding metrics duplicated by Qualimap out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths avg_depths = [ tz.get_in(["summary", "metrics", "Avg_coverage"], s) for s in samples ] avg_depths = [x for x in avg_depths if x] # Picking all thresholds up to the highest sample average depth thresholds = [ t for t in coverage.DEPTH_THRESHOLDS if not avg_depths or t <= max(avg_depths) ] # ...plus one more if len(thresholds) < len(coverage.DEPTH_THRESHOLDS): thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)]) # Showing only thresholds surrounding any of average depths thresholds_hidden = [] for i, t in enumerate(thresholds): if t > 20: # Not hiding anything below 20x if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \ any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)): pass else: thresholds_hidden.append(t) # Hide coverage unless running full qualimap, downsampled inputs are confusing if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples): thresholds_hidden = thresholds + thresholds_hidden thresholds_hidden.sort() thresholds = [] out['qualimap_config'] = { 'general_stats_coverage': [str(t) for t in thresholds], 'general_stats_coverage_hidden': [str(t) for t in thresholds_hidden] } # Avoid confusing peddy outputs, sticking to ancestry and sex prediction out["table_columns_visible"]["Peddy"] = { "family_id": False, "sex_het_ratio": False, "error_sex_check": False } # Setting the module order module_order = [] module_order.extend(["bcbio", "samtools", "goleft_indexcov", "peddy"]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) or # tumor-only somatic with germline extraction dd.get_phenotype(s) == "germline" or # or paired somatic with germline calling for normal _has_bcftools_germline_stats(s) # CWL organized statistics for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([ { 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'custom_config': { 'write_general_stats': True }, } }, { 'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'custom_config': { 'write_general_stats': False }, } }, ]) else: module_order.append("bcftools") module_order.extend([ "salmon", "star", "picard", "qualimap", "snpeff", "bismark", "fastqc", "preseq" ]) out["module_order"] = module_order preseq_samples = [ s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s) ] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file