def run_freebayes(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run FreeBayes variant calling, either paired tumor/normal or germline calling. """ if is_paired_analysis(align_bams, items): paired = get_paired_bams(align_bams, items) if not paired.normal_bam: call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file, somatic=paired) else: call_file = _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region, out_file) else: vcfutils.check_paired_problems(items) call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. """ items = [utils.to_single_data(x) for x in items] svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage).get(svcaller) out = [] if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis( [x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [ x for x in all_items if dd.get_sample_name(x) not in names ] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if "cwl_keys" in items[0]: out_cwl = [] for data in [utils.to_single_data(x) for x in out]: svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] out_cwl.append([data]) return out_cwl return out
def detect_sv(items, all_items, config): """Top level parallel target for examining structural variation. """ svcaller = config["algorithm"].get("svcaller_active") out = [] if svcaller: if svcaller in _CALLERS: assert len(items) == 1 data = items[0] data["sv"] = _CALLERS[svcaller](data) out.append([data]) elif svcaller in _BATCH_CALLERS: if (svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)): names = set([tz.get_in(["rgnames", "sample"], x) for x in items]) background = [x for x in all_items if tz.get_in(["rgnames", "sample"], x) not in names] for svdata in _BATCH_CALLERS[svcaller](items, background): out.append([svdata]) else: for svdata in _BATCH_CALLERS[svcaller](items): out.append([svdata]) else: raise ValueError("Unexpected structural variant caller: %s" % svcaller) else: out.append(items) return out
def detect_sv(items, all_items, config): """Top level parallel target for examining structural variation. """ svcaller = config["algorithm"].get("svcaller_active") out = [] if svcaller: if svcaller in _CALLERS: assert len(items) == 1 data = items[0] data["sv"] = _CALLERS[svcaller](data) out.append([data]) elif svcaller in _BATCH_CALLERS: if (svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis( [x.get("align_bam") for x in items], items)): names = set( [tz.get_in(["rgnames", "sample"], x) for x in items]) background = [ x for x in all_items if tz.get_in(["rgnames", "sample"], x) not in names ] for svdata in _BATCH_CALLERS[svcaller](items, background): out.append([svdata]) else: for svdata in _BATCH_CALLERS[svcaller](items): out.append([svdata]) else: raise ValueError("Unexpected structural variant caller: %s" % svcaller) else: out.append(items) return out
def run_freebayes(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run FreeBayes variant calling, either paired tumor/normal or germline calling. """ items = shared.add_highdepth_genome_exclusion(items) if is_paired_analysis(align_bams, items): paired = get_paired_bams(align_bams, items) if not paired.normal_bam: call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file, somatic=paired) else: call_file = _run_freebayes_paired( [paired.tumor_bam, paired.normal_bam], [paired.tumor_data, paired.normal_data], ref_file, assoc_files, region, out_file) else: vcfutils.check_paired_problems(items) call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) for x in align_bams: bam.index(x, config) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file, items=items) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def run_freebayes(align_bams, items, ref_file, assoc_files, region=None, out_file=None): if is_paired_analysis(align_bams, items): call_file = _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region, out_file) else: call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def run_vardict(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run VarDict variant calling. """ if vcfutils.is_paired_analysis(align_bams, items): call_file = _run_vardict_paired(align_bams, items, ref_file, assoc_files, region, out_file) else: vcfutils.check_paired_problems(items) call_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def run_freebayes(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run FreeBayes variant calling, either paired tumor/normal or germline calling. """ if is_paired_analysis(align_bams, items): call_file = _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region, out_file) else: call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def run_scalpel(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run Scalpel indel calling, either paired tumor/normal or germline calling. """ if region is None: message = "A region must be provided for Scalpel" raise ValueError(message) if is_paired_analysis(align_bams, items): call_file = _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region, out_file) else: call_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run strelka2 variant calling, either paired tumor/normal or germline calling. """ if vcfutils.is_paired_analysis(align_bams, items): paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Strelka2 requires a normal sample" call_file = _run_somatic(paired, ref_file, assoc_files, region, out_file) else: call_file = _run_germline(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def _pick_lead_item(items): """Pick single representative sample for batch calling to attach calls to. For cancer samples, attach to tumor. """ if vcfutils.is_paired_analysis([x["align_bam"] for x in items], items): for data in items: if vcfutils.get_paired_phenotype(data) == "tumor": return data raise ValueError("Did not find tumor sample in paired tumor/normal calling") else: return items[0]
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Return DeepVariant calling on germline samples. region can be a single region or list of multiple regions for multicore calling. """ assert not vcfutils.is_paired_analysis(align_bams, items), \ ("DeepVariant currently only supports germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) assert len(items) == 1, \ ("DeepVariant currently only supports single sample calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) return _run_germline(align_bams[0], items[0], ref_file, region, out_file)
def run_varscan(align_bams, items, ref_file, assoc_files, region=None, out_file=None): if is_paired_analysis(align_bams, items): call_file = samtools.shared_variantcall(_varscan_paired, "varscan", align_bams, ref_file, items, assoc_files, region, out_file) else: call_file = samtools.shared_variantcall(_varscan_work, "varscan", align_bams, ref_file, items, assoc_files, region, out_file) return call_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run strelka2 variant calling, either paired tumor/normal or germline calling. region can be a single region or list of multiple regions for multicore calling. """ if vcfutils.is_paired_analysis(align_bams, items): paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Strelka2 requires a normal sample" call_file = _run_somatic(paired, ref_file, assoc_files, region, out_file) else: call_file = _run_germline(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def run_scalpel(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run Scalpel indel calling, either paired tumor/normal or germline calling. """ if region is None: message = ("A region must be provided for Scalpel") raise ValueError(message) if is_paired_analysis(align_bams, items): call_file = _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region, out_file) else: call_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Return DeepVariant calling on germline samples. region can be a single region or list of multiple regions for multicore calling. """ assert not vcfutils.is_paired_analysis(align_bams, items), \ ("DeepVariant currently only supports germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) assert len(items) == 1, \ ("DeepVariant currently only supports single sample calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) out_file = _run_germline(align_bams[0], items[0], ref_file, region, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. items = sample-sv_caller list, from one batch """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] batch = dd.get_batch(items[0]) # no SV calling when just creating a PON for PureCN if batch == "pon_build" and "purecn" in dd.get_svcaller(items[0]): return out if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis( [x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [ x for x in all_items if dd.get_sample_name(x) not in names ] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = { "summary": tz.get_in(["sv-validate", "csv"], data) } svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out
def run_freebayes(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run FreeBayes variant calling, either paired tumor/normal or germline calling. """ if is_paired_analysis(align_bams, items): paired = get_paired_bams(align_bams, items) if not paired.normal_bam: call_file = _run_freebayes_caller( align_bams, items, ref_file, assoc_files, region, out_file, somatic=paired ) else: call_file = _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region, out_file) else: vcfutils.check_paired_problems(items) call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. """ svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage).get(svcaller) out = [] if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [x for x in all_items if dd.get_sample_name(x) not in names] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) return out
def run_freebayes(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run FreeBayes variant calling, either paired tumor/normal or germline calling. """ items = shared.add_highdepth_genome_exclusion(items) if is_paired_analysis(align_bams, items): paired = get_paired_bams(align_bams, items) if not paired.normal_bam: call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file, somatic=paired) else: call_file = _run_freebayes_paired([paired.tumor_bam, paired.normal_bam], [paired.tumor_data, paired.normal_data], ref_file, assoc_files, region, out_file) else: vcfutils.check_paired_problems(items) call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. """ svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _CALLERS[stage].get(svcaller) out = [] if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [x for x in all_items if dd.get_sample_name(x) not in names] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) return out
def detect_sv(items, all_items, config, stage): """Top level parallel target for examining structural variation. """ svcaller = config["algorithm"].get("svcaller_active") caller_fn = _CALLERS[stage].get(svcaller) out = [] if svcaller and caller_fn: if (svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)): names = set([tz.get_in(["rgnames", "sample"], x) for x in items]) background = [x for x in all_items if tz.get_in(["rgnames", "sample"], x) not in names] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) return out
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [x for x in all_items if dd.get_sample_name(x) not in names] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = {"summary": tz.get_in(["sv-validate", "csv"], data)} svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out