def run(items): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) data = paired.tumor_data if paired else items[0] work_dir = _sv_workdir(data) variant_file = _get_out_file(work_dir, paired) if not utils.file_exists(variant_file): with file_transaction(data, work_dir) as tx_work_dir: utils.safe_makedir(tx_work_dir) tx_workflow_file = _prep_config(items, paired, tx_work_dir) _run_workflow(items, paired, tx_workflow_file, tx_work_dir) assert utils.file_exists( variant_file), "Manta finished without output file %s" % variant_file out = [] for data in items: sample_file = _select_sample(data, variant_file, work_dir) if "sv" not in data: data["sv"] = [] effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff") data["sv"].append({ "variantcaller": "manta", "vrn_file": effects_vcf or sample_file }) out.append(data) return out
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True, rerun_effects=True, remove_oldeffects=False, nonrefonly=False, work_dir=None): """Normalizes variants and reruns SnpEFF for resulting VCF """ if remove_oldeffects: out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file) else: out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if work_dir: out_file = os.path.join(work_dir, os.path.basename(out_file)) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): ready_ma_file = _normalize(in_file, data, passonly=passonly, normalize_indels=normalize_indels, split_biallelic=split_biallelic, remove_oldeffects=remove_oldeffects, nonrefonly=nonrefonly, work_dir=work_dir) if rerun_effects: ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file utils.symlink_plus(ready_ma_file, out_file) else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def run(items): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ assert len(items) == 1, "Expect one input to MetaSV ensemble calling" data = items[0] work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] methods = [] for call in data.get("sv", []): if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods: methods.append(call["variantcaller"]) cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if len(methods) >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)") filter_file = vfilter.cutoff_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff") data["sv"].append({"variantcaller": "metasv", "vrn_file": effects_vcf or filter_file}) return [data]
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) if dd.get_analysis(data).lower().find("rna-seq") >= 0: logger.info("Annotate RNA editing sites") ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data[vrn_key] = ann_file if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration( data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ data = _get_batch_representative(items, "vrn_file") cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get("vrn_file") data = _symlink_to_workdir(data, ["vrn_file"]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data["vrn_file"], data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data["vrn_file"] = annotation.finalize_vcf(data["vrn_file"], get_variantcaller(data), orig_items) logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration( data["vrn_file"], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data, orig_items) logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if orig_vrn_file and os.path.samefile(data["vrn_file"], orig_vrn_file): data["vrn_file"] = orig_vrn_file return [[data]]
def _add_variantcalls_to_output(out, data): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = dd.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call", "--ploidy", str(dd.get_ploidy(data)), "-o", tx_call_file, out["cns"]] if gender: cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(dd.get_ploidy(data)), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ data = _get_batch_representative(items, "vrn_file") cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get("vrn_file") data = _symlink_to_workdir(data, ["vrn_file"]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data["vrn_file"], data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats logger.info("Filtering for %s" % cur_name) orig_items = _get_orig_items(items) data["vrn_file"] = variant_filtration(data["vrn_file"], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data, orig_items) logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if orig_vrn_file and os.path.samefile(data["vrn_file"], orig_vrn_file): data["vrn_file"] = orig_vrn_file return [[data]]
def run(items): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ assert len(items) == 1, "Expect one input to MetaSV ensemble calling" data = items[0] work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] methods = [] for call in data.get("sv", []): if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods: methods.append(call["variantcaller"]) cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if len(methods) >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff") data["sv"].append({"variantcaller": "metasv", "vrn_file": effects_vcf or filter_file}) return [data]
def to_vcf(in_file, caller, header_fn, vcf_fn, data, sep="\t"): """Convert output TitanCNA segs file into bgzipped VCF.""" out_file = "%s.vcf" % utils.splitext_plus(in_file)[0] out_file_gz = out_file + ".gz" if not utils.file_exists(out_file + ".gz") and not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: out_handle.write(_vcf_header.format(caller=caller)) out_handle.write("\t".join([ "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", dd.get_sample_name(data) ]) + "\n") header, in_handle = header_fn(in_handle) for line in in_handle: out = vcf_fn(dict(zip(header, line.strip().split(sep)))) if out: out_handle.write("\t".join(out) + "\n") # also does bgzip and index out_file_prep_vcf_gz = vcfutils.sort_by_ref(out_file, data) shutil.move(out_file_prep_vcf_gz, out_file_gz) shutil.move(out_file_prep_vcf_gz + ".tbi", out_file_gz + ".tbi") effects_vcf, _ = effects.add_to_vcf(out_file_gz, data, "snpeff") return effects_vcf or out_file_gz
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] background_bams = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] if paired.normal_bam: background = [paired.normal_data] background_bams = [paired.normal_bam] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] orig_vcf = _run_wham(inputs, background_bams) out = [] for data in inputs: if "sv" not in data: data["sv"] = [] sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if background: sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data) effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff") data["sv"].append({"variantcaller": "wham", "vrn_file": effects_vcf or sample_vcf}) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy. """ if not all( utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", "minimap2", False, None] for data in items): raise ValueError( "Require bwa or minimap2 alignment input for lumpy structural variation detection" ) paired = vcfutils.get_paired(items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: full_bams.append(dd.get_align_bam(data)) sr_bam, disc_bam = sshared.find_existing_split_discordants(data) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} # Retain paired samples with tumor/normal genotyped in one file if paired and paired.normal_name: batches = [[paired.tumor_data, paired.normal_data]] else: batches = [[x] for x in items] for batch_items in batches: for data in batch_items: gt_vcfs[dd.get_sample_name(data)] = _filter_by_support( lumpy_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs.get(dd.get_sample_name(data)) if vcf_file: if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file }) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) elif "lumpy-genotype" in dd.get_tools_off(data): gt_vcf = sample_vcf else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all( utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError( "Require bwa-mem alignment input for lumpy structural variation detection" ) paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants( data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample( lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data) gt_vcf = vcfutils.combine_variant_files( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], ref_file=dd.get_ref_file(data), config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file }) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy. """ paired = vcfutils.get_paired(items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: full_bams.append(dd.get_align_bam(data)) sr_bam, disc_bam = sshared.find_existing_split_discordants(data) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items) lumpy_vcf = sshared.annotate_with_depth(lumpy_vcf, items) gt_vcfs = {} # Retain paired samples with tumor/normal genotyped in one file if paired and paired.normal_name: batches = [[paired.tumor_data, paired.normal_data]] else: batches = [[x] for x in items] for batch_items in batches: for data in batch_items: gt_vcfs[dd.get_sample_name(data)] = _filter_by_support( lumpy_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] upload_counts = collections.defaultdict(int) for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs.get(dd.get_sample_name(data)) if vcf_file: effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "do_upload": upload_counts[vcf_file] == 0, # only upload a single file per batch "exclude_file": exclude_file }) upload_counts[vcf_file] += 1 out.append(data) return out
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } work_bams = [dd.get_align_bam(d) for d in items] ref_file = dd.get_ref_file(items[0]) sv_types = [ "DEL", "DUP" ] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product( sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample( combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) effects_vcf, _ = effects.add_to_vcf(delly_vcf, data, "snpeff") data["sv"].append({ "variantcaller": "delly", "vrn_file": effects_vcf, "exclude": exclude_file }) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, nonrefonly=True, work_dir=utils.safe_makedir(os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files)] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]
def to_single(in_file, data, passonly=False): """Convert multi-allelic inputs in the original VCF file into single alleles. """ out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): ready_ma_file = _decompose(in_file, data, passonly=passonly) ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file out_file = ready_ma_file else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data, require_bam=False)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data, require_bam=False), orig_items) if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(data[vrn_key], data) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def finalize_sv(orig_vcf, data, items): """Finalize structural variants, adding effects and splitting if needed. """ paired = vcfutils.get_paired(items) # For paired/somatic, attach combined calls to tumor sample if paired: sample_vcf = orig_vcf if paired.tumor_name == dd.get_sample_name(data) else None else: sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if sample_vcf: effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff") else: effects_vcf = None return effects_vcf or sample_vcf
def to_single(in_file, data): """Convert multi-allelic inputs in the original VCF file into single alleles. """ out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): ba_file, ma_file = _split_mulitallelic(in_file, data) if vcfutils.vcf_has_variants(ma_file): ready_ma_file = _decompose(ma_file, data) ann_ma_file = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file out_file = vcfutils.merge_sorted([ready_ma_file, ba_file], out_file, data) else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def _add_variantcalls_to_output(out, data, items, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: filters = ["--filter", "cn"] cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \ filters + \ ["--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += [ "--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample ] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] if not is_somatic: cmd += ["-m", "clonal"] gender = population.get_gender(data) if gender and gender.lower() != "unknown": cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [ os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file ] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0] if eff_file: data = dd.set_vrn_file(data, eff_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) variantcaller = dd.get_variantcaller(data) if variantcaller and ("gatk-haplotype" in variantcaller): filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) return [[data]]
def run(items): """Perform detection of structural variations with lumpy. """ paired = vcfutils.get_paired(items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: full_bams.append(dd.get_align_bam(data)) sr_bam, disc_bam = sshared.find_existing_split_discordants(data) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items) lumpy_vcf = sshared.annotate_with_depth(lumpy_vcf, items) gt_vcfs = {} # Retain paired samples with tumor/normal genotyped in one file if paired and paired.normal_name: batches = [[paired.tumor_data, paired.normal_data]] else: batches = [[x] for x in items] for batch_items in batches: for data in batch_items: gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(lumpy_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] upload_counts = collections.defaultdict(int) for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs.get(dd.get_sample_name(data)) if vcf_file: effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "do_upload": upload_counts[vcf_file] == 0, # only upload a single file per batch "exclude_file": exclude_file}) upload_counts[vcf_file] += 1 out.append(data) return out
def postprocess_variants(data): """Provide post-processing of variant calls: filtering and effects annotation. """ cur_name = "%s, %s" % (data["name"][-1], get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file = effects.add_to_vcf(data["vrn_file"], data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration(data["vrn_file"], data["sam_ref"], tz.get_in(("genome_resources", "variation"), data, {}), data) logger.info("Prioritization for %s" % cur_name) data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data) return [[data]]
def run(items): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) work_dir = _sv_workdir(paired.tumor_data if paired else items[0]) workflow_file = _prep_config(items, paired, work_dir) variant_file = _run_workflow(items, paired, workflow_file, work_dir) out = [] for data in items: sample_file = _select_sample(data, variant_file, work_dir) if "sv" not in data: data["sv"] = [] effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff") data["sv"].append({"variantcaller": "manta", "vrn_file": effects_vcf or sample_file}) out.append(data) return out
def finalize_sv(orig_vcf, data, items): """Finalize structural variants, adding effects and splitting if needed. """ paired = vcfutils.get_paired(items) # For paired/somatic, attach combined calls to tumor sample if paired: sample_vcf = orig_vcf if paired.tumor_name == dd.get_sample_name( data) else None else: sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if sample_vcf: effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff") else: effects_vcf = None return effects_vcf or sample_vcf
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} work_bams = [dd.get_align_bam(d) for d in items] ref_file = dd.get_ref_file(items[0]) sv_types = ["DEL", "DUP"] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) effects_vcf, _ = effects.add_to_vcf(delly_vcf, data, "snpeff") data["sv"].append({"variantcaller": "delly", "vrn_file": effects_vcf, "exclude": exclude_file}) out.append(data) return out
def postprocess_variants(data): """Provide post-processing of variant calls: filtering and effects annotation. """ cur_name = "%s, %s" % (data["name"][-1], get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data["vrn_file"], data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration( data["vrn_file"], data["sam_ref"], tz.get_in(("genome_resources", "variation"), data, {}), data) logger.info("Prioritization for %s" % cur_name) data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data) return [[data]]
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0] if eff_file: data = dd.set_vrn_file(data, eff_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) variantcaller = dd.get_variantcaller(data) if variantcaller and ("gatk-haplotype" in variantcaller): filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) # remove variants close to splice junctions vrn_file = dd.get_vrn_file(data) vrn_file = variation.filter_junction_variants(vrn_file, data) data = dd.set_vrn_file(data, vrn_file) return [[data]]
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0] if eff_file: data = dd.set_vrn_file(data, eff_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) jointcaller = dd.get_jointcaller(data) if jointcaller and 'gatk-haplotype-joint' in jointcaller: filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) # remove variants close to splice junctions vrn_file = dd.get_vrn_file(data) vrn_file = variation.filter_junction_variants(vrn_file, data) data = dd.set_vrn_file(data, vrn_file) return [[data]]
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def combine_calls(batch_id, samples, data): """Combine multiple callsets into a final set of merged calls. """ logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"]))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id) exist_variants = False for tmp_vrn_file in vrn_files: if vcfutils.vcf_has_variants(tmp_vrn_file): exist_variants = True break if exist_variants: # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True) for f in vrn_files] if "classifiers" not in edata["config"]["algorithm"]["ensemble"]: callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, edata["sam_ref"], edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} return [[batch_id, callinfo]]
def _add_variantcalls_to_output(out, data, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = population.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: filters = ["--filter", "cn"] cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \ filters + \ ["--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += ["-v", small_vrn_files[0]] if not is_somatic: cmd += ["-m", "clonal"] if gender and gender.lower() != "unknown": cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def _add_variantcalls_to_output(out, data, items, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: filters = ["--filter", "cn"] cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \ filters + \ ["--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] if not is_somatic: cmd += ["-m", "clonal"] gender = _get_batch_gender(items) if gender: cmd += ["--sample-sex", gender] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] out["vrn_file"] = shared.annotate_with_depth(out["vrn_file"], items) return out
def run(items, background=None): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) if paired: inputs = [paired.tumor_data] background = [paired.normal_data] if paired.normal_bam else [] else: assert not background inputs, background = sshared.find_case_control(items) work_dir = _sv_workdir(inputs[0]) variant_file = _run_gridss(inputs, background, work_dir) out = [] for data in items: sample_file = variant_file if "sv" not in data: data["sv"] = [] effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff") data["sv"].append({"variantcaller": "gridss", "vrn_file": effects_vcf or sample_file}) out.append(data) return out
def run(items): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) data = paired.tumor_data if paired else items[0] work_dir = _sv_workdir(data) variant_file = _get_out_file(work_dir, paired) if not utils.file_exists(variant_file): with file_transaction(data, work_dir) as tx_work_dir: utils.safe_makedir(tx_work_dir) tx_workflow_file = _prep_config(items, paired, tx_work_dir) _run_workflow(items, paired, tx_workflow_file, tx_work_dir) assert utils.file_exists(variant_file), "Manta finished without output file %s" % variant_file out = [] for data in items: sample_file = _select_sample(data, variant_file, work_dir) if "sv" not in data: data["sv"] = [] effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff") data["sv"].append({"variantcaller": "manta", "vrn_file": effects_vcf or sample_file}) out.append(data) return out
def _add_variantcalls_to_output(out, data): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = dd.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: cmd = [ os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call", "--ploidy", str(dd.get_ploidy(data)), "-o", tx_call_file, out["cns"] ] if gender: cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [ os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(dd.get_ploidy(data)), "-o", tx_out_file, call_file ] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True, rerun_effects=True): """Normalizes variants and reruns SnpEFF for resulting VCF """ out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): ready_ma_file = _normalize(in_file, data, passonly=passonly, normalize_indels=normalize_indels, split_biallelic=split_biallelic) if rerun_effects: ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file utils.symlink_plus(ready_ma_file, out_file) else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def run(items, background=None): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) if paired: inputs = [paired.tumor_data] background = [paired.normal_data] if paired.normal_bam else [] else: assert not background inputs, background = sshared.find_case_control(items) work_dir = _sv_workdir(inputs[0]) variant_file = _run_gridss(inputs, background, work_dir) out = [] for data in items: sample_file = variant_file if "sv" not in data: data["sv"] = [] effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff") data["sv"].append({ "variantcaller": "gridss", "vrn_file": effects_vcf or sample_file }) out.append(data) return out