def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(get_orig_items(data)): if vcfutils.get_paired_phenotype(sub_data) == "tumor": sub_data["vrn_file"] = data["vrn_file"] else: sub_data.pop("vrn_file", None) out.append([sub_data]) return out # joint calling, do not split back up due to potentially large sample sizes elif tz.get_in(("config", "algorithm", "jointcaller"), data): return [[data]] # population or single sample else: out = [] for sub_data in get_orig_items(data): sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) sub_data["vrn_file_batch"] = data["vrn_file"] sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(data["group_orig"]): if vcfutils.get_paired_phenotype(sub_data) == "tumor": sub_data["vrn_file"] = data["vrn_file"] out.append([sub_data]) return out # population or single sample else: out = [] for sub_data in data["group_orig"]: sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(data["group_orig"]): if vcfutils.get_paired_phenotype(sub_data) == "tumor": if "combine" in data: sub_data["combine"] = data["combine"] sub_data["vrn_file"] = data["vrn_file"] out.append([sub_data]) return out # population or single sample else: out = [] for sub_data in data["group_orig"]: sub_vrn_file = data["vrn_file"].replace(data["group"][0] + "-", sub_data["name"][-1] + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], sub_data["name"][-1], sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) if "combine" in data: sub_data["combine"] = data["combine"] sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) out = [] for i, data in enumerate(items): if "sv" not in data: data["sv"] = [] sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, data) filter_vcf = _filter_by_support(gt_vcf, data) data["sv"].append({"variantcaller": "lumpy", "vrn_file": filter_vcf, "exclude_file": exclude_file}) out.append(data) return out
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] background_bams = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] if paired.normal_bam: background = [paired.normal_data] background_bams = [paired.normal_bam] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] orig_vcf = _run_wham(inputs, background_bams) out = [] for data in inputs: if "sv" not in data: data["sv"] = [] sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if background: sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data) data["sv"].append({"variantcaller": "wham", "vrn_file": sample_vcf}) out.append(data) return out
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = ["rtg", "vcfeval", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] do.run(cmd, "Validate calls using rtg vcfeval", data) return {"tp": os.path.join(out_dir, "tp.vcf.gz"), "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")}
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] background_bams = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] if paired.normal_bam: background = [paired.normal_data] background_bams = [paired.normal_bam] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] orig_vcf = _run_wham(inputs, background_bams) out = [] for data in inputs: if "sv" not in data: data["sv"] = [] sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if background: sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data) effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff") data["sv"].append({"variantcaller": "wham", "vrn_file": effects_vcf or sample_vcf}) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "lumpy")) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) pebed_file, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) out = [] sample_config_file = _write_samples_to_ids(pebed_file, items) lumpy_vcf = _bedpe_to_vcf(pebed_file, sample_config_file, items) for i, data in enumerate(items): if "sv" not in data: data["sv"] = [] sample = tz.get_in(["rgnames", "sample"], data) sample_bedpe = _filter_by_support(_subset_to_sample(pebed_file, i, data), i, data) if lumpy_vcf: sample_vcf = utils.append_stem(lumpy_vcf, "-%s" % sample) sample_vcf = _filter_by_bedpe(vcfutils.select_sample(lumpy_vcf, sample, sample_vcf, data["config"]), sample_bedpe, data) else: sample_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": sample_vcf, "exclude_file": exclude_file, "bedpe_file": sample_bedpe, "sample_bed": sample_config_file}) out.append(data) return out
def _organize_variants(samples, batch_id): """Retrieve variant calls for all samples, merging batched samples into single VCF. """ bam_files = set([]) caller_names = [x["variantcaller"] for x in samples[0]["variants"]] calls = collections.defaultdict(list) for data in samples: if "work_bam" in data: bam_files.add(data["work_bam"]) for vrn in data["variants"]: # for somatic ensemble, discard normal samples and filtered # variants from vcfs vrn_file = vrn["vrn_file"] if data.get("metadata", False) and data["metadata"].get( "phenotype", "normal").lower().startswith("tumor"): vrn_file_temp = vrn_file.replace( ".vcf", "_tumorOnly_noFilteredCalls.vcf" ) if ".vcf" in vrn_file else vrn_file_temp + "_tumorOnly_noFilteredCalls.vcf.gz" # Select tumor sample and keep only PASS and . calls vrn_file = vcfutils.select_sample(in_file=vrn_file, sample=data["name"][1], out_file=vrn_file_temp, config=data["config"], filters="PASS,.") calls[vrn["variantcaller"]].append(vrn_file) data = samples[0] vrn_files = [] for caller in caller_names: fnames = calls[caller] if len(fnames) == 1: vrn_files.append(fnames[0]) else: vrn_files.append( population.get_multisample_vcf(fnames, batch_id, caller, data)) return caller_names, vrn_files, list(bam_files)
def test_4_vcf_sample_select(self): """Select a sample from a VCF file. """ fname = os.path.join(self.var_dir, "S1-variants.vcf") out_file = "%s-sampleselect%s.gz" % os.path.splitext(fname) out_file = vcfutils.select_sample(fname, "S2", out_file, {}) self._remove_vcf(out_file)
def test_4_vcf_sample_select(self): """Select a sample from a VCF file. """ fname = os.path.join(self.var_dir, "S1_S2-combined.vcf.gz") out_file = "%s-sampleselect%s" % utils.splitext_plus(fname) out_file = vcfutils.select_sample(fname, "S2", out_file, {}) self._remove_vcf(out_file)
def _organize_variants(samples, batch_id): """Retrieve variant calls for all samples, merging batched samples into single VCF. """ bam_files = set([]) caller_names = [x["variantcaller"] for x in samples[0]["variants"]] calls = collections.defaultdict(list) for data in samples: if "work_bam" in data: bam_files.add(data["work_bam"]) for vrn in data["variants"]: # for somatic ensemble, discard normal samples and filtered # variants from vcfs vrn_file = vrn["vrn_file"] if data.get("metadata", False) and data["metadata"].get("phenotype", "normal").lower().startswith("tumor"): vrn_file_temp = vrn_file.replace(".vcf", "_tumorOnly_noFilteredCalls.vcf") if ".vcf" in vrn_file else vrn_file_temp + "_tumorOnly_noFilteredCalls.vcf.gz" # Select tumor sample and keep only PASS and . calls vrn_file = vcfutils.select_sample(in_file=vrn_file, sample=data["name"][1], out_file=vrn_file_temp, config=data["config"], filters="PASS,.") calls[vrn["variantcaller"]].append(vrn_file) data = samples[0] vrn_files = [] for caller in caller_names: fnames = calls[caller] if len(fnames) == 1: vrn_files.append(fnames[0]) else: vrn_files.append(population.get_multisample_vcf(fnames, batch_id, caller, data)) return caller_names, vrn_files, list(bam_files)
def test_4_vcf_sample_select(self, install_test_files, data_dir): """Select a sample from a VCF file. """ from bcbio.variation import vcfutils fname = os.path.join(self.var_dir, "S1_S2-combined.vcf.gz") out_file = "%s-sampleselect%s" % utils.splitext_plus(fname) out_file = vcfutils.select_sample(fname, "S2", out_file, {}) self._remove_vcf(out_file)
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) sample_file = os.path.join( base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) threads = min(dd.get_num_cores(data), 6) mem = "%sg" % threads cmd = [ "rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] cmd += [ "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file)) ] mem_export = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=%s" % mem cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = { "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") } tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) elif "lumpy-genotype" in dd.get_tools_off(data): gt_vcf = sample_vcf else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } sv_types = [ "DEL", "DUP", "INV" ] # "TRA" has invalid VCF END specifications that GATK doesn't like with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type ) in itertools.product(pysam_work_bam.references, sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) delly_vcf = vfilter.genotype_filter(combo_vcf, 'DV / (DV + DR) > 0.35 && DV > 4', data, "DVSupport") out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(delly_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = "%s-%s%s" % (base, sample, ext) data["sv"].append({ "variantcaller": "delly", "vrn_file": vcfutils.select_sample(delly_vcf, sample, delly_sample_vcf, data["config"]) }) out.append(data) return out
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, assign the combined file to the tumor sample instead of splitting, and remove variant files from the normal. """ config = data["config"] vrn_file = data["vrn_file"] out = [] # cancer tumor/normal if vcfutils.get_paired_phenotype(data): # handle trailing normals, which we don't need to process if len(data["group_orig"]) == 1 and vcfutils.get_paired_phenotype(data["group_orig"][0][0]) == "normal": sub_data, sub_vrn_file = data["group_orig"][0] sub_data.pop("vrn_file", None) sub_data["vrn_file-shared"] = sub_vrn_file out.append(sub_data) else: has_tumor = False for sub_data, sub_vrn_file in data["group_orig"]: paired_phenotype = vcfutils.get_paired_phenotype(sub_data) if paired_phenotype == "tumor": has_tumor = True if not os.path.exists(sub_vrn_file): utils.symlink_plus(vrn_file, sub_vrn_file) sub_data["vrn_file"] = sub_vrn_file out.append(sub_data) else: sub_data.pop("vrn_file", None) sub_data["vrn_file-shared"] = sub_vrn_file out.append(sub_data) if not has_tumor: raise ValueError("Did not find tumor sample in paired analysis") # population or single sample else: for sub_data, sub_vrn_file in data["group_orig"]: if len(vcfutils.get_samples(vrn_file)) > 1: vcfutils.select_sample(vrn_file, sub_data["name"][-1], sub_vrn_file, config) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(vrn_file, sub_vrn_file) if sub_vrn_file: sub_data["vrn_file"] = sub_vrn_file out.append(sub_data) return out
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join( base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath( os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = [ "rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] caller = _get_caller(data) # flexible quality scores for building ROC curves, handle multiple cases # MuTect has no quality scores # not clear how to get t_lod_fstar into VCF cleanly if caller == "mutect": cmd += ["--vcf-score-field=BQ"] # otherwise use quality score as a standard # Discussion point: is it worth using caller specific annotations or settling # on a single metric for comparison else: cmd += ["--vcf-score-field=QUAL"] cmd = "export RTG_JAVA_OPTS='-Xms1g' export RTG_MEM=5g && " + " ".join( cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) return { "tp": os.path.join(out_dir, "tp.vcf.gz"), "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") }
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } work_bams = run_multicore(_prep_subsampled_bams, [(data, work_dir) for data in items], config, parallel) ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) sv_types = [ "DEL", "DUP" ] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product( sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample( combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) data["sv"].append({ "variantcaller": "delly", "vrn_file": delly_vcf, "exclude": exclude_file }) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all( utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError( "Require bwa-mem alignment input for lumpy structural variation detection" ) paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants( data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample( lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data) gt_vcf = vcfutils.combine_variant_files( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], ref_file=dd.get_ref_file(data), config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file }) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join( base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath( os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = [ "rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] cmd += [ "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file)) ] cmd = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=5g && " + " ".join( cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = { "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") } tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def _handle_somatic_ensemble(vrn_file, data): """For somatic ensemble, discard normal samples and filtered variants from vcfs. Only needed for bcbio.variation based ensemble calling. """ if tz.get_in(["metadata", "phenotype"], data, "").lower().startswith("tumor"): vrn_file_temp = vrn_file.replace(".vcf", "_tumorOnly_noFilteredCalls.vcf") # Select tumor sample and keep only PASS and . calls vrn_file = vcfutils.select_sample(in_file=vrn_file, sample=data["name"][1], out_file=vrn_file_temp, config=data["config"], filters="PASS,.") return vrn_file
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data): """Prepare input VCF and BED files for validation. """ if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data) return vrn_file, rm_file, interval_bed
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {"algorithm": {"memory_adjust": {"magnitude": threads, "direction": "increase"}}}) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = ["rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))] mem_export = "export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = {"fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")} tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def finalize_sv(orig_vcf, data, items): """Finalize structural variants, adding effects and splitting if needed. """ paired = vcfutils.get_paired(items) # For paired/somatic, attach combined calls to tumor sample if paired: sample_vcf = orig_vcf if paired.tumor_name == dd.get_sample_name(data) else None else: sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if sample_vcf: effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff") else: effects_vcf = None return effects_vcf or sample_vcf
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data): """Prepare input VCF and BED files for validation. """ if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base = utils.splitext_plus(os.path.basename(vrn_file))[0] sample_file = os.path.join(base_dir, "%s-%s.vcf.gz" % (base, dd.get_sample_name(data))) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) # rtg fails on bgzipped VCFs produced by GatherVcfs so we re-prep them else: vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data) return vrn_file, rm_file, interval_bed
def _compatible_small_variants(data): """Retrieve small variant (SNP, indel) VCFs compatible with CNVkit. """ supported = set(["vardict", "freebayes", "gatk-haplotype", "mutect2", "vardict"]) out = [] for v in data.get("variants", []): vrn_file = v.get("vrn_file") if vrn_file and v.get("variantcaller") in supported: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) sample_vrn_file = os.path.join(dd.get_work_dir(data), v["variantcaller"], "%s-%s%s" % (base, dd.get_sample_name(data), ext)) sample_vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_vrn_file, data["config"]) out.append(sample_vrn_file) return out
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} work_bams = run_multicore(_prep_subsampled_bams, [(data, work_dir) for data in items], config, parallel) ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) sv_types = ["DEL", "DUP"] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf, "exclude": exclude_file}) out.append(data) return out
def _compatible_small_variants(data): """Retrieve small variant (SNP, indel) VCFs compatible with CNVkit. """ supported = set(["vardict", "freebayes", "gatk-haplotype", "mutect2", "vardict"]) out = [] for v in data.get("variants", []): vrn_file = v.get("vrn_file") if vrn_file and v.get("variantcaller") in supported: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) if vcfutils.get_paired_phenotype(data): out.append(vrn_file) else: sample_vrn_file = os.path.join(dd.get_work_dir(data), v["variantcaller"], "%s-%s%s" % (base, dd.get_sample_name(data), ext)) sample_vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_vrn_file, data["config"]) out.append(sample_vrn_file) return out
def finalize_sv(orig_vcf, data, items): """Finalize structural variants, adding effects and splitting if needed. """ paired = vcfutils.get_paired(items) # For paired/somatic, attach combined calls to tumor sample if paired: sample_vcf = orig_vcf if paired.tumor_name == dd.get_sample_name( data) else None else: sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if sample_vcf: effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff") else: effects_vcf = None return effects_vcf or sample_vcf
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} sv_types = ["DEL", "DUP", "INV"] # "TRA" has invalid VCF END specifications that GATK doesn't like with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(pysam_work_bam.references, sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = vfilter.hard_w_expression(delly_sample_vcf, "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2", data, name="DVSupport") data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf}) out.append(data) return out
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) threads = min(dd.get_num_cores(data), 6) mem = "%sg" % threads cmd = ["rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))] mem_export = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=%s" % mem cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = {"fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")} tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = ["rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] caller = _get_caller(data) # flexible quality scores for building ROC curves, handle multiple cases # MuTect has no quality scores # not clear how to get t_lod_fstar into VCF cleanly if caller == "mutect": cmd += ["--vcf-score-field=BQ"] # otherwise use quality score as a standard # Discussion point: is it worth using caller specific annotations or settling # on a single metric for comparison else: cmd += ["--vcf-score-field=QUAL"] cmd = "export RTG_JAVA_OPTS='-Xms1g' export RTG_MEM=5g && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) return {"tp": os.path.join(out_dir, "tp.vcf.gz"), "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")}
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = ["rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))] cmd = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=5g && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = {"fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")} tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out