def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None}]]
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ if isinstance(data, (list, tuple)): data = _normalize_cwl_inputs(data) toval_data = _get_validate(data) if toval_data: caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir( os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError( "Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path( toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip( normalize_input_path( toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) rm_interval_file = bedutils.clean_file( rm_interval_file, toval_data, bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep"))) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data), data.get("genome_build"), base_dir, data) rm_interval_file = (naming.handle_synonyms( rm_interval_file, dd.get_ref_file(data), data.get("genome_build"), base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") if not vcfutils.vcf_has_variants(vrn_file): # RTG can fail on totally empty files. Skip these since we have nothing. pass # empty validation file, every call is a false positive elif not vcfutils.vcf_has_variants(rm_file): eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "rtg": eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "hap.py": data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """# data = samples[0] use_gemini = do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) name, caller, is_batch = call_info out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) if use_gemini: passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples) gemini_vcf = normalize.normalize(gemini_vcf, data, passonly=passonly) ann_vcf = _run_vcfanno(gemini_vcf, data, use_gemini) gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) if vcfutils.vcf_has_variants(gemini_vcf) and caller not in NO_DB_CALLERS: if not utils.file_exists(gemini_db) and use_gemini: ped_file = create_ped_file(samples + extras, gemini_vcf) # Use original approach for hg19/GRCh37 pending additional testing if support_gemini_orig(data) and not any( dd.get_vcfanno(d) for d in samples): gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file) elif ann_vcf: gemini_db = create_gemini_db(ann_vcf, data, gemini_db, ped_file) return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": ann_vcf or gemini_vcf, "decomposed": use_gemini }]]
def prep_gemini_db(fnames, call_id, samples, data): """Prepare a gemini database from VCF inputs prepared with snpEff. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db") is_population = len(fnames) > 1 if is_population: name, caller = call_id gemini_vcf = get_multisample_vcf(fnames, name, caller, data) else: gemini_vcf = fnames[0] use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts = "--passonly" else: load_opts = "" num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % str(call_id), data) return [[call_id, {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_population else None}]]
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion( "0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion( "0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists( os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = ("snpEff" if tz.get_in( ("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP") cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None }]]
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"): gemini_dir = install.get_gemini_dir() for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz"), ("--skip-gerp-bp", "hg19.gerp.bw"), ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd num_cores = data["config"]["algorithm"].get("num_cores", 1) eanns = "snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP" cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [ [ (name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None}, ] ]
def run_filter(vrn_file, align_bam, ref_file, data, items): """Filter and annotate somatic VCFs with damage/bias artifacts on low frequency variants. Moves damage estimation to INFO field, instead of leaving in FILTER. """ if not should_filter(items) or not vcfutils.vcf_has_variants(vrn_file): return data else: raw_file = "%s-damage.vcf" % utils.splitext_plus(vrn_file)[0] out_plot_files = ["%s%s" % (utils.splitext_plus(raw_file)[0], ext) for ext in ["_seq_bias_simplified.pdf", "_pcr_bias_simplified.pdf"]] if not utils.file_uptodate(raw_file, vrn_file) and not utils.file_uptodate(raw_file + ".gz", vrn_file): with file_transaction(items[0], raw_file) as tx_out_file: # Does not apply --qcSummary plotting due to slow runtimes dkfzbiasfilter = utils.which(config_utils.get_program("dkfzbiasfilter.py", data)) cmd = [dkfzbiasfilter, "--filterCycles", "1", "--passOnly", "--tempFolder", os.path.dirname(tx_out_file), vrn_file, align_bam, ref_file, tx_out_file] do.run(cmd, "Filter low frequency variants for DNA damage and strand bias") for out_plot in out_plot_files: tx_plot_file = os.path.join("%s_qcSummary" % utils.splitext_plus(tx_out_file)[0], "plots", os.path.basename(out_plot)) if utils.file_exists(tx_plot_file): shutil.move(tx_plot_file, out_plot) raw_file = vcfutils.bgzip_and_index(raw_file, items[0]["config"]) data["vrn_file"] = _filter_to_info(raw_file, items[0]) out_plot_files = [x for x in out_plot_files if utils.file_exists(x)] data["damage_plots"] = out_plot_files return data
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: gemini_db = create_gemini_db(gemini_vcf, data, gemini_db) return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None}]]
def _run_svtyper(in_file, full_bam, exclude_file, data): """Genotype structural variant calls with SVtyper. Removes calls in high depth regions to avoid slow runtimes: https://github.com/hall-lab/svtyper/issues/16 """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") if exclude_file and utils.file_exists(exclude_file): regions_to_rm = "-T ^%s" % (exclude_file) else: regions_to_rm = "" # add FILTER headers, which are lost during svtyping header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break if line.startswith("##FILTER"): out_handle.write(line) for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cmd = ("bcftools view {in_file} {regions_to_rm} | " "{python} {svtyper} --max_reads 1000 -B {full_bam} | " "bcftools annotate -h {header_file} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def snpeff_effects(vcf_in, data): """Annotate input VCF file with effects calculated by snpEff. """ if vcfutils.vcf_has_variants(vcf_in): return _run_snpeff(vcf_in, "vcf", data) else: return None, None
def combine_calls(batch_id, samples, data): """Combine multiple callsets into a final set of merged calls. """ logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"]))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id) exist_variants = False for tmp_vrn_file in vrn_files: if vcfutils.vcf_has_variants(tmp_vrn_file): exist_variants = True break if exist_variants: if "classifiers" not in edata["config"]["algorithm"]["ensemble"]: callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, edata["sam_ref"], edata) edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file) callinfo = {"variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None} return [[batch_id, callinfo]]
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True, rerun_effects=True, remove_oldeffects=False, nonrefonly=False, work_dir=None): """Normalizes variants and reruns SnpEFF for resulting VCF """ if remove_oldeffects: out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file) else: out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if work_dir: out_file = os.path.join(work_dir, os.path.basename(out_file)) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): ready_ma_file = _normalize(in_file, data, passonly=passonly, normalize_indels=normalize_indels, split_biallelic=split_biallelic, remove_oldeffects=remove_oldeffects, nonrefonly=nonrefonly, work_dir=work_dir) if rerun_effects: ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file utils.symlink_plus(ready_ma_file, out_file) else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def snpeff_effects(data): """Annotate input VCF file with effects calculated by snpEff. """ vcf_in = data["vrn_file"] if vcfutils.vcf_has_variants(vcf_in): vcf_file = _run_snpeff(vcf_in, "vcf", data) return vcf_file
def cutoff_w_expression(vcf_file, expression, data, name="+", filterext="", extra_cmd="", limit_regions="variant_regions"): """Perform cutoff-based soft filtering using bcftools expressions like %QUAL < 20 || DP < 4. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: if vcfutils.vcf_has_variants(vcf_file): bcftools = config_utils.get_program("bcftools", data["config"]) bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else "" intervals = "" if limit_regions == "variant_regions": variant_regions = dd.get_variant_regions(data) if variant_regions: intervals = "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"]) cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' " "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Cutoff-based soft filtering %s with %s" % (vcf_file, expression), data) else: shutil.copy(vcf_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _has_variant_calls(data): if data.get("align_bam"): for vrn in data["variants"]: if vrn.get("vrn_file") and vcfutils.vcf_has_variants( vrn["vrn_file"]): return True return False
def _freebayes_cutoff(in_file, data): """Perform filtering of FreeBayes results, flagging low confidence calls. Filters using cutoffs on low depth based on Meynert et al's work modeling sensitivity of homozygote and heterozygote calling on depth: http://www.ncbi.nlm.nih.gov/pubmed/23773188 and high depth heterozygote SNP filtering based on Heng Li's work evaluating variant calling artifacts: http://arxiv.org/abs/1404.0929 Tuned based on NA12878 call comparisons to Genome in a Bottle reference genome. """ if not vcfutils.vcf_has_variants(in_file): base, ext = utils.splitext_plus(in_file) out_file = "{base}-filter{ext}".format(**locals()) if not utils.file_exists(out_file): shutil.copy(in_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file depth_thresh, qual_thresh = None, None if _do_high_depth_filter(data): stats = _calc_vcf_stats(in_file) if stats["avg_depth"] > 0: depth_thresh = int(math.ceil(stats["avg_depth"] + 3 * math.pow(stats["avg_depth"], 0.5))) qual_thresh = depth_thresh * 2.0 # Multiplier from default GATK QD cutoff filter filters = ('(AF[0] <= 0.5 && (max(FORMAT/DP) < 4 || (max(FORMAT/DP) < 13 && %QUAL < 10))) || ' '(AF[0] > 0.5 && (max(FORMAT/DP) < 4 && %QUAL < 50))') if depth_thresh: filters += ' || (%QUAL < {qual_thresh} && max(FORMAT/DP) > {depth_thresh} && AF[0] <= 0.5)'.format(**locals()) return cutoff_w_expression(in_file, filters, data, name="FBQualDepth")
def subset_by_supported(input_file, get_coords, calls_by_name, work_dir, data, headers=("#",)): """Limit CNVkit input to calls with support from another caller. get_coords is a function that return chrom, start, end from a line of the input_file, allowing handling of multiple input file types. """ support_files = [(c, tz.get_in([c, "vrn_file"], calls_by_name)) for c in ensemble.SUBSET_BY_SUPPORT["cnvkit"]] support_files = [(c, f) for (c, f) in support_files if f and vcfutils.vcf_has_variants(f)] if len(support_files) == 0: return input_file else: out_file = os.path.join(work_dir, "%s-havesupport%s" % utils.splitext_plus(os.path.basename(input_file))) if not utils.file_uptodate(out_file, input_file): input_bed = _input_to_bed(input_file, work_dir, get_coords, headers) pass_coords = set([]) with file_transaction(data, out_file) as tx_out_file: support_beds = " ".join([_sv_vcf_to_bed(f, c, out_file) for c, f in support_files]) tmp_cmp_bed = "%s-intersectwith.bed" % utils.splitext_plus(tx_out_file)[0] cmd = "bedtools intersect -wa -f 0.5 -r -a {input_bed} -b {support_beds} > {tmp_cmp_bed}" do.run(cmd.format(**locals()), "Intersect CNVs with support files") for r in pybedtools.BedTool(tmp_cmp_bed): pass_coords.add((str(r.chrom), str(r.start), str(r.stop))) with open(input_file) as in_handle: with open(tx_out_file, "w") as out_handle: for line in in_handle: passes = True if not line.startswith(headers): passes = get_coords(line) in pass_coords if passes: out_handle.write(line) return out_file
def _pick_best_quality_score(vrn_file): """Flexible quality score selection, picking the best available. Implementation based on discussion: https://github.com/chapmanb/bcbio-nextgen/commit/a538cecd86c0000d17d3f9d4f8ac9d2da04f9884#commitcomment-14539249 (RTG=AVR/GATK=VQSLOD/MuTect=t_lod_fstar, otherwise GQ, otherwise QUAL, otherwise DP.) For MuTect, it's not clear how to get t_lod_fstar, the right quality score, into VCF cleanly. """ # pysam fails on checking reference contigs if input is empty if not vcfutils.vcf_has_variants(vrn_file): return "DP" to_check = 25 scores = collections.defaultdict(int) try: in_handle = VariantFile(vrn_file) except ValueError: raise ValueError("Failed to parse input file in preparation for validation: %s" % vrn_file) with contextlib.closing(in_handle) as val_in: for i, rec in enumerate(val_in): if i > to_check: break if rec.info.get("VQSLOD") is not None: scores["INFO=VQSLOD"] += 1 for skey in ["AVR", "GQ", "DP"]: if rec.samples[0].get(skey) is not None: scores[skey] += 1 if rec.qual: scores["QUAL"] += 1 for key in ["AVR", "INFO=VQSLOD", "GQ", "QUAL", "DP"]: if scores[key] > 0: return key raise ValueError("Did not find quality score for validation from %s" % vrn_file)
def hard_w_expression(vcf_file, expression, data, name="+", filterext=""): """Perform hard filtering using bcftools expressions like %QUAL < 20 || DP < 4. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: if vcfutils.vcf_has_variants(vcf_file): bcftools = config_utils.get_program("bcftools", data["config"]) output_type = "z" if out_file.endswith(".gz") else "v" variant_regions = utils.get_in( data, ("config", "algorithm", "variant_regions")) intervals = ( "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"]) if variant_regions else "") cmd = ( "{bcftools} filter -O {output_type} {intervals} --soft-filter '{name}' " "-e '{expression}' -m '+' {vcf_file} > {tx_out_file}") do.run(cmd.format(**locals()), "Hard filtering %s with %s" % (vcf_file, expression), data) else: shutil.copy(vcf_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _get_build_type(fnames, samples, caller): """Confirm we should build a gemini database: need gemini in tools_on. Checks for valid conditions for running a database and gemini or gemini_orig configured in tools on. """ build_type = set() if any(vcfutils.vcf_has_variants(f) for f in fnames) and caller not in NO_DB_CALLERS: for data in samples: if any([ x in dd.get_tools_on(data) for x in [ "gemini", "gemini_orig", "gemini_allvariants", "vcf2db_expand" ] ]): if vcfanno.annotate_gemini(data): build_type.add("gemini_orig" if "gemini_orig" in dd.get_tools_on(data) else "gemini") else: logger.info( "Not running gemini, input data not found: %s" % dd.get_sample_name(data)) else: logger.info( "Not running gemini, not configured in tools_on: %s" % dd.get_sample_name(data)) else: logger.info("Not running gemini, no samples with variants found: %s" % (", ".join([dd.get_sample_name(d) for d in samples]))) return build_type
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ toval_data = _get_validate(data) if toval_data: if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data), data["genome_build"], base_dir, data) rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(data), data["genome_build"], base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") if not vcfutils.vcf_has_variants(vrn_file): # RTG can fail on totally empty files. Skip these since we have nothing. pass elif vmethod == "rtg": eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def combine_calls(batch_id, samples, data): """Combine multiple callsets into a final set of merged calls. """ logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"]))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id) exist_variants = False for tmp_vrn_file in vrn_files: if vcfutils.vcf_has_variants(tmp_vrn_file): exist_variants = True break if exist_variants: if "classifiers" not in edata["config"]["algorithm"]["ensemble"]: callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, edata["sam_ref"], edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} return [[batch_id, callinfo]]
def run_filter(vrn_file, align_bam, ref_file, data, items): """Filter and annotate somatic VCFs with damage/bias artifacts on low frequency variants. Moves damage estimation to INFO field, instead of leaving in FILTER. """ if not should_filter(items) or not vcfutils.vcf_has_variants(vrn_file): return data else: raw_file = "%s-damage.vcf" % utils.splitext_plus(vrn_file)[0] out_plot_files = ["%s%s" % (utils.splitext_plus(raw_file)[0], ext) for ext in ["_seq_bias_simplified.pdf", "_pcr_bias_simplified.pdf"]] if not utils.file_uptodate(raw_file, vrn_file) and not utils.file_uptodate(raw_file + ".gz", vrn_file): with file_transaction(items[0], raw_file) as tx_out_file: # Does not apply --qcSummary plotting due to slow runtimes cmd = ["dkfzbiasfilter.py", "--filterCycles", "1", "--passOnly", "--tempFolder", os.path.dirname(tx_out_file), vrn_file, align_bam, ref_file, tx_out_file] do.run(cmd, "Filter low frequency variants for DNA damage and strand bias") for out_plot in out_plot_files: tx_plot_file = os.path.join("%s_qcSummary" % utils.splitext_plus(tx_out_file)[0], "plots", os.path.basename(out_plot)) if utils.file_exists(tx_plot_file): shutil.move(tx_plot_file, out_plot) raw_file = vcfutils.bgzip_and_index(raw_file, items[0]["config"]) data["vrn_file"] = _filter_to_info(raw_file, items[0]) out_plot_files = [x for x in out_plot_files if utils.file_exists(x)] data["damage_plots"] = out_plot_files return data
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not vcfutils.vcf_has_variants(gemini_vcf): return None if not utils.file_exists(gemini_db): data_basepath = install.get_gemini_dir(data) if support_gemini_orig( data) else None conf_files = dd.get_vcfanno(data) if not conf_files: conf_files = ["gemini"] ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath) with file_transaction(data, gemini_db) as tx_gemini_db: vcf2db = config_utils.get_program("vcf2db.py", data) if "vcf2db_expand" in dd.get_tools_on(data): vcf2db_args = [ "--expand", "gt_types", "--expand", "gt_ref_depths", "--expand", "gt_alt_depths" ] else: vcf2db_args = [] cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args do.run(cmd, "GEMINI: create database with vcf2db") return gemini_db
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if is_human: plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion} plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick_allele"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad", "--pubmed", "--variant_class", "--allele_number"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ if isinstance(data, (list, tuple)) and cwlutils.is_cwl_run(utils.to_single_data(data[0])): data = _normalize_cwl_inputs(data) toval_data = _get_validate(data) toval_data = cwlutils.unpack_tarballs(toval_data, toval_data) if toval_data: caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) rm_interval_file = bedutils.clean_file(rm_interval_file, toval_data, prefix="validateregions-", bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep"))) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") # RTG can fail on totally empty files. Call everything in truth set as false negatives if not vcfutils.vcf_has_variants(vrn_file): eval_files = _setup_call_false(rm_file, rm_interval_file, base_dir, toval_data, "fn") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) # empty validation file, every call is a false positive elif not vcfutils.vcf_has_variants(rm_file): eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data, "fp") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod in ["rtg", "rtg-squash-ploidy"]: eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data, vmethod) eval_files = _annotate_validations(eval_files, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "hap.py": data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if is_human: plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion} plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick_allele"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad", "--pubmed", "--variant_class", "--allele_number"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): return vcfutils.bgzip_and_index(out_file, data["config"])
def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) multisample_vcf = get_multisample_vcf(fnames, name, caller, data) gemini_vcf = multiallelic.to_single(multisample_vcf, data) use_gemini_quick = (do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: ped_file = create_ped_file(samples + extras, gemini_vcf) gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file) return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": multisample_vcf if is_batch else None}]]
def _has_variant_calls(data): for vrn in data["variants"]: if ( vrn.get("vrn_file") and vcfutils.vcf_has_variants(vrn["vrn_file"]) and (_has_precalled(data) or data.get("align_bam")) ): return True return False
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion( "0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None }]]
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, nonrefonly=True, work_dir=utils.safe_makedir(os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files)] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one # variant per gene # From the VEP docs: # "Pick once line of consequence data per variant, # including transcript-specific columns. Consequences are # chosen by the canonical, biotype status and length of the # transcript, along with the ranking of the consequence # type according to this table. This is the best method to # use if you are interested only in one consequence per # variant. cmd += ["--pick"] # TODO investigate hgvs reporting but requires indexing the reference file # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)] perllib = "export PERL5LIB=%s:$PERL5LIB" % _get_perllib() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perllib, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one # variant per gene # From the VEP docs: # "Pick once line of consequence data per variant, # including transcript-specific columns. Consequences are # chosen by the canonical, biotype status and length of the # transcript, along with the ranking of the consequence # type according to this table. This is the best method to # use if you are interested only in one consequence per # variant. cmd += ["--pick"] # TODO investigate hgvs reporting but requires indexing the reference file # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)] # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (" ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) if is_human: dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) prediction_args = ["--sift", "b", "--polyphen", "b"] prediction_fields = ["PolyPhen", "SIFT"] else: dbnsfp_args, dbnsfp_fields = [], [] loftee_args, loftee_fields = [], [] prediction_args, prediction_fields = [], [] if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one variant per gene # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick # Also use hgvs reporting but requires indexing the reference file clinical_args = ["--pick", "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)] clinical_fields = ["HGVSc", "HGVSp"] else: clinical_args, clinical_fields = [], [] std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \ prediction_args + dbnsfp_args + loftee_args + clinical_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion( "0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion( "0.7.0"): gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"] ]) if resources.get("options") else "" cmd = ( "{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}") cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) config_args, config_fields, prediction_fields = [], [], [] if is_human: plugin_fns = {"dbnsfp": _get_dbnsfp, "loftee": _get_loftee, "dbscsnv": _get_dbscsnv, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer} plugins = tz.get_in(("config", "resources", "vep", "plugins"), data, ["dbnsfp", "loftee"]) for plugin in plugins: plugin_args, plugin_fields = plugin_fns[plugin](data) config_args += plugin_args config_fields += plugin_fields config_args += ["--sift", "b", "--polyphen", "b"] prediction_fields += ["PolyPhen", "SIFT"] # Use HGVS by default, requires indexing the reference genome config_args += ["--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)] config_fields += ["HGVSc", "HGVSp"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick"] std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + config_fields)] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run(bam_file, data, out_dir): out = {} vcinfo = variant.get_active_vcinfo(data) if vcinfo and vcfutils.vcf_has_variants(vcinfo["vrn_file"]): out_file = os.path.join(utils.safe_makedir(out_dir), "%s-damage.yaml" % (dd.get_sample_name(data))) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = ["dkfzbiasfilter_summarize.py", "--sample=%s" % dd.get_sample_name(data), "--outfile=%s" % tx_out_file, vcinfo["vrn_file"]] do.run(cmd, "Summarize damage filtering") if utils.file_exists(out_file): out["base"] = out_file return out
def to_single(in_file, data, passonly=False): """Convert multi-allelic inputs in the original VCF file into single alleles. """ out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): ready_ma_file = _decompose(in_file, data, passonly=passonly) ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file out_file = ready_ma_file else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def snpeff_effects(data): """Annotate input VCF file with effects calculated by snpEff. """ vcf_in = data["vrn_file"] interval_file = data["config"]["algorithm"].get("variant_regions", None) if vcfutils.vcf_has_variants(vcf_in): se_interval = _convert_to_snpeff_interval(interval_file, vcf_in) if interval_file else None try: vcf_file = _run_snpeff(vcf_in, se_interval, "vcf", data) finally: for fname in [se_interval]: if fname and os.path.exists(fname): os.remove(fname) return vcf_file
def _get_build_type(fnames, samples, caller): """Confirm we should build a gemini database: need gemini in tools_on. Checks for valid conditions for running a database and gemini or gemini_orig configured in tools on. """ build_type = set() if any(vcfutils.vcf_has_variants(f) for f in fnames) and caller not in NO_DB_CALLERS: for data in samples: if any([x in dd.get_tools_on(data) for x in ["gemini", "gemini_orig", "gemini_allvariants", "vcf2db_expand"]]): if vcfanno.annoated_gemini(data): build_type.add("gemini_orig" if "gemini_orig" in dd.get_tools_on(data) else "gemini") return build_type
def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) name, caller, is_batch = call_info out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) if use_gemini: passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples) gemini_vcf = multiallelic.to_single(gemini_vcf, data, passonly=passonly) gemini_vcf = _run_vcfanno(gemini_vcf, data, use_gemini) gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) if vcfutils.vcf_has_variants(gemini_vcf): if not utils.file_exists(gemini_db) and use_gemini: ped_file = create_ped_file(samples + extras, gemini_vcf) # Use original approach for hg19/GRCh37 pending additional testing if support_gemini_orig(data) and not any(dd.get_vcfanno(d) for d in samples): gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file) else: gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file) return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf, "decomposed": use_gemini}]]
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not vcfutils.vcf_has_variants(gemini_vcf): return None if not utils.file_exists(gemini_db): data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None ann_file = vcfanno.run_vcfanno(gemini_vcf, "gemini", data, data_basepath) with file_transaction(data, gemini_db) as tx_gemini_db: vcf2db = config_utils.get_program("vcf2db.py", data) cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] do.run(cmd, "GEMINI: create database with vcf2db") return gemini_db
def run(bam_file, data, out_dir): out = {} vcinfo = variant.get_active_vcinfo(data, use_ensemble=False) dkfzbiasfilter = config_utils.get_program("dkfzbiasfilter_summarize.py", data) if vcinfo and vcfutils.vcf_has_variants(vcinfo["vrn_file"]): out_file = os.path.join(utils.safe_makedir(out_dir), "%s-damage.yaml" % (dd.get_sample_name(data))) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [dkfzbiasfilter, "--sample=%s" % dd.get_sample_name(data), "--outfile=%s" % tx_out_file, vcinfo["vrn_file"]] do.run(cmd, "Summarize damage filtering") if utils.file_exists(out_file): out["base"] = out_file return out
def to_single(in_file, data): """Convert multi-allelic inputs in the original VCF file into single alleles. """ out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): ba_file, ma_file = _split_mulitallelic(in_file, data) if vcfutils.vcf_has_variants(ma_file): ready_ma_file = _decompose(ma_file, data) ann_ma_file = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file out_file = vcfutils.merge_sorted([ready_ma_file, ba_file], out_file, data) else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def _add_vcf_header_sample_cl(in_file, items, base_file): """Add phenotype information to a VCF header. Encode tumor/normal relationships in VCF header. Could also eventually handle more complicated pedigree information if useful. """ paired = vcfutils.get_paired(items) if paired: toadd = ["##SAMPLE=<ID=%s,Genomes=Tumor>" % paired.tumor_name] if paired.normal_name: toadd.append("##SAMPLE=<ID=%s,Genomes=Germline>" % paired.normal_name) toadd.append("##PEDIGREE=<Derived=%s,Original=%s>" % (paired.tumor_name, paired.normal_name)) new_header = _update_header(in_file, base_file, toadd, _fix_generic_tn_names(paired)) if vcfutils.vcf_has_variants(in_file): cmd = "bcftools reheader -h {new_header} | bcftools view " return cmd.format(**locals())
def snpeff_effects(data): """Annotate input VCF file with effects calculated by snpEff. """ vcf_in = data["vrn_file"] interval_file = data["config"]["algorithm"].get("hybrid_target", None) if vcfutils.vcf_has_variants(vcf_in): se_interval = (_convert_to_snpeff_interval(interval_file, vcf_in) if interval_file else None) try: vcf_file = _run_snpeff(vcf_in, _get_snpeff_genome(data), se_interval, "vcf", data) finally: for fname in [se_interval]: if fname and os.path.exists(fname): os.remove(fname) return vcf_file
def _run_svtyper(in_file, full_bam, sr_bam, data): """Genotype structural variant calls with SVtyper. """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") cmd = ("gunzip -c {in_file} | " "{python} {svtyper} -B {full_bam} -S {sr_bam} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def create_gemini_db_orig(gemini_vcf, data, gemini_db=None, ped_file=None): """Original GEMINI specific data loader, only works with hg19/GRCh37. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) load_opts = "" if "gemini_allvariants" not in dd.get_tools_on(data): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [ ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz") ]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"] ]) if resources.get("options") else "" exports = utils.local_path_export() cmd = ("{exports} {gemini} {gemini_opts} load {load_opts} " "-v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}") cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def subset_by_supported(input_file, get_coords, calls_by_name, work_dir, data, headers=("#", )): """Limit CNVkit input to calls with support from another caller. get_coords is a function that return chrom, start, end from a line of the input_file, allowing handling of multiple input file types. """ support_files = [(c, tz.get_in([c, "vrn_file"], calls_by_name)) for c in convert.SUBSET_BY_SUPPORT["cnvkit"]] support_files = [(c, f) for (c, f) in support_files if f and vcfutils.vcf_has_variants(f)] if len(support_files) == 0: return input_file else: out_file = os.path.join( work_dir, "%s-havesupport%s" % utils.splitext_plus(os.path.basename(input_file))) if not utils.file_uptodate(out_file, input_file): input_bed = _input_to_bed(input_file, work_dir, get_coords, headers) pass_coords = set([]) with file_transaction(data, out_file) as tx_out_file: support_beds = " ".join( [_sv_vcf_to_bed(f, c, out_file) for c, f in support_files]) tmp_cmp_bed = "%s-intersectwith.bed" % utils.splitext_plus( tx_out_file)[0] cmd = "bedtools intersect -wa -f 0.5 -r -a {input_bed} -b {support_beds} > {tmp_cmp_bed}" do.run(cmd.format(**locals()), "Intersect CNVs with support files") for r in pybedtools.BedTool(tmp_cmp_bed): pass_coords.add((str(r.chrom), str(r.start), str(r.stop))) with open(input_file) as in_handle: with open(tx_out_file, "w") as out_handle: for line in in_handle: passes = True if not line.startswith(headers): passes = get_coords(line) in pass_coords if passes: out_handle.write(line) return out_file