def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] name, caller, is_batch = call_info build_type = _get_build_type(fnames, samples, caller) out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) # If we're building a gemini database, normalize the inputs if build_type: passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples) gemini_vcf = normalize.normalize(gemini_vcf, data, passonly=passonly) decomposed = True else: decomposed = False ann_vcf = run_vcfanno(gemini_vcf, data, decomposed) gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) if ann_vcf and build_type and not utils.file_exists(gemini_db): ped_file = create_ped_file(samples + extras, gemini_vcf) # Original approach for hg19/GRCh37 if vcfanno.is_human(data, builds=["37" ]) and "gemini_orig" in build_type: gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file) else: gemini_db = create_gemini_db(ann_vcf, data, gemini_db, ped_file) # only pass along gemini_vcf_downstream if uniquely created here if os.path.islink(gemini_vcf): gemini_vcf = None return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": ann_vcf or gemini_vcf, "decomposed": decomposed }]]
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if vcfanno.is_human(data): plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion} plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick_allele"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad", "--pubmed", "--variant_class", "--allele_number"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): return vcfutils.bgzip_and_index(out_file, data["config"])
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if vcfanno.is_human(data): plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,"genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion, "G2P": _get_G2P} plugins = ["loftee", "G2P"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvsg","--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick_allele"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad", "--pubmed", "--variant_class", "--allele_number"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): return vcfutils.bgzip_and_index(out_file, data["config"])
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([ tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"] ]): to_run.append("qualimap") if analysis.startswith("rna-seq") or analysis == "smallrna-seq": if "qualimap" not in dd.get_tools_off(data): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("chip-seq"): to_run.append("chipqc") if dd.get_chip_method(data) == "atac": to_run.append("ataqv") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") to_run.append("atropos") if "coverage_qc" not in dd.get_tools_off(data): to_run.append("samtools") if dd.has_variantcalls(data): if "coverage_qc" not in dd.get_tools_off(data): to_run += ["coverage", "picard"] to_run += ["qsignature", "variants"] if vcfanno.is_human(data): to_run += ["peddy"] if "contamination" not in dd.get_tools_off(data): to_run += ["contamination"] if vcfutils.get_paired_phenotype(data): if "viral" not in dd.get_tools_off(data): to_run += ["viral"] if damage.should_filter([data]): to_run += ["damage"] if dd.get_umi_consensus(data): to_run += ["umi"] if tz.get_in(["config", "algorithm", "preseq"], data): to_run.append("preseq") to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)] to_run.sort() return to_run
def _back_compatible_gemini(conf_files, data): """Provide old install directory for configuration with GEMINI supplied tidy VCFs. Handles new style (bcbio installed) and old style (GEMINI installed) configuration and data locations. """ if vcfanno.is_human(data, builds=["37"]): for f in conf_files: if f and os.path.basename(f) == "gemini.conf" and os.path.exists(f): with open(f) as in_handle: for line in in_handle: if line.startswith("file"): fname = line.strip().split("=")[-1].replace('"', '').strip() if fname.find(".tidy.") > 0: return install.get_gemini_dir(data) return None
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"]]): to_run.append("qualimap") if analysis.startswith("rna-seq") or analysis == "smallrna-seq": if "qualimap" not in dd.get_tools_off(data): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("chip-seq"): to_run.append("chipqc") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") to_run.append("atropos") if "coverage_qc" not in dd.get_tools_off(data): to_run.append("samtools") if analysis.startswith(("standard", "variant", "variant2")): if "coverage_qc" not in dd.get_tools_off(data): to_run += ["coverage", "picard"] to_run += ["qsignature", "variants"] if vcfanno.is_human(data): to_run += ["contamination", "peddy"] if vcfutils.get_paired_phenotype(data): to_run += ["viral"] if damage.should_filter([data]): to_run += ["damage"] if dd.get_umi_consensus(data): to_run += ["umi"] if tz.get_in(["config", "algorithm", "preseq"], data): to_run.append("preseq") to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)] to_run.sort() return to_run
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)