def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """# data = samples[0] use_gemini = do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) name, caller, is_batch = call_info out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) if use_gemini: passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples) gemini_vcf = normalize.normalize(gemini_vcf, data, passonly=passonly) ann_vcf = _run_vcfanno(gemini_vcf, data, use_gemini) gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) if vcfutils.vcf_has_variants(gemini_vcf) and caller not in NO_DB_CALLERS: if not utils.file_exists(gemini_db) and use_gemini: ped_file = create_ped_file(samples + extras, gemini_vcf) # Use original approach for hg19/GRCh37 pending additional testing if support_gemini_orig(data) and not any( dd.get_vcfanno(d) for d in samples): gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file) elif ann_vcf: gemini_db = create_gemini_db(ann_vcf, data, gemini_db, ped_file) return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": ann_vcf or gemini_vcf, "decomposed": use_gemini }]]
def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] name, caller, is_batch = call_info build_type = _get_build_type(fnames, samples, caller) out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) # If we're building a gemini database, normalize the inputs if build_type: passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples) gemini_vcf = normalize.normalize(gemini_vcf, data, passonly=passonly) decomposed = True else: decomposed = False ann_vcf = run_vcfanno(gemini_vcf, data, decomposed) gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) if ann_vcf and build_type and not utils.file_exists(gemini_db): ped_file = create_ped_file(samples + extras, gemini_vcf) # Original approach for hg19/GRCh37 if vcfanno.is_human(data, builds=["37" ]) and "gemini_orig" in build_type: gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file) else: gemini_db = create_gemini_db(ann_vcf, data, gemini_db, ped_file) # only pass along gemini_vcf_downstream if uniquely created here if os.path.islink(gemini_vcf): gemini_vcf = None return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": ann_vcf or gemini_vcf, "decomposed": decomposed }]]
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, nonrefonly=True, work_dir=utils.safe_makedir(os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files)] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]
def combine_calls(batch_id, samples, data): """Combine multiple callsets into a final set of merged calls. """ logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"]))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id) exist_variants = False for tmp_vrn_file in vrn_files: if vcfutils.vcf_has_variants(tmp_vrn_file): exist_variants = True break if exist_variants: # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True) for f in vrn_files] if "classifiers" not in edata["config"]["algorithm"]["ensemble"]: callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, edata["sam_ref"], edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} return [[batch_id, callinfo]]