Beispiel #1
0
def prep_gemini_db(fnames, call_info, samples, extras):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """#
    data = samples[0]
    use_gemini = do_db_build(samples) and any(
        vcfutils.vcf_has_variants(f) for f in fnames)
    name, caller, is_batch = call_info
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    if use_gemini:
        passonly = all("gemini_allvariants" not in dd.get_tools_on(d)
                       for d in samples)
        gemini_vcf = normalize.normalize(gemini_vcf, data, passonly=passonly)
    ann_vcf = _run_vcfanno(gemini_vcf, data, use_gemini)
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    if vcfutils.vcf_has_variants(gemini_vcf) and caller not in NO_DB_CALLERS:
        if not utils.file_exists(gemini_db) and use_gemini:
            ped_file = create_ped_file(samples + extras, gemini_vcf)
            # Use original approach for hg19/GRCh37 pending additional testing
            if support_gemini_orig(data) and not any(
                    dd.get_vcfanno(d) for d in samples):
                gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db,
                                                  ped_file)
            elif ann_vcf:
                gemini_db = create_gemini_db(ann_vcf, data, gemini_db,
                                             ped_file)
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": ann_vcf or gemini_vcf,
        "decomposed": use_gemini
    }]]
Beispiel #2
0
def prep_gemini_db(fnames, call_info, samples, extras):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    name, caller, is_batch = call_info
    build_type = _get_build_type(fnames, samples, caller)
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    # If we're building a gemini database, normalize the inputs
    if build_type:
        passonly = all("gemini_allvariants" not in dd.get_tools_on(d)
                       for d in samples)
        gemini_vcf = normalize.normalize(gemini_vcf, data, passonly=passonly)
        decomposed = True
    else:
        decomposed = False
    ann_vcf = run_vcfanno(gemini_vcf, data, decomposed)
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    if ann_vcf and build_type and not utils.file_exists(gemini_db):
        ped_file = create_ped_file(samples + extras, gemini_vcf)
        # Original approach for hg19/GRCh37
        if vcfanno.is_human(data, builds=["37"
                                          ]) and "gemini_orig" in build_type:
            gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db,
                                              ped_file)
        else:
            gemini_db = create_gemini_db(ann_vcf, data, gemini_db, ped_file)
    # only pass along gemini_vcf_downstream if uniquely created here
    if os.path.islink(gemini_vcf):
        gemini_vcf = None
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": ann_vcf or gemini_vcf,
        "decomposed": decomposed
    }]]
Beispiel #3
0
def combine_calls(*args):
    """Combine multiple callsets into a final set of merged calls.
    """
    if len(args) == 3:
        is_cwl = False
        batch_id, samples, data = args
        caller_names, vrn_files = _organize_variants(samples, batch_id)
    else:
        is_cwl = True
        samples = [utils.to_single_data(x) for x in args]
        samples = [cwlutils.unpack_tarballs(x, x) for x in samples]
        data = samples[0]
        batch_id = data["batch_id"]
        caller_names = data["variants"]["variantcallers"]
        vrn_files = data["variants"]["calls"]
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(caller_names)))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    if any([vcfutils.vcf_has_variants(f) for f in vrn_files]):
        # Decompose multiallelic variants and normalize
        passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False)
        vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True,
                                         nonrefonly=True,
                                         work_dir=utils.safe_makedir(os.path.join(base_dir, c)))
                     for c, f in zip(caller_names, vrn_files)]
        if "classifiers" not in (dd.get_ensemble(edata) or {}):
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     dd.get_ref_file(edata), edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"])
        # After decomposing multiallelic variants and normalizing, re-evaluate effects
        ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data)
        if ann_ma_file:
            callinfo["vrn_file"] = ann_ma_file

        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples])
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
                    "bed_file": None}
    if is_cwl:
        callinfo["batch_samples"] = data["batch_samples"]
        callinfo["batch_id"] = batch_id
        return [{"ensemble": callinfo}]
    else:
        return [[batch_id, callinfo]]
Beispiel #4
0
def combine_calls(batch_id, samples, data):
    """Combine multiple callsets into a final set of merged calls.
    """
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"])))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id)
    exist_variants = False
    for tmp_vrn_file in vrn_files:
        if vcfutils.vcf_has_variants(tmp_vrn_file):
            exist_variants = True
            break
    if exist_variants:
        # Decompose multiallelic variants and normalize
        passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False)
        vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True)
                     for f in vrn_files]
        if "classifiers" not in edata["config"]["algorithm"]["ensemble"]:
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     edata["sam_ref"], edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"])
        # After decomposing multiallelic variants and normalizing, re-evaluate effects
        ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data)
        if ann_ma_file:
            callinfo["vrn_file"] = ann_ma_file

        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples])
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
                    "bed_file": None}
    return [[batch_id, callinfo]]