예제 #1
0
def _run_ensemble_intersection(batch_id, vrn_files, callers, base_dir, edata):
    """Run intersection n out of x based ensemble method using bcbio.variation.recall.
    """
    out_vcf_file = os.path.join(base_dir,
                                "{0}-ensemble.vcf.gz".format(batch_id))
    if not utils.file_exists(out_vcf_file):
        num_pass = _get_num_pass(edata, len(vrn_files))
        cmd = [
            config_utils.get_program("bcbio-variation-recall",
                                     edata["config"]), "ensemble",
            "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1),
            "--numpass",
            str(num_pass), "--names", ",".join(callers)
        ]
        # Remove filtered calls, do not try to rescue, unless configured
        if not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"],
                         edata):
            cmd += ["--nofiltered"]

        with file_transaction(edata, out_vcf_file) as tx_out_file:
            cmd += [tx_out_file, dd.get_ref_file(edata)] + vrn_files
            cmd = "%s && %s" % (utils.get_java_clprep(), " ".join(
                str(x) for x in cmd))
            do.run(cmd, "Ensemble intersection calling: %s" % (batch_id))
    in_data = utils.deepish_copy(edata)
    in_data["vrn_file"] = out_vcf_file
    return {
        "variantcaller": "ensemble",
        "vrn_file": out_vcf_file,
        "bed_file": None
    }
예제 #2
0
def _fix_gatk_header(exist_files, out_file, config):
    """Ensure consistent headers for VCF concatenation.

    Fixes problems for genomes that start with chrM by reheadering the first file.
    These files do haploid variant calling which lack the PID phasing key/value
    pair in FORMAT, so initial chrM samples cause errors during concatenation
    due to the lack of header merging. This fixes this by updating the first header.
    """
    from bcbio.variation import ploidy
    c, base_file = exist_files[0]
    replace_file = base_file
    items = [{"config": config}]
    if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1:
        for c, x in exist_files[1:]:
            if ploidy.get_ploidy(items, (c, 1, 2)) > 1:
                replace_file = x
                break
    base_fix_file = os.path.join(
        os.path.dirname(out_file),
        "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file)))
    with file_transaction(config, base_fix_file) as tx_out_file:
        header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0]
        do.run("zgrep ^# %s > %s" % (replace_file, header_file),
               "Prepare header file for merging")
        do.run(
            "%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s" %
            (utils.get_java_clprep(), header_file, base_file, base_fix_file),
            "Reheader initial VCF file in merge")
    bgzip_and_index(base_fix_file, config)
    return [base_fix_file] + [x for (c, x) in exist_files[1:]]
예제 #3
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            dict_file = "%s.dict" % utils.splitext_plus(ref_file)[0]
            cores = dd.get_num_cores({"config": config})
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            cmd = ["picard"] + broad.get_picard_opts(config, memscale) + \
                  ["MergeVcfs", "D=%s" % dict_file, "O=%s" % tx_out_file] + \
                  ["I=%s" % f for f in ready_files]
            cmd = "%s && %s" % (utils.get_java_clprep(), " ".join(cmd))
            do.run(cmd, "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
예제 #4
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            dict_file = "%s.dict" % utils.splitext_plus(ref_file)[0]
            cores = dd.get_num_cores({"config": config})
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            cmd = ["picard"] + broad.get_picard_opts(config, memscale) + \
                  ["MergeVcfs", "D=%s" % dict_file, "O=%s" % tx_out_file] + \
                  ["I=%s" % f for f in ready_files]
            cmd = "%s && %s" % (utils.get_java_clprep(), " ".join(cmd))
            do.run(cmd, "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
예제 #5
0
def _fix_gatk_header(exist_files, out_file, config):
    """Ensure consistent headers for VCF concatenation.

    Fixes problems for genomes that start with chrM by reheadering the first file.
    These files do haploid variant calling which lack the PID phasing key/value
    pair in FORMAT, so initial chrM samples cause errors during concatenation
    due to the lack of header merging. This fixes this by updating the first header.
    """
    from bcbio.variation import ploidy
    c, base_file = exist_files[0]
    replace_file = base_file
    items = [{"config": config}]
    if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1:
        for c, x in exist_files[1:]:
            if ploidy.get_ploidy(items, (c, 1, 2)) > 1:
                replace_file = x
                break
    base_fix_file = os.path.join(os.path.dirname(out_file),
                                 "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file)))
    with file_transaction(config, base_fix_file) as tx_out_file:
        header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0]
        do.run("zgrep ^# %s > %s"
                % (replace_file, header_file), "Prepare header file for merging")
        resources = config_utils.get_resources("picard", config)
        ropts = []
        if "options" in resources:
            ropts += [str(x) for x in resources.get("options", [])]
        do.run("%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s %s" %
               (utils.get_java_clprep(), header_file, base_file, base_fix_file, " ".join(ropts)),
               "Reheader initial VCF file in merge")
    bgzip_and_index(base_fix_file, config)
    return [base_fix_file] + [x for (c, x) in exist_files[1:]]