Esempio n. 1
0
def _sample_variant_file_in_population(x):
    """Check if a sample file is the same as the population file.

    This is true for batches where we don't extract into samples and do not
    run decomposition for gemini.
    '"""
    if "population" in x:
        a = _get_project_vcf(x)
        b = _get_variant_file(x, ("vrn_file",))
        decomposed = tz.get_in(("population", "decomposed"), x)
        if (a and b and not decomposed and len(a) > 0 and len(b) > 0 and
              vcfutils.get_samples(a[0]["path"]) == vcfutils.get_samples(b[0]["path"])):
            return True
    return False
Esempio n. 2
0
def _sample_variant_file_in_population(x):
    """Check if a sample file is the same as the population file.

    This is true for batches where we don't extract into samples and do not
    run decomposition for gemini.
    '"""
    if "population" in x:
        a = _get_variant_file(x, ("population", "vcf"))
        b = _get_variant_file(x, ("vrn_file",))
        decomposed = tz.get_in(("population", "decomposed"), x)
        if (a and b and not decomposed and len(a) > 0 and len(b) > 0 and
              vcfutils.get_samples(a[0]["path"]) == vcfutils.get_samples(b[0]["path"])):
            return True
    return False
Esempio n. 3
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(data["group_orig"]):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                if "combine" in data:
                    sub_data["combine"] = data["combine"]
                sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
    # population or single sample
    else:
        out = []
        for sub_data in data["group_orig"]:
            sub_vrn_file = data["vrn_file"].replace(data["group"][0] + "-", sub_data["name"][-1] + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], sub_data["name"][-1], sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            if "combine" in data:
                sub_data["combine"] = data["combine"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Esempio n. 4
0
def filter_vcf_by_sex(vcf_file, data):
    """Post-filter a single sample VCF, handling sex chromosomes.

    Handles sex chromosomes and mitochondrial. Does not try to resolve called
    hets into potential homozygotes when converting diploid to haploid.

    Skips filtering on pooled samples, we still need to implement.
    """
    if len(vcfutils.get_samples(vcf_file)) > 1:
        return vcf_file
    _, sexes = _configured_ploidy_sex([data])
    sex = sexes.pop()
    out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file)
    if not utils.file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with utils.open_gzipsafe(vcf_file) as in_handle:
                    for line in in_handle:
                        if line.startswith("#"):
                            out_handle.write(line)
                        else:
                            line = _fix_line_ploidy(line, sex)
                            if line:
                                out_handle.write(line)
        if orig_out_file.endswith(".gz"):
            out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 5
0
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)):
                    # For tumor/normal batches, want to attach germline VCFs to normals
                    # Standard somatics go to tumors
                    if dd.get_phenotype(data) == "normal":
                        test_name += "-germline"
                    if os.path.basename(f).startswith(("%s-" % test_name,
                                                       "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Esempio n. 6
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(data["group_orig"]):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
    # population or single sample
    else:
        out = []
        for sub_data in data["group_orig"]:
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Esempio n. 7
0
def _comparison_stats_from_merge(in_file, stats, svcaller, data):
    """Extract true/false positive/negatives from a merged SURIVOR VCF.
    """
    truth_stats = {"tp": [], "fn": [], "fp": []}

    samples = [
        "truth" if x.endswith("-truth") else "eval"
        for x in vcfutils.get_samples(in_file)
    ]
    with open(in_file) as in_handle:
        for call in (l.rstrip().split("\t") for l in in_handle
                     if not l.startswith("#")):
            supp_vec_str = [
                x for x in call[7].split(";") if x.startswith("SUPP_VEC=")
            ][0]
            _, supp_vec = supp_vec_str.split("=")
            calls = dict(zip(samples, [int(x) for x in supp_vec]))
            if calls["truth"] and calls["eval"]:
                metric = "tp"
            elif calls["truth"]:
                metric = "fn"
            else:
                metric = "fp"
            truth_stats[metric].append(_summarize_call(call))
    return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
Esempio n. 8
0
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] +
                                              dd.get_batches(data)):
                    if os.path.basename(f).startswith(
                        ("%s-" % test_name, "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Esempio n. 9
0
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] +
                                              dd.get_batches(data)):
                    # For tumor/normal batches, want to attach germline VCFs to normals
                    # Standard somatics go to tumors
                    if dd.get_phenotype(data) == "normal":
                        test_name += "-germline"
                    if os.path.basename(f).startswith(
                        ("%s-" % test_name, "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Esempio n. 10
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                         "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = ["rtg", "vcfeval", "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    return {"tp": os.path.join(out_dir, "tp.vcf.gz"),
            "fp": os.path.join(out_dir, "fp.vcf.gz"),
            "fn": os.path.join(out_dir, "fn.vcf.gz")}
Esempio n. 11
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling, do not split back up due to potentially large sample sizes
    elif tz.get_in(("config", "algorithm", "jointcaller"), data):
        return [[data]]
    # population or single sample
    else:
        out = []
        for sub_data in get_orig_items(data):
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Esempio n. 12
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling, do not split back up due to potentially large sample sizes
    elif tz.get_in(("config", "algorithm", "jointcaller"), data):
        return [[data]]
    # population or single sample
    else:
        out = []
        for sub_data in get_orig_items(data):
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Esempio n. 13
0
def filter_vcf_by_sex(vcf_file, data):
    """Post-filter a single sample VCF, handling sex chromosomes.

    Handles sex chromosomes and mitochondrial. Does not try to resolve called
    hets into potential homozygotes when converting diploid to haploid.

    Skips filtering on pooled samples, we still need to implement.
    """
    if len(vcfutils.get_samples(vcf_file)) > 1:
        return vcf_file
    _, sexes = _configured_ploidy_sex([data])
    sex = sexes.pop()
    out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file)
    if not utils.file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with utils.open_gzipsafe(vcf_file) as in_handle:
                    for line in in_handle:
                        if line.startswith("#"):
                            out_handle.write(line)
                        else:
                            line = _fix_line_ploidy(line, sex)
                            if line:
                                out_handle.write(line)
        if orig_out_file.endswith(".gz"):
            out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 14
0
def _create_samplemap_file(vrn_files):
    tf = tempfile.NamedTemporaryFile(suffix=".tsv", delete=False)
    samplemap = tf.name
    samplenames = [vcfutils.get_samples(vrn_file)[0] for vrn_file in vrn_files]
    with open(samplemap, "w") as out_handle:
        for samplename, vrn_file in zip(samplenames, vrn_files):
            print(f"{samplename}\t{vrn_file}", file=out_handle)
    return samplemap
Esempio n. 15
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file +
                                                                 ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file,
                                               data["config"],
                                               out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(os.path.basename(vrn_file))
            sample_file = os.path.join(
                base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file,
                                              dd.get_sample_name(data),
                                              sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file +
                                                                  ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file,
                                                data["config"],
                                                out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)
        threads = min(dd.get_num_cores(data), 6)
        mem = "%sg" % threads
        cmd = [
            "rtg", "vcfeval", "--threads",
            str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c",
            vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        cmd += [
            "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))
        ]
        mem_export = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=%s" % mem
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Esempio n. 16
0
def get_bams(vcf_file, bam_dir):
    out = []
    for sample in vcfutils.get_samples(vcf_file):
        bam_files = glob.glob(os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample))
        assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % (sample, bam_files)
        if len(bam_files) > 1:
            bam_files = [x for x in bam_files if x.endswith(".bam")]
        out.append(bam_files[0])
    return out
Esempio n. 17
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file +
                                                                 ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file,
                                               data["config"],
                                               out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(
                base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file,
                                              dd.get_sample_name(data),
                                              sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file +
                                                                  ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file,
                                                data["config"],
                                                out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(
            os.path.join(ref_dir, os.path.pardir, "rtg",
                         "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = [
            "rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions",
            interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        caller = _get_caller(data)
        # flexible quality scores for building ROC curves, handle multiple cases
        # MuTect has no quality scores
        # not clear how to get t_lod_fstar into VCF cleanly
        if caller == "mutect":
            cmd += ["--vcf-score-field=BQ"]
        # otherwise use quality score as a standard
        # Discussion point: is it worth using caller specific annotations or settling
        # on a single metric for comparison
        else:
            cmd += ["--vcf-score-field=QUAL"]
        cmd = "export RTG_JAVA_OPTS='-Xms1g' export RTG_MEM=5g && " + " ".join(
            cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    return {
        "tp": os.path.join(out_dir, "tp.vcf.gz"),
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
Esempio n. 18
0
def _get_vcf_samples(calls):
    all_samples = set([])
    for f in utils.flatten(calls):
        cur = set(vcfutils.get_samples(f))
        if cur:
            if not all_samples:
                all_samples = cur
            else:
                all_samples &= set(cur)
    return list(all_samples)
Esempio n. 19
0
def run_peddy(samples, out_dir=None):
    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                vcf_file = vcinfo["vrn_file"]
                break
    data = samples[0]
    peddy = config_utils.get_program("peddy", data) if config_utils.program_installed("peddy", data) else None
    if not peddy or not vcf_file or not is_human(data):
        logger.info("peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking "
                    "for %s." % vcf_file)
        return samples
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"
    peddyfiles = expected_peddy_files(peddy_report, batch)
    if file_exists(peddy_report):
        return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    num_cores = dd.get_num_cores(data)

    with tx_tmpdir(data) as tx_dir:
        peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix))
        # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
        stderr_log = os.path.join(tx_dir, "run-stderr.log")
        cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}"
        message = "Running peddy on {vcf_file} against {ped_file}."
        try:
            do.run(cmd.format(**locals()), message.format(**locals()))
        except:
            to_show = collections.deque(maxlen=100)
            with open(stderr_log) as in_handle:
                for line in in_handle:
                    to_show.append(line)
            if any([l.find("IndexError") >=0 and l.find("is out of bounds for axis") >= 0
                    for l in to_show]):
                logger.info("Skipping peddy because no variants overlap with checks: %s" % batch)
                with open(peddy_prefix + "-failed.log", "w") as out_handle:
                    out_handle.write("peddy did not find overlaps with 1kg sites in VCF, skipping")
                return samples
            else:
                logger.warning("".join(to_show))
                raise
        for ext in PEDDY_OUT_EXTENSIONS:
            if os.path.exists(peddy_prefix_tx + ext):
                shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Esempio n. 20
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file +
                                                                 ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file,
                                               data["config"],
                                               out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(
                base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file,
                                              dd.get_sample_name(data),
                                              sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file +
                                                                  ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file,
                                                data["config"],
                                                out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(
            os.path.join(ref_dir, os.path.pardir, "rtg",
                         "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = [
            "rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions",
            interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        cmd += [
            "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))
        ]
        cmd = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=5g && " + " ".join(
            cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Esempio n. 21
0
def get_bams(vcf_file, bam_dir):
    out = []
    for sample in vcfutils.get_samples(vcf_file):
        bam_files = glob.glob(
            os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample))
        assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % (
            sample, bam_files)
        if len(bam_files) > 1:
            bam_files = [x for x in bam_files if x.endswith(".bam")]
        out.append(bam_files[0])
    return out
Esempio n. 22
0
def _validate_caller_vcf(call_vcf, truth_vcf, callable_bed, svcaller, work_dir, data):
    """Validate a caller VCF against truth within callable regions using SURVIVOR.

    Combines files with SURIVOR merge and counts (https://github.com/fritzsedlazeck/SURVIVOR/)
    """
    stats = _calculate_comparison_stats(truth_vcf)
    call_vcf = _prep_vcf(call_vcf, callable_bed, dd.get_sample_name(data), dd.get_sample_name(data),
                         stats, work_dir, data)
    truth_vcf = _prep_vcf(truth_vcf, callable_bed, vcfutils.get_samples(truth_vcf)[0],
                          "%s-truth" % dd.get_sample_name(data), stats, work_dir, data)
    cmp_vcf = _survivor_merge(call_vcf, truth_vcf, stats, work_dir, data)
    return _comparison_stats_from_merge(cmp_vcf, stats, svcaller, data)
Esempio n. 23
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data, validate_method):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        vrn_file, rm_file, interval_bed = _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data)

        rtg_ref = tz.get_in(["reference", "rtg"], data)
        if isinstance(rtg_ref, dict) and "base" in rtg_ref:
            rtg_ref = os.path.dirname(rtg_ref["base"])
        assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                                     "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)

        # get core and memory usage from standard configuration
        threads = min(dd.get_num_cores(data), 6)
        resources = config_utils.get_resources("rtg", data["config"])
        memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]),
                                          {"algorithm": {"memory_adjust": {"magnitude": threads,
                                                                           "direction": "increase"}}})
        jvm_stack = [x for x in memory if x.startswith("-Xms")]
        jvm_mem = [x for x in memory if x.startswith("-Xmx")]
        jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m"
        jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g"
        cmd = ["rtg", "vcfeval", "--threads", str(threads),
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        if validate_method == "rtg-squash-ploidy":
            cmd += ["--squash-ploidy"]
        rm_samples = vcfutils.get_samples(rm_file)
        if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples:
            cmd += ["--sample=%s" % dd.get_sample_name(data)]
        cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))]
        mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (utils.local_path_export(),
                                                                            jvm_stack, jvm_mem)
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {"fp": os.path.join(out_dir, "fp.vcf.gz"),
           "fn": os.path.join(out_dir, "fn.vcf.gz")}
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Esempio n. 24
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base, ext = utils.splitext_plus(os.path.basename(vrn_file))
        sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
    if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
        vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data)
    return vrn_file, rm_file, interval_bed
Esempio n. 25
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base, ext = utils.splitext_plus(os.path.basename(vrn_file))
        sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
    if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
        vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data)
    return vrn_file, rm_file, interval_bed
Esempio n. 26
0
def create_ped_file(samples, base_vcf, out_dir=None):
    """Create a GEMINI-compatible PED file, including gender, family and phenotype information.

    Checks for a specified `ped` file in metadata, and will use sample information from this file
    before reconstituting from metadata information.
    """
    out_file = "%s.ped" % utils.splitext_plus(base_vcf)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    sample_ped_lines = {}
    header = [
        "#Family_ID", "Individual_ID", "Paternal_ID", "Maternal_ID", "Sex",
        "Phenotype", "Ethnicity"
    ]
    for md_ped in list(
            set([
                x for x in
                [tz.get_in(["metadata", "ped"], data) for data in samples]
                if x is not None
            ])):
        with open(md_ped) as in_handle:
            reader = csv.reader(in_handle, dialect="excel-tab")
            for parts in reader:
                if parts[0].startswith("#") and len(parts) > len(header):
                    header = header + parts[len(header):]
                else:
                    sample_ped_lines[parts[1]] = parts
    if not utils.file_exists(out_file):
        with file_transaction(samples[0], out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                want_samples = set(vcfutils.get_samples(base_vcf))
                writer = csv.writer(out_handle, dialect="excel-tab")
                writer.writerow(header)
                for data in samples:
                    ped_info = get_ped_info(data, samples)
                    sname = ped_info["individual_id"]
                    if sname in want_samples:
                        want_samples.remove(sname)
                        if sname in sample_ped_lines:
                            writer.writerow(sample_ped_lines[sname])
                        else:
                            writer.writerow([
                                ped_info["family_id"],
                                ped_info["individual_id"],
                                ped_info["paternal_id"],
                                ped_info["maternal_id"], ped_info["gender"],
                                ped_info["affected"], ped_info["ethnicity"]
                            ])
    return out_file
Esempio n. 27
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base = utils.splitext_plus(os.path.basename(vrn_file))[0]
        sample_file = os.path.join(base_dir, "%s-%s.vcf.gz" % (base, dd.get_sample_name(data)))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
    # rtg fails on bgzipped VCFs produced by GatherVcfs so we re-prep them
    else:
        vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data)
    return vrn_file, rm_file, interval_bed
Esempio n. 28
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base = utils.splitext_plus(os.path.basename(vrn_file))[0]
        sample_file = os.path.join(base_dir, "%s-%s.vcf.gz" % (base, dd.get_sample_name(data)))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
    # rtg fails on bgzipped VCFs produced by GatherVcfs so we re-prep them
    else:
        vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data)
    return vrn_file, rm_file, interval_bed
Esempio n. 29
0
def _get_vcf_samples(calls, data):
    have_full_file = False
    all_samples = set([])
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for test_name in [dd.get_sample_name(data)] + dd.get_batches(data):
                if os.path.basename(f).startswith("%s-" % test_name):
                    all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Esempio n. 30
0
def extract(data, items):
    """Extract germline calls for the given sample, if tumor only.

    For germline calling done separately, fix VCF sample naming to match.
    """
    if vcfutils.get_paired_phenotype(data):
        if dd.get_batches(data) and len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    elif dd.get_phenotype(data) == "germline":
        sample_name = dd.get_sample_name(data)
        vcf_samples = vcfutils.get_samples(data["vrn_file"])
        if (sample_name.endswith("-germline") and len(vcf_samples) == 1
              and sample_name.replace("-germline", "") == vcf_samples[0]):
            data["vrn_file"] = fix_germline_samplename(data["vrn_file"], sample_name, data)
    return data
Esempio n. 31
0
def extract(data, items):
    """Extract germline calls for the given sample, if tumor only.

    For germline calling done separately, fix VCF sample naming to match.
    """
    if vcfutils.get_paired_phenotype(data):
        if dd.get_batches(data) and len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    elif dd.get_phenotype(data) == "germline":
        sample_name = dd.get_sample_name(data)
        vcf_samples = vcfutils.get_samples(data["vrn_file"])
        if (sample_name.endswith("-germline") and len(vcf_samples) == 1
              and sample_name.replace("-germline", "") == vcf_samples[0]):
            data["vrn_file"] = _fix_germline_samplename(data["vrn_file"], sample_name, data)
    return data
Esempio n. 32
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, assign the combined file to the
    tumor sample instead of splitting, and remove variant files from the normal.
    """
    config = data["config"]
    vrn_file = data["vrn_file"]
    out = []
    # cancer tumor/normal
    if vcfutils.get_paired_phenotype(data):
        # handle trailing normals, which we don't need to process
        if len(data["group_orig"]) == 1 and vcfutils.get_paired_phenotype(data["group_orig"][0][0]) == "normal":
            sub_data, sub_vrn_file = data["group_orig"][0]
            sub_data.pop("vrn_file", None)
            sub_data["vrn_file-shared"] = sub_vrn_file
            out.append(sub_data)
        else:
            has_tumor = False
            for sub_data, sub_vrn_file in data["group_orig"]:
                paired_phenotype = vcfutils.get_paired_phenotype(sub_data)
                if paired_phenotype == "tumor":
                    has_tumor = True
                    if not os.path.exists(sub_vrn_file):
                        utils.symlink_plus(vrn_file, sub_vrn_file)
                    sub_data["vrn_file"] = sub_vrn_file
                    out.append(sub_data)
                else:
                    sub_data.pop("vrn_file", None)
                    sub_data["vrn_file-shared"] = sub_vrn_file
                    out.append(sub_data)
            if not has_tumor:
                raise ValueError("Did not find tumor sample in paired analysis")
    # population or single sample
    else:
        for sub_data, sub_vrn_file in data["group_orig"]:
            if len(vcfutils.get_samples(vrn_file)) > 1:
                vcfutils.select_sample(vrn_file, sub_data["name"][-1], sub_vrn_file, config)
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(vrn_file, sub_vrn_file)
            if sub_vrn_file:
                sub_data["vrn_file"] = sub_vrn_file
                out.append(sub_data)
    return out
Esempio n. 33
0
def _comparison_stats_from_merge(in_file, stats, svcaller, data):
    """Extract true/false positive/negatives from a merged SURIVOR VCF.
    """
    truth_stats = {"tp": [], "fn": [], "fp": []}

    samples = ["truth" if x.endswith("-truth") else "eval" for x in vcfutils.get_samples(in_file)]
    with open(in_file) as in_handle:
        for call in (l.rstrip().split("\t") for l in in_handle if not l.startswith("#")):
            supp_vec_str = [x for x in call[7].split(";") if x.startswith("SUPP_VEC=")][0]
            _, supp_vec = supp_vec_str.split("=")
            calls = dict(zip(samples, [int(x) for x in supp_vec]))
            if calls["truth"] and calls["eval"]:
                metric = "tp"
            elif calls["truth"]:
                metric = "fn"
            else:
                metric = "fp"
            truth_stats[metric].append(_summarize_call(call))
    return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
Esempio n. 34
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(os.path.basename(vrn_file))
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                                     "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)
        threads = min(dd.get_num_cores(data), 6)
        mem = "%sg" % threads
        cmd = ["rtg", "vcfeval", "--threads", str(threads),
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))]
        mem_export = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=%s" % mem
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {"fp": os.path.join(out_dir, "fp.vcf.gz"),
           "fn": os.path.join(out_dir, "fn.vcf.gz")}
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Esempio n. 35
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                         "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = ["rtg", "vcfeval", "--threads", "6",
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        caller = _get_caller(data)
        # flexible quality scores for building ROC curves, handle multiple cases
        # MuTect has no quality scores
        # not clear how to get t_lod_fstar into VCF cleanly
        if caller == "mutect":
            cmd += ["--vcf-score-field=BQ"]
        # otherwise use quality score as a standard
        # Discussion point: is it worth using caller specific annotations or settling
        # on a single metric for comparison
        else:
            cmd += ["--vcf-score-field=QUAL"]
        cmd = "export RTG_JAVA_OPTS='-Xms1g' export RTG_MEM=5g && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    return {"tp": os.path.join(out_dir, "tp.vcf.gz"),
            "fp": os.path.join(out_dir, "fp.vcf.gz"),
            "fn": os.path.join(out_dir, "fn.vcf.gz")}
Esempio n. 36
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                         "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = ["rtg", "vcfeval", "--threads", "6",
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))]
        cmd = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=5g && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {"fp": os.path.join(out_dir, "fp.vcf.gz"),
           "fn": os.path.join(out_dir, "fn.vcf.gz")}
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Esempio n. 37
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = None
        if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [
                "tumor"
        ]:
            vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if not vcinfo and dd.get_phenotype(d) in ["tumor"]:
            vcinfo = variant.extract_germline_vcinfo(d, peddy_dir)
        if vcinfo:
            for key in ["germline", "vrn_file"]:
                if vcinfo and vcinfo.get(key) and utils.file_exists(
                        vcinfo[key]):
                    if vcinfo[key] and dd.get_sample_name(
                            d) in vcfutils.get_samples(vcinfo[key]):
                        if vcinfo[
                                key] and vcfutils.vcf_has_nonfiltered_variants(
                                    vcinfo[key]):
                            vcf_file = vcinfo[key]
                            break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples])
    if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips:
        if not peddy:
            reason = "peddy executable not found"
        elif config_skips:
            reason = "peddy in tools_off configuration"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            locale = utils.locale_export()
            cmd = (
                "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return (
                        (l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0)
                        or (l.find("n_components=") >= 0
                            and l.find("must be between 1 and min") >= 0)
                        or (l.find(
                            "Input contains NaN, infinity or a value too large for dtype"
                        ) >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)