Example #1
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling, do not split back up due to potentially large sample sizes
    elif tz.get_in(("config", "algorithm", "jointcaller"), data):
        return [[data]]
    # population or single sample
    else:
        out = []
        for sub_data in get_orig_items(data):
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Example #2
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling, do not split back up due to potentially large sample sizes
    elif tz.get_in(("config", "algorithm", "jointcaller"), data):
        return [[data]]
    # population or single sample
    else:
        out = []
        for sub_data in get_orig_items(data):
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Example #3
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(data["group_orig"]):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
    # population or single sample
    else:
        out = []
        for sub_data in data["group_orig"]:
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Example #4
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(data["group_orig"]):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                if "combine" in data:
                    sub_data["combine"] = data["combine"]
                sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
    # population or single sample
    else:
        out = []
        for sub_data in data["group_orig"]:
            sub_vrn_file = data["vrn_file"].replace(data["group"][0] + "-", sub_data["name"][-1] + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], sub_data["name"][-1], sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            if "combine" in data:
                sub_data["combine"] = data["combine"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Example #5
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    out = []
    for i, data in enumerate(items):
        if "sv" not in data:
            data["sv"] = []
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, data)
        filter_vcf = _filter_by_support(gt_vcf, data)
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": filter_vcf,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Example #6
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    background_bams = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        if paired.normal_bam:
            background = [paired.normal_data]
            background_bams = [paired.normal_bam]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
    orig_vcf = _run_wham(inputs, background_bams)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"])
        if background:
            sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data)
        data["sv"].append({"variantcaller": "wham",
                           "vrn_file": sample_vcf})
        out.append(data)
    return out
Example #7
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                         "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = ["rtg", "vcfeval", "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    return {"tp": os.path.join(out_dir, "tp.vcf.gz"),
            "fp": os.path.join(out_dir, "fp.vcf.gz"),
            "fn": os.path.join(out_dir, "fn.vcf.gz")}
Example #8
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    background_bams = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        if paired.normal_bam:
            background = [paired.normal_data]
            background_bams = [paired.normal_bam]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
    orig_vcf = _run_wham(inputs, background_bams)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"])
        if background:
            sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data)
        effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff")
        data["sv"].append({"variantcaller": "wham",
                           "vrn_file": effects_vcf or sample_vcf})
        out.append(data)
    return out
Example #9
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1],
                                               "lumpy"))
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    pebed_file, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    out = []
    sample_config_file = _write_samples_to_ids(pebed_file, items)
    lumpy_vcf = _bedpe_to_vcf(pebed_file, sample_config_file, items)
    for i, data in enumerate(items):
        if "sv" not in data:
            data["sv"] = []
        sample = tz.get_in(["rgnames", "sample"], data)
        sample_bedpe = _filter_by_support(_subset_to_sample(pebed_file, i, data), i, data)
        if lumpy_vcf:
            sample_vcf = utils.append_stem(lumpy_vcf, "-%s" % sample)
            sample_vcf = _filter_by_bedpe(vcfutils.select_sample(lumpy_vcf, sample, sample_vcf, data["config"]),
                                          sample_bedpe, data)
        else:
            sample_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": sample_vcf,
                           "exclude_file": exclude_file,
                           "bedpe_file": sample_bedpe,
                           "sample_bed": sample_config_file})
        out.append(data)
    return out
Example #10
0
def _organize_variants(samples, batch_id):
    """Retrieve variant calls for all samples, merging batched samples into single VCF.
    """
    bam_files = set([])
    caller_names = [x["variantcaller"] for x in samples[0]["variants"]]
    calls = collections.defaultdict(list)
    for data in samples:
        if "work_bam" in data:
            bam_files.add(data["work_bam"])
        for vrn in data["variants"]:
            # for somatic ensemble, discard normal samples and filtered
            # variants from vcfs
            vrn_file = vrn["vrn_file"]
            if data.get("metadata", False) and data["metadata"].get(
                    "phenotype", "normal").lower().startswith("tumor"):
                vrn_file_temp = vrn_file.replace(
                    ".vcf", "_tumorOnly_noFilteredCalls.vcf"
                ) if ".vcf" in vrn_file else vrn_file_temp + "_tumorOnly_noFilteredCalls.vcf.gz"
                # Select tumor sample and keep only PASS and . calls
                vrn_file = vcfutils.select_sample(in_file=vrn_file,
                                                  sample=data["name"][1],
                                                  out_file=vrn_file_temp,
                                                  config=data["config"],
                                                  filters="PASS,.")
            calls[vrn["variantcaller"]].append(vrn_file)
    data = samples[0]
    vrn_files = []
    for caller in caller_names:
        fnames = calls[caller]
        if len(fnames) == 1:
            vrn_files.append(fnames[0])
        else:
            vrn_files.append(
                population.get_multisample_vcf(fnames, batch_id, caller, data))
    return caller_names, vrn_files, list(bam_files)
Example #11
0
 def test_4_vcf_sample_select(self):
     """Select a sample from a VCF file.
     """
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     out_file = "%s-sampleselect%s.gz" % os.path.splitext(fname)
     out_file = vcfutils.select_sample(fname, "S2", out_file, {})
     self._remove_vcf(out_file)
 def test_4_vcf_sample_select(self):
     """Select a sample from a VCF file.
     """
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     out_file = "%s-sampleselect%s.gz" % os.path.splitext(fname)
     out_file = vcfutils.select_sample(fname, "S2", out_file, {})
     self._remove_vcf(out_file)
 def test_4_vcf_sample_select(self):
     """Select a sample from a VCF file.
     """
     fname = os.path.join(self.var_dir, "S1_S2-combined.vcf.gz")
     out_file = "%s-sampleselect%s" % utils.splitext_plus(fname)
     out_file = vcfutils.select_sample(fname, "S2", out_file, {})
     self._remove_vcf(out_file)
Example #14
0
 def test_4_vcf_sample_select(self):
     """Select a sample from a VCF file.
     """
     fname = os.path.join(self.var_dir, "S1_S2-combined.vcf.gz")
     out_file = "%s-sampleselect%s" % utils.splitext_plus(fname)
     out_file = vcfutils.select_sample(fname, "S2", out_file, {})
     self._remove_vcf(out_file)
Example #15
0
def _organize_variants(samples, batch_id):
    """Retrieve variant calls for all samples, merging batched samples into single VCF.
    """
    bam_files = set([])
    caller_names = [x["variantcaller"] for x in samples[0]["variants"]]
    calls = collections.defaultdict(list)
    for data in samples:
        if "work_bam" in data:
            bam_files.add(data["work_bam"])
        for vrn in data["variants"]:
            # for somatic ensemble, discard normal samples and filtered 
            # variants from vcfs
            vrn_file = vrn["vrn_file"]
            if data.get("metadata", False) and data["metadata"].get("phenotype", "normal").lower().startswith("tumor"):
                vrn_file_temp = vrn_file.replace(".vcf", "_tumorOnly_noFilteredCalls.vcf") if ".vcf" in vrn_file else vrn_file_temp + "_tumorOnly_noFilteredCalls.vcf.gz"
                # Select tumor sample and keep only PASS and . calls
                vrn_file = vcfutils.select_sample(in_file=vrn_file, sample=data["name"][1], 
                                                  out_file=vrn_file_temp, 
                                                  config=data["config"], filters="PASS,.")
            calls[vrn["variantcaller"]].append(vrn_file)
    data = samples[0]
    vrn_files = []
    for caller in caller_names:
        fnames = calls[caller]
        if len(fnames) == 1:
            vrn_files.append(fnames[0])
        else:
            vrn_files.append(population.get_multisample_vcf(fnames, batch_id, caller, data))
    return caller_names, vrn_files, list(bam_files)
Example #16
0
 def test_4_vcf_sample_select(self, install_test_files, data_dir):
     """Select a sample from a VCF file.
     """
     from bcbio.variation import vcfutils
     fname = os.path.join(self.var_dir, "S1_S2-combined.vcf.gz")
     out_file = "%s-sampleselect%s" % utils.splitext_plus(fname)
     out_file = vcfutils.select_sample(fname, "S2", out_file, {})
     self._remove_vcf(out_file)
Example #17
0
 def test_4_vcf_sample_select(self, install_test_files, data_dir):
     """Select a sample from a VCF file.
     """
     from bcbio.variation import vcfutils
     fname = os.path.join(self.var_dir, "S1_S2-combined.vcf.gz")
     out_file = "%s-sampleselect%s" % utils.splitext_plus(fname)
     out_file = vcfutils.select_sample(fname, "S2", out_file, {})
     self._remove_vcf(out_file)
Example #18
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file +
                                                                 ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file,
                                               data["config"],
                                               out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(os.path.basename(vrn_file))
            sample_file = os.path.join(
                base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file,
                                              dd.get_sample_name(data),
                                              sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file +
                                                                  ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file,
                                                data["config"],
                                                out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)
        threads = min(dd.get_num_cores(data), 6)
        mem = "%sg" % threads
        cmd = [
            "rtg", "vcfeval", "--threads",
            str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c",
            vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        cmd += [
            "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))
        ]
        mem_export = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=%s" % mem
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Example #19
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        elif "lumpy-genotype" in dd.get_tools_off(data):
            gt_vcf = sample_vcf
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Example #20
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(
        os.path.join(items[0]["dirs"]["work"], "structural",
                     items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {
        "type": "local",
        "cores": config["algorithm"].get("num_cores", 1),
        "progs": ["delly"]
    }
    sv_types = [
        "DEL", "DUP", "INV"
    ]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam:
        bytype_vcfs = run_multicore(
            _run_delly,
            [(work_bams, chrom, sv_type, ref_file, work_dir, items)
             for (chrom, sv_type
                  ) in itertools.product(pysam_work_bam.references, sv_types)],
            config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file,
                                               items[0]["config"])
    delly_vcf = vfilter.genotype_filter(combo_vcf,
                                        'DV / (DV + DR) > 0.35 && DV > 4',
                                        data, "DVSupport")
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(delly_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = "%s-%s%s" % (base, sample, ext)
        data["sv"].append({
            "variantcaller":
            "delly",
            "vrn_file":
            vcfutils.select_sample(delly_vcf, sample, delly_sample_vcf,
                                   data["config"])
        })
        out.append(data)
    return out
Example #21
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, assign the combined file to the
    tumor sample instead of splitting, and remove variant files from the normal.
    """
    config = data["config"]
    vrn_file = data["vrn_file"]
    out = []
    # cancer tumor/normal
    if vcfutils.get_paired_phenotype(data):
        # handle trailing normals, which we don't need to process
        if len(data["group_orig"]) == 1 and vcfutils.get_paired_phenotype(data["group_orig"][0][0]) == "normal":
            sub_data, sub_vrn_file = data["group_orig"][0]
            sub_data.pop("vrn_file", None)
            sub_data["vrn_file-shared"] = sub_vrn_file
            out.append(sub_data)
        else:
            has_tumor = False
            for sub_data, sub_vrn_file in data["group_orig"]:
                paired_phenotype = vcfutils.get_paired_phenotype(sub_data)
                if paired_phenotype == "tumor":
                    has_tumor = True
                    if not os.path.exists(sub_vrn_file):
                        utils.symlink_plus(vrn_file, sub_vrn_file)
                    sub_data["vrn_file"] = sub_vrn_file
                    out.append(sub_data)
                else:
                    sub_data.pop("vrn_file", None)
                    sub_data["vrn_file-shared"] = sub_vrn_file
                    out.append(sub_data)
            if not has_tumor:
                raise ValueError("Did not find tumor sample in paired analysis")
    # population or single sample
    else:
        for sub_data, sub_vrn_file in data["group_orig"]:
            if len(vcfutils.get_samples(vrn_file)) > 1:
                vcfutils.select_sample(vrn_file, sub_data["name"][-1], sub_vrn_file, config)
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(vrn_file, sub_vrn_file)
            if sub_vrn_file:
                sub_data["vrn_file"] = sub_vrn_file
                out.append(sub_data)
    return out
Example #22
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file +
                                                                 ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file,
                                               data["config"],
                                               out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(
                base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file,
                                              dd.get_sample_name(data),
                                              sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file +
                                                                  ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file,
                                                data["config"],
                                                out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(
            os.path.join(ref_dir, os.path.pardir, "rtg",
                         "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = [
            "rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions",
            interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        caller = _get_caller(data)
        # flexible quality scores for building ROC curves, handle multiple cases
        # MuTect has no quality scores
        # not clear how to get t_lod_fstar into VCF cleanly
        if caller == "mutect":
            cmd += ["--vcf-score-field=BQ"]
        # otherwise use quality score as a standard
        # Discussion point: is it worth using caller specific annotations or settling
        # on a single metric for comparison
        else:
            cmd += ["--vcf-score-field=QUAL"]
        cmd = "export RTG_JAVA_OPTS='-Xms1g' export RTG_MEM=5g && " + " ".join(
            cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    return {
        "tp": os.path.join(out_dir, "tp.vcf.gz"),
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
Example #23
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(
        os.path.join(items[0]["dirs"]["work"], "structural",
                     items[0]["name"][-1], "delly"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = 1
    config["resources"]["delly"] = delly_config
    parallel = {
        "type": "local",
        "cores": config["algorithm"].get("num_cores", 1),
        "progs": ["delly"]
    }
    work_bams = run_multicore(_prep_subsampled_bams,
                              [(data, work_dir) for data in items], config,
                              parallel)
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    sv_types = [
        "DEL", "DUP"
    ]  # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow
    exclude_file = _get_full_exclude_file(items, work_dir)
    bytype_vcfs = run_multicore(
        _run_delly,
        [(work_bams, chrom, sv_type, ref_file, work_dir, items)
         for (chrom, sv_type) in itertools.product(
             sshared.get_sv_chroms(items, exclude_file), sv_types)], config,
        parallel)
    out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file,
                                               config)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(
            combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data)
        data["sv"].append({
            "variantcaller": "delly",
            "vrn_file": delly_vcf,
            "exclude": exclude_file
        })
        out.append(data)
    return out
Example #24
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(
            utils.get_in(data, ("config", "algorithm",
                                "aligner")) in ["bwa", False, None]
            for data in items):
        raise ValueError(
            "Require bwa-mem alignment input for lumpy structural variation detection"
        )
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(
            data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(
            lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample),
            data["config"])
        std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
        std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file,
                                  data)
        gt_vcf = vcfutils.combine_variant_files(
            orig_files=[std_gt_vcf, bnd_vcf],
            out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
            ref_file=dd.get_ref_file(data),
            config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name],
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        data["sv"].append({
            "variantcaller": "lumpy",
            "vrn_file": effects_vcf or vcf_file,
            "exclude_file": exclude_file
        })
        out.append(data)
    return out
Example #25
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Example #26
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file +
                                                                 ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file,
                                               data["config"],
                                               out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(
                base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file,
                                              dd.get_sample_name(data),
                                              sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file +
                                                                  ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file,
                                                data["config"],
                                                out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(
            os.path.join(ref_dir, os.path.pardir, "rtg",
                         "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = [
            "rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions",
            interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        cmd += [
            "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))
        ]
        cmd = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=5g && " + " ".join(
            cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Example #27
0
def _handle_somatic_ensemble(vrn_file, data):
    """For somatic ensemble, discard normal samples and filtered variants from vcfs.

    Only needed for bcbio.variation based ensemble calling.
    """
    if tz.get_in(["metadata", "phenotype"], data, "").lower().startswith("tumor"):
        vrn_file_temp = vrn_file.replace(".vcf", "_tumorOnly_noFilteredCalls.vcf")
        # Select tumor sample and keep only PASS and . calls
        vrn_file = vcfutils.select_sample(in_file=vrn_file, sample=data["name"][1],
                                          out_file=vrn_file_temp,
                                          config=data["config"], filters="PASS,.")
    return vrn_file
Example #28
0
def _handle_somatic_ensemble(vrn_file, data):
    """For somatic ensemble, discard normal samples and filtered variants from vcfs.

    Only needed for bcbio.variation based ensemble calling.
    """
    if tz.get_in(["metadata", "phenotype"], data, "").lower().startswith("tumor"):
        vrn_file_temp = vrn_file.replace(".vcf", "_tumorOnly_noFilteredCalls.vcf")
        # Select tumor sample and keep only PASS and . calls
        vrn_file = vcfutils.select_sample(in_file=vrn_file, sample=data["name"][1],
                                          out_file=vrn_file_temp,
                                          config=data["config"], filters="PASS,.")
    return vrn_file
Example #29
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base, ext = utils.splitext_plus(os.path.basename(vrn_file))
        sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
    if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
        vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data)
    return vrn_file, rm_file, interval_bed
Example #30
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(os.path.basename(vrn_file))
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                                     "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)
        # get core and memory usage from standard configuration
        threads = min(dd.get_num_cores(data), 6)
        resources = config_utils.get_resources("rtg", data["config"])
        memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]),
                                          {"algorithm": {"memory_adjust": {"magnitude": threads,
                                                                           "direction": "increase"}}})
        jvm_stack = [x for x in memory if x.startswith("-Xms")]
        jvm_mem = [x for x in memory if x.startswith("-Xmx")]
        jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m"
        jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g"
        cmd = ["rtg", "vcfeval", "--threads", str(threads),
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))]
        mem_export = "export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (jvm_stack, jvm_mem)
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {"fp": os.path.join(out_dir, "fp.vcf.gz"),
           "fn": os.path.join(out_dir, "fn.vcf.gz")}
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Example #31
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base, ext = utils.splitext_plus(os.path.basename(vrn_file))
        sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
    if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
        vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data)
    return vrn_file, rm_file, interval_bed
Example #32
0
def finalize_sv(orig_vcf, data, items):
    """Finalize structural variants, adding effects and splitting if needed.
    """
    paired = vcfutils.get_paired(items)
    # For paired/somatic, attach combined calls to tumor sample
    if paired:
        sample_vcf = orig_vcf if paired.tumor_name == dd.get_sample_name(data) else None
    else:
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"])
    if sample_vcf:
        effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff")
    else:
        effects_vcf = None
    return effects_vcf or sample_vcf
Example #33
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base = utils.splitext_plus(os.path.basename(vrn_file))[0]
        sample_file = os.path.join(base_dir, "%s-%s.vcf.gz" % (base, dd.get_sample_name(data)))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
    # rtg fails on bgzipped VCFs produced by GatherVcfs so we re-prep them
    else:
        vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data)
    return vrn_file, rm_file, interval_bed
Example #34
0
def _compatible_small_variants(data):
    """Retrieve small variant (SNP, indel) VCFs compatible with CNVkit.
    """
    supported = set(["vardict", "freebayes", "gatk-haplotype", "mutect2", "vardict"])
    out = []
    for v in data.get("variants", []):
        vrn_file = v.get("vrn_file")
        if vrn_file and v.get("variantcaller") in supported:
            base, ext = utils.splitext_plus(os.path.basename(vrn_file))
            sample_vrn_file = os.path.join(dd.get_work_dir(data), v["variantcaller"],
                                           "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            sample_vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_vrn_file,
                                                     data["config"])
            out.append(sample_vrn_file)
    return out
Example #35
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base = utils.splitext_plus(os.path.basename(vrn_file))[0]
        sample_file = os.path.join(base_dir, "%s-%s.vcf.gz" % (base, dd.get_sample_name(data)))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
    # rtg fails on bgzipped VCFs produced by GatherVcfs so we re-prep them
    else:
        vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data)
    return vrn_file, rm_file, interval_bed
Example #36
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = 1
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    work_bams = run_multicore(_prep_subsampled_bams,
                              [(data, work_dir) for data in items],
                              config, parallel)
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    sv_types = ["DEL", "DUP"]  # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow
    exclude_file = _get_full_exclude_file(items, work_dir)
    bytype_vcfs = run_multicore(_run_delly,
                                [(work_bams, chrom, sv_type, ref_file, work_dir, items)
                                 for (chrom, sv_type)
                                 in itertools.product(sshared.get_sv_chroms(items, exclude_file), sv_types)],
                                config, parallel)
    out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample,
                                                  "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data)
        data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf,
                           "exclude": exclude_file})
        out.append(data)
    return out
Example #37
0
def _compatible_small_variants(data):
    """Retrieve small variant (SNP, indel) VCFs compatible with CNVkit.
    """
    supported = set(["vardict", "freebayes", "gatk-haplotype", "mutect2", "vardict"])
    out = []
    for v in data.get("variants", []):
        vrn_file = v.get("vrn_file")
        if vrn_file and v.get("variantcaller") in supported:
            base, ext = utils.splitext_plus(os.path.basename(vrn_file))
            if vcfutils.get_paired_phenotype(data):
                out.append(vrn_file)
            else:
                sample_vrn_file = os.path.join(dd.get_work_dir(data), v["variantcaller"],
                                               "%s-%s%s" % (base, dd.get_sample_name(data), ext))
                sample_vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_vrn_file,
                                                         data["config"])
                out.append(sample_vrn_file)
    return out
Example #38
0
def finalize_sv(orig_vcf, data, items):
    """Finalize structural variants, adding effects and splitting if needed.
    """
    paired = vcfutils.get_paired(items)
    # For paired/somatic, attach combined calls to tumor sample
    if paired:
        sample_vcf = orig_vcf if paired.tumor_name == dd.get_sample_name(
            data) else None
    else:
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0],
                                       dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data),
                                            sample_vcf, data["config"])
    if sample_vcf:
        effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff")
    else:
        effects_vcf = None
    return effects_vcf or sample_vcf
Example #39
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    sv_types = ["DEL", "DUP", "INV"]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam:
        bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items)
                                                 for (chrom, sv_type)
                                                 in itertools.product(pysam_work_bam.references, sv_types)],
                                    config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample,
                                                  "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = vfilter.hard_w_expression(delly_sample_vcf,
                                              "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2", data,
                                              name="DVSupport")
        data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf})
        out.append(data)
    return out
Example #40
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(os.path.basename(vrn_file))
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                                     "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)
        threads = min(dd.get_num_cores(data), 6)
        mem = "%sg" % threads
        cmd = ["rtg", "vcfeval", "--threads", str(threads),
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))]
        mem_export = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=%s" % mem
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {"fp": os.path.join(out_dir, "fp.vcf.gz"),
           "fn": os.path.join(out_dir, "fn.vcf.gz")}
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Example #41
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
        std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data)
        gt_vcf = vcfutils.concat_variant_files_bcftools(
            orig_files=[std_gt_vcf, bnd_vcf],
            out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
            config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Example #42
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                         "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = ["rtg", "vcfeval", "--threads", "6",
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        caller = _get_caller(data)
        # flexible quality scores for building ROC curves, handle multiple cases
        # MuTect has no quality scores
        # not clear how to get t_lod_fstar into VCF cleanly
        if caller == "mutect":
            cmd += ["--vcf-score-field=BQ"]
        # otherwise use quality score as a standard
        # Discussion point: is it worth using caller specific annotations or settling
        # on a single metric for comparison
        else:
            cmd += ["--vcf-score-field=QUAL"]
        cmd = "export RTG_JAVA_OPTS='-Xms1g' export RTG_MEM=5g && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    return {"tp": os.path.join(out_dir, "tp.vcf.gz"),
            "fp": os.path.join(out_dir, "fp.vcf.gz"),
            "fn": os.path.join(out_dir, "fn.vcf.gz")}
Example #43
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                         "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = ["rtg", "vcfeval", "--threads", "6",
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))]
        cmd = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=5g && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {"fp": os.path.join(out_dir, "fp.vcf.gz"),
           "fn": os.path.join(out_dir, "fn.vcf.gz")}
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out