Esempio n. 1
0
def _do_run(paired):
    """Perform Battenberg caling with the paired dataset.

    This purposely does not use a temporary directory for the output
    since Battenberg does smart restarts.
    """
    work_dir = _sv_workdir(paired.tumor_data)
    ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt")
    out = _get_battenberg_out(paired, work_dir)
    if len(_missing_files(out)) > 0:
        ref_file = dd.get_ref_file(paired.tumor_data)
        bat_datadir = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg"))
        ignore_file = _make_ignore_file(work_dir, ref_file, os.path.join(bat_datadir, "impute", "impute_info.txt"),
                                        ignore_file)
        local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                     "lib", "R", "site-library")
        perllib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                               "lib", "perl5")
        tumor_bam = paired.tumor_bam
        normal_bam = paired.normal_bam
        platform = dd.get_platform(paired.tumor_data)
        genome_build = paired.tumor_data["genome_build"]
        # scale cores to avoid over-using memory during imputation
        cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5))
        cmd = ("export R_LIBS_USER={local_sitelib} && "
               "export PERL5LIB={perllib}:$PERL5LIB "
               "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai "
               "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt "
               "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt "
               "-ig {ignore_file} "
               "-assembly {genome_build} -species Human -platform {platform}")
        do.run(cmd.format(**locals()), "Battenberg CNV calling")
    assert len(_missing_files(out)) == 0, "Missing Battenberg output: %s" % _missing_files(out)
    out["ignore"] = ignore_file
    return out
Esempio n. 2
0
def _run_bubbletree(vcf_csv, cnv_csv, data):
    """Create R script and run on input data
    """
    local_sitelib = os.path.join(
        install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
        "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run([utils.Rscript_cmd(), r_file],
                   "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
Esempio n. 3
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data)
            cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
                        len(test_bams) + len(background_bams))
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_file,
                   "-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
            "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "back_cnn": os.path.join(raw_work_dir, background_cnn)}
Esempio n. 4
0
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True,
                    handle_failures=True):
    """Create R script and run on input data

    BubbleTree has some internal hardcoded paramters that assume a smaller
    distribution of log2 scores. This is not true for tumor-only calls, so if
    we specify wide_lrr we scale the calculations to actually get calls. Need a
    better long term solution with flexible parameters.
    """
    lrr_scale = 10.0 if wide_lrr else 1.0
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    do_plots = "yes" if do_plots else "no"
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if handle_failures and _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
Esempio n. 5
0
def _run_on_chrom(chrom, work_bams, names, work_dir, items):
    """Run cn.mops on work BAMs for a specific chromosome.
    """
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    batch = sshared.get_cur_batch(items)
    ext = "-%s-cnv" % batch if batch else "-cnv"
    out_file = os.path.join(work_dir, "%s%s-%s.bed" % (os.path.splitext(os.path.basename(work_bams[0]))[0],
                                                       ext, chrom if chrom else "all"))
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(out_file)[0]
            with open(rcode, "w") as out_handle:
                out_handle.write(_script.format(prep_str=_prep_load_script(work_bams, names, chrom, items),
                                                out_file=tx_out_file,
                                                local_sitelib=local_sitelib))
            rscript = config_utils.get_program("Rscript", items[0]["config"])
            try:
                do.run([rscript, rcode], "cn.mops CNV detection", items[0], log_error=False)
            except subprocess.CalledProcessError, msg:
                # cn.mops errors out if no CNVs found. Just write an empty file.
                if _allowed_cnmops_errorstates(str(msg)):
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write('track name=empty description="No CNVs found"\n')
                else:
                    logger.exception()
                    raise
Esempio n. 6
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data)
            cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
                        len(test_bams) + len(background_bams))
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_file,
                   "-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
            "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "back_cnn": os.path.join(raw_work_dir, background_cnn)}
Esempio n. 7
0
def _run_cnvkit_shared(data,
                       test_bams,
                       background_bams,
                       work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(
        test_bams[0]))[0].split(".")[0]

    background_cnn = "%s_background.cnn" % (background_name
                                            if background_name else "flat")
    files = {
        "cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
        "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
        "back_cnn": os.path.join(raw_work_dir, background_cnn)
    }
    if not utils.file_exists(files["cnr"]):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            cov_interval = dd.get_coverage_interval(data)
            raw_target_bed, access_bed = _get_target_access_files(
                cov_interval, data, work_dir)
            # bail out if we ended up with no regions
            if not utils.file_exists(raw_target_bed):
                return {}
            target_bed = annotate.add_genes(raw_target_bed, data)

            # Do not paralleize cnvkit due to current issues with multi-processing
            cores = 1
            # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
            #             len(test_bams) + len(background_bams))
            cmd = [_get_cmd(), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_bed] + \
                  ["-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            if cov_interval not in ["amplicon", "genome"]:
                at_avg, at_min, t_avg = _get_antitarget_size(
                    access_bed, target_bed)
                if at_avg:
                    cmd += [
                        "--antitarget-avg-size",
                        str(at_avg), "--antitarget-min-size",
                        str(at_min), "--target-avg-size",
                        str(t_avg)
                    ]
            local_sitelib = os.path.join(
                install.get_defaults().get("tooldir", "/usr/local"), "lib",
                "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    for ftype in ["cnr", "cns"]:
        if not os.path.exists(files[ftype]):
            raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype]))
    return files
Esempio n. 8
0
def _get_brew_versions():
    """Retrieve versions of tools installed via brew.
    """
    from bcbio import install
    tooldir = install.get_defaults().get("tooldir")
    brew_cmd = os.path.join(tooldir, "bin", "brew") if tooldir else "brew"
    try:
        vout = subprocess.check_output([brew_cmd, "which"])
        uses_which = True
    except subprocess.CalledProcessError:
        vout = subprocess.check_output([brew_cmd, "list", "--versions"])
        uses_which = False
    except OSError:  # brew not installed/used
        vout = ""
    out = {}
    for vstr in vout.split("\n"):
        if vstr.strip():
            if uses_which:
                name, v = vstr.rstrip().split(": ")
            else:
                parts = vstr.rstrip().split()
                name = parts[0]
                v = parts[-1]
            out[name] = v
    return out
Esempio n. 9
0
def _run_on_chrom(chrom, work_bams, names, work_dir, items):
    """Run cn.mops on work BAMs for a specific chromosome.
    """
    local_sitelib = os.path.join(
        install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
        "site-library")
    out_file = os.path.join(
        work_dir, "%s-%s-cnv.bed" % (os.path.splitext(
            os.path.basename(work_bams[0]))[0], chrom if chrom else "all"))
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(out_file)[0]
            with open(rcode, "w") as out_handle:
                out_handle.write(
                    _script.format(prep_str=_prep_load_script(
                        work_bams, names, chrom, items),
                                   out_file=tx_out_file,
                                   local_sitelib=local_sitelib))
            rscript = config_utils.get_program("Rscript", items[0]["config"])
            try:
                do.run([rscript, rcode],
                       "cn.mops CNV detection",
                       items[0],
                       log_error=False)
            except subprocess.CalledProcessError, msg:
                # cn.mops errors out if no CNVs found. Just write an empty file.
                if _allowed_cnmops_errorstates(str(msg)):
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write(
                            'track name=empty description="No CNVs found"\n')
                else:
                    logger.exception()
                    raise
Esempio n. 10
0
def _get_brew_versions():
    """Retrieve versions of tools installed via brew.
    """
    from bcbio import install
    tooldir = install.get_defaults().get("tooldir")
    brew_cmd = os.path.join(tooldir, "bin", "brew") if tooldir else "brew"
    try:
        vout = subprocess.check_output([brew_cmd, "which"])
        uses_which = True
    except subprocess.CalledProcessError:
        vout = subprocess.check_output([brew_cmd, "list", "--versions"])
        uses_which = False
    except OSError:  # brew not installed/used
        vout = ""
    out = {}
    for vstr in vout.split("\n"):
        if vstr.strip():
            if uses_which:
                name, v = vstr.rstrip().split(": ")
            else:
                parts = vstr.rstrip().split()
                name = parts[0]
                v = parts[-1]
            out[name] = v
    return out
Esempio n. 11
0
def _run_bubbletree(vcf_csv, cnv_csv, data, has_normal=True):
    """Create R script and run on input data
    """
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    # BubbleTree has some internal hardcoded paramters that assume a smaller
    # distribution of log2 scores. This is not true for tumor-only calls and
    # normal contamination, so we scale the calculations to actually get calls.
    # Need a better long term solution with flexible parameters.
    lrr_scale = 1.0 if has_normal else 10.0
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
Esempio n. 12
0
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True,
                    handle_failures=True):
    """Create R script and run on input data

    BubbleTree has some internal hardcoded paramters that assume a smaller
    distribution of log2 scores. This is not true for tumor-only calls, so if
    we specify wide_lrr we scale the calculations to actually get calls. Need a
    better long term solution with flexible parameters.
    """
    lrr_scale = 10.0 if wide_lrr else 1.0
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    do_plots = "yes" if do_plots else "no"
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError as msg:
            if handle_failures and _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
    return {"caller": "bubbletree",
            "report": freqs_out,
            "plots": {"bubble": bubbleplot_out, "track": trackplot_out}}
Esempio n. 13
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
             "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
             "back_cnn": os.path.join(raw_work_dir, background_cnn)}
    if not utils.file_exists(files["cnr"]):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            # pick targets, anti-targets and access files based on analysis type
            # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html
            cov_interval = dd.get_coverage_interval(data)
            base_regions = dd.get_variant_regions(data)
            # For genome calls, subset to regions within 10kb of genes
            if cov_interval == "genome":
                base_regions = annotate.subset_by_genes(base_regions, data,
                                                        work_dir, pad=1e4)

            raw_target_bed = bedutils.merge_overlaps(base_regions, data,
                                                     out_dir=work_dir)
            target_bed = annotate.add_genes(raw_target_bed, data)

            # bail out if we ended up with no regions
            if not utils.file_exists(target_bed):
                return {}

            if cov_interval == "amplicon":
                target_opts = ["--targets", target_bed, "--access", target_bed]
            elif cov_interval == "genome":
                target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)]
            else:
                target_opts = ["--targets", target_bed, "--access", access_file]

            cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
                        len(test_bams) + len(background_bams))
            cmd = [_get_cmd(), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  target_opts + \
                  ["-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    for ftype in ["cnr", "cns"]:
        if not os.path.exists(files[ftype]):
            raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype]))
    return files
Esempio n. 14
0
def get_perl_exports(tooldir=None):
    """Environmental exports to use conda install perl and site library.
    """
    from bcbio import install
    if tooldir is None:
        tooldir = install.get_defaults().get("tooldir", "/usr/local")
    perllib = "%s/lib/perl5" % tooldir
    perl_path = os.path.dirname(perl_cmd())
    return "export PATH=%s:$PATH && export PERL5LIB=%s:$PERL5LIB" % (perl_path, perllib)
Esempio n. 15
0
def get_perl_exports(tooldir=None):
    """Environmental exports to use conda install perl and site library.
    """
    from bcbio import install
    if tooldir is None:
        tooldir = install.get_defaults().get("tooldir", "/usr/local")
    perllib = "%s/lib/perl5" % tooldir
    perl_path = os.path.dirname(perl_cmd())
    return "export PATH=%s:$PATH && export PERL5LIB=%s:$PERL5LIB" % (perl_path, perllib)
Esempio n. 16
0
def has_gemini_data(data):
    """Use gemini if we installed required data for hg19, hg38.

    Other organisms don't have special data targets.
    """
    if support_gemini_orig(data, all_human=True):
        from bcbio import install
        return "gemini" in install.get_defaults().get("datatarget", [])
    else:
        return True
Esempio n. 17
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            perllib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                   "lib", "perl5")
            opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
            opts += " --ref {}".format(ref_file)
            opts += " --dir %s" % tmp_path
            # caling
            cl = ("export PERL5LIB={perllib}:$PERL5LIB && "
                  "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}")
            do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path))
            # More lenient filters for low-frequency variations
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz")
            with file_transaction(config, scalpel_tmp_file) as tx_indel_file:
                cmd = ("export PERL5LIB={perllib}:$PERL5LIB && "
                       "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                       "--min-alt-count-tumor 2 --min-phred-fisher 5 | bgzip -c > {tx_indel_file}")
                do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                   "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                   " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
Esempio n. 18
0
def _run_scalpel_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(
                _scalpel_options_from_config(items, config, out_file, region,
                                             tmp_path))
            opts += " --dir %s" % tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perllib = os.path.join(
                install.get_defaults().get("tooldir", "/usr/local"), "lib",
                "perl5")
            cmd = (
                "export PERL5LIB={perllib}:$PERL5LIB && "
                "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} "
            )
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(
                os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression(
                "chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = (
                "{bcftools_cmd_chi2} {scalpel_tmp_file} | sed 's/sample_name/{sample_name_str}/g' "
                "| {fix_ambig} | vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort "
                "{compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Esempio n. 19
0
def _cnvkit_segment(cnr_file, cov_interval, data):
    """Perform segmentation and copy number calling on normalized inputs
    """
    out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                            "lib", "R", "site-library")
            cmd = [_get_cmd(), "segment", "-o", tx_out_file, "--rlibpath", local_sitelib, cnr_file]
            if cov_interval == "genome":
                cmd += ["--threshold", "0.005"]
            do.run(cmd, "CNVkit segment")
    return out_file
Esempio n. 20
0
def _run_bubbletree(vcf_csv, cnv_csv, data):
    """Create R script and run on input data
    """
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbles_out = "%s-bubbles.pdf" % base
    prev_model_out = "%s-bubbletree_prev_model.pdf" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        do.run(["Rscript", r_file], "Assess heterogeneity with BubbleTree")
Esempio n. 21
0
def _do_run(paired):
    """Perform Battenberg caling with the paired dataset.

    This purposely does not use a temporary directory for the output
    since Battenberg does smart restarts.
    """
    work_dir = _sv_workdir(paired.tumor_data)
    out = _get_battenberg_out(paired, work_dir)
    ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt")
    if len(_missing_files(out)) > 0:
        ref_file = dd.get_ref_file(paired.tumor_data)
        bat_datadir = os.path.normpath(
            os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg"))
        ignore_file, gl_file = _make_ignore_file(
            work_dir, ref_file, ignore_file,
            os.path.join(bat_datadir, "impute", "impute_info.txt"))
        local_sitelib = os.path.join(
            install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
            "site-library")
        tumor_bam = paired.tumor_bam
        normal_bam = paired.normal_bam
        platform = dd.get_platform(paired.tumor_data)
        genome_build = paired.tumor_data["genome_build"]
        # scale cores to avoid over-using memory during imputation
        cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5))
        gender = {
            "male": "XY",
            "female": "XX",
            "unknown": "L"
        }.get(population.get_gender(paired.tumor_data))
        if gender == "L":
            gender_str = "-ge %s -gl %s" % (gender, gl_file)
        else:
            gender_str = "-ge %s" % (gender)
        r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
            utils.Rscript_cmd())
        cmd = (
            "export R_LIBS_USER={local_sitelib} && {r_export_cmd}"
            "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai "
            "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt "
            "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt "
            "-ig {ignore_file} {gender_str} "
            "-assembly {genome_build} -species Human -platform {platform}")
        do.run(cmd.format(**locals()), "Battenberg CNV calling")
    assert len(_missing_files(
        out)) == 0, "Missing Battenberg output: %s" % _missing_files(out)
    out["plot"] = _get_battenberg_out_plots(paired, work_dir)
    out["ignore"] = ignore_file
    return out
Esempio n. 22
0
def _cnvkit_segment(cnr_file, cov_interval, data):
    """Perform segmentation and copy number calling on normalized inputs
    """
    out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                            "lib", "R", "site-library")
            cmd = [_get_cmd(), "segment", "-o", tx_out_file, "--rlibpath", local_sitelib, cnr_file]
            if cov_interval == "genome":
                cmd += ["--threshold", "0.00001"]
            # preferentially use conda installed Rscript
            export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
            do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Esempio n. 23
0
def _cnvkit_segment(cnr_file, cov_interval, data):
    """Perform segmentation and copy number calling on normalized inputs
    """
    out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            local_sitelib = os.path.join(
                install.get_defaults().get("tooldir", "/usr/local"), "lib",
                "R", "site-library")
            cmd = [
                _get_cmd(), "segment", "-o", tx_out_file, "--rlibpath",
                local_sitelib, cnr_file
            ]
            if cov_interval == "genome":
                cmd += ["--threshold", "0.005"]
            do.run(cmd, "CNVkit segment")
    return out_file
Esempio n. 24
0
def _run_cnvkit_shared(data, test_bams, background_bams, work_dir, background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0]

    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
             "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
             "back_cnn": os.path.join(raw_work_dir, background_cnn)}
    if not utils.file_exists(files["cnr"]):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            cov_interval = dd.get_coverage_interval(data)
            raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir)
            # bail out if we ended up with no regions
            if not utils.file_exists(raw_target_bed):
                return {}
            target_bed = annotate.add_genes(raw_target_bed, data)

            # Do not paralleize cnvkit due to current issues with multi-processing
            cores = 1
            # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
            #             len(test_bams) + len(background_bams))
            cmd = [_get_cmd(), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_bed] + \
                  ["-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            if cov_interval not in ["amplicon", "genome"]:
                at_avg, at_min, t_avg = _get_antitarget_size(access_bed, target_bed)
                if at_avg:
                    cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                            "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    for ftype in ["cnr", "cns"]:
        if not os.path.exists(files[ftype]):
            raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype]))
    return files
Esempio n. 25
0
def snpeff_version(args=None):
    from bcbio.install import get_defaults
    tooldir = (args and args.tooldir) or get_defaults()["tooldir"]
    raw_version = programs.get_version_manifest("snpeff")
    if not raw_version:
        config = {
            "resources": {
                "snpeff": {
                    "jvm_opts": ["-Xms500m", "-Xmx1g"],
                    "dir": os.path.join(tooldir, "share", "java", "snpeff")
                }
            }
        }
        raw_version = programs.java_versioner(
            "snpeff", "snpEff", stdout_flag="snpEff version SnpEff")(config)
    snpeff_version = "".join(
        [x for x in str(raw_version) if x in set(string.digits + ".")])
    assert snpeff_version, "Did not find snpEff version information"
    return snpeff_version
Esempio n. 26
0
def _cnvkit_segment(cnr_file, cov_interval, data):
    """Perform segmentation and copy number calling on normalized inputs
    """
    out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            local_sitelib = os.path.join(
                install.get_defaults().get("tooldir", "/usr/local"), "lib",
                "R", "site-library")
            cmd = [
                _get_cmd(), "segment", "-o", tx_out_file, "--rlibpath",
                local_sitelib, cnr_file
            ]
            if cov_interval == "genome":
                cmd += ["--threshold", "0.00001"]
            # preferentially use conda installed Rscript
            export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                utils.Rscript_cmd())
            do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Esempio n. 27
0
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
            opts += " --dir %s" % tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perllib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                   "lib", "perl5")
            cmd = ("export PERL5LIB={perllib}:$PERL5LIB && "
                   "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ")
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | sed 's/sample_name/{sample_name_str}/g' "
                   "| {fix_ambig} | vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort "
                   "{compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Esempio n. 28
0
def _run_bubbletree(vcf_csv, cnv_csv, data):
    """Create R script and run on input data
    """
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbles_out = "%s-bubbles.pdf" % base
    prev_model_out = "%s-bubbletree_prev_model.pdf" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run(["Rscript", r_file], "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
Esempio n. 29
0
def _do_run(paired):
    """Perform Battenberg caling with the paired dataset.

    This purposely does not use a temporary directory for the output
    since Battenberg does smart restarts.
    """
    work_dir = _sv_workdir(paired.tumor_data)
    out = _get_battenberg_out(paired, work_dir)
    ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt")
    if len(_missing_files(out)) > 0:
        ref_file = dd.get_ref_file(paired.tumor_data)
        bat_datadir = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg"))
        ignore_file, gl_file = _make_ignore_file(work_dir, ref_file, ignore_file,
                                                 os.path.join(bat_datadir, "impute", "impute_info.txt"))
        local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                     "lib", "R", "site-library")
        tumor_bam = paired.tumor_bam
        normal_bam = paired.normal_bam
        platform = dd.get_platform(paired.tumor_data)
        genome_build = paired.tumor_data["genome_build"]
        # scale cores to avoid over-using memory during imputation
        cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5))
        gender = {"male": "XY", "female": "XX", "unknown": "L"}.get(population.get_gender(paired.tumor_data))
        if gender == "L":
            gender_str = "-ge %s -gl %s" % (gender, gl_file)
        else:
            gender_str = "-ge %s" % (gender)
        r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
        cmd = ("export R_LIBS_USER={local_sitelib} && {r_export_cmd}"
               "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai "
               "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt "
               "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt "
               "-ig {ignore_file} {gender_str} "
               "-assembly {genome_build} -species Human -platform {platform}")
        do.run(cmd.format(**locals()), "Battenberg CNV calling")
    assert len(_missing_files(out)) == 0, "Missing Battenberg output: %s" % _missing_files(out)
    out["plot"] = _get_battenberg_out_plots(paired, work_dir)
    out["ignore"] = ignore_file
    return out
Esempio n. 30
0
def _has_gemini(data):
    """Use gemini if we installed required data.
    """
    from bcbio import install
    return "gemini" in install.get_defaults().get("datatarget", [])
Esempio n. 31
0
def _get_perllib(tooldir=None):
    from bcbio import install
    if tooldir is None:
        tooldir = install.get_defaults().get("tooldir", "/usr/local")
    return "%s/lib/perl5" % tooldir
Esempio n. 32
0
def R_sitelib():
    """Retrieve the R site-library installed with the bcbio installer.
    """
    from bcbio import install
    return os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                        "lib", "R", "site-library")
Esempio n. 33
0
def _get_perllib(tooldir=None):
    from bcbio import install
    if tooldir is None:
        tooldir = install.get_defaults().get("tooldir", "/usr/local")
    return "%s/lib/perl5" % tooldir
Esempio n. 34
0
def _run_scalpel_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perllib = os.path.join(
                install.get_defaults().get("tooldir", "/usr/local"), "lib",
                "perl5")
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(
                    _scalpel_options_from_config(items, config, out_file,
                                                 region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = (
                    "export PERL5LIB={perllib}:$PERL5LIB && "
                    "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}"
                )
                do.run(cl.format(**locals()),
                       "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(
                _scalpel_bed_file_opts(items, config, out_file, region,
                                       tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path,
                                                "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(
                    tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config,
                                      scalpel_tmp_file) as tx_indel_file:
                    cmd = (
                        "export PERL5LIB={perllib}:$PERL5LIB && "
                        "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                        "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                        "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()),
                           "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(
                os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression(
                "chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression(
                "reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = (
                "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}"
            )
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file