Example #1
0
def get_coverage(data):
    """Calculate coverage for a sample.bam, account for GC content
       data is single sample
    """
    data = utils.to_single_data(data)
    bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data)
    sample_name = dd.get_sample_name(data)
    work_dir = _sv_workdir(data)
    rscript = utils.Rscript_cmd("r36")
    coverage_r = utils.R_package_script("r36", "PureCN", "extdata/Coverage.R")
    intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data)
    # PureCN resolves symlinks and the actual output PureCN coverage file name
    # is derived from the end bam not from bam_file
    bam_file = os.path.realpath(dd.get_align_bam(data))
    bam_name = os.path.basename(bam_file)
    (bname, ext) = os.path.splitext(bam_name)
    result_file = os.path.join(work_dir, bname + "_coverage_loess.txt.gz")
    if not os.path.exists(result_file):
        cmd = [rscript, coverage_r,
               "--outdir", work_dir,
               "--bam", bam_file,
               "--intervals", intervals]
        try:
            cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                              utils.get_R_exports(env = "r36"),
                                                              " ".join([str(x) for x in cmd]))
            do.run(cmd_line, "PureCN coverage")
        except subprocess.CalledProcessError as msg:
            logger.info("PureCN failed to calculate coverage")
        logger.debug("Saved PureCN coverage files to " + result_file)
    return result_file
Example #2
0
def _run_bubbletree(vcf_csv, cnv_csv, data, has_normal=True):
    """Create R script and run on input data
    """
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    # BubbleTree has some internal hardcoded paramters that assume a smaller
    # distribution of log2 scores. This is not true for tumor-only calls and
    # normal contamination, so we scale the calculations to actually get calls.
    # Need a better long term solution with flexible parameters.
    lrr_scale = 1.0 if has_normal else 10.0
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
Example #3
0
def _cnvkit_segment(cnr_file, cov_interval, data):
    """Perform segmentation and copy number calling on normalized inputs
    """
    out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(
                        "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n"
                    )
            else:
                cmd = [
                    _get_cmd(), "segment", "-p",
                    str(dd.get_cores(data)), "-o", tx_out_file, cnr_file
                ]
                small_vrn_files = _compatible_small_variants(data)
                if len(small_vrn_files) > 0 and _cna_has_values(
                        cnr_file) and cov_interval != "genome":
                    cmd += ["-v", small_vrn_files[0]]
                if cov_interval == "genome":
                    cmd += ["--threshold", "0.00001"]
                # preferentially use conda installed Rscript
                export_cmd = (
                    "unset R_HOME && export PATH=%s:$PATH && export TMPDIR=%s && "
                    % (os.path.dirname(
                        utils.Rscript_cmd()), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Example #4
0
def _run_purecn_dx(out, paired):
    """Extract signatures and mutational burdens from PureCN rds file."""
    # no solution - no signatures
    if not "rds" in out:
        return out
    rscript = utils.Rscript_cmd()
    purecndx_r = utils.R_package_script("PureCN", "extdata/Dx.R", env="base")
    simple_repeat_bed = dd.get_variation_resources(
        paired.tumor_data)["simple_repeat"]
    callable_bed = dd.get_sample_callable(paired.tumor_data)
    out_base = utils.splitext_plus(out["rds"])[0]
    mutation_burden_csv = out_base + "_mutation_burden.csv"
    if not utils.file_uptodate(mutation_burden_csv, out["rds"]):
        # no signatures - so we generate them
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            cmd = [
                rscript, purecndx_r, "--rds", out["rds"], "--callable",
                callable_bed, "--signatures", "--exclude", simple_repeat_bed,
                "--out", tx_out_base
            ]
            do.run(cmd, "PureCN Dx mutational burden and signatures")
            out_base, out, all_files = _get_purecn_dx_files(paired,
                                                            out,
                                                            require_exist=True)
            # if a file was not generated it would not go to the upload
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base),
                                               f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    return out
Example #5
0
def create_normal_db(coverage_files_txt, snv_pon, out_dir, genome_build):
    """create normal db
       input: coverage files calculated by purecn for each sample
              snv_pon - mutect2 SNV PON
       output:
              mapping_bias_hg38.rds
              normalDB_hg38.rds
    """
    rscript = utils.Rscript_cmd("r36")
    normaldb_r = utils.R_package_script("r36", "PureCN", "extdata/NormalDB.R")
    cmd = [rscript, normaldb_r,
           "--outdir", out_dir,
           "--coveragefiles", coverage_files_txt,
           "--normal_panel" , snv_pon,
           "--genome", genome_build,
           "--force"]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                          utils.get_R_exports(env = "r36"),
                                                          " ".join([str(x) for x in cmd]))
        do.run(cmd_line, "PureCN normalDB")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to create a normal db")

    return out_dir
Example #6
0
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True,
                    handle_failures=True):
    """Create R script and run on input data

    BubbleTree has some internal hardcoded paramters that assume a smaller
    distribution of log2 scores. This is not true for tumor-only calls, so if
    we specify wide_lrr we scale the calculations to actually get calls. Need a
    better long term solution with flexible parameters.
    """
    lrr_scale = 10.0 if wide_lrr else 1.0
    local_sitelib = utils.R_sitelib()
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    do_plots = "yes" if do_plots else "no"
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        cmd = "%s && %s --no-environ %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file)
        try:
            do.run(cmd, "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError as msg:
            if handle_failures and _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
    return {"caller": "bubbletree",
            "report": freqs_out,
            "plot": {"bubble": bubbleplot_out, "track": trackplot_out}}
Example #7
0
def _run_on_chrom(chrom, work_bams, names, work_dir, items):
    """Run cn.mops on work BAMs for a specific chromosome.
    """
    local_sitelib = utils.R_sitelib()
    batch = sshared.get_cur_batch(items)
    ext = "-%s-cnv" % batch if batch else "-cnv"
    out_file = os.path.join(
        work_dir,
        "%s%s-%s.bed" % (os.path.splitext(os.path.basename(
            work_bams[0]))[0], ext, chrom if chrom else "all"))
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(out_file)[0]
            with open(rcode, "w") as out_handle:
                out_handle.write(
                    _script.format(prep_str=_prep_load_script(
                        work_bams, names, chrom, items),
                                   out_file=tx_out_file,
                                   local_sitelib=local_sitelib))
            rscript = utils.Rscript_cmd()
            try:
                do.run([rscript, "--vanilla", rcode],
                       "cn.mops CNV detection",
                       items[0],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                # cn.mops errors out if no CNVs found. Just write an empty file.
                if _allowed_cnmops_errorstates(str(msg)):
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write(
                            'track name=empty description="No CNVs found"\n')
                else:
                    logger.exception()
                    raise
    return [out_file]
Example #8
0
def _amber_het_file(vrn_files, work_dir, paired):
    """Create file of BAFs in normal heterozygous positions compatible with AMBER.

    https://github.com/hartwigmedical/hmftools/tree/master/amber
    https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf
    """
    assert vrn_files, "Did not find compatible variant calling files for TitanCNA inputs"
    from bcbio.heterogeneity import bubbletree

    prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"],
                                         vrn_files[0]["variantcaller"],
                                         work_dir, paired, AmberWriter)
    amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
    out_file = os.path.join(
        amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
    utils.symlink_plus(prep_file, out_file)
    pcf_file = out_file + ".pcf"
    if not utils.file_exists(pcf_file):
        with file_transaction(paired.tumor_data, pcf_file) as tx_out_file:
            r_file = os.path.join(os.path.dirname(tx_out_file),
                                  "bafSegmentation.R")
            with open(r_file, "w") as out_handle:
                out_handle.write(_amber_seg_script)
            cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports(
            ), utils.Rscript_cmd(), r_file, out_file, pcf_file)
            do.run(cmd, "PURPLE: AMBER baf segmentation")
    return out_file
Example #9
0
def _run_bubbletree(vcf_csv, cnv_csv, data):
    """Create R script and run on input data
    """
    local_sitelib = os.path.join(
        install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
        "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run([utils.Rscript_cmd(), r_file],
                   "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
Example #10
0
def process_intervals(data):
    """Prepare intervals file"""
    bed_file = regions.get_sv_bed(data)
    if not bed_file:
         bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    basename = os.path.splitext(bed_file)[0]
    ready_file = basename + ".txt"
    if os.path.exists(ready_file):
        return ready_file
    optimized_bed = basename + ".optimized.bed"
    rscript = utils.Rscript_cmd("r36")
    interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R")
    ref_file = dd.get_ref_file(data)
    mappability_resource = dd.get_variation_resources(data)["purecn_mappability"]
    genome = dd.get_genome_build(data)
    cmd = [rscript, interval_file_r, "--infile", bed_file,
          "--fasta", ref_file,
          "--outfile", ready_file,
          "--offtarget",
          "--genome", genome,
          "--export", optimized_bed,
          "--mappability", mappability_resource]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                     utils.get_R_exports(env = "r36"),
                                                     " ".join([str(x) for x in cmd]))
        do.run(cmd_line, "PureCN intervals")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to prepare intervals")
    logger.debug("Saved PureCN interval file into " + ready_file)
    return ready_file
Example #11
0
 def run(self, subcmd, opts, memscale=None):
     jvm_opts = get_picard_opts(self._config, memscale=memscale)
     Rpath = os.path.dirname(utils.Rscript_cmd())
     cmd = ["unset", "JAVA_HOME", "&&", "export", "PATH=%s:$PATH" % Rpath, "&&"] + \
           [self._cmd] + jvm_opts + [subcmd] + ["%s=%s" % (x, y) for x, y in opts] + \
           ["VALIDATION_STRINGENCY=SILENT"]
     do.run(" ".join(cmd), "Picard: %s" % subcmd)
Example #12
0
def _run_purecn_normaldb(paired, out):
    """Run PureCN with normaldb and native segmentation
       paired is one t/n pair or only """
    sample = utils.to_single_data(paired.tumor_data)
    bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    sample_name = dd.get_sample_name(sample)
    work_dir = _sv_workdir(sample)
    rscript = utils.Rscript_cmd("r36")
    purecn_r = utils.R_package_script("r36", "PureCN", "extdata/PureCN.R")
    intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    bam_file = dd.get_align_bam(sample)
    # termline and somatic - just annotated and filters assigned
    variants_vcf =  tz.get_in(["variants"], sample)[0].get("germline")
    # in a T/N case, there is no germline file - vrn file with all variants
    if not variants_vcf:
        variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file")
    normaldb = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_normaldb"], sample)
    mappingbiasfile = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias"], sample)
    sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample)
    simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"]
    result_file = os.path.join(work_dir, sample_name + ".rds")
    genome = dd.get_genome_build(sample)
    cmd = [ rscript, purecn_r,
            "--out", work_dir,
            "--tumor", sample_coverage,
            "--sampleid", sample_name,
            "--vcf", variants_vcf,
            "--normaldb", normaldb,
            "--mappingbiasfile", mappingbiasfile,
            "--intervals", intervals,
            "--snpblacklist", simple_repeat_bed,
            "--genome", genome,
            "--force",
            "--postoptimize",
            "--seed", "123",
            "--bootstrapn", "500",
            "--cores", dd.get_num_cores(sample)]
    resources = config_utils.get_resources("purecn", sample)
    if "options" in resources:
        cmd += [str(x) for x in resources.get("options", [])]
    # it is not recommended to use matched normal sample in PureCN analysis,
    # because then it skips PON coverage normalization and denoising steps!
    # but still, if it is supplied, we useit
    if paired.normal_data:
        normal_sample = utils.to_single_data(paired.normal_data)
        if normal_sample:
            normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample)
            cmd.extend(["--normal", normal_coverage])
    if not os.path.exists(result_file):
        try:
            cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                              utils.get_R_exports(env = "r36"),
                                                              " ".join([str(x) for x in cmd]))
            do.run(cmd_line, "PureCN copy number calling")
            logger.debug("Saved PureCN output to " + work_dir)
        except subprocess.CalledProcessError as msg:
            logger.info("PureCN failed")
    out_base, out, all_files  = _get_purecn_files(paired, work_dir, require_exist = True)
    return out
Example #13
0
def chipqc(bam_file, sample, out_dir):
    """Attempt code to run ChIPQC bioconductor packate in one sample"""
    work_dir = dd.get_work_dir(sample)
    sample_name = dd.get_sample_name(sample)
    logger.warning("ChIPQC is unstable right now, if it breaks, turn off the tool.")
    if utils.file_exists(out_dir):
        return _get_output(out_dir)
    with tx_tmpdir() as tmp_dir:
        rcode = _sample_template(sample, tmp_dir)
        # local_sitelib = utils.R_sitelib()
        rscript = utils.Rscript_cmd()
        do.run([rscript, rcode], "ChIPQC in %s" % sample_name, log_error=False)
        shutil.move(tmp_dir, out_dir)
    return _get_output(out_dir)
Example #14
0
def cpat(assembled_fasta, hexamer, logit, out_file=None):
    if out_file and file_exists(out_file):
        return out_file
    if not out_file:
        out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".cpat").name

    cpat_cmd = _find_executable("cpat.py")
    r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
    cmd = ("{r_setup}{cpat_cmd} --gene={assembled_fasta} --hex={hexamer} "
           "--logitModel={logit} --outfile={tx_out_file}")
    message = "Predicing coding potential of %s." % (assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file
Example #15
0
def _cnvkit_segment(cnr_file, cov_interval, data):
    """Perform segmentation and copy number calling on normalized inputs
    """
    out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = [_get_cmd(), "segment", "-o", tx_out_file, cnr_file]
            if cov_interval == "genome":
                cmd += ["--threshold", "0.00001"]
            # preferentially use conda installed Rscript
            export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                utils.Rscript_cmd())
            do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Example #16
0
def _do_run(paired):
    """Perform Battenberg caling with the paired dataset.

    This purposely does not use a temporary directory for the output
    since Battenberg does smart restarts.
    """
    work_dir = _sv_workdir(paired.tumor_data)
    out = _get_battenberg_out(paired, work_dir)
    ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt")
    if len(_missing_files(out)) > 0:
        ref_file = dd.get_ref_file(paired.tumor_data)
        bat_datadir = os.path.normpath(
            os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg"))
        ignore_file, gl_file = _make_ignore_file(
            work_dir, ref_file, ignore_file,
            os.path.join(bat_datadir, "impute", "impute_info.txt"))
        local_sitelib = os.path.join(
            install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
            "site-library")
        tumor_bam = paired.tumor_bam
        normal_bam = paired.normal_bam
        platform = dd.get_platform(paired.tumor_data)
        genome_build = paired.tumor_data["genome_build"]
        # scale cores to avoid over-using memory during imputation
        cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5))
        gender = {
            "male": "XY",
            "female": "XX",
            "unknown": "L"
        }.get(population.get_gender(paired.tumor_data))
        if gender == "L":
            gender_str = "-ge %s -gl %s" % (gender, gl_file)
        else:
            gender_str = "-ge %s" % (gender)
        r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
            utils.Rscript_cmd())
        cmd = (
            "export R_LIBS_USER={local_sitelib} && {r_export_cmd}"
            "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai "
            "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt "
            "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt "
            "-ig {ignore_file} {gender_str} "
            "-assembly {genome_build} -species Human -platform {platform}")
        do.run(cmd.format(**locals()), "Battenberg CNV calling")
    assert len(_missing_files(
        out)) == 0, "Missing Battenberg output: %s" % _missing_files(out)
    out["plot"] = _get_battenberg_out_plots(paired, work_dir)
    out["ignore"] = ignore_file
    return out
Example #17
0
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data):
    """Run titanCNA wrapper script on given ploidy and clusters.
    """
    sample = dd.get_sample_name(data)
    cores = dd.get_num_cores(data)
    export_cmd = utils.get_R_exports()
    ploidy_dir = utils.safe_makedir(os.path.join(work_dir, "run_ploidy%s" % ploidy))

    cluster_dir = "%s_cluster%02d" % (sample, num_clusters)
    out_dir = os.path.join(ploidy_dir, cluster_dir)
    if not utils.file_uptodate(out_dir + ".titan.txt", cn_file):
        with tx_tmpdir(data) as tmp_dir:
            with utils.chdir(tmp_dir):
                cmd = ("{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} "
                       "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} "
                       "--libdir None")
                chroms = ["'%s'" % c.name.replace("chr", "") for c in ref.file_contigs(dd.get_ref_file(data))
                          if chromhacks.is_autosomal_or_x(c.name)]
                if "'X'" not in chroms:
                    chroms += ["'X'"]
                # Use UCSC style naming for human builds to support BSgenome
                genome_build = ("hg19" if dd.get_genome_build(data) in ["GRCh37", "hg19"]
                                else dd.get_genome_build(data))
                cmd += """ --chrs "c(%s)" """ % ",".join(chroms)
                cmd += " --genomeBuild {genome_build}"
                if data["genome_build"] in ("hg19", "hg38"):
                    cmd += " --genomeStyle UCSC"
                if data["genome_build"] in ["hg38"]:
                    data_dir = os.path.normpath(os.path.join(
                        os.path.dirname(os.path.realpath(os.path.join(
                            os.path.dirname(utils.Rscript_cmd()), "titanCNA.R"))),
                        os.pardir, os.pardir, "data"))
                    cytoband_file = os.path.join(data_dir, "cytoBand_hg38.txt")
                    assert os.path.exists(cytoband_file), cytoband_file
                    cmd += " --cytobandFile %s" % cytoband_file
                # TitanCNA's model is influenced by the variance in read coverage data
                # and data type: set reasonable defaults for non-WGS runs
                # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts)
                if dd.get_coverage_interval(data) != "genome":
                    cmd += " --alphaK=2500 --alphaKHigh=2500"
                do.run(cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters))
            for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")):
                shutil.move(fname, ploidy_dir)
            if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")):
                shutil.move(os.path.join(tmp_dir, "Rplots.pdf"),
                            os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir))
    return ploidy_dir
Example #18
0
def make_logit_model(coding_fasta, noncoding_fasta, hexamers, out_dir=None):
    safe_makedir(out_dir)
    out_prefix = os.path.join(out_dir, "logit")
    out_file = out_prefix + ".logit.RData"
    if file_exists(out_file):
        return out_file

    tx_prefix = tempfile.NamedTemporaryFile(delete=False).name
    tx_out_file = tx_prefix + ".logit.RData"

    logit_cmd = _find_executable("make_logitModel.py")
    r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
    cmd = ("{r_setup}{logit_cmd} --cgene={coding_fasta} --ngene={noncoding_fasta} "
           "--hex={hexamers} --outfile={tx_prefix}")
    message = "Building coding/noncoding logistical model."
    do.run(cmd.format(**locals()), message)

    shutil.move(tx_out_file, out_file)

    return out_file
Example #19
0
def _run_purecn_dx(out, paired):
    """Extract signatures and mutational burdens from PureCN rds file."""
    out_base, out, all_files = _get_purecn_dx_files(paired, out)
    rscript = utils.Rscript_cmd("r36")
    purecndx_r = utils.R_package_script("r36", "PureCN", "extdata/Dx.R")
    simple_repeat_bed = dd.get_variation_resources(paired.tumor_data)["simple_repeat"]
    callable_bed = dd.get_sample_callable(paired.tumor_data)
    if not utils.file_uptodate(out["mutation_burden"], out["rds"]):
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            cmd = [rscript, purecndx_r, 
                   "--rds", out["rds"], 
                   "--callable", callable_bed,
                   "--signatures",
                   "--exclude", simple_repeat_bed,
                   "--out", tx_out_base]
            do.run(cmd, "PureCN Dx mutational burden and signatures")
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    return out
Example #20
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(
                items[0]),
                                                   region,
                                                   out_file,
                                                   do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region,
                                                   out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                strandbias = "testsomatic.R"
                var2vcf = "var2vcf_paired.pl"
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts = " ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in((
                        "config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = (
                        "| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                        "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                        "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" %
                        os.path.join(os.path.dirname(sys.executable), "py"))
                    freq_filter = (
                        "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                        "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'"
                        % (os.path.join(os.path.dirname(sys.executable), "py"),
                           0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                    utils.Rscript_cmd())
                cmd = (
                    "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                    "| {strandbias} "
                    "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                    "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                    "{freq_filter} "
                    "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Genotyping with VarDict: Inference", {})
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
Example #21
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = (" ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                        if _is_bed_file(target) else "")
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                    utils.Rscript_cmd())
                cmd = (
                    "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
Example #22
0
def _get_env():
    anaconda_bin = os.path.dirname(utils.Rscript_cmd())
    return "unset JAVA_HOME  && export PATH=%s:$PATH && " % (anaconda_bin)
Example #23
0
def _run_purecn(paired, work_dir):
    """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs.
    """
    segfns = {
        "cnvkit": _segment_normalized_cnvkit,
        "gatk-cnv": _segment_normalized_gatk
    }
    out_base, out, all_files = _get_purecn_files(paired, work_dir)
    failed_file = out_base + "-failed.log"
    cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data)
    if not utils.file_uptodate(
            out["rds"], cnr_file) and not utils.file_exists(failed_file):
        cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](
            cnr_file, work_dir, paired)
        from bcbio import heterogeneity
        vcf_file = heterogeneity.get_variants(
            paired.tumor_data, include_germline=False)[0]["vrn_file"]
        vcf_file = germline.filter_to_pass_and_reject(vcf_file,
                                                      paired,
                                                      out_dir=work_dir)
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            # Use UCSC style naming for human builds to support BSgenome
            genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [
                "GRCh37", "hg19"
            ] else dd.get_genome_build(paired.tumor_data))
            rscript = utils.Rscript_cmd()
            purecn_r = utils.R_package_script("PureCN",
                                              "extdata/PureCN.R",
                                              env="base")
            cmd = [
                rscript, purecn_r, "--seed", "42", "--out", tx_out_base,
                "--rds",
                "%s.rds" % tx_out_base, "--sampleid",
                dd.get_sample_name(paired.tumor_data), "--genome", genome,
                "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file,
                "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"
            ]
            if dd.get_num_cores(paired.tumor_data) > 1:
                cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))]
            try:
                cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(
                    env="base"), utils.get_R_exports(env="base"), " ".join(
                        [str(x) for x in cmd]))
                do.run(cmd, "PureCN copy number calling")
            except subprocess.CalledProcessError as msg:
                if _allowed_errors(str(msg)):
                    logger.info(
                        "PureCN failed to find solution for %s: skipping" %
                        dd.get_sample_name(paired.tumor_data))
                    with open(failed_file, "w") as out_handle:
                        out_handle.write(str(msg))
                else:
                    logger.exception()
                    raise
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base),
                                               f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    out = _get_purecn_files(paired, work_dir, require_exist=True)[1]
    return out if (out.get("rds") and os.path.exists(out["rds"])) else None
Example #24
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        raw_file = "%s-raw%s" % utils.splitext_plus(out_file)
        with file_transaction(items[0], raw_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   items=items,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = (" ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                        if _is_bed_file(target) else "")
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith(
                    "gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if dd.get_avg_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = (
                    "unset R_HOME && unset JAVA_HOME && export PATH=%s:$PATH && "
                    % os.path.dirname(utils.Rscript_cmd()))
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} "
                    """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """
                    "| bcftools filter -i 'QUAL >= 0' "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = raw_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if raw_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
        if assoc_files.get("dbsnp"):
            annotation.add_dbsnp(raw_file, assoc_files["dbsnp"], items[0],
                                 out_file)
        else:
            utils.symlink_plus(raw_file, out_file)
    return out_file
Example #25
0
def _report_summary(samples, out_dir):
    """
    Run coverage report with bcbiocov package
    """
    try:
        import bcbreport.prepare as bcbreport
    except ImportError:
        logger.info("skipping report. No bcbreport installed.")
        return samples
    # samples = utils.unpack_worlds(samples)
    work_dir = dd.get_work_dir(samples[0])
    parent_dir = utils.safe_makedir(out_dir)
    with utils.chdir(parent_dir):
        logger.info("copy qsignature")
        qsignature_fn = os.path.join(work_dir, "qc", "qsignature",
                                     "qsignature.ma")
        if qsignature_fn:  # this need to be inside summary/qc dict
            if utils.file_exists(
                    qsignature_fn) and not utils.file_exists("qsignature.ma"):
                shutil.copy(qsignature_fn, "bcbio_qsignature.ma")

        out_dir = utils.safe_makedir("fastqc")
        logger.info("summarize fastqc")
        with utils.chdir(out_dir):
            _merge_fastqc(samples)

        logger.info("summarize metrics")
        samples = _merge_metrics(samples)

        logger.info("summarize target information")
        samples = _merge_target_information(samples)

        out_dir = utils.safe_makedir("coverage")
        logger.info("summarize coverage")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "coverage"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("coverage_fixed") > -1:
                    utils.copy_plus(
                        fn, os.path.join(out_dir, os.path.basename(fn)))

        out_dir = utils.safe_makedir("variants")
        logger.info("summarize variants")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "variants"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("gc-depth-parse.tsv") > -1:
                    utils.copy_plus(
                        fn, os.path.join(out_dir, os.path.basename(fn)))
        bcbreport.report(parent_dir)
        out_report = os.path.join(parent_dir, "qc-coverage-report.html")
        if not utils.file_exists(out_report):
            rmd_file = os.path.join(parent_dir, "report-ready.Rmd")
            run_file = "%s-run.R" % (os.path.splitext(out_report)[0])
            with open(run_file, "w") as out_handle:
                out_handle.write("""library(rmarkdown)\nrender("%s")\n""" %
                                 rmd_file)
            cmd = "%s %s" % (utils.Rscript_cmd(), run_file)
            # Skip automated generation of coverage report to avoid error
            # messages. We need to generalize coverage reporting and re-include.
            # try:
            #     do.run(cmd, "Prepare coverage summary", log_error=False)
            # except subprocess.CalledProcessError as msg:
            #     logger.info("Skipping generation of coverage report: %s" % (str(msg)))
            if utils.file_exists("report-ready.html"):
                shutil.move("report-ready.html", out_report)
    return samples