Esempio n. 1
0
def get_coverage(data):
    """Calculate coverage for a sample.bam, account for GC content
       data is single sample
    """
    data = utils.to_single_data(data)
    bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data)
    sample_name = dd.get_sample_name(data)
    work_dir = _sv_workdir(data)
    rscript = utils.Rscript_cmd("r36")
    coverage_r = utils.R_package_script("r36", "PureCN", "extdata/Coverage.R")
    intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data)
    # PureCN resolves symlinks and the actual output PureCN coverage file name
    # is derived from the end bam not from bam_file
    bam_file = os.path.realpath(dd.get_align_bam(data))
    bam_name = os.path.basename(bam_file)
    (bname, ext) = os.path.splitext(bam_name)
    result_file = os.path.join(work_dir, bname + "_coverage_loess.txt.gz")
    if not os.path.exists(result_file):
        cmd = [rscript, coverage_r,
               "--outdir", work_dir,
               "--bam", bam_file,
               "--intervals", intervals]
        try:
            cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                              utils.get_R_exports(env = "r36"),
                                                              " ".join([str(x) for x in cmd]))
            do.run(cmd_line, "PureCN coverage")
        except subprocess.CalledProcessError as msg:
            logger.info("PureCN failed to calculate coverage")
        logger.debug("Saved PureCN coverage files to " + result_file)
    return result_file
Esempio n. 2
0
def create_normal_db(coverage_files_txt, snv_pon, out_dir, genome_build):
    """create normal db
       input: coverage files calculated by purecn for each sample
              snv_pon - mutect2 SNV PON
       output:
              mapping_bias_hg38.rds
              normalDB_hg38.rds
    """
    rscript = utils.Rscript_cmd("r36")
    normaldb_r = utils.R_package_script("r36", "PureCN", "extdata/NormalDB.R")
    cmd = [rscript, normaldb_r,
           "--outdir", out_dir,
           "--coveragefiles", coverage_files_txt,
           "--normal_panel" , snv_pon,
           "--genome", genome_build,
           "--force"]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                          utils.get_R_exports(env = "r36"),
                                                          " ".join([str(x) for x in cmd]))
        do.run(cmd_line, "PureCN normalDB")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to create a normal db")

    return out_dir
Esempio n. 3
0
def process_intervals(data):
    """Prepare intervals file"""
    bed_file = regions.get_sv_bed(data)
    if not bed_file:
         bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    basename = os.path.splitext(bed_file)[0]
    ready_file = basename + ".txt"
    if os.path.exists(ready_file):
        return ready_file
    optimized_bed = basename + ".optimized.bed"
    rscript = utils.Rscript_cmd("r36")
    interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R")
    ref_file = dd.get_ref_file(data)
    mappability_resource = dd.get_variation_resources(data)["purecn_mappability"]
    genome = dd.get_genome_build(data)
    cmd = [rscript, interval_file_r, "--infile", bed_file,
          "--fasta", ref_file,
          "--outfile", ready_file,
          "--offtarget",
          "--genome", genome,
          "--export", optimized_bed,
          "--mappability", mappability_resource]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                     utils.get_R_exports(env = "r36"),
                                                     " ".join([str(x) for x in cmd]))
        do.run(cmd_line, "PureCN intervals")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to prepare intervals")
    logger.debug("Saved PureCN interval file into " + ready_file)
    return ready_file
Esempio n. 4
0
def _run_on_chrom(chrom, work_bams, names, work_dir, items):
    """Run cn.mops on work BAMs for a specific chromosome.
    """
    local_sitelib = utils.R_sitelib()
    batch = sshared.get_cur_batch(items)
    ext = "-%s-cnv" % batch if batch else "-cnv"
    out_file = os.path.join(
        work_dir,
        "%s%s-%s.bed" % (os.path.splitext(os.path.basename(
            work_bams[0]))[0], ext, chrom if chrom else "all"))
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(out_file)[0]
            with open(rcode, "w") as out_handle:
                out_handle.write(
                    _script.format(prep_str=_prep_load_script(
                        work_bams, names, chrom, items),
                                   out_file=tx_out_file,
                                   local_sitelib=local_sitelib))
            rscript = utils.Rscript_cmd()
            try:
                do.run([rscript, "--vanilla", rcode],
                       "cn.mops CNV detection",
                       items[0],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                # cn.mops errors out if no CNVs found. Just write an empty file.
                if _allowed_cnmops_errorstates(str(msg)):
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write(
                            'track name=empty description="No CNVs found"\n')
                else:
                    logger.exception()
                    raise
    return [out_file]
Esempio n. 5
0
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True,
                    handle_failures=True):
    """Create R script and run on input data

    BubbleTree has some internal hardcoded paramters that assume a smaller
    distribution of log2 scores. This is not true for tumor-only calls, so if
    we specify wide_lrr we scale the calculations to actually get calls. Need a
    better long term solution with flexible parameters.
    """
    lrr_scale = 10.0 if wide_lrr else 1.0
    local_sitelib = utils.R_sitelib()
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    do_plots = "yes" if do_plots else "no"
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        cmd = "%s && %s --no-environ %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file)
        try:
            do.run(cmd, "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError as msg:
            if handle_failures and _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
    return {"caller": "bubbletree",
            "report": freqs_out,
            "plot": {"bubble": bubbleplot_out, "track": trackplot_out}}
Esempio n. 6
0
def _run_purecn_normaldb(paired, out):
    """Run PureCN with normaldb and native segmentation
       paired is one t/n pair or only """
    sample = utils.to_single_data(paired.tumor_data)
    bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    sample_name = dd.get_sample_name(sample)
    work_dir = _sv_workdir(sample)
    rscript = utils.Rscript_cmd("r36")
    purecn_r = utils.R_package_script("r36", "PureCN", "extdata/PureCN.R")
    intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    bam_file = dd.get_align_bam(sample)
    # termline and somatic - just annotated and filters assigned
    variants_vcf =  tz.get_in(["variants"], sample)[0].get("germline")
    # in a T/N case, there is no germline file - vrn file with all variants
    if not variants_vcf:
        variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file")
    normaldb = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_normaldb"], sample)
    mappingbiasfile = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias"], sample)
    sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample)
    simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"]
    result_file = os.path.join(work_dir, sample_name + ".rds")
    genome = dd.get_genome_build(sample)
    cmd = [ rscript, purecn_r,
            "--out", work_dir,
            "--tumor", sample_coverage,
            "--sampleid", sample_name,
            "--vcf", variants_vcf,
            "--normaldb", normaldb,
            "--mappingbiasfile", mappingbiasfile,
            "--intervals", intervals,
            "--snpblacklist", simple_repeat_bed,
            "--genome", genome,
            "--force",
            "--postoptimize",
            "--seed", "123",
            "--bootstrapn", "500",
            "--cores", dd.get_num_cores(sample)]
    resources = config_utils.get_resources("purecn", sample)
    if "options" in resources:
        cmd += [str(x) for x in resources.get("options", [])]
    # it is not recommended to use matched normal sample in PureCN analysis,
    # because then it skips PON coverage normalization and denoising steps!
    # but still, if it is supplied, we useit
    if paired.normal_data:
        normal_sample = utils.to_single_data(paired.normal_data)
        if normal_sample:
            normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample)
            cmd.extend(["--normal", normal_coverage])
    if not os.path.exists(result_file):
        try:
            cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                              utils.get_R_exports(env = "r36"),
                                                              " ".join([str(x) for x in cmd]))
            do.run(cmd_line, "PureCN copy number calling")
            logger.debug("Saved PureCN output to " + work_dir)
        except subprocess.CalledProcessError as msg:
            logger.info("PureCN failed")
    out_base, out, all_files  = _get_purecn_files(paired, work_dir, require_exist = True)
    return out
Esempio n. 7
0
def _run_purecn(paired, work_dir):
    """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs.
    """
    segfns = {
        "cnvkit": _segment_normalized_cnvkit,
        "gatk-cnv": _segment_normalized_gatk
    }
    out_base, out, all_files = _get_purecn_files(paired, work_dir)
    failed_file = out_base + "-failed.log"
    cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data)
    if not utils.file_uptodate(
            out["rds"], cnr_file) and not utils.file_exists(failed_file):
        cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](
            cnr_file, work_dir, paired)
        from bcbio import heterogeneity
        vcf_file = heterogeneity.get_variants(
            paired.tumor_data, include_germline=False)[0]["vrn_file"]
        vcf_file = germline.filter_to_pass_and_reject(vcf_file,
                                                      paired,
                                                      out_dir=work_dir)
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            # Use UCSC style naming for human builds to support BSgenome
            genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [
                "GRCh37", "hg19"
            ] else dd.get_genome_build(paired.tumor_data))
            cmd = [
                "PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds",
                "%s.rds" % tx_out_base, "--sampleid",
                dd.get_sample_name(paired.tumor_data), "--genome", genome,
                "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file,
                "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"
            ]
            if dd.get_num_cores(paired.tumor_data) > 1:
                cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))]
            try:
                cmd = "export R_LIBS_USER=%s && %s && %s" % (
                    utils.R_sitelib(), utils.get_R_exports(), " ".join(
                        [str(x) for x in cmd]))
                do.run(cmd, "PureCN copy number calling")
            except subprocess.CalledProcessError as msg:
                if _allowed_errors(str(msg)):
                    logger.info(
                        "PureCN failed to find solution for %s: skipping" %
                        dd.get_sample_name(paired.tumor_data))
                    with open(failed_file, "w") as out_handle:
                        out_handle.write(str(msg))
                else:
                    logger.exception()
                    raise
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base),
                                               f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    out = _get_purecn_files(paired, work_dir, require_exist=True)[1]
    return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def _do_run(paired):
    """Perform Battenberg caling with the paired dataset.

    This purposely does not use a temporary directory for the output
    since Battenberg does smart restarts.
    """
    work_dir = _sv_workdir(paired.tumor_data)
    out = _get_battenberg_out(paired, work_dir)
    ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt")
    if len(_missing_files(out)) > 0:
        ref_file = dd.get_ref_file(paired.tumor_data)
        bat_datadir = os.path.normpath(
            os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg"))
        ignore_file, gl_file = _make_ignore_file(
            work_dir, ref_file, ignore_file,
            os.path.join(bat_datadir, "impute", "impute_info.txt"))
        tumor_bam = paired.tumor_bam
        normal_bam = paired.normal_bam
        platform = dd.get_platform(paired.tumor_data)
        genome_build = paired.tumor_data["genome_build"]
        # scale cores to avoid over-using memory during imputation
        cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5))
        gender = {
            "male": "XY",
            "female": "XX",
            "unknown": "L"
        }.get(population.get_gender(paired.tumor_data))
        if gender == "L":
            gender_str = "-ge %s -gl %s" % (gender, gl_file)
        else:
            gender_str = "-ge %s" % (gender)
        r_export_cmd = utils.get_R_exports()
        local_sitelib = utils.R_sitelib()
        cmd = (
            "export R_LIBS_USER={local_sitelib} && {r_export_cmd} && "
            "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai "
            "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt "
            "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt "
            "-ig {ignore_file} {gender_str} "
            "-assembly {genome_build} -species Human -platform {platform}")
        do.run(cmd.format(**locals()), "Battenberg CNV calling")
    assert len(_missing_files(
        out)) == 0, "Missing Battenberg output: %s" % _missing_files(out)
    out["plot"] = _get_battenberg_out_plots(paired, work_dir)
    out["ignore"] = ignore_file
    return out