Exemple #1
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > GENOME_COV_THRESH:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data),
                                             vrs or callable_file, "variant_regions")
            if offtarget_pct > OFFTARGET_THRESH:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Exemple #2
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which("configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file) or _out_of_date(out_file):
        config_script = os.path.realpath(utils.which("configManta.py"))
        cmd = [utils.get_program_python("configManta.py"), config_script]
        if paired:
            if paired.normal_bam:
                cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        for region in _maybe_limit_chromosomes(data):
            cmd += ["--region", region]
        resources = config_utils.get_resources("manta", data["config"])
        if resources.get("options"):
            cmd += [str(x) for x in resources["options"]]
        # If we are removing polyX, avoid calling on small indels which require
        # excessively long runtimes on noisy WGS runs
        if "polyx" in dd.get_exclude_regions(data):
            cmd += ["--config", _prep_streamlined_config(config_script, work_dir)]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
def _get_maxcov_downsample(data):
    """Calculate maximum coverage downsampling for whole genome samples.

    Returns None if we're not doing downsampling.
    """
    from bcbio.bam import ref
    from bcbio.ngsalign import alignprep, bwa
    from bcbio.variation import coverage
    params = {"min_coverage_for_downsampling": 10,
              "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data)}
    fastq_file = data["files"][0]
    num_reads = alignprep.total_reads_from_grabix(fastq_file)
    if num_reads and params["maxcov_downsample_multiplier"] and params["maxcov_downsample_multiplier"] > 0:
        vrs = dd.get_variant_regions_merged(data)
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
            genome_cov_pct = callable_size / float(total_size)
        else:
            callable_size = total_size
            genome_cov_pct = 1.0
        if (genome_cov_pct > coverage.GENOME_COV_THRESH
              and dd.get_coverage_interval(data) in ["genome", None, False]):
            total_counts, total_sizes = 0, 0
            for count, size in bwa.fastq_size_output(fastq_file, 5000):
                total_counts += int(count)
                total_sizes += (int(size) * int(count))
            read_size = float(total_sizes) / float(total_counts)
            avg_cov = float(num_reads * read_size) / callable_size
            if avg_cov >= params["min_coverage_for_downsampling"]:
                return int(avg_cov * params["maxcov_downsample_multiplier"])
    return None
Exemple #4
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion.

    Excludes high depth and centromere regions which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "")
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with shared.bedtools_tmpdir(items[0]):
            # Get a bedtool for the full region if no variant regions
            want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                    items[0]["config"], chrom)
            if chrom:
                want_bedtool = pybedtools.BedTool(shared.subset_bed_by_chrom(want_bedtool.saveas().fn,
                                                                             chrom, items[0]))
            sv_exclude_bed = _get_sv_exclude_file(items)
            if sv_exclude_bed and len(want_bedtool) > 0:
                want_bedtool = want_bedtool.subtract(sv_exclude_bed, nonamecheck=True).saveas()
            if any(dd.get_coverage_interval(d) == "genome" for d in items):
                want_bedtool = pybedtools.BedTool(shared.remove_highdepth_regions(want_bedtool.saveas().fn, items))
            with file_transaction(items[0], out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                        items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
Exemple #5
0
def _goleft_indexcov(bam_file, data, out_dir):
    """Use goleft indexcov to estimate coverage distributions using BAM index.

    Only used for whole genome runs as captures typically don't have enough data
    to be useful for index-only summaries.
    """
    if not dd.get_coverage_interval(data) == "genome":
        return []
    out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov"))
    out_files = [os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext))
                 for ext in ["roc", "ped", "bed.gz"]]
    if not utils.file_uptodate(out_files[-1], bam_file):
        with transaction.tx_tmpdir(data) as tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.join(tmp_dir, dd.get_sample_name(data)))
            gender_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name)]
            gender_args = "--sex %s" % (",".join(gender_chroms)) if gender_chroms else ""
            cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}"
            try:
                do.run(cmd.format(**locals()), "QC: goleft indexcov")
            except subprocess.CalledProcessError as msg:
                if not ("indexcov: no usable" in str(msg) or
                        ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))):
                    raise
            for out_file in out_files:
                orig_file = os.path.join(tmp_dir, os.path.basename(out_file))
                if utils.file_exists(orig_file):
                    utils.copy_plus(orig_file, out_file)
    # MultiQC needs non-gzipped/BED inputs so unpack the file
    out_bed = out_files[-1].replace(".bed.gz", ".tsv")
    if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed):
        with transaction.file_transaction(data, out_bed) as tx_out_bed:
            cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed)
            do.run(cmd, "Unpack indexcov BED file")
    out_files[-1] = out_bed
    return [x for x in out_files if utils.file_exists(x)]
Exemple #6
0
def segment_from_cnr(cnr_file, data, out_base):
    """Provide segmentation on a cnr file, used in external PureCN integration.
    """
    cns_file = _cnvkit_segment(cnr_file, dd.get_coverage_interval(data),
                               data, [data], out_file="%s.cns" % out_base, detailed=True)
    out = _add_seg_to_output({"cns": cns_file}, data, enumerate_chroms=False)
    return out["seg"]
Exemple #7
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
             "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
             "back_cnn": os.path.join(raw_work_dir, background_cnn)}
    if not utils.file_exists(files["cnr"]):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            # pick targets, anti-targets and access files based on analysis type
            # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html
            cov_interval = dd.get_coverage_interval(data)
            base_regions = dd.get_variant_regions(data)
            # For genome calls, subset to regions within 10kb of genes
            if cov_interval == "genome":
                base_regions = annotate.subset_by_genes(base_regions, data,
                                                        work_dir, pad=1e4)

            raw_target_bed = bedutils.merge_overlaps(base_regions, data,
                                                     out_dir=work_dir)
            target_bed = annotate.add_genes(raw_target_bed, data)

            # bail out if we ended up with no regions
            if not utils.file_exists(target_bed):
                return {}

            if cov_interval == "amplicon":
                target_opts = ["--targets", target_bed, "--access", target_bed]
            elif cov_interval == "genome":
                target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)]
            else:
                target_opts = ["--targets", target_bed, "--access", access_file]

            cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
                        len(test_bams) + len(background_bams))
            cmd = [_get_cmd(), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  target_opts + \
                  ["-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    for ftype in ["cnr", "cns"]:
        if not os.path.exists(files[ftype]):
            raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype]))
    return files
Exemple #8
0
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))

    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat"))
    ckouts = []
    for test_bam in test_bams:
        out_base = _bam_to_outbase(test_bam, raw_work_dir)
        ckouts.append({"cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cnr"]):
        data = items[0]
        cov_interval = dd.get_coverage_interval(data)
        raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, data)
        parallel = {"type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"]}
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data)

        def _bam_to_itype(bam):
            return "background" if bam in background_bams else "evaluate"

        split_cnns = run_multicore(
            _cnvkit_coverage,
            [
                (bam, bed, _bam_to_itype(bam), raw_work_dir, data)
                for bam in test_bams + background_bams
                for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data)
            ],
            data["config"],
            parallel,
        )
        coverage_cnns = _merge_coverage(split_cnns, data)
        background_cnn = _cnvkit_background(
            [x["file"] for x in coverage_cnns if x["itype"] == "background"],
            background_cnn,
            target_bed,
            antitarget_bed,
            data,
        )
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [
                (cnns, background_cnn, data)
                for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()
            ],
            data["config"],
            parallel,
        )
        called_segs = run_multicore(
            _cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel
        )
    return ckouts
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [sys.executable, os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))]
    cmd += ["--referenceFasta=%s" % ref_file,
            "--callRegions=%s" % _get_region_bed(region, [paired.tumor_data, paired.normal_data], out_file),
            "--runDir=%s" % tx_work_dir,
            "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
    if dd.get_coverage_interval(paired.tumor_data) not in ["genome"]:
        cmd += ["--targeted"]
    do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name)
    return os.path.join(tx_work_dir, "runWorkflow.py")
Exemple #10
0
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [sys.executable, os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))]
    cmd += ["--referenceFasta=%s" % ref_file,
            "--callRegions=%s" % _get_region_bed(region, items, out_file),
            "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file),
            "--runDir=%s" % tx_work_dir]
    cmd += ["--bam=%s" % b for b in align_bams]
    if any(dd.get_coverage_interval(d) not in ["genome"] for d in items):
        cmd += ["--targeted"]
    do.run(cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items])))
    return os.path.join(tx_work_dir, "runWorkflow.py")
Exemple #11
0
def _run_cnvkit_shared_orig(inputs, backgrounds):
    """Original CNVkit implementation with full normalization and segmentation.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
        out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input)
        if utils.file_exists(out_base_old + ".cns"):
            out_base = out_base_old
        ckouts.append({"cnr": "%s.cnr" % out_base,
                       "cns": "%s.cns" % out_base})
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        samples_to_run = list(zip(["background"] * len(backgrounds), backgrounds)) + \
                         list(zip(["evaluate"] * len(inputs), inputs))
        # New style shared SV bins
        if tz.get_in(["depth", "bins", "target"], inputs[0]):
            target_bed = tz.get_in(["depth", "bins", "target"], inputs[0])
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0])
            raw_coverage_cnns = reduce(operator.add,
                                       [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run])
        # Back compatible with pre-existing runs
        else:
            target_bed, antitarget_bed = _get_original_targets(inputs[0])
            raw_coverage_cnns = reduce(operator.add,
                                       [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run])
        # Currently metrics not calculated due to speed and needing re-evaluation
        # We could re-enable with larger truth sets to evaluate background noise
        # But want to reimplement in a more general fashion as part of normalization
        if False:
            coverage_cnns = reduce(operator.add,
                                [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                                                    inputs + backgrounds)
                                    for cnns in tz.groupby("bam", raw_coverage_cnns).values()])
            background_cnn = cnvkit_background(_select_background_cnns(coverage_cnns),
                                                background_cnn, inputs, target_bed, antitarget_bed)
        else:
            coverage_cnns = raw_coverage_cnns
            background_cnn = cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"],
                                                background_cnn, inputs, target_bed, antitarget_bed)
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        fixed_cnrs = run_multicore(_cnvkit_fix,
                                   [(cnns, background_cnn, inputs, ckouts) for cnns in
                                    tz.groupby("bam", [x for x in coverage_cnns
                                                       if x["itype"] == "evaluate"]).values()],
                                   inputs[0]["config"], parallel)
        [_cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs]
    return ckouts
Exemple #12
0
def _is_targeted_region(cur_bed, data):
    """Calculate if we should process region as a targeted or WGS.

    Currently always based on total coverage interval, as that validates best and
    is consistent between CWL (larger blocks) and non-CWL runs (smaller blocks).
    We can check core usage and provide a consistent report when moving to CWL
    exclusively.
    """
    cores = dd.get_num_cores(data)
    if cores > 0:  # Apply to all core setups now for consistency
        return dd.get_coverage_interval(data) not in ["genome"]
    else:
        return coverage_interval_from_bed(cur_bed, per_chrom=False) == "targeted"
Exemple #13
0
def _add_segmetrics_to_output(out, data):
    """Add metrics for measuring reliability of CNV estimates.
    """
    out_file = "%s-segmetrics.txt" % os.path.splitext(out["cns"])[0]
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "segmetrics",
                   "--ci", "--pi",
                   "-s", out["cns"], "-o", tx_out_file, out["cnr"]]
            if dd.get_coverage_interval(data) != "genome":
                cmd += ["--alpha", "0.001", "--bootstrap", "2000"]
            do.run(cmd, "CNVkit segmetrics")
    out["segmetrics"] = out_file
    return out
Exemple #14
0
def prepare_intervals(data, region_file, work_dir):
    """Prepare interval regions for targeted and gene based regions.
    """
    target_file = os.path.join(work_dir, "%s-target.interval_list" % dd.get_sample_name(data))
    if not utils.file_uptodate(target_file, region_file):
        with file_transaction(data, target_file) as tx_out_file:
            params = ["-T", "PreprocessIntervals", "-R", dd.get_ref_file(data),
                      "--interval-merging-rule", "OVERLAPPING_ONLY",
                      "-O", tx_out_file]
            if dd.get_coverage_interval(data) == "genome":
                params += ["--bin-length", "1000", "--padding", "0"]
            else:
                params += ["-L", region_file, "--bin-length", "0", "--padding", "250"]
            _run_with_memory_scaling(params, tx_out_file, data)
    return target_file
Exemple #15
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
        out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir)
        ckouts.append({"cnr": "%s.cnr" % out_base,
                       "cns": "%s.cns" % out_base,
                       "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cnr"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0])
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() /
                        float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval,
                                                     pct_coverage, raw_work_dir, inputs[0])
        split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                         zip(["evaluate"] * len(inputs), inputs)
        split_cnns = run_multicore(_cnvkit_coverage,
                                   [(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds],
                                   inputs[0]["config"], parallel)
        raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0])
        coverage_cnns = run_multicore(_cnvkit_metrics,
                                      [(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds)
                                       for cnns in tz.groupby("bam", raw_coverage_cnns).values()],
                                      inputs[0]["config"], parallel)
        background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns),
                                            background_cnn, target_bed, antitarget_bed, inputs[0])
        fixed_cnrs = run_multicore(_cnvkit_fix,
                                   [(cnns, background_cnn, inputs + backgrounds) for cnns in
                                    tz.groupby("bam", [x for x in coverage_cnns
                                                       if x["itype"] == "evaluate"]).values()],
                                      inputs[0]["config"], parallel)
        run_multicore(_cnvkit_segment,
                      [(cnr, cov_interval, data) for cnr, data in fixed_cnrs],
                      inputs[0]["config"], parallel)
    return ckouts
Exemple #16
0
def identify(data):
    """Identify high depth regions in the alignment file for potential filtering.
    """
    high_multiplier = 20
    sample_size = int(1e6)
    high_percentage = 25.0
    min_coverage = 10
    window_size = 250
    work_bam, out_file, stats_file = _get_files(data)
    if not os.path.exists(out_file) and dd.get_coverage_interval(data) == "genome":
        cores = dd.get_num_cores(data)
        with file_transaction(data, out_file) as tx_out_file:
            tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
            py_cl = os.path.join(os.path.dirname(sys.executable), "py")
            cmd = ("sambamba depth window -t {cores} -c {min_coverage} "
                   "--window-size {window_size} {work_bam} "
                   "| head -n {sample_size} "
                   """| cut -f 5 | {py_cl} -l 'numpy.median([float(x) for x in l if not x.startswith("mean")])'""")
            median_depth_out = subprocess.check_output(cmd.format(**locals()), shell=True)
            try:
                median_cov = float(median_depth_out)
            except ValueError:
                logger.info("Skipping high coverage region detection; problem calculating median depth: %s" %
                            median_depth_out)
                median_cov = None
            if median_cov and not numpy.isnan(median_cov):
                high_thresh = int(high_multiplier * median_cov)
                cmd = ("sambamba depth window -t {cores} -c {median_cov} "
                       "--window-size {window_size} -T {high_thresh} {work_bam} "
                       "| {py_cl} -fx 'float(x.split()[5]) >= {high_percentage} "
                       """if not x.startswith("#") else None' """
                       "| cut -f 1-3,7 > {tx_raw_file} ")
                do.run(cmd.format(**locals()), "Identify high coverage regions")
                with open(stats_file, "w") as out_handle:
                    yaml.safe_dump({"median_cov": median_cov}, out_handle,
                                   allow_unicode=False, default_flow_style=False)
            else:
                with open(tx_raw_file, "w") as out_handle:
                    out_handle.write("")
            if utils.file_exists(tx_raw_file):
                cmd = "bedtools merge -i {tx_raw_file} -c 4 -o distinct > {tx_out_file}"
                do.run(cmd.format(**locals()), "Clean up raw coverage file")
            else:
                shutil.move(tx_raw_file, tx_out_file)
    return out_file if os.path.exists(out_file) else None
Exemple #17
0
def _run_cnvkit_shared(data, test_bams, background_bams, work_dir, background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0]

    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
             "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
             "back_cnn": os.path.join(raw_work_dir, background_cnn)}
    if not utils.file_exists(files["cnr"]):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            cov_interval = dd.get_coverage_interval(data)
            raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir)
            # bail out if we ended up with no regions
            if not utils.file_exists(raw_target_bed):
                return {}
            target_bed = annotate.add_genes(raw_target_bed, data)

            # Do not paralleize cnvkit due to current issues with multi-processing
            cores = 1
            # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
            #             len(test_bams) + len(background_bams))
            cmd = [_get_cmd(), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_bed] + \
                  ["-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            if cov_interval not in ["amplicon", "genome"]:
                at_avg, at_min, t_avg = _get_antitarget_size(access_bed, target_bed)
                if at_avg:
                    cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                            "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    for ftype in ["cnr", "cns"]:
        if not os.path.exists(files[ftype]):
            raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype]))
    return files
Exemple #18
0
def _get_vqsr_annotations(filter_type, data):
    """Retrieve appropriate annotations to use for VQSR based on filter type.

    Issues reported with MQ and bwa-mem quality distribution, results in intermittent
    failures to use VQSR:
    http://gatkforums.broadinstitute.org/discussion/4425/variant-recalibration-failing
    http://gatkforums.broadinstitute.org/discussion/4248/variantrecalibrator-removing-all-snps-from-the-training-set
    """
    if filter_type == "SNP":
        # MQ, MQRankSum
        anns = ["QD", "FS", "ReadPosRankSum", "SOR"]
    else:
        assert filter_type == "INDEL"
        # MQRankSum
        anns = ["QD", "FS", "ReadPosRankSum", "SOR"]
    if dd.get_coverage_interval(data) == "genome":
        anns += ["DP"]
    return anns
Exemple #19
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.

    Handles new style cases where we have pre-normalized inputs and
    old cases where we run CNVkit individually.
    """
    if tz.get_in(["depth", "bins", "normalized"], inputs[0]):
        ckouts = []
        for data in inputs:
            cnr_file = tz.get_in(["depth", "bins", "normalized"], data)
            cns_file = os.path.join(_sv_workdir(data), "%s.cns" % dd.get_sample_name(data))
            cns_file = _cnvkit_segment(cnr_file, dd.get_coverage_interval(data), data,
                                       inputs + backgrounds, cns_file)
            ckouts.append({"cnr": cnr_file, "cns": cns_file,
                           "background": tz.get_in(["depth", "bins", "background"], data)})
        return ckouts
    else:
        return _run_cnvkit_shared_orig(inputs, backgrounds)
Exemple #20
0
def get_base_cnv_regions(data, work_dir, genome_default="transcripts1e4", include_gene_names=True):
    """Retrieve set of target regions for CNV analysis.

    Subsets to extended transcript regions for WGS experiments to avoid
    long runtimes.
    """
    cov_interval = dd.get_coverage_interval(data)
    base_regions = get_sv_bed(data, include_gene_names=include_gene_names)
    # if we don't have a configured BED or regions to use for SV caling
    if not base_regions:
        # For genome calls, subset to regions near genes as targets
        if cov_interval == "genome":
            base_regions = get_sv_bed(data, genome_default, work_dir, include_gene_names=include_gene_names)
            if base_regions:
                base_regions = remove_exclude_regions(base_regions, base_regions, [data])
        # Finally, default to the defined variant regions
        if not base_regions:
            base_regions = dd.get_variant_regions(data)
    return bedutils.clean_file(base_regions, data)
Exemple #21
0
def get_base_cnv_regions(data, work_dir):
    """Retrieve set of target regions for CNV analysis.

    Subsets to extended transcript regions for WGS experiments to avoid
    long runtimes.
    """
    cov_interval = dd.get_coverage_interval(data)
    base_regions = regions.get_sv_bed(data)
    # if we don't have a configured BED or regions to use for SV caling
    if not base_regions:
        # For genome calls, subset to regions within 10kb of genes
        if cov_interval == "genome":
            base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir)
            if base_regions:
                base_regions = remove_exclude_regions(base_regions, base_regions, [data])
        # Finally, default to the defined variant regions
        if not base_regions:
            base_regions = dd.get_variant_regions(data)
    return base_regions
Exemple #22
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which("configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file):
        cmd = [sys.executable, utils.which("configManta.py")]
        if paired:
            if paired.normal_bam:
                cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
Exemple #23
0
def prepare_intervals(data, region_file, work_dir):
    """Prepare interval regions for targeted and gene based regions.
    """
    target_file = os.path.join(
        work_dir, "%s-target.interval_list" % dd.get_sample_name(data))
    if not utils.file_uptodate(target_file, region_file):
        with file_transaction(data, target_file) as tx_out_file:
            params = [
                "-T", "PreprocessIntervals", "-R",
                dd.get_ref_file(data), "--interval-merging-rule",
                "OVERLAPPING_ONLY", "-O", tx_out_file
            ]
            if dd.get_coverage_interval(data) == "genome":
                params += ["--bin-length", "1000", "--padding", "0"]
            else:
                params += [
                    "-L", region_file, "--bin-length", "0", "--padding", "250"
                ]
            _run_with_memory_scaling(params, tx_out_file, data)
    return target_file
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    genome_cov_thresh = 0.40  # percent of genome covered for whole genome analysis
    offtarget_thresh = 0.10  # percent of offtarget reads required to be capture (not amplification) based
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            seq_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            seq_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = seq_size / float(total_size)
        if genome_cov_pct > genome_cov_thresh:
            cov_interval = "genome"
            offtarget_pct = 0.0
        else:
            offtarget_stat_file = dd.get_offtarget_stats(data)
            if not offtarget_stat_file:
                offtarget_pct = 0.0
            else:
                with open(offtarget_stat_file) as in_handle:
                    stats = yaml.safe_load(in_handle)
                if float(stats["mapped"]) > 0:
                    offtarget_pct = stats["offtarget"] / float(stats["mapped"])
                else:
                    offtarget_pct = 0.0
            if offtarget_pct > offtarget_thresh:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Exemple #25
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > GENOME_COV_THRESH:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(
                data,
                dd.get_align_bam(data) or dd.get_work_bam(data), vrs
                or callable_file, "variant_regions")
            if offtarget_pct > OFFTARGET_THRESH:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info(
            "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
            % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0,
               offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Exemple #26
0
def _configure_germline(align_bams, items, ref_file, region, out_file,
                        tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [
        sys.executable,
        os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))
    ]
    cmd += [
        "--referenceFasta=%s" % ref_file,
        "--callRegions=%s" % get_region_bed(region, items, out_file),
        "--ploidy=%s" %
        _get_ploidy(shared.to_multiregion(region), items, out_file),
        "--runDir=%s" % tx_work_dir
    ]
    cmd += ["--bam=%s" % b for b in align_bams]
    if any(dd.get_coverage_interval(d) not in ["genome"] for d in items):
        cmd += ["--targeted"]
    do.run(
        cmd, "Configure Strelka2 germline calling: %s" %
        (", ".join([dd.get_sample_name(d) for d in items])))
    return os.path.join(tx_work_dir, "runWorkflow.py")
Exemple #27
0
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path):
    variant_regions = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, six.string_types) and os.path.isfile(target):
            target_bed = target
        else:
            target_bed = os.path.join(tmp_path, "tmp.bed")
            if not utils.file_exists(target_bed):
                with file_transaction(config, target_bed) as tx_tmp_bed:
                    if not isinstance(region, (list, tuple)):
                        message = ("Region must be a tuple - something odd just happened")
                        raise ValueError(message)
                    chrom, start, end = region
                    with open(tx_tmp_bed, "w") as out_handle:
                        print("%s\t%s\t%s" % (chrom, start, end), file=out_handle)
        if any(dd.get_coverage_interval(x) == "genome" for x in items):
            target_bed = shared.remove_lcr_regions(target_bed, items)
        return ["--bed", target_bed]
    else:
        return []
Exemple #28
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
        out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir)
        ckouts.append({"cnr": "%s.cnr" % out_base,
                       "cns": "%s.cns" % out_base,
                       "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0])
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval,
                                                     raw_work_dir, inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                         zip(["evaluate"] * len(inputs), inputs)
        raw_coverage_cnns = [_cnvkit_coverage(cdata, bed, itype) for itype, cdata in samples_to_run
                             for bed in [target_bed, antitarget_bed]]
        coverage_cnns = reduce(operator.add,
                               [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds)
                                for cnns in tz.groupby("bam", raw_coverage_cnns).values()])
        background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns),
                                            background_cnn, target_bed, antitarget_bed, inputs[0])
        fixed_cnrs = run_multicore(_cnvkit_fix,
                                   [(cnns, background_cnn, inputs + backgrounds) for cnns in
                                    tz.groupby("bam", [x for x in coverage_cnns
                                                       if x["itype"] == "evaluate"]).values()],
                                      inputs[0]["config"], parallel)
        [_cnvkit_segment(cnr, cov_interval, data) for cnr, data in fixed_cnrs]
    return ckouts
Exemple #29
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which(
        "configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file) or _out_of_date(out_file):
        config_script = os.path.realpath(utils.which("configManta.py"))
        cmd = [sys.executable, config_script]
        if paired:
            if paired.normal_bam:
                cmd += [
                    "--normalBam=%s" % paired.normal_bam,
                    "--tumorBam=%s" % paired.tumor_bam
                ]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += [
            "--referenceFasta=%s" % dd.get_ref_file(data),
            "--runDir=%s" % work_dir
        ]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        for region in _maybe_limit_chromosomes(data):
            cmd += ["--region", region]
        resources = config_utils.get_resources("manta", data["config"])
        if resources.get("options"):
            cmd += [str(x) for x in resources["options"]]
        # If we are removing polyX, avoid calling on small indels which require
        # excessively long runtimes on noisy WGS runs
        if "polyx" in dd.get_exclude_regions(data):
            cmd += [
                "--config",
                _prep_streamlined_config(config_script, work_dir)
            ]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which("configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file) or _out_of_date(out_file):
        cmd = [sys.executable, os.path.realpath(utils.which("configManta.py"))]
        if paired:
            if paired.normal_bam:
                cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        for region in _maybe_limit_chromosomes(data):
            cmd += ["--region", region]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
Exemple #31
0
def precall(items):
    """Perform initial pre-calling steps -- coverage calcuation by sample.

    Use sambamba to call average region coverage in regions, and convert into a correct format.
    """
    items = [utils.to_single_data(x) for x in items]
    assert len(items) == 1, "Expect one item to Seq2C coverage calculation"
    data = utils.to_single_data(items)
    assert dd.get_coverage_interval(data) != "genome", "Seq2C only for amplicon and exome sequencing"

    work_dir = _sv_workdir(data)
    bed_file = _prep_bed(data, work_dir)
    bam_file = dd.get_align_bam(data)
    sample_name = dd.get_sample_name(data)

    cov_file = _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name)

    if "sv" not in data:
        data["sv"] = []
    data["sv"].append({"variantcaller": "seq2c",
                       "coverage": cov_file})
    return [data]
Exemple #32
0
def _get_maxcov_downsample(data):
    """Calculate maximum coverage downsampling for whole genome samples.

    Returns None if we're not doing downsampling.
    """
    from bcbio.bam import ref
    from bcbio.ngsalign import alignprep, bwa
    from bcbio.variation import coverage
    params = {
        "min_coverage_for_downsampling": 10,
        "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data)
    }
    fastq_file = data["files"][0]
    num_reads = alignprep.total_reads_from_grabix(fastq_file)
    if num_reads and params["maxcov_downsample_multiplier"] and params[
            "maxcov_downsample_multiplier"] > 0:
        vrs = dd.get_variant_regions_merged(data)
        total_size = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
            genome_cov_pct = callable_size / float(total_size)
        else:
            callable_size = total_size
            genome_cov_pct = 1.0
        if (genome_cov_pct > coverage.GENOME_COV_THRESH
                and dd.get_coverage_interval(data) in ["genome", None, False]):
            total_counts, total_sizes = 0, 0
            for count, size in bwa.fastq_size_output(fastq_file, 5000):
                total_counts += int(count)
                total_sizes += (int(size) * int(count))
            read_size = float(total_sizes) / float(total_counts)
            avg_cov = float(num_reads * read_size) / callable_size
            if avg_cov >= params["min_coverage_for_downsampling"]:
                return int(avg_cov * params["maxcov_downsample_multiplier"])
    return None
Exemple #33
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    genome_cov_thresh = 0.40  # percent of genome covered for whole genome analysis
    offtarget_thresh = 0.10  # percent of offtarget reads required to be capture (not amplification) based
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            seq_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            seq_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = seq_size / float(total_size)
        if genome_cov_pct > genome_cov_thresh:
            cov_interval = "genome"
            offtarget_pct = 0.0
        else:
            offtarget_stat_file = dd.get_offtarget_stats(data)
            if not offtarget_stat_file:
                offtarget_pct = 0.0
            else:
                with open(offtarget_stat_file) as in_handle:
                    stats = yaml.safe_load(in_handle)
                offtarget_pct = stats["offtarget"] / float(stats["mapped"])
            if offtarget_pct > offtarget_thresh:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Exemple #34
0
def _prep_real_counts(bam_file, data, samtools_stats):
    out = {}

    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        bed = dd.get_coverage_merged(data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        bed = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        bed = None
        target_name = "genome"

    dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True)

    if bed:
        out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage()
        out["Preseq_read_count"] = readstats.number_of_mapped_reads(
            data, bam_file, keep_dups=True, bed_file=bed, target_name=target_name)
        ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data, bam_file)
        if dedupped:
            out["Preseq_unique_count"] = readstats.number_of_mapped_reads(
                data, bam_file, keep_dups=False, bed_file=bed, target_name=target_name)

        # Counting average on-target alignment length, based on the equation:
        #    avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size
        total_alignments = out.get("Preseq_unique_count") or out["Preseq_read_count"]
        out["Preseq_read_length"] = ontrg_unique_depth * out["Preseq_genome_size"] // total_alignments

    else:  # WGS
        out["Preseq_genome_size"] = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        out["Preseq_read_count"] = int(samtools_stats["Total_reads"])
        out["Preseq_read_length"] = int(samtools_stats["Average_read_length"])
        if dedupped:
            out["Preseq_unique_count"] = out["Preseq_read_count"] - int(samtools_stats["Duplicates"])

    return out
Exemple #35
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion.

    Excludes high depth and centromere regions which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0],
                                     "-%s" % chrom if chrom else "")
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with shared.bedtools_tmpdir(items[0]):
            # Get a bedtool for the full region if no variant regions
            want_bedtool = callable.get_ref_bedtool(
                tz.get_in(["reference", "fasta", "base"], items[0]),
                items[0]["config"], chrom)
            if chrom:
                want_bedtool = pybedtools.BedTool(
                    shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom,
                                               items[0]))
            sv_exclude_bed = _get_sv_exclude_file(items)
            if sv_exclude_bed and len(want_bedtool) > 0:
                want_bedtool = want_bedtool.subtract(
                    sv_exclude_bed, nonamecheck=True).saveas()
            if any(dd.get_coverage_interval(d) == "genome" for d in items):
                want_bedtool = pybedtools.BedTool(
                    shared.remove_highdepth_regions(want_bedtool.saveas().fn,
                                                    items))
            with file_transaction(items[0], out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(
                    tz.get_in(["reference", "fasta", "base"], items[0]),
                    items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool,
                                          nonamecheck=True).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
Exemple #36
0
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data):
    """Run titanCNA wrapper script on given ploidy and clusters.
    """
    sample = dd.get_sample_name(data)
    cores = dd.get_num_cores(data)
    export_cmd = utils.get_R_exports()
    ploidy_dir = utils.safe_makedir(
        os.path.join(work_dir, "run_ploidy%s" % ploidy))

    cluster_dir = "%s_cluster%02d" % (sample, num_clusters)
    out_dir = os.path.join(ploidy_dir, cluster_dir)
    if not utils.file_uptodate(out_dir + ".titan.txt", cn_file):
        with tx_tmpdir(data) as tmp_dir:
            with utils.chdir(tmp_dir):
                cmd = (
                    "{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} "
                    "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir}"
                )
                if data["genome_build"] in ("hg19", "hg38"):
                    cmd += " --genomeStyle UCSC"
                # TitanCNA's model is influenced by the variance in read coverage data
                # and data type: set reasonable defaults for non-WGS runs
                # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts)
                if dd.get_coverage_interval(data) != "genome":
                    cmd += " --alphaK=2500 --alphaKHigh=2500"
                do.run(
                    cmd.format(**locals()),
                    "TitanCNA CNV detection: ploidy %s, cluster %s" %
                    (ploidy, num_clusters))
            for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")):
                shutil.move(fname, ploidy_dir)
            if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")):
                shutil.move(
                    os.path.join(tmp_dir, "Rplots.pdf"),
                    os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir))
    return ploidy_dir
Exemple #37
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    total_reads = sambamba.number_of_reads(data, bam_file)
    out['Total_reads'] = total_reads
    mapped = sambamba.number_of_mapped_reads(data, bam_file)
    out['Mapped_reads'] = mapped
    if total_reads:
        out['Mapped_reads_pct'] = 100.0 * mapped / total_reads
    if mapped:
        mapped_unique = sambamba.number_of_mapped_reads(data,
                                                        bam_file,
                                                        keep_dups=False)
        out['Mapped_unique_reads'] = mapped
        mapped_dups = mapped - mapped_unique
        out['Duplicates'] = mapped_dups
        out['Duplicates_pct'] = 100.0 * mapped_dups / mapped

        if dd.get_coverage(data):
            cov_bed_file = clean_file(dd.get_coverage(data),
                                      data,
                                      prefix="cov-",
                                      simple=True)
            merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
            target_name = "coverage"
        elif dd.get_coverage_interval(data) != "genome":
            merged_bed_file = dd.get_variant_regions_merged(data)
            target_name = "variant_regions"
        else:
            merged_bed_file = None
            target_name = "genome"

        if merged_bed_file:
            ontarget = sambamba.number_mapped_reads_on_target(
                data,
                merged_bed_file,
                bam_file,
                keep_dups=False,
                target_name=target_name)
            if mapped_unique:
                out["Ontarget_unique_reads"] = ontarget
                out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
                out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                                ontarget) / mapped_unique
                padded_bed_file = bedutils.get_padded_bed_file(
                    merged_bed_file, 200, data)
                ontarget_padded = sambamba.number_mapped_reads_on_target(
                    data,
                    padded_bed_file,
                    bam_file,
                    keep_dups=False,
                    target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
            if total_reads:
                out['Usable_pct'] = 100.0 * ontarget / total_reads

        avg_coverage = get_average_coverage(data, bam_file, merged_bed_file,
                                            target_name)
        out['Avg_coverage'] = avg_coverage

    priority = cov.priority_coverage(data, out_dir)
    cov.priority_total_coverage(data, out_dir)
    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir)
    # Re-enable with annotations from internally installed
    # problem region directory
    # if priority:
    #    annotated = cov.decorate_problem_regions(priority, problem_regions)

    return out
Exemple #38
0
def run(bam_file, data, out_dir):
    """Run coverage QC analysis
    """
    out = dict()

    out_dir = utils.safe_makedir(out_dir)
    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        merged_bed_file = bedutils.clean_file(dd.get_coverage_merged(data),
                                              data,
                                              prefix="cov-",
                                              simple=True)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    avg_depth = cov.get_average_coverage(target_name, merged_bed_file, data)
    if target_name == "coverage":
        out_files = cov.coverage_region_detailed_stats(target_name,
                                                       merged_bed_file, data,
                                                       out_dir)
    else:
        out_files = []

    out['Avg_coverage'] = avg_depth

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools')
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data,
                                  samtools_stats_dir)["metrics"]

    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"])
    out['Duplicates'] = dups = int(samtools_stats["Duplicates"])

    if total_reads:
        out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if mapped:
        out['Duplicates_pct'] = 100.0 * dups / mapped

    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        mapped_unique = readstats.number_of_mapped_reads(data,
                                                         bam_file,
                                                         keep_dups=False)
    out['Mapped_unique_reads'] = mapped_unique

    if merged_bed_file:
        ontarget = readstats.number_of_mapped_reads(data,
                                                    bam_file,
                                                    keep_dups=False,
                                                    bed_file=merged_bed_file,
                                                    target_name=target_name)
        out["Ontarget_unique_reads"] = ontarget
        if mapped_unique:
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                            ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(
                    out_dir, merged_bed_file, 200, data)
                ontarget_padded = readstats.number_of_mapped_reads(
                    data,
                    bam_file,
                    keep_dups=False,
                    bed_file=padded_bed_file,
                    target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    indexcov_files = _goleft_indexcov(bam_file, data, out_dir)
    out_files += [x for x in indexcov_files if x and utils.file_exists(x)]
    out = {"metrics": out}
    if len(out_files) > 0:
        out["base"] = out_files[0]
        out["secondary"] = out_files[1:]
    return out
Exemple #39
0
 def _skip_duplicates(data):
     return (dd.get_coverage_interval(data) == "amplicon"
             or (dd.get_aligner(data) and not dd.get_mark_duplicates(data)))
Exemple #40
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir)
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    if "Total_reads" not in samtools_stats:
        return
    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    if not total_reads:
        return

    if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats:
        return
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if not mapped:
        return out

    if "Duplicates" in samtools_stats:
        out['Duplicates'] = dups = int(samtools_stats["Duplicates"])
        out['Duplicates_pct'] = 100.0 * dups / int(samtools_stats["Mapped_reads_raw"])
    else:
        dups = 0

    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
        merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    # Whole genome runs do not need detailed on-target calculations, use total unique mapped
    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False)

    if merged_bed_file:
        ontarget = sambamba.number_of_mapped_reads(
            data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data)
                ontarget_padded = sambamba.number_of_mapped_reads(
                    data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name)
    out['Avg_coverage'] = avg_depth

    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir,
                                                              extra_cutoffs=set([max(1, int(avg_depth * 0.8))]))

    return out
Exemple #41
0
def _get_max_depth(average_coverage, params, data):
    """Calculate maximum depth based on a rough multiplier of average coverage.
    """
    if dd.get_coverage_interval(data) == "genome":
        avg_cov = max(30.0, average_coverage)
        return avg_cov * params["high_multiplier"]
Exemple #42
0
 def _skip_duplicates(data):
     return dd.get_coverage_interval(
         data) == "amplicon" or not dd.get_mark_duplicates(data)
Exemple #43
0
 def _skip_duplicates(data):
     return dd.get_coverage_interval(data) == "amplicon" or not dd.get_mark_duplicates(data)
Exemple #44
0
def _run_cnvkit_shared_orig(inputs, backgrounds):
    """Original CNVkit implementation with full normalization and segmentation.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(
        backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir,
                                  "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(
            os.path.join(_sv_workdir(cur_input), "raw"))
        out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input),
                                                 cur_raw_work_dir, cur_input)
        if utils.file_exists(out_base_old + ".cns"):
            out_base = out_base_old
        ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base})
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                        zip(["evaluate"] * len(inputs), inputs)
        # New style shared SV bins
        if tz.get_in(["depth", "bins", "target"], inputs[0]):
            target_bed = tz.get_in(["depth", "bins", "target"], inputs[0])
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"],
                                       inputs[0])
            raw_coverage_cnns = reduce(operator.add, [
                _get_general_coverage(cdata, itype)
                for itype, cdata in samples_to_run
            ])
        # Back compatible with pre-existing runs
        else:
            target_bed, antitarget_bed = _get_original_targets(inputs[0])
            raw_coverage_cnns = reduce(operator.add, [
                _get_original_coverage(cdata, itype)
                for itype, cdata in samples_to_run
            ])
        # Currently metrics not calculated due to speed and needing re-evaluation
        # We could re-enable with larger truth sets to evaluate background noise
        # But want to reimplement in a more general fashion as part of normalization
        if False:
            coverage_cnns = reduce(operator.add, [
                _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                                inputs + backgrounds)
                for cnns in tz.groupby("bam", raw_coverage_cnns).values()
            ])
            background_cnn = cnvkit_background(
                _select_background_cnns(coverage_cnns), background_cnn, inputs,
                target_bed, antitarget_bed)
        else:
            coverage_cnns = raw_coverage_cnns
            background_cnn = cnvkit_background([
                x["file"] for x in coverage_cnns if x["itype"] == "background"
            ], background_cnn, inputs, target_bed, antitarget_bed)
        parallel = {
            "type": "local",
            "cores": dd.get_cores(inputs[0]),
            "progs": ["cnvkit"]
        }
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby(
                "bam", [x for x in coverage_cnns
                        if x["itype"] == "evaluate"]).values()],
            inputs[0]["config"], parallel)
        [
            _cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds)
            for cnr, data in fixed_cnrs
        ]
    return ckouts
Exemple #45
0
def _run_cnvkit_shared(items,
                       test_bams,
                       background_bams,
                       work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))

    background_cnn = os.path.join(
        raw_work_dir,
        "%s_background.cnn" % (background_name if background_name else "flat"))
    ckouts = []
    for test_bam in test_bams:
        out_base = os.path.splitext(
            os.path.basename(test_bam))[0].split(".")[0]
        ckouts.append({
            "cnr": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "back_cnn": background_cnn
        })
    if not utils.file_exists(ckouts[0]["cnr"]):
        data = items[0]
        cov_interval = dd.get_coverage_interval(data)
        raw_target_bed, access_bed = _get_target_access_files(
            cov_interval, data, work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, data)
        parallel = {
            "type": "local",
            "cores": dd.get_cores(data),
            "progs": ["cnvkit"]
        }
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed,
                                                     access_bed, cov_interval,
                                                     raw_work_dir, data)

        def _bam_to_itype(bam):
            return "background" if bam in background_bams else "evaluate"

        coverage_cnns = run_multicore(
            _cnvkit_coverage,
            [(bam, bed, _bam_to_itype(bam), raw_work_dir, data)
             for bam in test_bams + background_bams
             for bed in [target_bed, antitarget_bed]], data["config"],
            parallel)
        background_cnn = _cnvkit_background(
            [x["file"] for x in coverage_cnns if x["itype"] == "background"],
            background_cnn, target_bed, antitarget_bed, data)
        fixed_cnrs = run_multicore(_cnvkit_fix, [
            (cnns, background_cnn, data)
            for cnns in tz.groupby(lambda x: x[
                "bam"], [x for x in coverage_cnns
                         if x["itype"] == "evaluate"]).values()
        ], data["config"], parallel)
        called_segs = run_multicore(_cnvkit_segment, [(cnr, cov_interval, data)
                                                      for cnr in fixed_cnrs],
                                    data["config"], parallel)
    return ckouts
Exemple #46
0
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data):
    """Run titanCNA wrapper script on given ploidy and clusters.
    """
    sample = dd.get_sample_name(data)
    cores = dd.get_num_cores(data)
    export_cmd = utils.get_R_exports()
    ploidy_dir = utils.safe_makedir(
        os.path.join(work_dir, "run_ploidy%s" % ploidy))

    cluster_dir = "%s_cluster%02d" % (sample, num_clusters)
    out_dir = os.path.join(ploidy_dir, cluster_dir)
    if not utils.file_uptodate(out_dir + ".titan.txt", cn_file):
        with tx_tmpdir(data) as tmp_dir:
            with utils.chdir(tmp_dir):
                cmd = (
                    "{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} "
                    "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} "
                    "--libdir None")
                chroms = [
                    "'%s'" % c.name.replace("chr", "")
                    for c in ref.file_contigs(dd.get_ref_file(data))
                    if chromhacks.is_autosomal_or_x(c.name)
                ]
                if "'X'" not in chroms:
                    chroms += ["'X'"]
                # Use UCSC style naming for human builds to support BSgenome
                genome_build = ("hg19" if dd.get_genome_build(data) in [
                    "GRCh37", "hg19"
                ] else dd.get_genome_build(data))
                cmd += """ --chrs "c(%s)" """ % ",".join(chroms)
                cmd += " --genomeBuild {genome_build}"
                if data["genome_build"] in ("hg19", "hg38"):
                    cmd += " --genomeStyle UCSC"
                if data["genome_build"] in ["hg38"]:
                    data_dir = os.path.normpath(
                        os.path.join(
                            os.path.dirname(
                                os.path.realpath(
                                    os.path.join(
                                        os.path.dirname(utils.Rscript_cmd()),
                                        "titanCNA.R"))), os.pardir, os.pardir,
                            "data"))
                    cytoband_file = os.path.join(data_dir, "cytoBand_hg38.txt")
                    assert os.path.exists(cytoband_file), cytoband_file
                    cmd += " --cytobandFile %s" % cytoband_file
                # TitanCNA's model is influenced by the variance in read coverage data
                # and data type: set reasonable defaults for non-WGS runs
                # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts)
                if dd.get_coverage_interval(data) != "genome":
                    cmd += " --alphaK=2500 --alphaKHigh=2500"
                do.run(
                    cmd.format(**locals()),
                    "TitanCNA CNV detection: ploidy %s, cluster %s" %
                    (ploidy, num_clusters))
            for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")):
                shutil.move(fname, ploidy_dir)
            if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")):
                shutil.move(
                    os.path.join(tmp_dir, "Rplots.pdf"),
                    os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir))
    return ploidy_dir
Exemple #47
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir)
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    if "Total_reads" not in samtools_stats:
        return
    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    if not total_reads:
        return

    if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats:
        return
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if not mapped:
        return out

    if "Duplicates" in samtools_stats:
        out['Duplicates'] = dups = int(samtools_stats["Duplicates"])
        out['Duplicates_pct'] = 100.0 * dups / int(
            samtools_stats["Mapped_reads_raw"])
    else:
        dups = 0

    if dd.get_coverage(data):
        cov_bed_file = bedutils.clean_file(dd.get_coverage(data),
                                           data,
                                           prefix="cov-",
                                           simple=True)
        merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    # Whole genome runs do not need detailed on-target calculations, use total unique mapped
    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(
            data, bam_file, keep_dups=False)

    if merged_bed_file:
        ontarget = sambamba.number_of_mapped_reads(data,
                                                   bam_file,
                                                   keep_dups=False,
                                                   bed_file=merged_bed_file,
                                                   target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                            ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(
                    merged_bed_file, 200, data)
                ontarget_padded = sambamba.number_of_mapped_reads(
                    data,
                    bam_file,
                    keep_dups=False,
                    bed_file=padded_bed_file,
                    target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file,
                                         target_name)
    out['Avg_coverage'] = avg_depth

    region_coverage_file = cov.coverage_region_detailed_stats(
        data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))]))

    return out
Exemple #48
0
 def _skip_duplicates(data):
     return (dd.get_coverage_interval(data) == "amplicon" or
             (dd.get_aligner(data) and not dd.get_mark_duplicates(data)))
Exemple #49
0
def _get_max_depth(average_coverage, params, data):
    """Calculate maximum depth based on a rough multiplier of average coverage.
    """
    if dd.get_coverage_interval(data) == "genome":
        avg_cov = min(30.0, average_coverage)
        return avg_cov * params["high_multiplier"]