Example #1
0
def population_variant_regions(items):
    """Retrieve the variant region BED file from a population of items.

    If tumor/normal, return the tumor BED file. If a population, return
    the BED file covering the most bases.
    """
    import pybedtools
    if len(items) == 1:
        return dd.get_variant_regions(items[0]) or dd.get_sample_callable(
            items[0])
    else:
        paired = vcfutils.get_paired(items)
        if paired:
            return dd.get_variant_regions(
                paired.tumor_data) or dd.get_sample_callable(paired.tumor_data)
        else:
            vrs = []
            for data in items:
                vr_bed = dd.get_variant_regions(
                    data) or dd.get_sample_callable(data)
                if vr_bed:
                    vrs.append(
                        (pybedtools.BedTool(vr_bed).total_coverage(), vr_bed))
            vrs.sort(reverse=True)
            if vrs:
                return vrs[0][1]
Example #2
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    if vrn_file and "gvcf" in dd.get_tools_on(data):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        return callable.sample_callable_bed(data["work_bam_callable"],
                                            dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
Example #3
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    from bcbio.bam import callable
    if vrn_file and vcfutils.is_gvcf_file(vrn_file):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        data = utils.deepish_copy(data)
        data["work_bam"] = data.pop("work_bam_callable")
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
Example #4
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    from bcbio.bam import callable
    if vrn_file and vcfutils.is_gvcf_file(vrn_file):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        data = utils.deepish_copy(data)
        data["work_bam"] = data.pop("work_bam_callable")
        return callable.sample_callable_bed(data["work_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
Example #5
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > GENOME_COV_THRESH:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data),
                                             vrs or callable_file, "variant_regions")
            if offtarget_pct > OFFTARGET_THRESH:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Example #6
0
def run(bam_file, data, out_dir):
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    broad_runner = broad.PicardCmdRunner("picard", data["config"])
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    out_base = utils.splitext_plus(os.path.basename(bam_fname))[0]
    hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base)
    hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base)
    if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file):
        with utils.chdir(out_dir):
            with tx_tmpdir() as tmp_dir:
                cur_bam = os.path.basename(bam_fname)
                if not os.path.exists(cur_bam):
                    os.symlink(bam_fname, cur_bam)
                gen_metrics = PicardMetrics(broad_runner, tmp_dir)
                gen_metrics.report(cur_bam, ref_file,
                                bam.is_paired(bam_fname),
                                target_file, target_file, None, data["config"])
        if utils.file_exists(hsmetric_file):
            do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "")
        if utils.file_exists(hsinsert_file):
            do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "")
    return hsmetric_file
Example #7
0
def filter_multimappers(align_file, data):
    """
    It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie,
    there are some options that are close but don't do the same thing. Bowtie2
    sets the XS flag for reads mapping in more than one place, so we can just
    filter on that. This will not work for other aligners.
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Example #8
0
def run(bam_file, data, out_dir):
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    broad_runner = broad.PicardCmdRunner("picard", data["config"])
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    out_base = utils.splitext_plus(os.path.basename(bam_fname))[0]
    hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base)
    hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base)
    if not utils.file_exists(hsmetric_file) and not utils.file_exists(
            hsinsert_file):
        with utils.chdir(out_dir):
            with tx_tmpdir() as tmp_dir:
                cur_bam = os.path.basename(bam_fname)
                if not os.path.exists(cur_bam):
                    os.symlink(bam_fname, cur_bam)
                gen_metrics = PicardMetrics(broad_runner, tmp_dir)
                gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname),
                                   target_file, target_file, None,
                                   data["config"])
        if utils.file_exists(hsmetric_file):
            do.run(
                "sed -i 's/%s.bam//g' %s" %
                (out_base.replace(sample, ""), hsmetric_file), "")
        if utils.file_exists(hsinsert_file):
            do.run(
                "sed -i 's/%s.bam//g' %s" %
                (out_base.replace(sample, ""), hsinsert_file), "")
    return hsmetric_file
Example #9
0
def prep_recal(data):
    """Do pre-BQSR recalibration, calculation of recalibration tables.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Prepare BQSR tables with GATK: %s " %
                    str(dd.get_sample_name(data)))
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"),
                               data)
        if not dbsnp_file:
            logger.info(
                "Skipping GATK BaseRecalibrator because no VCF file of known variants was found."
            )
            return data
        broad_runner = broad.runner_from_config(data["config"])
        data["prep_recal"] = _gatk_base_recalibrator(
            broad_runner, dd.get_align_bam(data), dd.get_ref_file(data),
            dd.get_platform(data), dbsnp_file,
            dd.get_variant_regions(data) or dd.get_sample_callable(data), data)
    elif dd.get_recalibrate(data) == "sentieon":
        logger.info("Prepare BQSR tables with sentieon: %s " %
                    str(dd.get_sample_name(data)))
        data["prep_recal"] = sentieon.bqsr_table(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" %
                                  (dd.get_recalibrate(data)))
    return data
Example #10
0
def filter_multimappers(align_file, data):
    """
    Filtering a BWA alignment file for uniquely mapped reads, from here:
    https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Example #11
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > GENOME_COV_THRESH:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data),
                                             vrs or callable_file, "variant_regions")
            if offtarget_pct > OFFTARGET_THRESH:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Example #12
0
def filter_multimappers(align_file, data):
    """
    Filtering a BWA alignment file for uniquely mapped reads, from here:
    https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Example #13
0
def get_sv_bed(data, method=None, out_dir=None, include_gene_names=True):
    """Retrieve a BED file of regions for SV and heterogeneity calling using the provided method.

    method choices:
      - exons: Raw BED file of exon regions
      - transcripts: Full collapsed regions with the min and max of each transcript.
      - transcriptsXXXX: Collapsed regions around transcripts with a window size of
        XXXX.
      - A custom BED file of regions
    """
    if method is None:
        method = (tz.get_in(["config", "algorithm", "sv_regions"], data) or dd.get_variant_regions(data)
                  or dd.get_sample_callable(data))
    gene_file = dd.get_gene_bed(data)
    if method and os.path.isfile(method):
        return method
    elif not gene_file or not method:
        return None
    elif method == "exons":
        return gene_file
    elif method.startswith("transcripts"):
        window = method.split("transcripts")[-1]
        window = int(float(window)) if window else 0
        return _collapse_transcripts(gene_file, window, data, out_dir, include_gene_names=include_gene_names)
    else:
        raise ValueError("Unexpected transcript retrieval method: %s" % method)
Example #14
0
def get_base_cnv_regions(data,
                         work_dir,
                         genome_default="transcripts1e4",
                         include_gene_names=True):
    """Retrieve set of target regions for CNV analysis.

    Subsets to extended transcript regions for WGS experiments to avoid
    long runtimes.
    """
    cov_interval = dd.get_coverage_interval(data)
    base_regions = get_sv_bed(data, include_gene_names=include_gene_names)
    # if we don't have a configured BED or regions to use for SV caling
    if not base_regions:
        # For genome calls, subset to regions near genes as targets
        if cov_interval == "genome":
            base_regions = get_sv_bed(data,
                                      genome_default,
                                      work_dir,
                                      include_gene_names=include_gene_names)
            if base_regions:
                base_regions = remove_exclude_regions(base_regions,
                                                      base_regions, [data])
        # Finally, default to the defined variant regions
        if not base_regions:
            base_regions = dd.get_variant_regions(
                data) or dd.get_sample_callable(data)
    return bedutils.clean_file(base_regions, data)
Example #15
0
def get_sv_bed(data, method=None, out_dir=None, include_gene_names=True):
    """Retrieve a BED file of regions for SV and heterogeneity calling using the provided method.

    method choices:
      - exons: Raw BED file of exon regions
      - transcripts: Full collapsed regions with the min and max of each transcript.
      - transcriptsXXXX: Collapsed regions around transcripts with a window size of
        XXXX.
      - A custom BED file of regions
    """
    if method is None:
        method = (tz.get_in(["config", "algorithm", "sv_regions"], data)
                  or dd.get_variant_regions(data)
                  or dd.get_sample_callable(data))
    gene_file = dd.get_gene_bed(data)
    if method and os.path.isfile(method):
        return method
    elif not gene_file or not method:
        return None
    elif method == "exons":
        return gene_file
    elif method.startswith("transcripts"):
        window = method.split("transcripts")[-1]
        window = int(float(window)) if window else 0
        return _collapse_transcripts(gene_file,
                                     window,
                                     data,
                                     out_dir,
                                     include_gene_names=include_gene_names)
    else:
        raise ValueError("Unexpected transcript retrieval method: %s" % method)
Example #16
0
def filter_multimappers(align_file, data):
    """
    It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie,
    there are some options that are close but don't do the same thing. Bowtie2
    sets the XS flag for reads mapping in more than one place, so we can just
    filter on that. This will not work for other aligners.
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "[XS] == null and not unmapped {paired_filter}" '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Example #17
0
def _run_purecn_dx(out, paired):
    """Extract signatures and mutational burdens from PureCN rds file."""
    # no solution - no signatures
    if not "rds" in out:
        return out
    rscript = utils.Rscript_cmd()
    purecndx_r = utils.R_package_script("PureCN", "extdata/Dx.R", env="base")
    simple_repeat_bed = dd.get_variation_resources(
        paired.tumor_data)["simple_repeat"]
    callable_bed = dd.get_sample_callable(paired.tumor_data)
    out_base = utils.splitext_plus(out["rds"])[0]
    mutation_burden_csv = out_base + "_mutation_burden.csv"
    if not utils.file_uptodate(mutation_burden_csv, out["rds"]):
        # no signatures - so we generate them
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            cmd = [
                rscript, purecndx_r, "--rds", out["rds"], "--callable",
                callable_bed, "--signatures", "--exclude", simple_repeat_bed,
                "--out", tx_out_base
            ]
            do.run(cmd, "PureCN Dx mutational burden and signatures")
            out_base, out, all_files = _get_purecn_dx_files(paired,
                                                            out,
                                                            require_exist=True)
            # if a file was not generated it would not go to the upload
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base),
                                               f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    return out
Example #18
0
 def _get_variant_regions(data):
     out = dd.get_variant_regions(data) or dd.get_sample_callable(data)
     if merged:
         merged_out = dd.get_variant_regions_merged(data)
         if merged_out:
             out = merged_out
         else:
             out = merge_overlaps(out, data)
     return out
Example #19
0
 def _get_variant_regions(data):
     out = dd.get_variant_regions(data) or dd.get_sample_callable(data)
     # Only need to merge for variant region inputs, not callable BED regions which don't overlap
     if merged and dd.get_variant_regions(data):
         merged_out = dd.get_variant_regions_merged(data)
         if merged_out:
             out = merged_out
         else:
             out = merge_overlaps(out, data)
     return out
Example #20
0
def _maybe_add_callable(data, out):
    """Add callable and depth regions to output folder.
    """
    callable_bed = dd.get_sample_callable(data)
    if callable_bed:
        out.append({"path": callable_bed, "type": "bed", "ext": "callable"})
    perbase_bed = tz.get_in(["depth", "variant_regions", "per_base"], data)
    if perbase_bed:
        out.append({"path": perbase_bed, "type": "bed.gz", "ext": "depth-per-base"})
    return out
Example #21
0
 def _get_variant_regions(data):
     out = dd.get_variant_regions(data) or dd.get_sample_callable(data)
     # Only need to merge for variant region inputs, not callable BED regions which don't overlap
     if merged and dd.get_variant_regions(data):
         merged_out = dd.get_variant_regions_merged(data)
         if merged_out:
             out = merged_out
         else:
             out = merge_overlaps(out, data)
     return out
Example #22
0
def _maybe_add_callable(data, out):
    """Add callable and depth regions to output folder.
    """
    callable_bed = dd.get_sample_callable(data)
    if callable_bed:
        out.append({"path": callable_bed, "type": "bed", "ext": "callable"})
    perbase_bed = tz.get_in(["depth", "variant_regions", "per_base"], data)
    if perbase_bed:
        out.append({"path": perbase_bed, "type": "bed.gz", "ext": "depth-per-base"})
    return out
Example #23
0
def _prep_real_counts(bam_file, data, samtools_stats):
    out = {}

    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        bed = dd.get_coverage_merged(data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        bed = dd.get_variant_regions_merged(data) or dd.get_sample_callable(
            data)
        target_name = "variant_regions"
    else:
        bed = None
        target_name = "genome"

    dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"),
                            True)

    if bed:
        out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage()
        out["Preseq_read_count"] = readstats.number_of_mapped_reads(
            data,
            bam_file,
            keep_dups=True,
            bed_file=bed,
            target_name=target_name)
        ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data,
                                                      bam_file)
        if dedupped:
            out["Preseq_unique_count"] = readstats.number_of_mapped_reads(
                data,
                bam_file,
                keep_dups=False,
                bed_file=bed,
                target_name=target_name)

        # Counting average on-target alignment length, based on the equation:
        #    avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size
        total_alignments = out.get(
            "Preseq_unique_count") or out["Preseq_read_count"]
        out["Preseq_read_length"] = ontrg_unique_depth * out[
            "Preseq_genome_size"] // total_alignments

    else:  # WGS
        out["Preseq_genome_size"] = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        out["Preseq_read_count"] = int(samtools_stats["Total_reads"])
        out["Preseq_read_length"] = int(samtools_stats["Average_read_length"])
        if dedupped:
            out["Preseq_unique_count"] = out["Preseq_read_count"] - int(
                samtools_stats["Duplicates"])

    return out
Example #24
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    if vrn_file and "gvcf" in dd.get_tools_on(data):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        return callable.sample_callable_bed(data["work_bam_callable"], dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
Example #25
0
def _run_purecn_dx(out, paired):
    """Extract signatures and mutational burdens from PureCN rds file.
    """
    out_base, out, all_files = _get_purecn_dx_files(paired, out)
    if not utils.file_uptodate(out["mutation_burden"], out["rds"]):
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            cmd = ["PureCN_Dx.R", "--rds", out["rds"], "--callable", dd.get_sample_callable(paired.tumor_data),
                   "--signatures", "--out", tx_out_base]
            do.run(cmd, "PureCN Dx mutational burden and signatures")
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    return out
Example #26
0
def _run_purecn_dx(out, paired):
    """Extract signatures and mutational burdens from PureCN rds file.
    """
    out_base, out, all_files = _get_purecn_dx_files(paired, out)
    if not utils.file_uptodate(out["mutation_burden"], out["rds"]):
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            cmd = ["PureCN_Dx.R", "--rds", out["rds"], "--callable", dd.get_sample_callable(paired.tumor_data),
                   "--signatures", "--out", tx_out_base]
            do.run(cmd, "PureCN Dx mutational burden and signatures")
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    return out
Example #27
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    genome_cov_thresh = 0.40  # percent of genome covered for whole genome analysis
    offtarget_thresh = 0.05  # percent of offtarget reads required to be capture (not amplification) based
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            seq_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            seq_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        genome_cov_pct = seq_size / float(total_size)
        if genome_cov_pct > genome_cov_thresh:
            cov_interval = "genome"
            offtarget_pct = 0.0
        else:
            offtarget_stats_file = dd.get_offtarget_stats(data)
            if not offtarget_stats_file:
                offtarget_pct = 0.0
            else:
                with open(offtarget_stats_file) as in_handle:
                    stats = yaml.safe_load(in_handle)
                if stats.get("offtarget") and stats["mapped_unique"]:
                    offtarget_pct = float(
                        stats["offtarget"]) / stats["mapped_unique"]
                else:
                    offtarget_pct = 0.0
            if offtarget_pct > offtarget_thresh:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info(
            "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
            % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0,
               offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Example #28
0
def get_base_cnv_regions(data, work_dir, genome_default="transcripts1e4", include_gene_names=True):
    """Retrieve set of target regions for CNV analysis.

    Subsets to extended transcript regions for WGS experiments to avoid
    long runtimes.
    """
    cov_interval = dd.get_coverage_interval(data)
    base_regions = get_sv_bed(data, include_gene_names=include_gene_names)
    # if we don't have a configured BED or regions to use for SV caling
    if not base_regions:
        # For genome calls, subset to regions near genes as targets
        if cov_interval == "genome":
            base_regions = get_sv_bed(data, genome_default, work_dir, include_gene_names=include_gene_names)
            if base_regions:
                base_regions = remove_exclude_regions(base_regions, base_regions, [data])
        # Finally, default to the defined variant regions
        if not base_regions:
            base_regions = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    return bedutils.clean_file(base_regions, data)
Example #29
0
def _evaluate_vcf(calls, truth_vcf, work_dir, data):
    out_file = os.path.join(
        work_dir,
        os.path.join("%s-sv-validate.csv" % dd.get_sample_name(data)))
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle)
                writer.writerow(
                    ["sample", "caller", "vtype", "metric", "value"])
                for call in calls:
                    detail_dir = utils.safe_makedir(
                        os.path.join(work_dir, call["variantcaller"]))
                    for stats in _validate_caller_vcf(
                            call["vrn_file"], truth_vcf,
                            dd.get_sample_callable(data),
                            call["variantcaller"], detail_dir, data):

                        writer.writerow(stats)
    return out_file
Example #30
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    genome_cov_thresh = 0.40  # percent of genome covered for whole genome analysis
    offtarget_thresh = 0.10  # percent of offtarget reads required to be capture (not amplification) based
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            seq_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            seq_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = seq_size / float(total_size)
        if genome_cov_pct > genome_cov_thresh:
            cov_interval = "genome"
            offtarget_pct = 0.0
        else:
            offtarget_stat_file = dd.get_offtarget_stats(data)
            if not offtarget_stat_file:
                offtarget_pct = 0.0
            else:
                with open(offtarget_stat_file) as in_handle:
                    stats = yaml.safe_load(in_handle)
                if float(stats["mapped"]) > 0:
                    offtarget_pct = stats["offtarget"] / float(stats["mapped"])
                else:
                    offtarget_pct = 0.0
            if offtarget_pct > offtarget_thresh:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Example #31
0
def prep_recal(data):
    """Do pre-BQSR recalibration, calculation of recalibration tables.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data)))
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data)
        if not dbsnp_file:
            logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.")
            return data
        broad_runner = broad.runner_from_config(data["config"])
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dd.get_align_bam(data),
                                                     dd.get_ref_file(data), dd.get_platform(data),
                                                     dbsnp_file,
                                                     dd.get_variant_regions(data) or dd.get_sample_callable(data),
                                                     data)
    elif dd.get_recalibrate(data) == "sentieon":
        logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data)))
        data["prep_recal"] = sentieon.bqsr_table(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data)))
    return data
Example #32
0
def _run_purecn_dx(out, paired):
    """Extract signatures and mutational burdens from PureCN rds file."""
    out_base, out, all_files = _get_purecn_dx_files(paired, out)
    rscript = utils.Rscript_cmd("r36")
    purecndx_r = utils.R_package_script("r36", "PureCN", "extdata/Dx.R")
    simple_repeat_bed = dd.get_variation_resources(paired.tumor_data)["simple_repeat"]
    callable_bed = dd.get_sample_callable(paired.tumor_data)
    if not utils.file_uptodate(out["mutation_burden"], out["rds"]):
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            cmd = [rscript, purecndx_r, 
                   "--rds", out["rds"], 
                   "--callable", callable_bed,
                   "--signatures",
                   "--exclude", simple_repeat_bed,
                   "--out", tx_out_base]
            do.run(cmd, "PureCN Dx mutational burden and signatures")
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    return out
Example #33
0
def run(bam_file, data, out_dir):
    """Run coverage QC analysis
    """
    out = dict()

    out_dir = utils.safe_makedir(out_dir)
    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        merged_bed_file = bedutils.clean_file(dd.get_coverage_merged(data),
                                              data,
                                              prefix="cov-",
                                              simple=True)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(
            data) or dd.get_sample_callable(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    avg_depth = cov.get_average_coverage(target_name, merged_bed_file, data)
    if target_name == "coverage":
        out_files = cov.coverage_region_detailed_stats(target_name,
                                                       merged_bed_file, data,
                                                       out_dir)
    else:
        out_files = []

    out['Avg_coverage'] = avg_depth

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools')
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data,
                                  samtools_stats_dir)["metrics"]

    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"])
    out['Duplicates'] = dups = int(samtools_stats["Duplicates"])

    if total_reads:
        out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if mapped:
        out['Duplicates_pct'] = 100.0 * dups / mapped

    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        mapped_unique = readstats.number_of_mapped_reads(data,
                                                         bam_file,
                                                         keep_dups=False)
    out['Mapped_unique_reads'] = mapped_unique

    if merged_bed_file:
        ontarget = readstats.number_of_mapped_reads(data,
                                                    bam_file,
                                                    keep_dups=False,
                                                    bed_file=merged_bed_file,
                                                    target_name=target_name)
        out["Ontarget_unique_reads"] = ontarget
        if mapped_unique:
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                            ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(
                    out_dir, merged_bed_file, 200, data)
                ontarget_padded = readstats.number_of_mapped_reads(
                    data,
                    bam_file,
                    keep_dups=False,
                    bed_file=padded_bed_file,
                    target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    indexcov_files = _goleft_indexcov(bam_file, data, out_dir)
    out_files += [x for x in indexcov_files if x and utils.file_exists(x)]
    out = {"metrics": out}
    if len(out_files) > 0:
        out["base"] = out_files[0]
        out["secondary"] = out_files[1:]
    return out
Example #34
0
def _evaluate_vcf(calls, truth_vcf, work_dir, data):
    out_file = os.path.join(work_dir, os.path.join("%s-sv-validate.csv" % dd.get_sample_name(data)))
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle)
                writer.writerow(["sample", "caller", "vtype", "metric", "value"])
                for call in calls:
                    detail_dir = utils.safe_makedir(os.path.join(work_dir, call["variantcaller"]))
                    if call.get("vrn_file"):
                        for stats in _validate_caller_vcf(call["vrn_file"], truth_vcf, dd.get_sample_callable(data),
                                                          call["variantcaller"], detail_dir, data):

                            writer.writerow(stats)
    return out_file