Beispiel #1
0
def _segment_normalized_gatk(cnr_file, work_dir, paired):
    """Segmentation of normalized inputs using GATK4, converting into standard input formats.
    """
    work_dir = utils.safe_makedir(os.path.join(work_dir, "gatk-cnv"))
    seg_file = gatkcnv.model_segments(cnr_file, work_dir, paired)["seg"]
    std_seg_file = seg_file.replace(".cr.seg", ".seg")
    if not utils.file_uptodate(std_seg_file, seg_file):
        with file_transaction(std_seg_file) as tx_out_file:
            df = pd.read_csv(seg_file, sep="\t", comment="@", header=0,
                             names=["chrom", "loc.start", "loc.end", "num.mark", "seg.mean"])
            df.insert(0, "ID", [dd.get_sample_name(paired.tumor_data)] * len(df))
            df.to_csv(tx_out_file, sep="\t", header=True, index=False)
    std_cnr_file = os.path.join(work_dir, "%s.cnr" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_uptodate(std_cnr_file, cnr_file):
        with file_transaction(std_cnr_file) as tx_out_file:
            logdf = pd.read_csv(cnr_file, sep="\t", comment="@", header=0,
                                names=["chrom", "start", "end", "log2"])
            covdf = pd.read_csv(tz.get_in(["depth", "bins", "antitarget"], paired.tumor_data),
                                sep="\t", header=None,
                                names=["chrom", "start", "end", "orig.name", "depth", "gene"])
            df = pd.merge(logdf, covdf, on=["chrom", "start", "end"])
            del df["orig.name"]
            df = df[["chrom", "start", "end", "gene", "log2", "depth"]]
            df.insert(6, "weight", [1.0] * len(df))
            df.to_csv(tx_out_file, sep="\t", header=True, index=False)
    return std_cnr_file, std_seg_file
Beispiel #2
0
def _evaluate_multi(calls, truth_svtypes, work_dir, data):
    base = os.path.join(work_dir, "%s-sv-validate" % (dd.get_sample_name(data)))
    out_file = base + ".csv"
    df_file = base + "-df.csv"
    if any((not utils.file_uptodate(out_file, x["vrn_file"])
            or not utils.file_uptodate(df_file, x["vrn_file"])) for x in calls):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with open(df_file, "w") as df_out_handle:
                    writer = csv.writer(out_handle)
                    dfwriter = csv.writer(df_out_handle)
                    writer.writerow(["svtype", "size", "caller", "sensitivity", "precision"])
                    dfwriter.writerow(["svtype", "size", "caller", "metric", "value", "label"])
                    for svtype, truth in truth_svtypes.items():
                        for size in EVENT_SIZES:
                            str_size = "%s-%s" % size
                            for call in calls:
                                call_bed = convert.to_bed(call, dd.get_sample_name(data), work_dir, calls, data)
                                if utils.file_exists(call_bed):
                                    evalout = _evaluate_one(call["variantcaller"], svtype, size, call_bed,
                                                            truth, data)
                                    writer.writerow([svtype, str_size, call["variantcaller"],
                                                     evalout["sensitivity"]["label"], evalout["precision"]["label"]])
                                    for metric in ["sensitivity", "precision"]:
                                        dfwriter.writerow([svtype, str_size, call["variantcaller"], metric,
                                                           evalout[metric]["val"], evalout[metric]["label"]])
    return out_file, df_file
Beispiel #3
0
def _evaluate_multi(callers, truth_svtypes, ensemble, call_beds, data):
    out_file = "%s-validate.csv" % utils.splitext_plus(ensemble)[0]
    df_file = "%s-validate-df.csv" % utils.splitext_plus(ensemble)[0]
    if not utils.file_uptodate(out_file, ensemble) or not utils.file_uptodate(df_file, ensemble):
        with open(out_file, "w") as out_handle:
            with open(df_file, "w") as df_out_handle:
                writer = csv.writer(out_handle)
                dfwriter = csv.writer(df_out_handle)
                total_callers = callers_by_event(ensemble, data)
                writer.writerow(["svtype", "size", "caller", "sensitivity", "precision"])
                dfwriter.writerow(["svtype", "size", "caller", "metric", "value", "label"])
                for svtype, truth in truth_svtypes.items():
                    for size in EVENT_SIZES:
                        str_size = "%s-%s" % size
                        for caller in (x for x in callers if x in total_callers[svtype] or x == "sv-ensemble"):
                            try:
                                call_bed = call_beds[caller]
                            except KeyError:
                                assert caller == "sv-ensemble", caller
                                call_bed = ensemble
                            evalout = _evaluate_one(caller, svtype, size, call_bed, truth, data)
                            writer.writerow([svtype, str_size, caller,
                                             evalout["sensitivity"]["label"], evalout["precision"]["label"]])
                            for metric in ["sensitivity", "precision"]:
                                dfwriter.writerow([svtype, str_size, caller, metric,
                                                   evalout[metric]["val"], evalout[metric]["label"]])
    return out_file, df_file
Beispiel #4
0
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file or not utils.file_exists(bed_file):
        return []
    work_dir = safe_makedir(out_dir)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000}

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam):
            pass
        else:
            with file_transaction(data, parse_file) as out_tx:
                depth_thresholds = sorted(list(cutoffs | extra_cutoffs))
                cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds)
                cmdl += " | sed 's/# chrom/chrom/' > " + out_tx
                do.run(cmdl, "Run coverage regional analysis for {}".format(sample))
        out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs)
    return [os.path.abspath(x) for x in out_files]
Beispiel #5
0
def coverage_region_detailed_stats(data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file:
        return None
    work_dir = safe_makedir(out_dir)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        parse_file = os.path.join(sample + "_coverage.bed")
        if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam):
            pass
        else:
            with file_transaction(parse_file) as out_tx:
                cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed,
                                             depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100],
                                             max_cov=1000)
                cmdl += " | sed 's/# chrom/chrom/' > " + out_tx
                do.run(cmdl, "Run coverage regional analysis for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample)
        parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample)
    return os.path.abspath(parse_file)
def index(in_bam, config):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if (not utils.file_uptodate(index_file, in_bam) and
          not utils.file_uptodate(alt_index_file, in_bam)):
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(index_file) as tx_index_file:
            samtools_cmd = "{samtools} index {in_bam} {tx_index_file}"
            if sambamba:
                cmd = "{sambamba} index -t {num_cores} {in_bam} {tx_index_file}"
            else:
                cmd = samtools_cmd
            # sambamba has intermittent multicore failures. Allow
            # retries with single core
            try:
                do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam),
                       log_error=False)
            except:
                do.run(samtools_cmd.format(**locals()),
                       "Index BAM file (single core): %s" % os.path.basename(in_bam))
    return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
Beispiel #7
0
def bgzip_and_index(in_file, config=None, remove_orig=True, prep_cmd="", tabix_args=None, out_dir=None):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    if config is None:
        config = {}
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if out_dir:
        remove_orig = False
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if (not utils.file_exists(out_file) or not os.path.lexists(out_file)
          or (utils.file_exists(in_file) and not utils.file_uptodate(out_file, in_file))):
        assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file
        assert os.path.exists(in_file), "Input file %s not found" % in_file
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(config, out_file) as tx_out_file:
                bgzip = tools.get_bgzip_cmd(config)
                cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
                if prep_cmd:
                    prep_cmd = "| %s " % prep_cmd
                cmd = "{cat_cmd} {in_file} {prep_cmd} | {bgzip} -c > {tx_out_file}"
                try:
                    do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file))
                except subprocess.CalledProcessError:
                    # Race conditions: ignore errors where file has been deleted by another
                    if os.path.exists(in_file) and not os.path.exists(out_file):
                        raise
            if remove_orig:
                try:
                    os.remove(in_file)
                except OSError:  # Handle cases where run in parallel and file has been deleted
                    pass
    tabix_index(out_file, config, tabix_args=tabix_args)
    return out_file
Beispiel #8
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                     "%s-%s.bam" % (dd.get_sample_name(data),
                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                        cmd = ("samtools view -u -f 4 {bam_file} | "
                               "bamtofastq collate=0 | "
                               "bwa mem -t {cores} {viral_ref} - | "
                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                        do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" % dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" % (info.contig, info.aligned))
            out["base"] = out_file
    return out
Beispiel #9
0
def index(in_bam, config, check_timestamp=True):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if check_timestamp:
        bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam)
    else:
        bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file)
    if not bai_exists:
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        assert sambamba, "Did not find sambamba for indexing"
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            try:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
                do.run(cmd.format(**locals()), "Index BAM file with sambamba: %s" % os.path.basename(in_bam))
            except subprocess.CalledProcessError:
                cmd = "{samtools} index {in_bam} {tx_index_file}"
                do.run(cmd.format(**locals()), "Backup single thread index of BAM file with samtools: %s"
                       % os.path.basename(in_bam))
    return index_file if utils.file_exists(index_file) else alt_index_file
Beispiel #10
0
def index(in_bam, config):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if (not utils.file_uptodate(index_file, in_bam) and
          not utils.file_uptodate(alt_index_file, in_bam)):
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            if sambamba:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
            else:
                cmd = "{samtools} index {tx_bam_file}"
            do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam))
    return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
Beispiel #11
0
def run_filter(vrn_file, align_bam, ref_file, data, items):
    """Filter and annotate somatic VCFs with damage/bias artifacts on low frequency variants.

    Moves damage estimation to INFO field, instead of leaving in FILTER.
    """
    if not should_filter(items) or not vcfutils.vcf_has_variants(vrn_file):
        return data
    else:
        raw_file = "%s-damage.vcf" % utils.splitext_plus(vrn_file)[0]
        out_plot_files = ["%s%s" % (utils.splitext_plus(raw_file)[0], ext)
                          for ext in ["_seq_bias_simplified.pdf", "_pcr_bias_simplified.pdf"]]
        if not utils.file_uptodate(raw_file, vrn_file) and not utils.file_uptodate(raw_file + ".gz", vrn_file):
            with file_transaction(items[0], raw_file) as tx_out_file:
                # Does not apply --qcSummary plotting due to slow runtimes
                cmd = ["dkfzbiasfilter.py", "--filterCycles", "1", "--passOnly",
                       "--tempFolder", os.path.dirname(tx_out_file),
                       vrn_file, align_bam, ref_file, tx_out_file]
                do.run(cmd, "Filter low frequency variants for DNA damage and strand bias")
                for out_plot in out_plot_files:
                    tx_plot_file = os.path.join("%s_qcSummary" % utils.splitext_plus(tx_out_file)[0], "plots",
                                                os.path.basename(out_plot))
                    if utils.file_exists(tx_plot_file):
                        shutil.move(tx_plot_file, out_plot)
        raw_file = vcfutils.bgzip_and_index(raw_file, items[0]["config"])
        data["vrn_file"] = _filter_to_info(raw_file, items[0])
        out_plot_files = [x for x in out_plot_files if utils.file_exists(x)]
        data["damage_plots"] = out_plot_files
        return data
Beispiel #12
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files")
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom))
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir(data):
        sv_bed = regions.get_sv_bed(data)
        callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed)
        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, callable_bed, depth_files)
Beispiel #13
0
def regions_coverage(data, bed_file, bam_file, target_name):
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, target_name + "_regions_depth.bed")
    if utils.file_uptodate(out_file, bam_file) and utils.file_uptodate(out_file, bed_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file) + " -o " + tx_out_file
        message = "Calculating regions coverage of {target_name} in {bam_file}"
        do.run(cmdl, message.format(**locals()))
    return out_file
Beispiel #14
0
def _extract_germline(in_file, data):
    """Extract germline calls non-somatic, non-filtered calls.
    """
    out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate(out_file + ".gz", in_file):
        with file_transaction(data, out_file) as tx_out_file:
            reader = cyvcf2.VCF(in_file)
            reader.add_filter_to_header({'ID': 'Somatic', 'Description': 'Variant called as Somatic'})
            with contextlib.closing(cyvcf2.Writer(tx_out_file, reader)) as writer:
                for rec in reader:
                    writer.write_record(_update_germline_filters(rec))
    return out_file
Beispiel #15
0
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, work_dir, data):
    """Create target and antitarget regions from target and access files.
    """
    target_bed = os.path.join(work_dir, "%s.target.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0])
    if not utils.file_uptodate(target_bed, raw_target_bed):
        with file_transaction(data, target_bed) as tx_out_file:
            cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file]
            do.run(cmd, "CNVkit target")
    antitarget_bed = os.path.join(work_dir, "%s.antitarget.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0])
    if not utils.file_uptodate(antitarget_bed, target_bed):
        with file_transaction(data, antitarget_bed) as tx_out_file:
            cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file]
            do.run(cmd, "CNVkit antitarget")
    return target_bed, antitarget_bed
Beispiel #16
0
def merge_overlaps(in_file, data, distance=None, out_dir=None):
    """Merge bed file intervals to avoid overlapping regions.

    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    config = data["config"]
    if in_file:
        bedtools = config_utils.get_program("bedtools", config,
                                            default="bedtools")
        work_dir = tz.get_in(["dirs", "work"], data)
        if out_dir:
            bedprep_dir = out_dir
        elif work_dir:
            bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep"))
        else:
            bedprep_dir = os.path.dirname(in_file)
        out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0]))
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(data, out_file) as tx_out_file:
                distance = "-d %s" % distance if distance else ""
                cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prepare merged BED file", data)
        vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False)
        return out_file
Beispiel #17
0
def slim_vcf(in_file, data):
    """Remove larger annotations which slow down VCF processing
    """
    to_remove = ["ANN", "LOF"]
    to_remove_str = tuple(["##INFO=<ID=%s" % x for x in to_remove])
    in_file = vcfutils.bgzip_and_index(in_file, data, remove_orig=False)
    out_file = "%s-slim.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        cur_remove = []
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if not line.startswith("#"):
                    break
                elif line.startswith(to_remove_str):
                    cur_id = line.split("ID=")[-1].split(",")[0]
                    cur_remove.append("INFO/%s" % cur_id)
        with file_transaction(data, out_file) as tx_out_file:
            if cur_remove:
                cur_remove = ",".join(cur_remove)
                cmd = ("bcftools view -f 'PASS,.' {in_file} | "
                       "bcftools annotate -x {cur_remove} -O z -o {tx_out_file}")
            else:
                cmd = ("bcftools view -f 'PASS,.' {in_file} -O z -o {tx_out_file}")
            do.run(cmd.format(**locals()), "Create slim VCF")
    return out_file
Beispiel #18
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    from bcbio.variation import bedutils
    highdepth_beds = filter(lambda x: x is not None,
                            list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items])))
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(highdepth_beds):
                            parts = line.split("\t")
                            out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Beispiel #19
0
def _prep_bed(data, work_dir):
    """Selecting the bed file, cleaning, and properly annotating for Seq2C
    """
    bed_file = regions.get_sv_bed(data)
    if bed_file:
        bed_file = clean_file(bed_file, data, prefix="svregions-")
    else:
        bed_file = clean_file(dd.get_variant_regions(data), data)

    col_num = bt.BedTool(bed_file).field_count()
    if col_num < 4:
        annotated_file = annotate.add_genes(bed_file, data, max_distance=0)
        if annotated_file == bed_file:
            raise ValueError("BED file for Seq2C must be annotated with gene names, "
                             "however the input BED is 3-columns and we have no transcript "
                             "data to annotate with " + bed_file)
        annotated_file = annotate.gene_one_per_line(annotated_file, data)
    else:
        annotated_file = bed_file

    ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0])
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])
        with file_transaction(data, ready_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file)

    return ready_file
def _run_gemini_stats(bam_file, data, out_dir):
    """Retrieve high level variant statistics from Gemini.
    """
    out = {}
    gemini_db = (data.get("variants", [{}])[0].get("population", {}).get("db") 
                 if data.get("variants") else None)
    if gemini_db:
        gemini_stat_file = "%s-stats.yaml" % os.path.splitext(gemini_db)[0]
        if not utils.file_uptodate(gemini_stat_file, gemini_db):
            gemini = config_utils.get_program("gemini", data["config"])
            tstv = subprocess.check_output([gemini, "stats", "--tstv", gemini_db])
            gt_counts = subprocess.check_output([gemini, "stats", "--gts-by-sample", gemini_db])
            dbsnp_count = subprocess.check_output([gemini, "query", gemini_db, "-q",
                                                   "SELECT count(*) FROM variants WHERE in_dbsnp==1"])
            out["Transition/Transversion"] = tstv.split("\n")[1].split()[-1]
            for line in gt_counts.split("\n"):
                parts = line.rstrip().split()
                if len(parts) > 0 and parts[0] == data["name"][-1]:
                    _, hom_ref, het, hom_var, _, total = parts
                    out["Variations (total)"] = int(total)
                    out["Variations (heterozygous)"] = int(het)
                    out["Variations (homozygous)"] = int(hom_var)
                    break
            out["Variations (in dbSNP)"] = int(dbsnp_count.strip())
            if out.get("Variations (total)") > 0:
                out["Variations (in dbSNP) pct"] = "%.1f%%" % (out["Variations (in dbSNP)"] /
                                                               float(out["Variations (total)"]) * 100.0)
            with open(gemini_stat_file, "w") as out_handle:
                yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
        else:
            with open(gemini_stat_file) as in_handle:
                out = yaml.safe_load(in_handle)
    return out
Beispiel #21
0
def add_genome_context(orig_file, data):
    """Annotate a file with annotations of genome context using vcfanno.
    """
    out_file = "%s-context.vcf.gz" % utils.splitext_plus(orig_file)[0]
    if not utils.file_uptodate(out_file, orig_file):
        with file_transaction(data, out_file) as tx_out_file:
            config_file = "%s.toml" % (utils.splitext_plus(tx_out_file)[0])
            with open(config_file, "w") as out_handle:
                all_names = []
                for fname in dd.get_genome_context_files(data):
                    bt = pybedtools.BedTool(fname)
                    if bt.field_count() >= 4:
                        d, base = os.path.split(fname)
                        _, prefix = os.path.split(d)
                        name = "%s_%s" % (prefix, utils.splitext_plus(base)[0])
                        out_handle.write("[[annotation]]\n")
                        out_handle.write('file = "%s"\n' % fname)
                        out_handle.write("columns = [4]\n")
                        out_handle.write('names = ["%s"]\n' % name)
                        out_handle.write('ops = ["uniq"]\n')
                        all_names.append(name)
                out_handle.write("[[postannotation]]\n")
                out_handle.write("fields = [%s]\n" % (", ".join(['"%s"' % n for n in all_names])))
                out_handle.write('name = "genome_context"\n')
                out_handle.write('op = "concat"\n')
                out_handle.write('type = "String"\n')
            cmd = "vcfanno {config_file} {orig_file} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Annotate with problem annotations", data)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Beispiel #22
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if gene_list:
            if vcf_file.endswith(".vcf.gz"):
                utils.symlink_plus(vcf_file, priority_vcf)
            else:
                assert vcf_file.endswith(".vcf")
                utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf"))
                vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"),
                                         data["config"], remove_orig=False)
        # otherwise prioritize based on BED and proceed
        else:
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources("bcbio_prioritize", data["config"])
                    jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])
                    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                                 {"direction": "increase",
                                                                                  "maximum": "30000M",
                                                                                  "magnitude": dd.get_cores(data)}}})
                    jvm_opts = " ".join(jvm_opts)
                    export = utils.local_path_export()
                    cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                           " -k {prioritize_by}")
                    do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")

        data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if post_prior_fn:
        simple_vcf = post_prior_fn(simple_vcf, work_dir, data)
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export(env_cmd="vawk")
            cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
Beispiel #23
0
def _uniquify_bed_names(bed_file, out_dir, data):
    """Chanjo required unique names in the BED file to map to intervals.
    """
    out_file = os.path.join(out_dir, "%s-unames%s" % utils.splitext_plus(os.path.basename(bed_file)))
    if not utils.file_exists(out_file) or not utils.file_uptodate(out_file, bed_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(bed_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    namecounts = collections.defaultdict(int)
                    for i, line in enumerate(in_handle):
                        parts = line.rstrip("\r\n").split("\t")
                        if len(parts) >= 4:
                            name = parts[3]
                        else:
                            name = str(i)
                        namecount = namecounts.get(name, 0)
                        namecounts[name] += 1
                        if namecount > 0:
                            name = "%s-%s" % (name, namecount)
                        if len(parts) >= 4:
                            parts[3] = name
                        else:
                            assert len(parts) == 3
                            parts.append(name)
                        out_handle.write("\t".join(parts) + "\n")
    return out_file
Beispiel #24
0
def umi_consensus(data):
    """Convert UMI grouped reads into fastq pair for re-alignment.
    """
    align_bam = dd.get_work_bam(data)
    umi_method, umi_tag = _check_umi_type(align_bam)
    f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
    f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
    if not utils.file_uptodate(f1_out, align_bam):
        with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
            jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
            # Improve speeds by avoiding compression read/write bottlenecks
            io_opts = "--async-io=true --compression=0"
            group_opts, cons_opts, filter_opts = _get_fgbio_options(data, umi_method)
            cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads"
            tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0]
            ref_file = dd.get_ref_file(data)
            cmd = ("unset JAVA_HOME && "
                   "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
                   "-i {align_bam} | "
                   "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: "
                   "-i /dev/stdin -o /dev/stdout | "
                   "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} "
                   "-i /dev/stdin -o /dev/stdout | "
                   "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
            do.run(cmd.format(**locals()), "UMI consensus fastq generation")
    return f1_out, f2_out
Beispiel #25
0
def _remove_prioritization(in_file, data):
    """Remove tumor-only prioritization and return non-filtered calls.
    """
    out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate(out_file + ".gz", in_file):
        with file_transaction(data, out_file) as tx_out_file:
            reader = cyvcf2.VCF(in_file)
            reader.add_filter_to_header({'ID': 'Somatic', 'Description': 'Variant called as Somatic'})
            # with contextlib.closing(cyvcf2.Writer(tx_out_file, reader)) as writer:
            with open(tx_out_file, "w") as out_handle:
                out_handle.write(reader.raw_header)
                for rec in reader:
                    rec = _update_prioritization_filters(rec)
                    out_handle.write(str(rec))
                    # writer.write_record(rec)
    return out_file
Beispiel #26
0
def _filter_ensemble(in_bed, data):
    """Filter ensemble set of calls, requiring calls supported by 2 callers.

    We filter only smaller size events, which seem to benefit the most since
    they have lower precision. We also check to be sure that the required
    number of callers actually called in each event, since some callers don't handle
    all event types.
    """
    support_events = set(["BND", "UKN"])
    max_size = max([xs[1] for xs in validate.EVENT_SIZES[:2]])
    out_file = "%s-filter%s" % utils.splitext_plus(in_bed)

    if not utils.file_uptodate(out_file, in_bed):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with open(in_bed) as in_handle:
                    total_callers = validate.callers_by_event(in_bed, data)
                    for line in in_handle:
                        chrom, start, end, caller_strs = line.strip().split()[:4]
                        size = int(end) - int(start)
                        events = collections.defaultdict(set)
                        for event, caller in [x.split("_", 1) for x in caller_strs.split(",")]:
                            events[validate.cnv_to_event(event, data)].add(caller)
                        all_callers = set([])
                        for event, callers in events.iteritems():
                            all_callers = all_callers.union(callers)
                            if event not in support_events:
                                if (len(all_callers) > 1 or size > max_size
                                      or len(total_callers[event]) <= N_FILTER_CALLERS):
                                    out_handle.write(line)
                                    break
    return out_file
Beispiel #27
0
def _run_svtyper(in_file, full_bam, exclude_file, data):
    """Genotype structural variant calls with SVtyper.

    Removes calls in high depth regions to avoid slow runtimes:
    https://github.com/hall-lab/svtyper/issues/16
    """
    out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not vcfutils.vcf_has_variants(in_file):
                shutil.copy(in_file, out_file)
            else:
                python = sys.executable
                svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper")
                if exclude_file and utils.file_exists(exclude_file):
                    regions_to_rm = "-T ^%s" % (exclude_file)
                else:
                    regions_to_rm = ""
                # add FILTER headers, which are lost during svtyping
                header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0]
                with open(header_file, "w") as out_handle:
                    with utils.open_gzipsafe(in_file) as in_handle:
                        for line in in_handle:
                            if not line.startswith("#"):
                                break
                            if line.startswith("##FILTER"):
                                out_handle.write(line)
                    for region in ref.file_contigs(dd.get_ref_file(data), data["config"]):
                        out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size))
                cmd = ("bcftools view {in_file} {regions_to_rm} | "
                       "{python} {svtyper} --max_reads 1000 -B {full_bam} | "
                       "bcftools annotate -h {header_file} | "
                       "bgzip -c > {tx_out_file}")
                do.run(cmd.format(**locals()), "SV genotyping with svtyper")
    return vcfutils.sort_by_ref(out_file, data)
Beispiel #28
0
def _prep_vrn_file(in_file, vcaller, seg_file, work_dir, somatic_info):
    """Select heterozygous variants in the normal sample with sufficient depth.
    """
    data = somatic_info.tumor_data
    params = {"min_freq": 0.4,
              "max_freq": 0.6,
              "tumor_only": {"min_freq": 0.10, "max_freq": 0.90},
              "min_depth": 20,
              "hetblock": {"min_alleles": 25,
                           "allowed_misses": 2}}
    out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0],
                                                          vcaller))
    if not utils.file_uptodate(out_file, in_file):
        #ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info)
        ready_bed = None
        if ready_bed and utils.file_exists(ready_bed):
            sub_file = _create_subset_file(in_file, ready_bed, work_dir, data)
        else:
            sub_file = in_file
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle)
                writer.writerow(["chrom", "start", "end", "freq"])
                bcf_in = pysam.VariantFile(sub_file)
                for rec in bcf_in:
                    tumor_freq = _is_possible_loh(rec, bcf_in, params, somatic_info)
                    if chromhacks.is_autosomal(rec.chrom) and tumor_freq is not None:
                        writer.writerow([_to_ucsc_style(rec.chrom), rec.start, rec.stop, tumor_freq])
    return out_file
Beispiel #29
0
def prep_vrn_file(in_file, vcaller, work_dir, somatic_info, writer_class, seg_file=None, params=None):
    """Select heterozygous variants in the normal sample with sufficient depth.

    writer_class implements write_header and write_row to write VCF outputs
    from a record and extracted tumor/normal statistics.
    """
    data = somatic_info.tumor_data
    if not params:
        params = PARAMS
    out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0],
                                                          vcaller))
    if not utils.file_uptodate(out_file, in_file):
        # ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info)
        ready_bed = None
        if ready_bed and utils.file_exists(ready_bed):
            sub_file = _create_subset_file(in_file, ready_bed, work_dir, data)
        else:
            sub_file = in_file
        max_depth = max_normal_germline_depth(sub_file, params, somatic_info)
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = writer_class(out_handle)
                writer.write_header()
                bcf_in = pysam.VariantFile(sub_file)
                for rec in bcf_in:
                    stats = _is_possible_loh(rec, bcf_in, params, somatic_info, max_normal_depth=max_depth)
                    if chromhacks.is_autosomal(rec.chrom) and stats is not None:
                        writer.write_row(rec, stats)
    return out_file
Beispiel #30
0
def _count_files_to_amber(tumor_counts, normal_counts, work_dir, data):
    """Converts tumor and normal counts from GATK CollectAllelicCounts into Amber format.
    """
    amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
    out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(data))

    if not utils.file_uptodate(out_file, tumor_counts):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tumor_counts) as tumor_handle:
                with open(normal_counts) as normal_handle:
                    with open(tx_out_file, "w") as out_handle:
                        writer = csv.writer(out_handle, delimiter="\t")
                        writer.writerow(["Chromosome", "Position", "TumorBAF", "TumorModifiedBAF", "TumorDepth",
                                         "NormalBAF", "NormalModifiedBAF", "NormalDepth"])
                        header = None
                        for t, n in zip(tumor_handle, normal_handle):
                            if header is None and t.startswith("CONTIG"):
                                header = t.strip().split()
                            elif header is not None:
                                t_vals = dict(zip(header, t.strip().split()))
                                n_vals = dict(zip(header, n.strip().split()))
                                amber_line = _counts_to_amber(t_vals, n_vals)
                                if amber_line:
                                    writer.writerow(amber_line)
    return out_file
Beispiel #31
0
def block_regions(callable_bed, in_bam, ref_file, data):
    """Find blocks of regions for analysis from mapped input BAM file.

    Identifies islands of callable regions, surrounding by regions
    with no read support, that can be analyzed independently.
    """
    min_n_size = int(data["config"]["algorithm"].get("nomap_split_size", 250))
    with shared.bedtools_tmpdir(data):
        nblock_bed = "%s-nblocks.bed" % utils.splitext_plus(callable_bed)[0]
        callblock_bed = "%s-callableblocks.bed" % utils.splitext_plus(callable_bed)[0]
        if not utils.file_uptodate(nblock_bed, callable_bed):
            ref_regions = get_ref_bedtool(ref_file, data["config"])
            nblock_regions = _get_nblock_regions(callable_bed, min_n_size, ref_regions)
            nblock_regions = _add_config_regions(nblock_regions, ref_regions, data)
            with file_transaction(data, nblock_bed, callblock_bed) as (tx_nblock_bed, tx_callblock_bed):
                nblock_regions.filter(lambda r: len(r) > min_n_size).saveas(tx_nblock_bed)
                if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0:
                    ref_regions.subtract(tx_nblock_bed, nonamecheck=True).merge(d=min_n_size).saveas(tx_callblock_bed)
                else:
                    raise ValueError("No callable regions found from BAM file. Alignment regions might "
                                     "not overlap with regions found in your `variant_regions` BED: %s" % in_bam)
    return callblock_bed, nblock_bed
def _cnn_tranch_filtering(in_file, vrn_files, tensor_type, data):
    """Filter CNN scored VCFs in tranches using standard SNP and Indel truth sets.
    """
    out_file = "%s-filter.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        runner = broad.runner_from_config(data["config"])
        gatk_type = runner.gatk_type()
        assert gatk_type == "gatk4", "CNN filtering requires GATK4"
        if "train_hapmap" not in vrn_files:
            raise ValueError("CNN filtering requires HapMap training inputs: %s" % vrn_files)
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "FilterVariantTranches", "--variant", in_file,
                      "--output", tx_out_file,
                      "--snp-truth-vcf", vrn_files["train_hapmap"],
                      "--indel-truth-vcf", vrn_files["train_indels"]]
            if tensor_type == "reference":
                params += ["--info-key", "CNN_1D", "--tranche", "99"]
            else:
                assert tensor_type == "read_tensor"
                params += ["--info-key", "CNN_2D", "--tranche", "99"]
            runner.run_gatk(params)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Beispiel #33
0
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, pct_coverage, work_dir, data):
    """Create target and antitarget regions from target and access files.
    """
    target_bed = os.path.join(work_dir, "%s.target.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0])
    if not utils.file_uptodate(target_bed, raw_target_bed):
        with file_transaction(data, target_bed) as tx_out_file:
            cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file]
            if cov_interval == "genome":
                cmd += ["--avg-size", "500"]
            # small target regions, use smaller, more defined segments
            elif pct_coverage < 1.0:
                cmd += ["--avg-size", "50"]
            do.run(cmd, "CNVkit target")
    antitarget_bed = os.path.join(work_dir, "%s.antitarget.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0])
    if not os.path.exists(antitarget_bed):
        with file_transaction(data, antitarget_bed) as tx_out_file:
            cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file]
            # small target regions, use smaller antitargets
            if pct_coverage < 1.0:
                cmd += ["--avg-size", "100000"]
            do.run(cmd, "CNVkit antitarget")
    return target_bed, antitarget_bed
Beispiel #34
0
def prep_seq2c_bed(data):
    """Selecting the bed file, cleaning, and properly annotating for Seq2C
    """
    if dd.get_background_cnv_reference(data, "seq2c"):
        bed_file = _background_to_bed(
            dd.get_background_cnv_reference(data, "seq2c"), data)
    else:
        bed_file = regions.get_sv_bed(data)
    if bed_file:
        bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-")
    else:
        bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    col_num = bt.BedTool(bed_file).field_count()
    if col_num < 4:
        annotated_file = annotate.add_genes(bed_file, data, max_distance=0)
        if annotated_file == bed_file:
            raise ValueError(
                "BED file for Seq2C must be annotated with gene names, "
                "however the input BED is 3-columns and we have no transcript "
                "data to annotate with " + bed_file)
        annotated_file = annotate.gene_one_per_line(annotated_file, data)
    else:
        annotated_file = bed_file

    ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0])
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])
        with file_transaction(data, ready_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean annotated ready input BED into " +
                     ready_file)

    return ready_file
Beispiel #35
0
def _run_svtyper(in_file, full_bam, sr_bam, exclude_file, data):
    """Genotype structural variant calls with SVtyper.

    Removes calls in high depth regions to avoid slow runtimes:
    https://github.com/hall-lab/svtyper/issues/16
    """
    out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not vcfutils.vcf_has_variants(in_file):
                shutil.copy(in_file, out_file)
            else:
                python = sys.executable
                svtyper = os.path.join(os.path.dirname(sys.executable),
                                       "svtyper")
                if exclude_file and utils.file_exists(exclude_file):
                    regions_to_rm = "-T ^%s" % (exclude_file)
                else:
                    regions_to_rm = ""
                # add FILTER headers, which are lost during svtyping
                header_file = "%s-header.txt" % utils.splitext_plus(
                    tx_out_file)[0]
                with open(header_file, "w") as out_handle:
                    with utils.open_gzipsafe(in_file) as in_handle:
                        for line in in_handle:
                            if not line.startswith("#"):
                                break
                            if line.startswith("##FILTER"):
                                out_handle.write(line)
                    for region in ref.file_contigs(dd.get_ref_file(data),
                                                   data["config"]):
                        out_handle.write("##contig=<ID=%s,length=%s>\n" %
                                         (region.name, region.size))
                cmd = ("bcftools view {in_file} {regions_to_rm} | "
                       "{python} {svtyper} -M -B {full_bam} -S {sr_bam} | "
                       "bcftools annotate -h {header_file} | "
                       "bgzip -c > {tx_out_file}")
                do.run(cmd.format(**locals()), "SV genotyping with svtyper")
    return vcfutils.sort_by_ref(out_file, data)
Beispiel #36
0
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(
                        "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n"
                    )
            else:
                cmd = [
                    _get_cmd(), "segment", "-p",
                    str(dd.get_cores(data)), "-o", tx_out_file, cnr_file
                ]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(
                        cnr_file) and cov_interval != "genome":
                    cmd += [
                        "--vcf", small_vrn_files[0].name, "--sample-id",
                        small_vrn_files[0].sample
                    ]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                if cov_interval == "genome":
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/chapmanb/bcbio-nextgen/issues/2171#issuecomment-348333650
                paired = vcfutils.get_paired(items)
                if paired:
                    cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = (
                    "%s && export TMPDIR=%s && " %
                    (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Beispiel #37
0
def _callable_from_gvcf(data, vrn_file, out_dir):
    """Retrieve callable regions based on ref call regions in gVCF.

    Uses https://github.com/lijiayong/gvcf_regions
    """
    methods = {
        "freebayes": "freebayes",
        "platypus": "platypus",
        "gatk-haplotype": "gatk"
    }
    gvcf_type = methods.get(dd.get_variantcaller(data))
    if gvcf_type:
        out_file = os.path.join(
            out_dir, "%s-gcvf-coverage.bed" %
            utils.splitext_plus(os.path.basename(vrn_file))[0])
        if not utils.file_uptodate(out_file, vrn_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = ("gvcf_regions.py --gvcf_type {gvcf_type} {vrn_file} "
                       "| bedtools merge > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Convert gVCF to BED file of callable regions")
        return out_file
Beispiel #38
0
def apply_bqsr(data):
    """Apply recalibration, producing a updated BAM file.
    """
    in_file = dd.get_align_bam(data)
    out_table_file = "%s-recal-table-post.txt" % utils.splitext_plus(
        in_file)[0]
    out_file = "%s-recal.bam" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file,
                              out_table_file) as (tx_out_file, tx_table_file):
            assoc_files = dd.get_variation_resources(data)
            known = "-k %s" % (
                assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            license = license_export(data)
            cores = dd.get_num_cores(data)
            ref_file = dd.get_ref_file(data)
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {in_file} --algo QualCal {known} {tx_table_file} "
                   "--algo ReadWriter {tx_out_file}")
            do.run(cmd.format(**locals()),
                   "Sentieon QualCal apply recalibration")
    return out_file
Beispiel #39
0
def umi_consensus(data):
    """Convert UMI grouped reads into fastq pair for re-alignment.
    """
    align_bam = dd.get_work_bam(data)
    f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
    f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
    if not utils.file_uptodate(f1_out, align_bam):
        with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
            jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
            # Improve speeds by avoiding compression read/write bottlenecks
            io_opts = ("-Dsamjdk.use_async_io_read_samtools=true -Dsamjdk.use_async_io_write_samtools=true "
                       "-Dsamjdk.compression_level=0")
            group_opts, cons_opts = _get_fgbio_options(data)
            tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0]
            cmd = ("unset JAVA_HOME && "
                   "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -s adjacency -i {align_bam} | "
                   "fgbio {jvm_opts} {io_opts} CallMolecularConsensusReads {cons_opts} "
                   "--output-per-base-tags=false --sort-order=unsorted "
                   "-i /dev/stdin -o /dev/stdout | "
                   "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
            do.run(cmd.format(**locals()), "UMI consensus fastq generation")
    return f1_out, f2_out
Beispiel #40
0
def _rtg_add_summary_file(eval_files, base_dir, data):
    """Parse output TP FP and FN files to generate metrics for plotting.
    """
    out_file = os.path.join(base_dir, "validate-summary.csv")
    if not utils.file_uptodate(out_file, eval_files.get("tp", eval_files["fp"])):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle)
                writer.writerow(["sample", "caller", "vtype", "metric", "value"])
                base = _get_sample_and_caller(data)
                for metric in ["tp", "fp", "fn"]:
                    for vtype, bcftools_types in [("SNPs", "--types snps"),
                                                  ("Indels", "--exclude-types snps")]:
                        in_file = eval_files.get(metric)
                        if in_file and os.path.exists(in_file):
                            cmd = ("bcftools view {bcftools_types} {in_file} | grep -v ^# | wc -l")
                            count = int(subprocess.check_output(cmd.format(**locals()), shell=True))
                        else:
                            count = 0
                        writer.writerow(base + [vtype, metric, count])
    eval_files["summary"] = out_file
    return eval_files
Beispiel #41
0
def block_regions(in_bam, ref_file, config):
    """Find blocks of regions for analysis from mapped input BAM file.

    Identifies islands of callable regions, surrounding by regions
    with no read support, that can be analyzed independently.
    """
    min_n_size = int(config["algorithm"].get("nomap_split_size", 100))
    with shared.bedtools_tmpdir({"config": config}):
        callable_bed = parallel_callable_loci(in_bam, ref_file, config)
        nblock_bed = "%s-nblocks%s" % os.path.splitext(callable_bed)
        callblock_bed = "%s-callableblocks%s" % os.path.splitext(callable_bed)
        if not utils.file_uptodate(nblock_bed, callable_bed):
            ref_regions = get_ref_bedtool(ref_file, config)
            nblock_regions = _get_nblock_regions(callable_bed, min_n_size)
            nblock_regions = _add_config_regions(nblock_regions, ref_regions, config)
            nblock_regions.saveas(nblock_bed)
            if len(ref_regions.subtract(nblock_regions)) > 0:
                ref_regions.subtract(nblock_bed).merge(d=min_n_size).saveas(callblock_bed)
            else:
                raise ValueError("No callable regions found from BAM file. Alignment regions might "
                                 "not overlap with regions found in your `variant_regions` BED: %s" % in_bam)
    return callblock_bed, nblock_bed, callable_bed
Beispiel #42
0
def _prioritize_plot_regions(region_bt, data, out_dir=None):
    """Avoid plotting large numbers of regions due to speed issues. Prioritize most interesting.

    XXX For now, just removes larger regions and avoid plotting thousands of regions.
    Longer term we'll insert biology-based prioritization.
    """
    max_plots = 1000
    max_size = 100 * 1000  # 100kb
    out_file = "%s-priority%s" % utils.splitext_plus(region_bt.fn)
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    num_plots = 0
    if not utils.file_uptodate(out_file, region_bt.fn):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                for r in region_bt:
                    if r.stop - r.start < max_size:
                        if num_plots < max_plots:
                            num_plots += 1
                            out_handle.write("%s\t%s\t%s\n" %
                                             (r.chrom, r.start, r.stop))
    return out_file
Beispiel #43
0
def calculate(bam_file, data, sv_bed):
    """Calculate coverage in parallel using mosdepth.

    Removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"min": dd.get_coverage_depth_min(data)}
    variant_regions = dd.get_variant_regions_merged(data)
    if not variant_regions:
        variant_regions = _create_genome_regions(data)
    # Back compatible with previous pre-mosdepth callable files
    callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                                 dd.get_sample_name(data))),
                                 "%s-coverage.callable.bed" % (dd.get_sample_name(data)))
    if not utils.file_uptodate(callable_file, bam_file):
        vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"])
        to_calculate = [("variant_regions", variant_regions,
                         vr_quantize, None, "coverage_perbase" in dd.get_tools_on(data)),
                        ("sv_regions", bedutils.clean_file(sv_bed, data, prefix="svregions-"),
                         None, None, False),
                        ("coverage", bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-"),
                         None, DEPTH_THRESHOLDS, False)]
        depth_files = {}
        for target_name, region_bed, quantize, thresholds, per_base in to_calculate:
            if region_bed:
                cur_depth = {}
                depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds,
                                          per_base=per_base)
                for attr in ("dist", "regions", "thresholds", "per_base"):
                    val = getattr(depth_info, attr, None)
                    if val:
                        cur_depth[attr] = val
                depth_files[target_name] = cur_depth
                if target_name == "variant_regions":
                    callable_file = depth_info.quantize
    else:
        depth_files = {}
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return final_callable, depth_files
Beispiel #44
0
def pon_to_bed(pon_file, out_dir, data):
    """Extract BED intervals from a GATK4 hdf5 panel of normal file.
    """
    out_file = os.path.join(
        out_dir, "%s-intervals.bed" %
        (utils.splitext_plus(os.path.basename(pon_file))[0]))
    if not utils.file_uptodate(out_file, pon_file):
        import h5py
        with file_transaction(data, out_file) as tx_out_file:
            with h5py.File(pon_file, "r") as f:
                with open(tx_out_file, "w") as out_handle:
                    intervals = f["original_data"]["intervals"]
                    for i in range(
                            len(intervals["transposed_index_start_end"][0])):
                        chrom = intervals["indexed_contig_names"][
                            intervals["transposed_index_start_end"][0][i]]
                        start = int(
                            intervals["transposed_index_start_end"][1][i]) - 1
                        end = int(
                            intervals["transposed_index_start_end"][2][i])
                        out_handle.write("%s\t%s\t%s\n" % (chrom, start, end))
    return out_file
Beispiel #45
0
def to_bed(call, sample, work_dir, calls, data):
    """Create a simplified BED file from caller specific input.
    """
    out_file = os.path.join(work_dir,
                            "%s-%s-flat.bed" % (sample, call["variantcaller"]))
    if call.get("vrn_file") and not utils.file_uptodate(
            out_file, call["vrn_file"]):
        with file_transaction(data, out_file) as tx_out_file:
            convert_fn = CALLER_TO_BED.get(call["variantcaller"])
            if convert_fn:
                vrn_file = call["vrn_file"]
                if call["variantcaller"] in SUBSET_BY_SUPPORT:
                    ecalls = [
                        x for x in calls if x["variantcaller"] in
                        SUBSET_BY_SUPPORT[call["variantcaller"]]
                    ]
                    if len(ecalls) > 0:
                        vrn_file = _subset_by_support(call["vrn_file"], ecalls,
                                                      data)
                convert_fn(vrn_file, call["variantcaller"], tx_out_file)
    if utils.file_exists(out_file):
        return out_file
Beispiel #46
0
def finalize_vcf(in_file, variantcaller, items):
    """Perform cleanup and dbSNP annotation of the final VCF.

    - Adds contigs to header for bcftools compatibility
    - adds sample information for tumor/normal
    """
    out_file = "%s-annotated%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        header_cl = _add_vcf_header_sample_cl(in_file, items, out_file)
        contig_cl = _add_contig_cl(in_file, items, out_file)
        cls = [x for x in (contig_cl, header_cl) if x]
        if cls:
            post_cl = " | ".join(cls) + " | "
        else:
            post_cl = None
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), items[0])
        if dbsnp_file:
            out_file = _add_dbsnp(in_file, dbsnp_file, items[0], out_file, post_cl)
    if utils.file_exists(out_file):
        return vcfutils.bgzip_and_index(out_file, items[0]["config"])
    else:
        return in_file
Beispiel #47
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.

    This does not yet handle mapping over 1 -> chr1 issues since this requires
    a ton of search/replace which slows down conversion.
    """
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "bamclean",
                     dd.get_sample_name(data)))
    out_file = os.path.join(
        work_dir,
        "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = _target_chroms_and_header(in_bam, data)
            str_chroms = " ".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            ref_file = dd.get_ref_file(data)
            local_bam = os.path.join(os.path.dirname(tx_out_file),
                                     os.path.basename(in_bam))
            utils.symlink_plus(in_bam, local_bam)
            bam.index(local_bam, data["config"])
            cmd = (
                "samtools view -h {local_bam} {str_chroms} | "
                """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                """cleanbam.fix_header("{ref_file}")' | """
                "samtools view -u - | "
                "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - "
            )
            do.run(
                cmd.format(**locals()),
                "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
Beispiel #48
0
def _gatk_apply_bqsr(data):
    """Parallel BQSR support for GATK4.

    Normalized qualities to 4 bin outputs based on pipeline standard
    recommendations, which will help with output file sizes:
    https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md#base-quality-score-binning-scheme

    spark host and timeout settings help deal with runs on restricted systems
    where we encounter network and timeout errors
    """
    in_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                            "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            gatk_type = broad_runner.gatk_type()
            cores = dd.get_num_cores(data)
            if gatk_type == "gatk4":
                params = ["-T", "ApplyBQSRSpark", "--spark-master", "local[%s]" % cores,
                          "--input", in_file, "--output", tx_out_file, "--bqsr-recal-file", data["prep_recal"],
                          "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file),
                          "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800",
                          "--quantize-quals", "4"]
            else:
                params = ["-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file,
                          "-BQSR", data["prep_recal"], "-o", tx_out_file]
            # Avoid problems with intel deflater for GATK 3.8 and GATK4
            # https://github.com/chapmanb/bcbio-nextgen/issues/2145#issuecomment-343095357
            if gatk_type == "gatk4":
                params += ["--jdk-deflater", "--jdk-inflater"]
            elif LooseVersion(broad_runner.gatk_major_version()) > LooseVersion("3.7"):
                params += ["-jdk_deflater", "-jdk_inflater"]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                  parallel_gc=True)
    bam.index(out_file, data["config"])
    return out_file
Beispiel #49
0
def _filter_by_normal(tumor_counts, normal_counts, data):
    """Filter count files based on normal frequency and median depth, avoiding high depth regions.

    For frequency, restricts normal positions to those between 0.4 and 0.65

    For depth, matches approach used in AMBER to try and avoid problematic genomic regions
    with high count in the normal:
    https://github.com/hartwigmedical/hmftools/tree/master/amber#usage
    """
    from bcbio.heterogeneity import bubbletree
    fparams = bubbletree.NORMAL_FILTER_PARAMS
    tumor_out = "%s-normfilter%s" % utils.splitext_plus(tumor_counts)
    normal_out = "%s-normfilter%s" % utils.splitext_plus(normal_counts)
    if not utils.file_uptodate(tumor_out, tumor_counts):
        with file_transaction(data, tumor_out,
                              normal_out) as (tx_tumor_out, tx_normal_out):
            median_depth = _get_normal_median_depth(normal_counts)
            min_normal_depth = median_depth * fparams["min_depth_percent"]
            max_normal_depth = median_depth * fparams["max_depth_percent"]
            with open(tumor_counts) as tumor_handle:
                with open(normal_counts) as normal_handle:
                    with open(tx_tumor_out, "w") as tumor_out_handle:
                        with open(tx_normal_out, "w") as normal_out_handle:
                            header = None
                            for t, n in zip(tumor_handle, normal_handle):
                                if header is None:
                                    if not n.startswith("@"):
                                        header = n.strip().split()
                                    tumor_out_handle.write(t)
                                    normal_out_handle.write(n)
                                elif (_normal_passes_depth(
                                        header, n, min_normal_depth,
                                        max_normal_depth)
                                      and _normal_passes_freq(
                                          header, n, fparams)):
                                    tumor_out_handle.write(t)
                                    normal_out_handle.write(n)
    return tumor_out, normal_out
Beispiel #50
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    from bcbio.variation import bedutils
    highdepth_beds = filter(
        lambda x: x is not None,
        list(
            set([
                tz.get_in(["config", "algorithm", "highdepth_regions"], x)
                for x in items
            ])))
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(highdepth_beds):
                            parts = line.split("\t")
                            out_handle.write("\t".join(parts[:4]).rstrip() +
                                             "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Beispiel #51
0
def _convert_to_csv(vcf_file, good_bed, bad_bed):
    """Convert WHAM output file into BED format for graphical exploration.
    """
    attrs = ["PU", "LRT", "SI", "MQ"]
    buffer_size = 25  # bp around break ends
    out_file = "%s-metrics.csv" % utils.splitext_plus(vcf_file)[0]
    if not utils.file_uptodate(out_file, vcf_file):
        lrts = []
        good = _read_bed(good_bed)
        bad = _read_bed(bad_bed)
        with open(out_file, "w") as out_handle:
            reader = vcf.Reader(filename=vcf_file)
            writer = csv.writer(out_handle)
            header = ["chrom", "start", "end", "class", "attr", "val"]
            writer.writerow(header)
            for rec in reader:
                start = max(rec.start - buffer_size, 0)
                if rec.INFO["BE"][0] not in [".", None]:
                    other_chrom, end, count = rec.INFO["BE"]
                    if int(end) > start and other_chrom == rec.CHROM:
                        end = int(end) + buffer_size
                        if (rec.CHROM, start, end) in good:
                            cur_class = "good"
                        elif (rec.CHROM, start, end) in bad:
                            cur_class = "bad"
                        else:
                            cur_class = None
                        if cur_class:
                            lrts.append(rec.INFO["LRT"])
                            for attr in attrs:
                                writer.writerow([
                                    rec.CHROM, start, end, cur_class, attr,
                                    rec.INFO[attr]
                                ])
        import numpy as np
        print np.mean(lrts), np.median(lrts), np.percentile(
            lrts, 25), max(lrts), min(lrts)
    return out_file
Beispiel #52
0
def sort_merge(in_file, data, out_dir=None):
    """Sort and merge a BED file, collapsing gene names.
       Output is a 3 or 4 column file (the 4th column values go comma-separated).
    """
    out_file = "%s-sortmerge.bed" % os.path.splitext(in_file)[0]
    bedtools = config_utils.get_program("bedtools", data, default="bedtools")
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file):
        column_opt = ""
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if not line.startswith(("#", "track", "browser", "@")):
                    parts = line.split()
                    if len(parts) >= 4:
                        column_opt = "-c 4 -o distinct"
        with file_transaction(data, out_file) as tx_out_file:
            cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
            sort_cmd = get_sort_cmd(os.path.dirname(tx_out_file))
            cmd = ("{cat_cmd} {in_file} | {sort_cmd} -k1,1 -k2,2n | "
                   "{bedtools} merge -i - {column_opt} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Sort and merge BED file", data)
    return out_file
Beispiel #53
0
def _gatk_apply_bqsr(data):
    """Parallel BQSR support for GATK4.
    """
    in_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                            "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            gatk_type = broad_runner.gatk_type()
            cores = dd.get_num_cores(data)
            if gatk_type == "gatk4":
                params = ["-T", "ApplyBQSRSpark", "--sparkMaster", "local[%s]" % cores,
                          "--input", in_file, "--output", tx_out_file, "--bqsr_recal_file", data["prep_recal"],
                          "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)]
            else:
                params = ["-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file,
                          "-BQSR", data["prep_recal"], "-o", tx_out_file]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                  parallel_gc=(cores > 1 and gatk_type == "gatk4"))
    bam.index(out_file, data["config"])
    return out_file
Beispiel #54
0
def umi_consensus(data):
    """Convert UMI grouped reads into fastq pair for re-alignment.
    """
    align_bam = dd.get_work_bam(data)
    umi_method, umi_tag = _check_umi_type(align_bam)
    f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
    f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
    if not utils.file_uptodate(f1_out, align_bam):
        with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
            jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
            # Improve speeds by avoiding compression read/write bottlenecks
            io_opts = "--async-io=true --compression=0"
            group_opts, cons_opts = _get_fgbio_options(data, umi_method)
            cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads"
            tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0]
            cmd = ("unset JAVA_HOME && "
                   "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
                   "-i {align_bam} | "
                   "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=unsorted "
                   "-i /dev/stdin -o /dev/stdout | "
                   "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
            do.run(cmd.format(**locals()), "UMI consensus fastq generation")
    return f1_out, f2_out
Beispiel #55
0
def _remove_overlaps(in_file, out_dir, data):
    """Remove regions that overlap with next region, these result in issues with PureCN.
    """
    out_file = os.path.join(
        out_dir,
        "%s-nooverlaps%s" % utils.splitext_plus(os.path.basename(in_file)))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    prev_line = None
                    for line in in_handle:
                        if prev_line:
                            pchrom, pstart, pend = prev_line.split("\t", 4)[:3]
                            cchrom, cstart, cend = line.split("\t", 4)[:3]
                            # Skip if chromosomes match and end overlaps start
                            if pchrom == cchrom and int(pend) > int(cstart):
                                pass
                            else:
                                out_handle.write(prev_line)
                        prev_line = line
                    out_handle.write(prev_line)
    return out_file
Beispiel #56
0
def coverage_region_detailed_stats(bed_file,
                                   data,
                                   out_dir,
                                   extra_cutoffs=None):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    if not bed_file or not utils.file_exists(bed_file):
        return []
    else:
        cov_file, dist_file = _run_mosdepth(bed_file, data)
        out_cov_file = os.path.join(out_dir, os.path.basename(cov_file))
        out_dist_file = os.path.join(out_dir, os.path.basename(dist_file))
        if not utils.file_uptodate(out_cov_file, cov_file):
            utils.copy_plus(cov_file, out_cov_file)
            utils.copy_plus(dist_file, out_dist_file)
        cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000}
        if extra_cutoffs:
            cutoffs = sorted(list(cutoffs | extra_cutoffs))
        out_files = _calculate_percentiles(out_cov_file, out_dist_file,
                                           cutoffs, out_dir, data)
        return [os.path.abspath(x) for x in out_files]
Beispiel #57
0
def _prep_cnv_file(cns_file, svcaller, work_dir, data):
    """Create a CSV file of CNV calls with log2 and number of marks.
    """
    in_file = cns_file
    out_file = os.path.join(
        work_dir, "%s-%s-prep.csv" %
        (utils.splitext_plus(os.path.basename(in_file))[0], svcaller))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    reader = csv.reader(in_handle, dialect="excel-tab")
                    writer = csv.writer(out_handle)
                    writer.writerow(
                        ["chrom", "start", "end", "num.mark", "seg.mean"])
                    reader.next()  # header
                    for chrom, start, end, _, log2, probes in (
                            xs[:6] for xs in reader):
                        if chromhacks.is_autosomal(chrom):
                            writer.writerow([
                                _to_ucsc_style(chrom), start, end, probes, log2
                            ])
    return out_file
Beispiel #58
0
def _filter_ensemble(in_bed, data):
    """Filter ensemble set of calls, requiring calls supported by 2 callers.

    We filter only smaller size events, which seem to benefit the most since
    they have lower precision. We also check to be sure that the required
    number of callers actually called in each event, since some callers don't handle
    all event types.
    """
    support_events = set(["BND", "UKN"])
    max_size = max([xs[1] for xs in validate.EVENT_SIZES[:2]])
    out_file = "%s-filter%s" % utils.splitext_plus(in_bed)

    if not utils.file_uptodate(out_file, in_bed):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with open(in_bed) as in_handle:
                    total_callers = validate.callers_by_event(in_bed, data)
                    for line in in_handle:
                        chrom, start, end, caller_strs = line.strip().split(
                        )[:4]
                        size = int(end) - int(start)
                        events = collections.defaultdict(set)
                        for event, caller in [
                                x.split("_", 1) for x in caller_strs.split(",")
                        ]:
                            events[validate.cnv_to_event(event,
                                                         data)].add(caller)
                        all_callers = set([])
                        for event, callers in events.iteritems():
                            all_callers = all_callers.union(callers)
                            if event not in support_events:
                                if (len(all_callers) > 1 or size > max_size
                                        or len(total_callers[event]) <=
                                        N_FILTER_CALLERS):
                                    out_handle.write(line)
                                    break
    return out_file
Beispiel #59
0
def _collapse_transcripts(in_file, window, data, out_dir):
    """Collapse transcripts into min/max coordinates and optionally add windows.
    """
    if out_dir is None:
        out_dir = os.path.dirname(in_file)
    out_file = os.path.join(
        out_dir, "%s-transcripts_w%s.bed" %
        (os.path.splitext(os.path.basename(in_file))[0], window))
    chrom_sizes = {}
    for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]):
        chrom_sizes[contig.name] = contig.size
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file)
            sort_cmd = bedutils.get_sort_cmd()
            cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}"
            do.run(cmd.format(**locals()), "Sort BED file by transcript name")
            with open(tx_out_file, "w") as out_handle:
                # Work around for segmentation fault issue with groupby
                # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476
                x = pybedtools.BedTool(prep_file)

                def gen():
                    for r in x:
                        yield r

                for name, rs in itertools.groupby(gen(), lambda r:
                                                  (r.name, r.chrom)):
                    rs = list(rs)
                    r = rs[0]
                    for gcoords in _group_coords(rs):
                        min_pos = max(min(gcoords) - window, 0)
                        max_pos = min(
                            max(gcoords) + window, chrom_sizes[r.chrom])
                        out_handle.write("%s\t%s\t%s\t%s\n" %
                                         (r.chrom, min_pos, max_pos, r.name))
    return bedutils.sort_merge(out_file, data)
Beispiel #60
0
def _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data):
    """Retrieve intervals to run validation on, merging reference and callable BED files.
    """
    a_intervals = get_analysis_intervals(data, vrn_file, base_dir)
    if a_intervals:
        final_intervals = shared.remove_lcr_regions(a_intervals, [data])
        if rm_interval_file:
            caller = _get_caller(data)
            sample = dd.get_sample_name(data)
            combo_intervals = os.path.join(base_dir, "%s-%s-%s-wrm.bed" %
                                           (utils.splitext_plus(os.path.basename(final_intervals))[0],
                                            sample, caller))
            if not utils.file_uptodate(combo_intervals, final_intervals):
                with file_transaction(data, combo_intervals) as tx_out_file:
                    with utils.chdir(os.path.dirname(tx_out_file)):
                        # Copy files locally to avoid issues on shared filesystems
                        # where BEDtools has trouble accessing the same base
                        # files from multiple locations
                        a = os.path.basename(final_intervals)
                        b = os.path.basename(rm_interval_file)
                        try:
                            shutil.copyfile(final_intervals, a)
                        except IOError:
                            time.sleep(60)
                            shutil.copyfile(final_intervals, a)
                        try:
                            shutil.copyfile(rm_interval_file, b)
                        except IOError:
                            time.sleep(60)
                            shutil.copyfile(rm_interval_file, b)
                        cmd = ("bedtools intersect -nonamecheck -a {a} -b {b} > {tx_out_file}")
                        do.run(cmd.format(**locals()), "Intersect callable intervals for rtg vcfeval")
            final_intervals = combo_intervals
    else:
        assert rm_interval_file, "No intervals to subset analysis with for %s" % vrn_file
        final_intervals = shared.remove_lcr_regions(rm_interval_file, [data])
    return final_intervals