Beispiel #1
0
def summarize(calls, data):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    import pybedtools
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda x: x is not None,
                            [_create_bed(c, out_file, data) for c in calls])
    if len(input_beds) > 0:
        size_beds = []
        for e_start, e_end in validate.EVENT_SIZES:
            base, ext = os.path.splitext(out_file)
            size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext)
            if not utils.file_exists(size_out_file):
                with file_transaction(data, size_out_file) as tx_out_file:
                    with shared.bedtools_tmpdir(data):
                        all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                        with open(all_file, "w") as out_handle:
                            for line in fileinput.input(input_beds):
                                chrom, start, end = line.split()[:3]
                                size = int(end) - int(start)
                                if size >= e_start and size < e_end:
                                    out_handle.write(line)
                        pybedtools.BedTool(all_file).sort(stream=True)\
                          .merge(c=4, o="distinct", delim=",").saveas(tx_out_file)
            size_beds.append(size_out_file)
        out_file = bedutils.combine(size_beds, out_file, data["config"])
    if utils.file_exists(out_file):
        bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep"))
        calls.append({"variantcaller": "ensemble",
                      "vrn_file": bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)})
    return calls
Beispiel #2
0
def get_split_regions(bed_file, data):
    """Retrieve a set of split regions using the input BED for callable regions.

    Provides a less inclusive hook for parallelizing over multiple regions.
    """
    out_file = "%s-analysis_blocks.bed" % utils.splitext_plus(bed_file)[0]
    with shared.bedtools_tmpdir(data):
        if not utils.file_uptodate(out_file, bed_file):
            ref_regions = get_ref_bedtool(dd.get_ref_file(data),
                                          data["config"])
            nblock_regions = ref_regions.subtract(
                pybedtools.BedTool(bed_file)).saveas()
            min_n_size = int(
                tz.get_in(["config", "algorithm", "nomap_split_size"], data,
                          250))
            block_filter = NBlockRegionPicker(ref_regions, data["config"],
                                              min_n_size)
            final_nblock_regions = nblock_regions.filter(
                block_filter.include_block).saveas().each(
                    block_filter.expand_block).saveas()
            with file_transaction(data, out_file) as tx_out_file:
                final_regions = ref_regions.subtract(final_nblock_regions, nonamecheck=True).\
                                saveas().merge(d=min_n_size).saveas(tx_out_file)
        chroms = set([])
        with shared.bedtools_tmpdir(data):
            for r in pybedtools.BedTool(bed_file):
                chroms.add(r.chrom)
        out = []
        for r in pybedtools.BedTool(out_file):
            if r.chrom in chroms:
                out.append((r.chrom, r.start, r.stop))
        return out
Beispiel #3
0
def get_split_regions(bed_file, data):
    """Retrieve a set of split regions using the input BED for callable regions.

    Provides a less inclusive hook for parallelizing over multiple regions.
    """
    out_file = "%s-analysis_blocks.bed" % utils.splitext_plus(bed_file)[0]
    with shared.bedtools_tmpdir(data):
        if not utils.file_uptodate(out_file, bed_file):
            ref_regions = get_ref_bedtool(dd.get_ref_file(data), data["config"])
            nblock_regions = ref_regions.subtract(pybedtools.BedTool(bed_file)).saveas()
            min_n_size = int(tz.get_in(["config", "algorithm", "nomap_split_size"], data, 250))
            block_filter = NBlockRegionPicker(ref_regions, data["config"], min_n_size)
            final_nblock_regions = nblock_regions.filter(
                block_filter.include_block).saveas().each(block_filter.expand_block).saveas()
            with file_transaction(data, out_file) as tx_out_file:
                final_regions = ref_regions.subtract(final_nblock_regions, nonamecheck=True).\
                                saveas().merge(d=min_n_size).saveas(tx_out_file)
        chroms = set([])
        with shared.bedtools_tmpdir(data):
            for r in pybedtools.BedTool(bed_file):
                chroms.add(r.chrom)
        out = []
        for r in pybedtools.BedTool(out_file):
            if r.chrom in chroms:
                out.append((r.chrom, r.start, r.stop))
        return out
def block_regions(in_bam, ref_file, data):
    """Find blocks of regions for analysis from mapped input BAM file.

    Identifies islands of callable regions, surrounding by regions
    with no read support, that can be analyzed independently.
    """
    config = data["config"]
    min_n_size = int(config["algorithm"].get("nomap_split_size", 250))
    with shared.bedtools_tmpdir({"config": config}):
        callable_bed = parallel_callable_loci(in_bam, ref_file, data)
        nblock_bed = "%s-nblocks%s" % os.path.splitext(callable_bed)
        callblock_bed = "%s-callableblocks%s" % os.path.splitext(callable_bed)
        if not utils.file_uptodate(nblock_bed, callable_bed):
            ref_regions = get_ref_bedtool(ref_file, config)
            nblock_regions = _get_nblock_regions(callable_bed, min_n_size, ref_regions)
            nblock_regions = _add_config_regions(nblock_regions, ref_regions, config)
            nblock_regions.filter(lambda r: len(r) > min_n_size).saveas(nblock_bed)
            if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0:
                ref_regions.subtract(nblock_bed, nonamecheck=True).merge(d=min_n_size).saveas(callblock_bed)
            else:
                raise ValueError(
                    "No callable regions found from BAM file. Alignment regions might "
                    "not overlap with regions found in your `variant_regions` BED: %s" % in_bam
                )
    return callblock_bed, nblock_bed, callable_bed
Beispiel #5
0
def summarize(calls, data, items):
    """Summarize results from multiple callers into a single flattened BED file.

    Approach:
      - Combine all calls found in all files
      - Filter files retaining those present with multiple levels of support.
      - Remove calls in high depth regions.
      - Remove calls with ends overlapping exclusion regions like low complexity regions.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]),
                            [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls])
    if len(input_beds) > 0:
        out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data)
        if utils.file_exists(out_file):
            if len(input_beds) > N_FILTER_CALLERS:
                filter_file = _filter_ensemble(out_file, data)
            else:
                filter_file = out_file
            limit_file = shared.remove_highdepth_regions(filter_file, items)
            exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f]
            exclude_file = exclude_files[0] if len(exclude_files) > 0 else None
            if exclude_file:
                noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data)
            else:
                noexclude_file = limit_file
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep"))
            if utils.file_exists(noexclude_file):
                calls.append({"variantcaller": "sv-ensemble",
                              "input_beds": input_beds,
                              "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)})
    return calls
Beispiel #6
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples).items():
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                vr_file = tz.get_in(["config", "algorithm", "variant_regions"], data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count()
                out.append([data])
        assert len(out) == len(samples)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions)
    return out
Beispiel #7
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples).items():
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                out.append([data])
        assert len(out) == len(samples)
        final_regions = pybedtools.BedTool(analysis_files[0])
        _analysis_block_stats(final_regions)
    return out
Beispiel #8
0
def combine_bed_by_size(input_beds, sample, work_dir, data, delim=","):
    """Combine a set of BED files, breaking into individual size chunks.
    """
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    if len(input_beds) > 0:
        size_beds = []
        for e_start, e_end in validate.EVENT_SIZES:
            base, ext = os.path.splitext(out_file)
            size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext)
            if not utils.file_exists(size_out_file):
                with file_transaction(data, size_out_file) as tx_out_file:
                    with shared.bedtools_tmpdir(data):
                        all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                        has_regions = False
                        with open(all_file, "w") as out_handle:
                            for line in fileinput.input(input_beds):
                                chrom, start, end, event_str = line.split()[:4]
                                event = event_str.split("_", 1)[0]
                                size = int(end) - int(start)
                                if size >= e_start and size < e_end or event == "BND":
                                    out_handle.write(line)
                                    has_regions = True
                        if has_regions:
                            pybedtools.BedTool(all_file).sort(stream=True)\
                              .merge(c=4, o="distinct", delim=delim).saveas(tx_out_file)
            if utils.file_exists(size_out_file):
                ann_size_out_file = annotate.add_genes(size_out_file, data)
                size_beds.append(ann_size_out_file)
        if len(size_beds) > 0:
            out_file = bedutils.combine(size_beds, out_file, data)
    return out_file
Beispiel #9
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo",
                                     "callable, raw_callable, depth_files")
    noalt_calling = "noalt_calling" in dd.get_tools_on(
        data) or "altcontigs" in dd.get_exclude_regions(data)

    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling
                                         or chromhacks.is_nonalt(r.chrom))

    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir(data):
        sv_bed = regions.get_sv_bed(data)
        callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed)
        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(
                            input_regions,
                            nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, callable_bed, depth_files)
Beispiel #10
0
def block_regions(in_bam, ref_file, data):
    """Find blocks of regions for analysis from mapped input BAM file.

    Identifies islands of callable regions, surrounding by regions
    with no read support, that can be analyzed independently.
    """
    config = data["config"]
    min_n_size = int(config["algorithm"].get("nomap_split_size", 250))
    with shared.bedtools_tmpdir({"config": config}):
        callable_bed = parallel_callable_loci(in_bam, ref_file, data)
        nblock_bed = "%s-nblocks%s" % os.path.splitext(callable_bed)
        callblock_bed = "%s-callableblocks%s" % os.path.splitext(callable_bed)
        if not utils.file_uptodate(nblock_bed, callable_bed):
            ref_regions = get_ref_bedtool(ref_file, config)
            nblock_regions = _get_nblock_regions(callable_bed, min_n_size,
                                                 ref_regions)
            nblock_regions = _add_config_regions(nblock_regions, ref_regions,
                                                 config)
            nblock_regions.filter(lambda r: len(r) > min_n_size).saveas(
                nblock_bed)
            if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0:
                ref_regions.subtract(
                    nblock_bed,
                    nonamecheck=True).merge(d=min_n_size).saveas(callblock_bed)
            else:
                raise ValueError(
                    "No callable regions found from BAM file. Alignment regions might "
                    "not overlap with regions found in your `variant_regions` BED: %s"
                    % in_bam)
    return callblock_bed, nblock_bed, callable_bed
Beispiel #11
0
def summarize(calls, data, items):
    """Summarize results from multiple callers into a single flattened BED file.

    Approach:
      - Combine all calls found in all files
      - Filter files retaining those present with multiple levels of support.
      - Remove calls in high depth regions.
      - Remove calls with ends overlapping exclusion regions like low complexity regions.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]),
                            [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls])
    if len(input_beds) > 0:
        out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data)
        if utils.file_exists(out_file):
            if len(input_beds) > N_FILTER_CALLERS:
                filter_file = _filter_ensemble(out_file, data)
            else:
                filter_file = out_file
            limit_file = shared.remove_highdepth_regions(filter_file, items)
            exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f]
            exclude_file = exclude_files[0] if len(exclude_files) > 0 else None
            if exclude_file:
                noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data)
            else:
                noexclude_file = limit_file
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep"))
            if utils.file_exists(noexclude_file):
                calls.append({"variantcaller": "sv-ensemble",
                              "input_beds": input_beds,
                              "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)})
    return calls
Beispiel #12
0
def variantcall_batch_region(items):
    """CWL entry point: variant call a batch of samples in a block of regions.
    """
    items = [utils.to_single_data(x) for x in items]
    align_bams = [dd.get_align_bam(x) for x in items]
    variantcaller = _get_batch_variantcaller(items)
    region_blocks = list(
        set([
            tuple(x.get("region_block")) for x in items if "region_block" in x
        ]))
    assert len(region_blocks) == 1, region_blocks
    region_block = region_blocks[0]
    # Pre-called input variant files
    if not variantcaller and all(d.get("vrn_file") for d in items):
        return {"vrn_file_region": None, "region_block": region_block}
    caller_fn = get_variantcallers()[variantcaller]
    assoc_files = tz.get_in(("genome_resources", "variation"), items[0], {})
    region = _region_to_coords(region_block[0])
    chrom, start, end = region
    region_str = "_".join(str(x) for x in region)
    batch_name = _get_batch_name(items)
    out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom,
                            "%s-%s-block.vcf.gz" % (batch_name, region_str))
    utils.safe_makedir(os.path.dirname(out_file))
    with pshared.bedtools_tmpdir(items[0]):
        if variantcaller in SUPPORT_MULTICORE:
            call_file = caller_fn(align_bams, items, dd.get_ref_file(items[0]),
                                  assoc_files,
                                  [_region_to_coords(r)
                                   for r in region_block], out_file)
        else:
            call_file = _run_variantcall_batch_multicore(
                items, region_block, out_file)
    return {"vrn_file_region": call_file, "region_block": region_block}
Beispiel #13
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion, incorporating variant regions and chromosome.

    Excludes locally repetitive regions (if `remove_lcr` is set) and
    centromere regions, both of which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude.bed" % utils.splitext_plus(base_file)[0]
    all_vrs = _get_variant_regions(items)
    ready_region = (shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items)
                    if len(all_vrs) > 0 else chrom)
    with shared.bedtools_tmpdir(items[0]):
        # Get a bedtool for the full region if no variant regions
        if ready_region == chrom:
            want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                    items[0]["config"], chrom)
            lcr_bed = shared.get_lcr_bed(items)
            if lcr_bed:
                want_bedtool = want_bedtool.subtract(pybedtools.BedTool(lcr_bed))
        else:
            want_bedtool = pybedtools.BedTool(ready_region).saveas()
        sv_exclude_bed = _get_sv_exclude_file(items)
        if sv_exclude_bed and len(want_bedtool) > 0:
            want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas()
        if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
            with file_transaction(out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                        items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
Beispiel #14
0
def _prep_sample_cnvs(cnv_file, data):
    """Convert a multiple sample CNV file into a single BED file for a sample.

    Handles matching and fixing names where R converts numerical IDs (1234) into
    strings by adding an X (X1234), and converts other characters into '.'s.
    http://stat.ethz.ch/R-manual/R-devel/library/base/html/make.names.html
    """
    import pybedtools
    sample_name = tz.get_in(["rgnames", "sample"], data)

    def make_names(name):
        return re.sub("[^\w.]", '.', name)

    def matches_sample_name(feat):
        return (feat.name == sample_name or feat.name == "X%s" % sample_name
                or feat.name == make_names(sample_name))

    def update_sample_name(feat):
        feat.name = sample_name
        return feat

    sample_file = os.path.join(os.path.dirname(cnv_file),
                               "%s-cnv.bed" % sample_name)
    if not utils.file_exists(sample_file):
        with file_transaction(data, sample_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                pybedtools.BedTool(cnv_file).filter(matches_sample_name).each(
                    update_sample_name).saveas(tx_out_file)
    return sample_file
def block_regions(callable_bed, in_bam, ref_file, data):
    """Find blocks of regions for analysis from mapped input BAM file.

    Identifies islands of callable regions, surrounding by regions
    with no read support, that can be analyzed independently.
    """
    min_n_size = int(data["config"]["algorithm"].get("nomap_split_size", 250))
    with shared.bedtools_tmpdir(data):
        nblock_bed = "%s-nblocks.bed" % utils.splitext_plus(callable_bed)[0]
        callblock_bed = "%s-callableblocks.bed" % utils.splitext_plus(callable_bed)[0]
        if not utils.file_uptodate(nblock_bed, callable_bed):
            ref_regions = get_ref_bedtool(ref_file, data["config"])
            nblock_regions = _get_nblock_regions(callable_bed, min_n_size, ref_regions)
            nblock_regions = _add_config_regions(nblock_regions, ref_regions, data)
            with file_transaction(data, nblock_bed, callblock_bed) as (tx_nblock_bed, tx_callblock_bed):
                nblock_regions.filter(lambda r: len(r) > min_n_size).saveas(tx_nblock_bed)
                if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0:
                    ref_regions.subtract(tx_nblock_bed, nonamecheck=True).merge(d=min_n_size).saveas(tx_callblock_bed)
                else:
                    raise ValueError("No callable regions found in %s from BAM file %s. Some causes:\n "
                                     " - Alignment regions do not overlap with regions found "
                                     "in your `variant_regions` BED: %s\n"
                                     " - There are no aligned reads in your BAM file that pass sanity checks "
                                     " (mapping score > 1, non-duplicates, both ends of paired reads mapped)"
                                     % (dd.get_sample_name(data), in_bam, dd.get_variant_regions(data)))
    return callblock_bed, nblock_bed
Beispiel #16
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion.

    Excludes high depth and centromere regions which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "")
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with shared.bedtools_tmpdir(items[0]):
            # Get a bedtool for the full region if no variant regions
            want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                    items[0]["config"], chrom)
            if chrom:
                want_bedtool = pybedtools.BedTool(shared.subset_bed_by_chrom(want_bedtool.saveas().fn,
                                                                             chrom, items[0]))
            sv_exclude_bed = _get_sv_exclude_file(items)
            if sv_exclude_bed and len(want_bedtool) > 0:
                want_bedtool = want_bedtool.subtract(sv_exclude_bed, nonamecheck=True).saveas()
            if any(dd.get_coverage_interval(d) == "genome" for d in items):
                want_bedtool = pybedtools.BedTool(shared.remove_highdepth_regions(want_bedtool.saveas().fn, items))
            with file_transaction(items[0], out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                        items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
Beispiel #17
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    # Unpack nested lists of samples grouped together
    if isinstance(samples[0], (list, tuple)) and len(samples[0]) == 1:
        samples = [x[0] for x in samples]
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"],
                                        "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(
            global_analysis_file, samples):
        global_no_analysis_file = os.path.join(
            os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    batches = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples,
                                                  require_bam=False).items():
            batches.append(items)
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(
                    batch, items)
            for data in items:
                vr_file = dd.get_variant_regions(data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"][
                        "callable_regions"] = analysis_file
                    data["config"]["algorithm"][
                        "non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"][
                        "callable_count"] = pybedtools.BedTool(
                            analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"][
                        "callable_count"] = pybedtools.BedTool(
                            vr_file).count()
                highdepth_bed = tz.get_in(["regions", "highdepth"], data)
                if highdepth_bed:
                    data["config"]["algorithm"][
                        "highdepth_regions"] = highdepth_bed
                # attach a representative sample for calculating callable region
                if not data.get("work_bam"):
                    for x in items:
                        if x.get("work_bam"):
                            data["work_bam_callable"] = x["work_bam"]
                out.append([data])
        assert len(out) == len(samples)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions, batches[0])
    return out
Beispiel #18
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    CovInfo = collections.namedtuple(
        "CovInfo", "callable, highdepth, avg_coverage, coverage")
    config = data["config"]
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir({"config": config}):
        coverage_file, callable_bed, highdepth_bed, variant_regions_avg_cov = coverage.calculate(
            bam_file, data)
        input_regions_bed = config["algorithm"].get("variant_regions", None)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(config, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(
                    lambda x: x.name == "CALLABLE")
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(
                            input_regions,
                            nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, highdepth_bed, variant_regions_avg_cov,
                   coverage_file)
Beispiel #19
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files")
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom))
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir(data):
        sv_bed = regions.get_sv_bed(data)
        callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed)
        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, callable_bed, depth_files)
Beispiel #20
0
def combine_bed_by_size(input_beds, sample, work_dir, data, delim=","):
    """Combine a set of BED files, breaking into individual size chunks.
    """
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    if len(input_beds) > 0:
        size_beds = []
        for e_start, e_end in validate.EVENT_SIZES:
            base, ext = os.path.splitext(out_file)
            size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext)
            if not utils.file_exists(size_out_file):
                with file_transaction(data, size_out_file) as tx_out_file:
                    with shared.bedtools_tmpdir(data):
                        all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                        has_regions = False
                        with open(all_file, "w") as out_handle:
                            for line in fileinput.input(input_beds):
                                chrom, start, end, event_str = line.split()[:4]
                                event = event_str.split("_", 1)[0]
                                size = int(end) - int(start)
                                if size >= e_start and size < e_end or event == "BND":
                                    out_handle.write(line)
                                    has_regions = True
                        if has_regions:
                            pybedtools.BedTool(all_file).sort(stream=True)\
                              .merge(c=4, o="distinct", delim=delim).saveas(tx_out_file)
            if utils.file_exists(size_out_file):
                ann_size_out_file = annotate.add_genes(size_out_file, data)
                size_beds.append(ann_size_out_file)
        if len(size_beds) > 0:
            out_file = bedutils.combine(size_beds, out_file, data)
    return out_file
Beispiel #21
0
def summarize(calls, data):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    import pybedtools
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "structural", sample, "ensemble"))
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                input_beds = filter(
                    lambda x: x is not None,
                    [_create_bed(c, out_file, data) for c in calls])
                if len(input_beds) > 0:
                    all_file = "%s-all.bed" % utils.splitext_plus(
                        tx_out_file)[0]
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(input_beds):
                            out_handle.write(line)
                    pybedtools.BedTool(all_file).sort(stream=True)\
                      .merge(c=4, o="distinct", delim=",").saveas(tx_out_file)
    if utils.file_exists(out_file):
        calls.append({"variantcaller": "ensemble", "vrn_file": out_file})
    return calls
def _get_chroms(data):
    """Retrieve chromosomes included in variant_regions for parallelization.
    """
    chroms = set([])
    with shared.bedtools_tmpdir(data):
        for r in pybedtools.BedTool(dd.get_variant_regions(data)):
            chroms.add(r.chrom)
    out = []
    for c in ref.file_contigs(dd.get_ref_file(data)):
        if c.name in chroms:
            out.append((c.name, 0, c.size))
    return out
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    samples = utils.unpack_worlds(samples)
    samples = cwlutils.unpack_tarballs(samples, samples[0])
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    batches = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples, require_bam=False).items():
            batches.append(items)
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                vr_file = dd.get_variant_regions(data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count()
                # attach a representative sample for calculating callable region
                if not data.get("work_bam"):
                    for x in items:
                        if x.get("work_bam"):
                            data["work_bam_callable"] = x["work_bam"]
                out.append([data])
        # Ensure output order matches input order, consistency for CWL-based runs
        assert len(out) == len(samples)
        sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)}
        def by_input_index(xs):
            return sample_indexes[dd.get_sample_name(xs[0])]
        out.sort(key=by_input_index)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions, batches[0])
    return out
Beispiel #24
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    samples = utils.unpack_worlds(samples)
    samples = cwlutils.unpack_tarballs(samples, samples[0])
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    batches = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples, require_bam=False).items():
            batches.append(items)
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                vr_file = dd.get_variant_regions(data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count()
                # attach a representative sample for calculating callable region
                if not data.get("work_bam"):
                    for x in items:
                        if x.get("work_bam"):
                            data["work_bam_callable"] = x["work_bam"]
                out.append([data])
        # Ensure output order matches input order, consistency for CWL-based runs
        assert len(out) == len(samples)
        sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)}
        def by_input_index(xs):
            return sample_indexes[dd.get_sample_name(xs[0])]
        out.sort(key=by_input_index)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions, batches[0])
    return out
Beispiel #25
0
def summarize(calls, data):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda x: x is not None,
                            [_create_bed(c, sample, work_dir, data) for c in calls])
    if len(input_beds) > 0:
        out_file = _combine_bed_by_size(input_beds, sample, work_dir, data)
        if utils.file_exists(out_file):
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep"))
            calls.append({"variantcaller": "ensemble",
                          "vrn_file": bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)})
    return calls
Beispiel #26
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    import pybedtools
    samples = [x[0] for x in samples]
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    batches = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples, require_bam=False).items():
            batches.append(items)
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                vr_file = dd.get_variant_regions(data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count()
                highdepth_bed = tz.get_in(["regions", "highdepth"], data)
                if highdepth_bed:
                    data["config"]["algorithm"]["highdepth_regions"] = highdepth_bed
                # attach a representative sample for calculating callable region
                if not data.get("work_bam"):
                    for x in items:
                        if x.get("work_bam"):
                            data["work_bam_callable"] = x["work_bam"]
                out.append([data])
        assert len(out) == len(samples)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions, batches[0])
    return out
Beispiel #27
0
def _limit_calls(in_file, highdepth_beds, data):
    """Limit calls to avoid calling in problematic genomic regions.

    - highdepth_beds -- high depth regions with reads in repeat regions.
    """
    import pybedtools
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                with open(all_file, "w") as out_handle:
                    for line in fileinput.input(highdepth_beds):
                        out_handle.write(line)
                to_remove = pybedtools.BedTool(all_file).sort(stream=True)\
                                                        .merge(c=4, o="distinct", delim=",").saveas()
                pybedtools.BedTool(in_file).intersect(to_remove, v=True).saveas(tx_out_file)
    return out_file
Beispiel #28
0
def sample_callable_bed(bam_file, ref_file, config):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir({"config": config}):
        callable_bed = parallel_callable_loci(bam_file, ref_file, config)
        input_regions_bed = config["algorithm"].get("variant_regions", None)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(config, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(lambda x: x.name == "CALLABLE")
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(input_regions).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return out_file
Beispiel #29
0
def _limit_calls(in_file, highdepth_beds, data):
    """Limit calls to avoid calling in problematic genomic regions.

    - highdepth_beds -- high depth regions with reads in repeat regions.
    """
    import pybedtools
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                with open(all_file, "w") as out_handle:
                    for line in fileinput.input(highdepth_beds):
                        out_handle.write(line)
                to_remove = pybedtools.BedTool(all_file).sort(stream=True)\
                                                        .merge(c=4, o="distinct", delim=",").saveas()
                pybedtools.BedTool(in_file).intersect(to_remove, v=True, nonamecheck=True).saveas(tx_out_file)
    return out_file
Beispiel #30
0
def sample_callable_bed(bam_file, ref_file, config):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir({"config": config}):
        callable_bed = parallel_callable_loci(bam_file, ref_file, config)
        input_regions_bed = config["algorithm"].get("variant_regions", None)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(lambda x: x.name == "CALLABLE")
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(input_regions).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return out_file
Beispiel #31
0
def exclude_by_ends(in_file, exclude_file, data, in_params=None):
    """Exclude calls based on overlap of the ends with exclusion regions.

    Removes structural variants with either end being in a repeat: a large
    source of false positives.

    Parameters tuned based on removal of LCR overlapping false positives in DREAM
    synthetic 3 data.
    """
    params = {
        "end_buffer": 50,
        "rpt_pct": 0.9,
        "total_rpt_pct": 0.2,
        "sv_pct": 0.5
    }
    if in_params:
        params.update(in_params)
    assert in_file.endswith(".bed")
    out_file = "%s-norepeats%s" % utils.splitext_plus(in_file)
    to_filter = collections.defaultdict(list)
    removed = 0
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                for coord, end_name in [(1, "end1"), (2, "end2")]:
                    base, ext = utils.splitext_plus(tx_out_file)
                    end_file = _create_end_file(
                        in_file, coord, params,
                        "%s-%s%s" % (base, end_name, ext))
                    to_filter = _find_to_filter(end_file, exclude_file, params,
                                                to_filter)
            with open(tx_out_file, "w") as out_handle:
                with open(in_file) as in_handle:
                    for line in in_handle:
                        key = "%s:%s-%s" % tuple(line.strip().split("\t")[:3])
                        total_rpt_size = sum(to_filter.get(key, [0]))
                        if total_rpt_size <= (params["total_rpt_pct"] *
                                              params["end_buffer"]):
                            out_handle.write(line)
                        else:
                            removed += 1
    return out_file, removed
Beispiel #32
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    CovInfo = collections.namedtuple("CovInfo", "callable, highdepth, avg_coverage, coverage")
    config = data["config"]
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir({"config": config}):
        coverage_file, callable_bed, highdepth_bed, variant_regions_avg_cov = coverage.calculate(bam_file, data)
        input_regions_bed = config["algorithm"].get("variant_regions", None)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(config, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(lambda x: x.name == "CALLABLE")
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, highdepth_bed, variant_regions_avg_cov, coverage_file)
Beispiel #33
0
def _setup_variant_regions(data, out_dir):
    """Ensure we have variant regions for calling, using transcript if not present.

    Respects noalt_calling by removing additional contigs to improve
    speeds.
    """
    vr_file = dd.get_variant_regions(data)
    if not vr_file:
        vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir)
    contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))])
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")),
                            "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0])
    if not utils.file_uptodate(out_file, vr_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with shared.bedtools_tmpdir(data):
                    for r in pybedtools.BedTool(vr_file):
                        if r.chrom in contigs:
                            if chromhacks.is_nonalt(r.chrom):
                                out_handle.write(str(r))
    data = dd.set_variant_regions(data, out_file)
    return data
Beispiel #34
0
def _setup_variant_regions(data, out_dir):
    """Ensure we have variant regions for calling, using transcript if not present.

    Respects noalt_calling by removing additional contigs to improve
    speeds.
    """
    vr_file = dd.get_variant_regions(data)
    if not vr_file:
        vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir)
    contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))])
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")),
                            "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0])
    if not utils.file_uptodate(out_file, vr_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with shared.bedtools_tmpdir(data):
                    for r in pybedtools.BedTool(vr_file):
                        if r.chrom in contigs:
                            if chromhacks.is_nonalt(r.chrom):
                                out_handle.write(str(r))
    data = dd.set_variant_regions(data, out_file)
    return data
Beispiel #35
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion, incorporating variant regions and chromosome.

    Excludes locally repetitive regions (if `remove_lcr` is set) and
    centromere regions, both of which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0],
                                     "-%s" % chrom if chrom else "")
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with shared.bedtools_tmpdir(items[0]):
            # Get a bedtool for the full region if no variant regions
            want_bedtool = callable.get_ref_bedtool(
                tz.get_in(["reference", "fasta", "base"], items[0]),
                items[0]["config"], chrom)
            if chrom:
                want_bedtool = pybedtools.BedTool(
                    shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom,
                                               items[0]))
            lcr_bed = shared.get_lcr_bed(items)
            if lcr_bed:
                want_bedtool = want_bedtool.subtract(
                    pybedtools.BedTool(lcr_bed))
            sv_exclude_bed = _get_sv_exclude_file(items)
            if sv_exclude_bed and len(want_bedtool) > 0:
                want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas()
            want_bedtool = pybedtools.BedTool(
                shared.remove_highdepth_regions(want_bedtool.saveas().fn,
                                                items))
            with file_transaction(items[0], out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(
                    tz.get_in(["reference", "fasta", "base"], items[0]),
                    items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
Beispiel #36
0
def summarize(calls, data):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                input_beds = filter(lambda x: x is not None,
                                    [_create_bed(c, out_file) for c in calls])
                if len(input_beds) > 0:
                    all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(input_beds):
                            out_handle.write(line)
                    pybedtools.BedTool(all_file).sort(stream=True).merge(nms=True).saveas(tx_out_file)
    if utils.file_exists(out_file):
        calls.append({"variantcaller": "ensemble",
                      "vrn_file": out_file})
    return calls
Beispiel #37
0
def _prep_sample_cnvs(cnv_file, data):
    """Convert a multiple sample CNV file into a single BED file for a sample.

    Handles matching and fixing names where R converts numerical IDs (1234) into
    strings by adding an X (X1234), and converts other characters into '.'s.
    http://stat.ethz.ch/R-manual/R-devel/library/base/html/make.names.html
    """
    import pybedtools
    sample_name = tz.get_in(["rgnames", "sample"], data)
    def make_names(name):
        return re.sub("[^\w.]", '.', name)
    def matches_sample_name(feat):
        return (feat.name == sample_name or feat.name == "X%s" % sample_name or
                feat.name == make_names(sample_name))
    def update_sample_name(feat):
        feat.name = sample_name
        return feat
    sample_file = os.path.join(os.path.dirname(cnv_file), "%s-cnv.bed" % sample_name)
    if not utils.file_exists(sample_file):
        with file_transaction(data, sample_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                pybedtools.BedTool(cnv_file).filter(matches_sample_name).each(update_sample_name).saveas(tx_out_file)
    return sample_file
Beispiel #38
0
def _prep_sample_cnvs(cnv_file, data):
    """Convert a multiple sample CNV file into a single BED file for a sample.

    Handles matching and fixing names where R converts numerical IDs (1234) into
    strings by adding an X (X1234).
    """
    import pybedtools
    sample_name = tz.get_in(["rgnames", "sample"], data)

    def matches_sample_name(feat):
        return feat.name == sample_name or feat.name == "X%s" % sample_name

    def update_sample_name(feat):
        feat.name = sample_name
        return feat

    sample_file = os.path.join(os.path.dirname(cnv_file),
                               "%s-cnv.bed" % sample_name)
    if not utils.file_exists(sample_file):
        with file_transaction(sample_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                pybedtools.BedTool(cnv_file).filter(matches_sample_name).each(
                    update_sample_name).saveas(tx_out_file)
    return sample_file
Beispiel #39
0
def exclude_by_ends(in_file, exclude_file, data, in_params=None):
    """Exclude calls based on overlap of the ends with exclusion regions.

    Removes structural variants with either end being in a repeat: a large
    source of false positives.

    Parameters tuned based on removal of LCR overlapping false positives in DREAM
    synthetic 3 data.
    """
    params = {"end_buffer": 50,
              "rpt_pct": 0.9,
              "total_rpt_pct": 0.2,
              "sv_pct": 0.5}
    if in_params:
        params.update(in_params)
    assert in_file.endswith(".bed")
    out_file = "%s-norepeats%s" % utils.splitext_plus(in_file)
    to_filter = collections.defaultdict(list)
    removed = 0
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                for coord, end_name in [(1, "end1"), (2, "end2")]:
                    base, ext = utils.splitext_plus(tx_out_file)
                    end_file = _create_end_file(in_file, coord, params, "%s-%s%s" % (base, end_name, ext))
                    to_filter = _find_to_filter(end_file, exclude_file, params, to_filter)
            with open(tx_out_file, "w") as out_handle:
                with open(in_file) as in_handle:
                    for line in in_handle:
                        key = "%s:%s-%s" % tuple(line.strip().split("\t")[:3])
                        total_rpt_size = sum(to_filter.get(key, [0]))
                        if total_rpt_size <= (params["total_rpt_pct"] * params["end_buffer"]):
                            out_handle.write(line)
                        else:
                            removed += 1
    return out_file, removed
Beispiel #40
0
def summarize(calls, data, highdepth_beds):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda x: x is not None and utils.file_exists(x),
                            [_create_bed(c, sample, work_dir, data) for c in calls])
    if len(input_beds) > 0:
        out_file = combine_bed_by_size(input_beds, sample, work_dir, data)
        if utils.file_exists(out_file):
            if len(input_beds) > N_FILTER_CALLERS:
                filter_file = _filter_ensemble(out_file, data)
            else:
                filter_file = out_file
            if len(highdepth_beds) > 0:
                limit_file = _limit_calls(filter_file, highdepth_beds, data)
            else:
                limit_file = filter_file
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(limit_file), "bedprep"))
            calls.append({"variantcaller": "sv-ensemble",
                          "vrn_file": bedutils.clean_file(limit_file, data, bedprep_dir=bedprep_dir)})
    return calls