Ejemplo n.º 1
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    from bcbio.variation import bedutils
    highdepth_beds = filter(lambda x: x is not None,
                            list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items])))
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(highdepth_beds):
                            parts = line.split("\t")
                            out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Ejemplo n.º 2
0
def _remove_regions(in_file, remove_beds, ext, data):
    """Subtract a list of BED files from an input BED.

    General approach handling none, one and more remove_beds.
    """
    from bcbio.variation import bedutils
    out_file = "%s-%s.bed" % (utils.splitext_plus(in_file)[0], ext)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with bedtools_tmpdir(data):
                if len(remove_beds) == 0:
                    to_remove = None
                elif len(remove_beds) == 1:
                    to_remove = remove_beds[0]
                else:
                    to_remove = "%s-all.bed" % utils.splitext_plus(
                        tx_out_file)[0]
                    with open(to_remove, "w") as out_handle:
                        for b in remove_beds:
                            with utils.open_gzipsafe(b) as in_handle:
                                for line in in_handle:
                                    parts = line.split("\t")
                                    out_handle.write(
                                        "\t".join(parts[:4]).rstrip() + "\n")
                    if utils.file_exists(to_remove):
                        to_remove = bedutils.sort_merge(to_remove, data)
                if to_remove and utils.file_exists(to_remove):
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()),
                           "Remove problematic regions: %s" % ext)
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Ejemplo n.º 3
0
def _collapse_transcripts(in_file, window, data, out_dir):
    """Collapse transcripts into min/max coordinates and optionally add windows.
    """
    if out_dir is None:
        out_dir = os.path.dirname(in_file)
    out_file = os.path.join(out_dir,
                            "%s-transcripts_w%s.bed" % (os.path.splitext(os.path.basename(in_file))[0],
                                                        window))
    chrom_sizes = {}
    for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]):
        chrom_sizes[contig.name] = contig.size
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file)
            sort_cmd = bedutils.get_sort_cmd()
            cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}"
            do.run(cmd.format(**locals()), "Sort BED file by transcript name")
            with open(tx_out_file, "w") as out_handle:
                # Work around for segmentation fault issue with groupby
                # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476
                x = pybedtools.BedTool(prep_file)
                def gen():
                    for r in x:
                        yield r
                for name, rs in itertools.groupby(gen(), lambda r: (r.name, r.chrom)):
                    rs = list(rs)
                    r = rs[0]
                    for gcoords in _group_coords(rs):
                        min_pos = max(min(gcoords) - window, 0)
                        max_pos = min(max(gcoords) + window, chrom_sizes[r.chrom])
                        out_handle.write("%s\t%s\t%s\t%s\n" % (r.chrom, min_pos, max_pos, r.name))
    return bedutils.sort_merge(out_file, data)
Ejemplo n.º 4
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    from bcbio.variation import bedutils
    highdepth_beds = filter(lambda x: x is not None,
                            list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items])))
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(highdepth_beds):
                            parts = line.split("\t")
                            out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Ejemplo n.º 5
0
def _get_target_access_files(cov_interval, data, work_dir):
    """Retrieve target and access files based on the type of data to process.

    pick targets, anti-targets and access files based on analysis type
    http://cnvkit.readthedocs.org/en/latest/nonhybrid.html
    """
    base_regions = shared.get_base_cnv_regions(data, work_dir)
    target_bed = bedutils.sort_merge(base_regions, data, out_dir=work_dir)
    if cov_interval == "amplicon":
        return target_bed, target_bed
    elif cov_interval == "genome":
        return target_bed, target_bed
    else:
        access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data)
        return target_bed, access_file