Beispiel #1
0
def _multivec(filepath,
              output_file,
              assembly,
              tile_size,
              chromsizes_filename,
              starting_resolution,
              row_infos_filename=None):
    '''
    Aggregate a multivec file.

    This is a file containing nxn data that is aggregated along only one axis.
    This data should be in an HDF5 file where each dataset is named for a
    chromosome and contains a 'resolutions' group containing values for the
    base level resolution.

    Example: f['chr1']['reslutions']['1000'] = [[1,2,3],[4,5,6]]

    The resulting data will be organized by resolution and chromosome.

    Example: f_out['chr1']['resolutions']['5000']=[[1000,2000,3000],[4000,5000,6000]]

    Aggregation is currently done by summing adjacent values.
    '''
    f_in = h5py.File(filepath, 'r')

    if output_file is None:
        output_file = op.splitext(filepath)[0] + ".multires.mv5"

    (chrom_info, chrom_names,
     chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly)

    if method == 'maxtotal':
        pass
    if method == 'logsumexp':

        def agg(x):
            a = x.T.reshape((x.shape[1], -1, 2))
            return sm.logsumexp(a, axis=2).T
    else:
        agg = lambda x: x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T

    print("agg:", agg)
    if row_infos_filename is not None:
        with open(row_infos_filename, 'r') as fr:
            row_infos = [l.strip().encode('utf8') for l in fr]
    else:
        row_infos = None
    print("row_infos:", row_infos)

    cmv.create_multivec_multires(
        f_in,
        chromsizes=zip(chrom_names, chrom_sizes),
        agg=lambda x: np.nansum(x.T.reshape((x.shape[1], -1, 2)), axis=2).T,
        starting_resolution=starting_resolution,
        tile_size=tile_size,
        output_file=output_file,
        row_infos=row_infos)
Beispiel #2
0
def abs2genome_fn(chromsizes_filename, start, end):
    """Convert an absolute genomic range to sections of genomic ranges.

    E.g. (1000,2000) => [('chr1', 1000, 1500), ('chr2', 1500, 2000)]
    """
    (chrom_info, chrom_names,
     chrom_sizes) = load_chromsizes(chromsizes_filename)

    for cid, start, end in abs2genomic(chrom_sizes, start, end):
        try:
            yield ChromosomeInterval(cid=cid,
                                     name=chrom_names[cid],
                                     start=start,
                                     end=end)
        except IndexError:
            # we've gone beyond the last chromosome so stop iterating
            return
    yield cid_hi, start, rel_pos_hi
Beispiel #3
0
def _bedgraph_to_multivec(filepaths, output_file, assembly, chrom_col,
                          from_pos_col, to_pos_col, value_col, has_header,
                          chunk_size, nan_value, chromsizes_filename,
                          starting_resolution, num_rows, format,
                          row_infos_filename, tile_size, method):
    print('chrom_col:', chrom_col)

    with tempfile.TemporaryDirectory() as td:
        print('temporary dir:', td)

        temp_file = op.join(td, 'temp.mv5')
        f_out = h5py.File(temp_file, 'w')

        (chrom_info, chrom_names,
         chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly)

        if row_infos_filename is not None:
            with open(row_infos_filename, 'r') as fr:
                row_infos = [l.strip().encode('utf8') for l in fr]

        else:
            row_infos = None

        for chrom in chrom_info.chrom_order:
            f_out.create_dataset(chrom, (math.ceil(
                chrom_info.chrom_lengths[chrom] / starting_resolution),
                                         num_rows * len(filepaths)),
                                 fillvalue=np.nan,
                                 compression='gzip')

        def bedline_to_chrom_start_end_vector(bedlines, row_infos=None):
            chrom_set = set()
            start_set = set()
            end_set = set()
            all_vector = []

            for bedline in bedlines:
                parts = bedline.strip().split()
                chrom = parts[chrom_col - 1]
                start = int(parts[from_pos_col - 1])
                end = int(parts[to_pos_col - 1])
                vector = [
                    float(f) if not f == 'NA' else np.nan
                    for f in parts[value_col - 1:value_col - 1 + num_rows]
                ]
                chrom_set.add(chrom)
                start_set.add(start)
                end_set.add(end)

                if len(chrom_set) > 1:
                    raise ValueError("Chromosomes don't match in these lines:",
                                     bedlines)
                if len(start_set) > 1:
                    raise ValueError(
                        "Start positions don't match in these lines:",
                        bedlines)
                if len(end_set) > 1:
                    raise ValueError(
                        "End positions don't match in these lines:", bedlines)
                all_vector += vector

            return (list(chrom_set)[0], list(start_set)[0], list(end_set)[0],
                    all_vector)

        if format == 'epilogos':
            cmv.bedfile_to_multivec(filepaths, f_out,
                                    epilogos_bedline_to_vector,
                                    starting_resolution, has_header,
                                    chunk_size)
        elif format == 'states':
            assert (
                row_infos != None
            ), "A row_infos file must be provided for --format = 'states' "
            states_dic = {row_infos[x]: x for x in range(len(row_infos))}

            cmv.bedfile_to_multivec(filepaths, f_out, states_bedline_to_vector,
                                    starting_resolution, has_header,
                                    chunk_size, states_dic)
        else:
            cmv.bedfile_to_multivec(filepaths, f_out,
                                    bedline_to_chrom_start_end_vector,
                                    starting_resolution, has_header,
                                    chunk_size)

        f_out.close()
        tf = temp_file
        f_in = h5py.File(tf, 'r')

        if output_file is None:
            output_file = op.splitext(filepaths[0])[0] + '.multires.mv5'
        print('output_file:', output_file)

        # Override the output file if it existts
        if op.exists(output_file):
            os.remove(output_file)

        if method == 'logsumexp':

            def agg(x):
                # newshape = (x.shape[2], -1, 2)
                # b = x.T.reshape((-1,))

                a = x.T.reshape((x.shape[1], -1, 2))

                # this is going to be an odd way to get rid of nan
                # values
                orig_shape = a.shape
                na = a.reshape((-1, ))

                SMALL_NUM = -1e8
                NAN_THRESHOLD_NUM = SMALL_NUM / 100

                if np.nanmin(na) < NAN_THRESHOLD_NUM:
                    raise ValueError(
                        "Error removing nan's when running logsumexp aggregation"
                    )

                na[np.isnan(na)] = SMALL_NUM
                na = na.reshape(orig_shape)
                res = sm.logsumexp(a, axis=2).T

                nres = res.reshape((-1, ))
                # print("nres:", np.nansum(nres < NAN_THRESHOLD_NUM))
                nres[nres < NAN_THRESHOLD_NUM] = np.nan
                res = nres.reshape(res.shape)

                # print("res:", np.nansum(res.reshape((-1,))))

                return res
        else:
            agg = lambda x: x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T

        cmv.create_multivec_multires(f_in,
                                     chromsizes=zip(chrom_names, chrom_sizes),
                                     agg=agg,
                                     starting_resolution=starting_resolution,
                                     tile_size=tile_size,
                                     output_file=output_file,
                                     row_infos=row_infos)
Beispiel #4
0
def _bedfile(
    filepath,
    output_file,
    assembly,
    importance_column,
    has_header,
    chromosome,
    max_per_tile,
    tile_size,
    delimiter,
    chromsizes_filename,
    offset,
):
    BEDDB_VERSION = 3

    if output_file is None:
        output_file = filepath + ".beddb"
    else:
        output_file = output_file

    if op.exists(output_file):
        os.remove(output_file)

    if filepath.endswith(".gz"):
        import gzip

        bed_file = gzip.open(filepath, "rt")
    else:
        bed_file = open(filepath, "r")

    try:
        (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes(
            chromsizes_filename, assembly
        )
    except FileNotFoundError:
        if chromsizes_filename is None:
            print("Assembly not found:", assembly, file=sys.stderr)
        else:
            print(
                "Chromsizes filename not found:", chromsizes_filename, file=sys.stderr
            )
        return None

    rand = random.Random(3)

    def line_to_np_array(line):
        """
        Convert a bed file line to a numpy array which can later
        be used as an entry in an h5py file.
        """
        try:
            start = int(line[1])
            stop = int(line[2])
        except ValueError:
            raise ValueError("Error parsing the position, line: {}".format(line))

        chrom = line[0]

        if importance_column is None:
            # assume a random importance when no aggregation strategy is given
            importance = rand.random()
        elif importance_column == "size":
            importance = stop - start
        elif importance_column == "random":
            importance = rand.random()
        else:
            importance = float(line[int(importance_column) - 1])

        if stop < start:
            print("WARNING: stop < start:", line, file=sys.stderr)

            start, stop = stop, start

        if len(line) > 3:
            bedline_name = line[3]
        else:
            bedline_name = ""
        # convert chromosome coordinates to genome coordinates
        genome_start = chrom_info.cum_chrom_lengths[chrom] + start + offset
        genome_end = chrom_info.cum_chrom_lengths[chrom] + stop + offset

        pos_offset = genome_start - start
        parts = {
            "startPos": genome_start,
            "endPos": genome_end,
            "uid": slugid.nice(),
            "name": bedline_name,
            "chrOffset": pos_offset,
            "fields": "\t".join(line),
            "importance": importance,
            "chromosome": str(chrom),
        }

        return parts

    dset = []

    print("delimiter:", delimiter)
    if has_header:
        line = bed_file.readline()
        header = line.strip().split(delimiter)
    else:
        line = bed_file.readline().strip()
        line_parts = line.strip().split(delimiter)
        try:
            dset += [line_to_np_array(line_parts)]
        except KeyError:
            print(
                f"Unable to find {line_parts[0]} in the list of chromosome sizes. "
                "Please make sure the correct assembly or chromsizes filename "
                "is passed in as a parameter",
                file=sys.stderr,
            )
            return None
        except IndexError:
            print("Invalid line:", line)
        header = map(str, list(range(1, len(line.strip().split(delimiter)) + 1)))

    for line in bed_file:
        line_parts = line.strip().split(delimiter)
        try:
            dset += [line_to_np_array(line_parts)]
        except IndexError:
            print("Invalid line:", line)

    if chromosome is not None:
        dset = [d for d in dset if d["chromosome"] == chromosome]

    # We neeed chromosome information as well as the assembly size to properly
    # tile this data
    tile_size = tile_size

    assembly_size = chrom_info.total_length + 1
    """
    else:
        try:
            assembly_size = chrom_info.chrom_lengths[chromosome]
        except KeyError:
            print(
                "ERROR: Chromosome {} not found in assembly {}.".format(
                    chromosome, assembly
                ),
                file=sys.stderr
            )
            return 1
    """

    max_zoom = int(math.ceil(math.log(assembly_size / tile_size) / math.log(2)))
    """
    if max_zoom is not None and max_zoom < max_zoom:
        max_zoom = max_zoom
    """

    # this script stores data in a sqlite database
    import sqlite3

    sqlite3.register_adapter(np.int64, lambda val: int(val))
    print("output_file:", output_file, "header:", header)
    conn = sqlite3.connect(output_file)

    # store some meta data
    store_meta_data(
        conn,
        1,
        max_length=assembly_size,
        assembly=assembly,
        chrom_names=chrom_names,
        chrom_sizes=chrom_sizes,
        tile_size=tile_size,
        max_zoom=max_zoom,
        max_width=tile_size * 2 ** max_zoom,
        header=header,
        version=BEDDB_VERSION,
    )

    # max_width = tile_size * 2 ** max_zoom
    uid_to_entry = {}

    intervals = []

    # store each bed file entry as an interval
    for d in dset:
        uid = d["uid"]
        uid_to_entry[uid] = d
        intervals += [(d["startPos"], d["endPos"], uid)]

    tile_width = tile_size

    c = conn.cursor()
    c.execute(
        """
        CREATE TABLE intervals
        (
            id int PRIMARY KEY,
            zoomLevel int,
            importance real,
            startPos int,
            endPos int,
            chrOffset int,
            uid text,
            name text,
            fields text
        )
        """
    )

    c.execute(
        """
        CREATE VIRTUAL TABLE position_index USING rtree(
            id,
            rStartZoomLevel, rEndZoomLevel, rStartPos, rEndPos
        )
        """
    )

    curr_zoom = 0
    counter = 0

    max_viewable_zoom = max_zoom

    if max_zoom is not None and max_zoom < max_zoom:
        max_viewable_zoom = max_zoom

    sorted_intervals = sorted(
        intervals, key=lambda x: -uid_to_entry[x[-1]]["importance"]
    )
    # print('si:', sorted_intervals[:10])
    print("max_per_tile:", max_per_tile)

    tile_counts = col.defaultdict(int)

    for interval in sorted_intervals:
        # go through each interval from most important to least
        while curr_zoom <= max_viewable_zoom:
            # try to place it in the highest zoom level and go down from there
            tile_width = tile_size * 2 ** (max_zoom - curr_zoom)

            curr_pos = interval[0]
            space_available = True

            # check if there's space at this zoom level
            while curr_pos < interval[1]:
                curr_tile = math.floor(curr_pos / tile_width)
                tile_id = "{}.{}".format(curr_zoom, curr_tile)

                """
                if interval[0] < 1000000:
                    print('tile_id:', tile_id, tile_counts[tile_id], curr_zoom, 'interval:', interval)
                """

                # print(tile_id, "tile_counts[tile_id]", tile_counts[tile_id])
                if tile_counts[tile_id] >= max_per_tile:
                    space_available = False
                    break

                curr_pos += tile_width

            # if there is, then fill it up
            if space_available:
                curr_pos = interval[0]
                while curr_pos < interval[1]:
                    curr_tile = math.floor(curr_pos / tile_width)
                    tile_id = "{}.{}".format(curr_zoom, curr_tile)

                    tile_counts[tile_id] += 1

                    """
                    # increment tile counts for lower level tiles
                    higher_zoom = curr_zoom + 1
                    higher_tile = math.floor(higher_zoom / 2)

                    while higher_zoom <= max_viewable_zoom:
                        new_tile_id = '{}.{}'.format(higher_zoom, higher_tile)
                        higher_zoom += 1
                        higher_tile = math.floor(higher_tile / 2)
                        tile_counts[new_tile_id] += 1
                    """

                    curr_pos += tile_width

            if space_available:
                # there's available space
                value = uid_to_entry[interval[-1]]

                # one extra question mark for the primary key
                exec_statement = "INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?)"

                c.execute(
                    exec_statement,
                    # primary key, zoomLevel, startPos, endPos, chrOffset, line
                    (
                        counter,
                        curr_zoom,
                        value["importance"],
                        value["startPos"],
                        value["endPos"],
                        value["chrOffset"],
                        value["uid"],
                        value["name"],
                        value["fields"],
                    ),
                )

                if counter % 1000 == 0:
                    print("counter:", counter, value["endPos"] - value["startPos"])

                exec_statement = "INSERT INTO position_index VALUES (?,?,?,?,?)"
                c.execute(
                    exec_statement,
                    # add counter as a primary key
                    (counter, curr_zoom, curr_zoom, value["startPos"], value["endPos"]),
                )

                counter += 1
                break

            curr_zoom += 1

        curr_zoom = 0
    conn.commit()
    return True
Beispiel #5
0
def _bedpe(
    filepath,
    output_file=None,
    assembly=None,
    importance_column="random",
    has_header=False,
    max_per_tile=100,
    tile_size=1024,
    chromosome=None,
    chromsizes_filename=None,
    chr1_col=1,
    from1_col=2,
    to1_col=3,
    chr2_col=4,
    from2_col=5,
    to2_col=6,
    max_zoom=None,
    sqlite_cache_size=500,  # 500 MB
    sqlite_batch_size=100000,
    verbose=0,
):
    BED2DDB_VERSION = 1

    if verbose > 0:
        print(f"BEDPEDB Version {BED2DDB_VERSION}")

    if filepath == "-":
        f = sys.stdin
    elif filepath.endswith(".gz"):
        f = gzip.open(filepath, "rt")
    else:
        f = open(filepath, "r")

    if output_file is None:
        output_file = filepath
        if filepath.endswith(".gz"):
            output_file = os.path.splitext(output_file)[0]
        output_file = os.path.splitext(output_file)[0] + ".bedpedb"

    if op.exists(output_file):
        os.remove(output_file)

    chrom_info, chrom_names, chrom_sizes = cch.load_chromsizes(
        chromsizes_filename, assembly
    )

    def line_to_dict(line):
        parts = line.split()
        d = {}
        try:
            chrom1 = parts[chr1_col - 1]
            chrom2 = parts[chr2_col - 1]
            chrom1_offset = chrom_info.cum_chrom_lengths[chrom1]
            chrom2_offset = chrom_info.cum_chrom_lengths[chrom2]

            d["xs"] = [
                chrom1_offset + int(parts[from1_col - 1]),
                chrom1_offset + int(parts[to1_col - 1]),
            ]
            d["ys"] = [
                chrom2_offset + int(parts[from2_col - 1]),
                chrom2_offset + int(parts[to2_col - 1]),
            ]
        except KeyError:
            error_str = (
                "ERROR converting chromosome position to genome position. "
                "Please make sure you've specified the correct assembly "
                "using the --assembly option or a chromsizes file using the . "
                "--chromsizes-filename option."
                "Current assembly: {}, chromosomes: {},{}".format(
                    assembly, parts[chr1_col - 1], parts[chr2_col - 1]
                )
            )
            raise (KeyError(error_str))

        d["uid"] = slugid.nice()

        d["chrOffset"] = d["xs"][0] - int(parts[from1_col - 1])
        d["chrom1"] = str(chrom1)
        d["chrom2"] = str(chrom2)

        if importance_column is None:
            d["importance"] = max(d["xs"][1] - d["xs"][0], d["ys"][1] - d["ys"][0])
        elif importance_column == "random":
            d["importance"] = random.random()
        else:
            # We seem to use one-based numbering for columns...
            d["importance"] = float(parts[int(importance_column) - 1])

        d["fields"] = line

        return d

    entries = []

    if has_header:
        f.readline()
    else:
        first_line = f.readline().strip()
        try:
            parts = first_line.split()

            int(parts[from1_col - 1])
            int(parts[to1_col - 1])
            int(parts[from2_col - 1])
            int(parts[to2_col - 1])
        except ValueError:
            error_str = (
                "Couldn't convert one of the bedpe coordinates to an "
                "integer. If the input file contains a header, make sure to "
                "indicate that with the --has-header option. Line: {}".format(
                    first_line
                )
            )
            raise ValueError(error_str)
        entries = [line_to_dict(first_line)]

    entries += [line_to_dict(line) for line in [line.strip() for line in f] if line]

    if chromosome is not None:
        entries = [
            d for d in entries if d["chrom1"] == chromosome or d["chrom2"] == chromosome
        ]

    if verbose > 0:
        print(f"Found {len(entries)} entries")

    # We need chromosome information as well as the assembly size to properly
    # tile this data
    assembly_size = chrom_info.total_length + 1
    max_zoom = int(math.ceil(math.log(assembly_size / tile_size) / math.log(2)))

    # this script stores data in a sqlite database
    sqlite3.register_adapter(np.int64, lambda val: int(val))
    conn = sqlite3.connect(output_file, isolation_level=None)

    # store some meta data
    store_meta_data(
        conn,
        1,
        max_length=assembly_size,
        assembly=assembly,
        chrom_names=chrom_names,
        chrom_sizes=chrom_sizes,
        tile_size=tile_size,
        max_zoom=max_zoom,
        max_width=tile_size * 2 ** max_zoom,
        version=BED2DDB_VERSION,
    )

    # max_width = tile_size * 2 ** max_zoom
    # uid_to_entry = {}

    c = conn.cursor()
    c.execute("PRAGMA synchronous = OFF;")
    c.execute("PRAGMA journal_mode = OFF;")
    c.execute(f"PRAGMA cache_size = {int(sqlite_cache_size * 1000)};")

    c.execute(
        """
        CREATE TABLE intervals
        (
            id int PRIMARY KEY,
            zoomLevel int,
            importance real,
            fromX int,
            toX int,
            fromY int,
            toY int,
            chrOffset int,
            uid text,
            fields text
        )
        """
    )

    c.execute(
        """
        CREATE VIRTUAL TABLE position_index USING rtree(
            id,
            rFromX, rToX,
            rFromY, rToY
        )
        """
    )

    curr_zoom = 0
    counter = 0

    tile_counts = col.defaultdict(lambda: col.defaultdict(lambda: col.defaultdict(int)))
    # Sort from high to low importance
    entries = sorted(entries, key=lambda x: -x["importance"])

    interval_inserts = []
    position_index_inserts = []

    def batch_insert(conn, c, interval_inserts, position_index_inserts):
        if verbose > 0:
            print(f"Insert batch ({counter})")

        with transaction(conn):
            c.executemany(
                "INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?,?)", interval_inserts
            )
            c.executemany(
                "INSERT INTO position_index VALUES (?,?,?,?,?)", position_index_inserts
            )

        interval_inserts.clear()
        position_index_inserts.clear()

    for entry_num, d in enumerate(entries):
        curr_zoom = 0

        while curr_zoom <= max_zoom:
            tile_width = tile_size * 2 ** (max_zoom - curr_zoom)
            tile_from = list(
                map(lambda x: int(x / tile_width), [d["xs"][0], d["ys"][0]])
            )
            tile_to = list(map(lambda x: int(x / tile_width), [d["xs"][1], d["ys"][1]]))

            empty_tiles = True

            # go through and check if any of the tiles at this zoom level are
            # full

            for i in range(tile_from[0], tile_to[0] + 1):
                if not empty_tiles:
                    break

                for j in range(tile_from[1], tile_to[1] + 1):
                    if tile_counts[curr_zoom][i][j] > max_per_tile:

                        empty_tiles = False
                        break

            if empty_tiles:
                # they're all empty so add this interval to this zoom level
                for i in range(tile_from[0], tile_to[0] + 1):
                    for j in range(tile_from[1], tile_to[1] + 1):
                        tile_counts[curr_zoom][i][j] += 1

                interval_inserts.append(
                    (
                        counter,
                        curr_zoom,
                        d["importance"],
                        d["xs"][0],
                        d["xs"][1],
                        d["ys"][0],
                        d["ys"][1],
                        d["chrOffset"],
                        d["uid"],
                        d["fields"],
                    )
                )

                position_index_inserts.append(
                    (counter, d["xs"][0], d["xs"][1], d["ys"][0], d["ys"][1])
                )

                counter += 1
                break

            curr_zoom += 1

        if len(interval_inserts) >= sqlite_batch_size:
            batch_insert(conn, c, interval_inserts, position_index_inserts)

    batch_insert(conn, c, interval_inserts, position_index_inserts)

    c.close()

    return
Beispiel #6
0
def _bedfile(
    filepath,
    output_file,
    assembly,
    importance_column,
    has_header,
    chromosome,
    max_per_tile,
    tile_size,
    delimiter,
    chromsizes_filename,
    offset
):
    if output_file is None:
        output_file = filepath + ".beddb"
    else:
        output_file = output_file

    if op.exists(output_file):
        os.remove(output_file)

    if filepath.endswith('.gz'):
        import gzip
        bed_file = gzip.open(filepath, 'rt')
    else:
        bed_file = open(filepath, 'r')

    (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly)
    rand = random.Random(3)

    def line_to_np_array(line):
        '''
        Convert a bed file line to a numpy array which can later
        be used as an entry in an h5py file.
        '''
        try:
            start = int(line[1])
            stop = int(line[2])
        except ValueError:
            raise ValueError(
                "Error parsing the position, line: {}".format(line)
            )

        chrom = line[0]

        if importance_column is None:
            # assume a random importance when no aggregation strategy is given
            importance = rand.random()
        elif importance_column == 'size':
            importance = stop - start
        elif importance_column == 'random':
            importance = rand.random()
        else:
            importance = int(line[int(importance_column)-1])

        if stop < start:
            print("WARNING: stop < start:", line, file=sys.stderr)
            return

        # convert chromosome coordinates to genome coordinates
        genome_start = chrom_info.cum_chrom_lengths[chrom] + start + offset
        genome_end = chrom_info.cum_chrom_lengths[chrom] + stop + offset

        pos_offset = genome_start - start
        parts = {
            'startPos': genome_start,
            'endPos': genome_end,
            'uid': slugid.nice().decode('utf-8'),
            'chrOffset': pos_offset,
            'fields': '\t'.join(line),
            'importance': importance,
            'chromosome': str(chrom)
        }

        return parts

    dset = []

    print("delimiter:", delimiter)
    if has_header:
        line = bed_file.readline()
        header = line.strip().split(delimiter)
    else:
        line = bed_file.readline().strip()
        line_parts = line.strip().split(delimiter)
        try:
            dset += [line_to_np_array(line_parts)]
        except IndexError as ie:
            print("Invalid line:", line)
        header = map(str, list(range(1,len(line.strip().split(delimiter))+1)))

    for line in bed_file:
        line_parts = line.strip().split(delimiter)
        try:
            dset += [line_to_np_array(line_parts)]
        except IndexError as ie:
            print("Invalid line:", line)

    if chromosome is not None:
        dset = [d for d in dset if d['chromosome'] == chromosome]

    # We neeed chromosome information as well as the assembly size to properly
    # tile this data
    tile_size = tile_size

    assembly_size = chrom_info.total_length + 1
    '''
    else:
        try:
            assembly_size = chrom_info.chrom_lengths[chromosome]
        except KeyError:
            print(
                "ERROR: Chromosome {} not found in assembly {}.".format(
                    chromosome, assembly
                ),
                file=sys.stderr
            )
            return 1
    '''

    max_zoom = int(
        math.ceil(math.log(assembly_size / tile_size) / math.log(2))
    )
    '''
    if max_zoom is not None and max_zoom < max_zoom:
        max_zoom = max_zoom
    '''

    # this script stores data in a sqlite database
    import sqlite3
    sqlite3.register_adapter(np.int64, lambda val: int(val))
    print("output_file:", output_file)
    conn = sqlite3.connect(output_file)

    # store some meta data
    store_meta_data(
        conn,
        1,
        max_length=assembly_size,
        assembly=assembly,
        chrom_names=chrom_names,
        chrom_sizes=chrom_sizes,
        tile_size=tile_size,
        max_zoom=max_zoom,
        max_width=tile_size * 2 ** max_zoom,
        header=header,
    )


    max_width = tile_size * 2 ** max_zoom
    uid_to_entry = {}

    intervals = []

    # store each bed file entry as an interval
    for d in dset:
        uid = d['uid']
        uid_to_entry[uid] = d
        intervals += [(d['startPos'], d['endPos'], uid)]

    tile_width = tile_size

    c = conn.cursor()
    c.execute(
        '''
        CREATE TABLE intervals
        (
            id int PRIMARY KEY,
            zoomLevel int,
            importance real,
            startPos int,
            endPos int,
            chrOffset int,
            uid text,
            fields text
        )
        '''
    )

    c.execute(
        '''
        CREATE VIRTUAL TABLE position_index USING rtree(
            id,
            rStartPos, rEndPos
        )
        '''
    )

    curr_zoom = 0
    counter = 0

    max_viewable_zoom = max_zoom

    if max_zoom is not None and max_zoom < max_zoom:
        max_viewable_zoom = max_zoom

    sorted_intervals = sorted(intervals, 
                    key=lambda x: -uid_to_entry[x[-1]]['importance'])
    # print('si:', sorted_intervals[:10])
    print("max_per_tile:", max_per_tile)

    tile_counts = col.defaultdict(int)

    for interval in sorted_intervals:
        # go through each interval from most important to least
        while curr_zoom <= max_viewable_zoom:
            # try to place it in the highest zoom level and go down from there
            tile_width = tile_size * 2 ** (max_zoom - curr_zoom)

            curr_pos = interval[0]
            space_available = True

            # check if there's space at this zoom level
            while curr_pos < interval[1]:
                curr_tile = math.floor(curr_pos / tile_width)
                tile_id = '{}.{}'.format(curr_zoom, curr_tile)

                '''
                if interval[0] < 1000000:
                    print('tile_id:', tile_id, tile_counts[tile_id], curr_zoom, 'interval:', interval)
                '''
                
                # print(tile_id, "tile_counts[tile_id]", tile_counts[tile_id])
                if tile_counts[tile_id] >= max_per_tile:
                    space_available = False
                    break

                curr_pos += tile_width

            # if there is, then fill it up
            if space_available:
                curr_pos = interval[0]
                while curr_pos < interval[1]:
                    curr_tile = math.floor(curr_pos / tile_width)
                    tile_id = '{}.{}'.format(curr_zoom, curr_tile)
                    
                    tile_counts[tile_id] += 1

                    '''
                    # increment tile counts for lower level tiles
                    higher_zoom = curr_zoom + 1
                    higher_tile = math.floor(higher_zoom / 2)

                    while higher_zoom <= max_viewable_zoom:
                        new_tile_id = '{}.{}'.format(higher_zoom, higher_tile)
                        higher_zoom += 1
                        higher_tile = math.floor(higher_tile / 2)
                        tile_counts[new_tile_id] += 1
                    '''

                    curr_pos += tile_width

            if space_available:
                # there's available space
                value = uid_to_entry[interval[-1]]

                # one extra question mark for the primary key
                exec_statement = 'INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?)'

                ret = c.execute(
                        exec_statement,
                        # primary key, zoomLevel, startPos, endPos, chrOffset, line
                        (counter, curr_zoom,
                            value['importance'],
                            value['startPos'], value['endPos'],
                            value['chrOffset'],
                            value['uid'],
                            value['fields'])
                        )

                if counter % 1000 == 0:
                    print('counter:', counter, value['endPos'] - value['startPos'])

                exec_statement = 'INSERT INTO position_index VALUES (?,?,?)'
                ret = c.execute(
                        exec_statement,
                        (counter, value['startPos'], value['endPos'])  #add counter as a primary key
                        )

                counter += 1
                break

            curr_zoom += 1

        curr_zoom = 0
    conn.commit()
Beispiel #7
0
def _bedpe(filepath, output_file, assembly, importance_column, has_header, max_per_tile, 
        tile_size, max_zoom=None, chromosome=None, 
        chromsizes_filename=None,
        chr1_col=0, from1_col=1, to1_col=2,
        chr2_col=3, from2_col=4, to2_col=5):
    print('output_file:', output_file)

    if filepath == '-':
        f = sys.stdin
    elif filepath.endswith('.gz'):
        f = gzip.open(filepath, 'rt')
    else:
        print("plain")
        f = open(filepath, 'r')

    if output_file is None:
        output_file = filepath + ".multires.db"
    else:
        output_file = output_file

    if op.exists(output_file):
        os.remove(output_file)

    (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly)

    def line_to_dict(line):
        parts = line.split()
        d = {}
        try:
            d['xs'] = [
                chrom_info.cum_chrom_lengths[
                    parts[chr1_col]] + int(parts[from1_col]),
                chrom_info.cum_chrom_lengths[
                    parts[chr1_col]] + int(parts[to1_col])
            ]
            d['ys'] = [
                chrom_info.cum_chrom_lengths[
                    parts[chr2_col]] + int(parts[from2_col]),
                chrom_info.cum_chrom_lengths[
                    parts[chr2_col]] + int(parts[to2_col])
            ]
        except KeyError:
            error_str = (
                "ERROR converting chromosome position to genome position. "
                "Please make sure you've specified the correct assembly "
                "using the --assembly option. "
                "Current assembly: {}, chromosomes: {},{}".format(
                    assembly,
                    parts[chr1_col], parts[chr2_col]
                )
            )
            raise(KeyError(error_str))

        d['uid'] = slugid.nice().decode('utf-8')

        d['chrOffset'] = d['xs'][0] - int(parts[from1_col])

        if importance_column is None:
            d['importance'] = max(
                d['xs'][1] - d['xs'][0], d['ys'][1] - d['ys'][0]
            )
        elif importance_column == 'random':
            d['importance'] = random.random()
        else:
            # We seem to use one-based numbering for columns...
            d['importance'] = float(parts[int(importance_column) - 1])

        d['fields'] = line

        return d

    entries = []

    if has_header:
        f.readline()
    else:
        first_line = f.readline().strip()
        try:
            parts = first_line.split()

            '''
            print("chr1_col", chr1_col, "chr2_col", chr2_col,
                  "from1_col:", from1_col, "from2_col", from2_col,
                  "to1_col", to1_col, "to2_col", to2_col)
            '''

            int(parts[from1_col])
            int(parts[to1_col])
            int(parts[from2_col])
            int(parts[to2_col])
        except ValueError as ve:
            error_str = (
                "Couldn't convert one of the bedpe coordinates to an "
                "integer. If the input file contains a header, make sure to "
                "indicate that with the --has-header option. Line: {}"
                .format(first_line)
            )
            raise(ValueError(error_str))
        entries = [line_to_dict(first_line)]

    entries += [line_to_dict(line.strip()) for line in f]

    # We neeed chromosome information as well as the assembly size to properly
    # tile this data
    tile_size = tile_size
    assembly_size = chrom_info.total_length + 1
    max_zoom = int(
        math.ceil(math.log(assembly_size / tile_size) / math.log(2))
    )
    '''
    if max_zoom is not None and max_zoom < max_zoom:
        max_zoom = max_zoom
    '''

    # this script stores data in a sqlite database
    sqlite3.register_adapter(np.int64, lambda val: int(val))
    conn = sqlite3.connect(output_file)

    # store some meta data
    store_meta_data(
        conn, 1,
        max_length=assembly_size,
        assembly=assembly,
        chrom_names=chrom_names,
        chrom_sizes=chrom_sizes,
        tile_size=tile_size,
        max_zoom=max_zoom,
        max_width=tile_size * 2 ** max_zoom
    )

    # max_width = tile_size * 2 ** max_zoom
    # uid_to_entry = {}

    c = conn.cursor()
    c.execute(
        '''
        CREATE TABLE intervals
        (
            id int PRIMARY KEY,
            zoomLevel int,
            importance real,
            fromX int,
            toX int,
            fromY int,
            toY int,
            chrOffset int,
            uid text,
            fields text
        )
        '''
    )

    print("creating rtree")
    c.execute('''
        CREATE VIRTUAL TABLE position_index USING rtree(
            id,
            rFromX, rToX,
            rFromY, rToY
        )
        ''')

    curr_zoom = 0
    counter = 0

    tile_counts = col.defaultdict(
        lambda: col.defaultdict(lambda: col.defaultdict(int))
    )
    entries = sorted(entries, key=lambda x: -x['importance'])

    counter = 0
    for d in entries:
        curr_zoom = 0

        while curr_zoom <= max_zoom:
            tile_width = tile_size * 2 ** (max_zoom - curr_zoom)
            tile_from = list(
                map(lambda x: x / tile_width, [d['xs'][0], d['ys'][0]])
            )
            tile_to = list(
                map(lambda x: x / tile_width, [d['xs'][1], d['ys'][1]])
            )

            empty_tiles = True

            # go through and check if any of the tiles at this zoom level are
            # full

            for i in range(int(tile_from[0]), int(tile_to[0])+1):
                if not empty_tiles:
                    break

                for j in range(int(tile_from[1]), int(tile_to[1])+1):
                    if tile_counts[curr_zoom][i][j] > max_per_tile:

                        empty_tiles = False
                        break

            if empty_tiles:
                # they're all empty so add this interval to this zoom level
                for i in range(int(tile_from[0]), int(tile_to[0])+1):
                    for j in range(int(tile_from[1]), int(tile_to[1])+1):
                        tile_counts[curr_zoom][i][j] += 1

                c.execute(
                    'INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?,?)',
                    (
                        counter,
                        curr_zoom,
                        d['importance'],
                        d['xs'][0], d['xs'][1],
                        d['ys'][0], d['ys'][1],
                        d['chrOffset'],
                        d['uid'],
                        d['fields']
                    )
                )
                conn.commit()

                c.execute(
                    'INSERT INTO position_index VALUES (?,?,?,?,?)',
                    (
                        counter, d['xs'][0], d['xs'][1],
                        d['ys'][0], d['ys'][1]
                    )  # add counter as a primary key
                )
                conn.commit()

                counter += 1
                break

            curr_zoom += 1

    return
Beispiel #8
0
def bigwigs_to_multivec(
    filepaths,
    output_file,
    assembly,
    chromsizes_filename,
    row_infos_filename,
    tile_size,
):
    with tempfile.TemporaryDirectory() as td:
        print("temporary dir:", td)

        temp_file = op.join(td, "temp.mv5")
        f_out = h5py.File(temp_file, "w")

        (chrom_info, chrom_names,
         chrom_lengths) = cch.load_chromsizes(chromsizes_filename, assembly)

        if row_infos_filename is not None:
            with open(row_infos_filename, "r") as f:
                row_infos = [line.strip().encode("utf8") for line in f]

        else:
            row_infos = None

        starting_resolution = 1
        resolution = starting_resolution
        for chrom in chrom_info.chrom_order:
            f_out.create_dataset(
                chrom,
                (
                    math.ceil(
                        chrom_info.chrom_lengths[chrom] / starting_resolution),
                    len(filepaths),
                ),
                fillvalue=np.nan,
                compression="gzip",
            )

        # Fill in data for each bigwig file.
        for bw_index, bw_file in tqdm(list(enumerate(filepaths)),
                                      desc="bigwigs"):
            if bbi.is_bigwig(bw_file):
                chromsizes = bbi.chromsizes(bw_file)
                matching_chromosomes = set(chromsizes.keys()).intersection(
                    set(chrom_names))

                # Fill in data for each resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    print("chr_name:", chr_name, resolution)
                    chr_len = chrom_info.chrom_lengths[chr_name]
                    chr_shape = (math.ceil(chr_len / resolution),
                                 len(filepaths))
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[0],
                                    summary="sum")
                    f_out[chr_name][:, bw_index] = arr
            else:
                print(f"{bw_file} not is_bigwig")

        f_out.flush()

        f_out.close()
        tf = temp_file
        f_in = h5py.File(tf, "r")

        def agg(x):
            return x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T

        cmv.create_multivec_multires(
            f_in,
            chromsizes=zip(chrom_names, chrom_lengths),
            agg=agg,
            starting_resolution=starting_resolution,
            tile_size=tile_size,
            output_file=output_file,
            row_infos=row_infos,
        )