Example #1
0
    def add_values_to_data_buffers(buffers_to_add, nan_buffers_to_add):
        curr_zoom = 0

        data_buffers[0] += buffers_to_add
        nan_data_buffers[0] += nan_buffers_to_add

        curr_time = time.time() - t1
        percent_progress = (positions[curr_zoom] + 1) / float(assembly_size)
        print(
            "position: {} progress: {:.2f} elapsed: {:.2f} "
            "remaining: {:.2f}".format(
                positions[curr_zoom] + 1,
                percent_progress,
                curr_time, curr_time / (percent_progress) - curr_time
            )
        )

        while len(data_buffers[curr_zoom]) >= chunk_size:
            # get the current chunk and store it, converting nans to 0
            print("len(data_buffers[curr_zoom])", len(data_buffers[curr_zoom]))
            curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
            nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size])

            '''
            print("1cc:", sum(curr_chunk))
            print("1db:", data_buffers[curr_zoom][:chunk_size])
            print("1curr_chunk:", nan_curr_chunk)
            '''
            print("positions[curr_zoom]:", positions[curr_zoom])

            curr_pos = positions[curr_zoom]
            dsets[curr_zoom][curr_pos:curr_pos+chunk_size] = curr_chunk
            nan_dsets[curr_zoom][curr_pos:curr_pos+chunk_size] = nan_curr_chunk

            # aggregate and store aggregated values in the next zoom_level's
            # data
            data_buffers[curr_zoom+1] += list(
                ct.aggregate(curr_chunk, 2 ** zoom_step)
            )
            nan_data_buffers[curr_zoom+1] += list(
                ct.aggregate(nan_curr_chunk, 2 ** zoom_step)
            )

            data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
            nan_data_buffers[curr_zoom] =\
                nan_data_buffers[curr_zoom][chunk_size:]

            # data = data_buffers[curr_zoom+1]
            # nan_data = nan_data_buffers[curr_zoom+1]

            # do the same for the nan values buffers

            positions[curr_zoom] += chunk_size
            curr_zoom += 1

            if curr_zoom * zoom_step >= max_zoom:
                break
Example #2
0
def _bedgraph(filepath, output_file, assembly, chrom_col, from_pos_col,
              to_pos_col, value_col, has_header, chromosome, tile_size,
              chunk_size, method, nan_value, transform, count_nan,
              closed_interval, chromsizes_filename, zoom_step):
    last_end = 0
    data = []

    if output_file is None:
        output_file = op.splitext(filepath)[0] + '.hitile'

    print("output file:", output_file)

    # Override the output file if it existts
    if op.exists(output_file):
        os.remove(output_file)
    f = h5py.File(output_file, 'w')

    # get the information about the chromosomes in this assembly
    if chromsizes_filename is not None:
        chrom_info = nc.get_chrominfo_from_file(chromsizes_filename)
        chrom_order = [
            a.encode('utf-8')
            for a in nc.get_chromorder_from_file(chromsizes_filename)
        ]
        chrom_sizes = nc.get_chromsizes_from_file(chromsizes_filename)
    else:
        chrom_info = nc.get_chrominfo(assembly)
        chrom_order = [a.encode('utf-8') for a in nc.get_chromorder(assembly)]
        chrom_sizes = nc.get_chromsizes(assembly)

    assembly_size = chrom_info.total_length
    print('assembly_size:', assembly_size)

    tile_size = tile_size
    chunk_size = tile_size * 2**chunk_size  # how many values to read in at once while tiling

    dsets = []  # data sets at each zoom level
    nan_dsets = []  # store nan values

    # initialize the arrays which will store the values at each stored zoom level
    z = 0
    positions = []  # store where we are at the current dataset
    data_buffers = [[]]
    nan_data_buffers = [[]]

    while assembly_size / 2**z > tile_size:
        dset_length = math.ceil(assembly_size / 2**z)
        dsets += [
            f.create_dataset('values_' + str(z), (dset_length, ),
                             dtype='f',
                             compression='gzip')
        ]
        nan_dsets += [
            f.create_dataset('nan_values_' + str(z), (dset_length, ),
                             dtype='f',
                             compression='gzip')
        ]

        data_buffers += [[]]
        nan_data_buffers += [[]]

        positions += [0]
        z += zoom_step

    #print("dsets[0][-10:]", dsets[0][-10:])

    # load the bigWig file
    #print("filepath:", filepath)

    # store some meta data
    d = f.create_dataset('meta', (1, ), dtype='f')

    print("assembly:", assembly)
    #print("chrom_info:", nc.get_chromorder(assembly))

    d.attrs['zoom-step'] = zoom_step
    d.attrs['max-length'] = assembly_size
    d.attrs['assembly'] = assembly
    d.attrs['chrom-names'] = chrom_order
    d.attrs['chrom-sizes'] = chrom_sizes
    d.attrs['chrom-order'] = chrom_order
    d.attrs['tile-size'] = tile_size
    d.attrs['max-zoom'] = max_zoom = math.ceil(
        math.log(d.attrs['max-length'] / tile_size) / math.log(2))
    d.attrs['max-width'] = tile_size * 2**max_zoom
    d.attrs['max-position'] = 0

    print("assembly size (max-length)", d.attrs['max-length'])
    print("max-width", d.attrs['max-width'])
    print("max_zoom:", d.attrs['max-zoom'])
    print("chunk-size:", chunk_size)
    print("chrom-order", d.attrs['chrom-order'])

    t1 = time.time()

    # are we reading the input from stdin or from a file?

    if filepath == '-':
        f = sys.stdin
    else:
        if filepath.endswith('.gz'):
            import gzip
            f = gzip.open(filepath, 'rt')
        else:
            f = open(filepath, 'r')

    curr_zoom = 0

    def add_values_to_data_buffers(buffers_to_add, nan_buffers_to_add):
        curr_zoom = 0

        data_buffers[0] += buffers_to_add
        nan_data_buffers[0] += nan_buffers_to_add

        curr_time = time.time() - t1
        percent_progress = (positions[curr_zoom] + 1) / float(assembly_size)
        print(
            "position: {} progress: {:.2f} elapsed: {:.2f} remaining: {:.2f}".
            format(positions[curr_zoom] + 1, percent_progress, curr_time,
                   curr_time / (percent_progress) - curr_time))

        while len(data_buffers[curr_zoom]) >= chunk_size:
            # get the current chunk and store it, converting nans to 0
            print("len(data_buffers[curr_zoom])", len(data_buffers[curr_zoom]))
            curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
            nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size])
            #curr_chunk[np.isnan(curr_chunk)] = 0
            '''
            print("1cc:", sum(curr_chunk))
            print("1db:", data_buffers[curr_zoom][:chunk_size])
            print("1curr_chunk:", nan_curr_chunk)
            '''
            print("positions[curr_zoom]:", positions[curr_zoom])

            dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                             chunk_size] = curr_chunk
            nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                                 chunk_size] = nan_curr_chunk

            # aggregate nan values
            #nan_curr_chunk[np.isnan(curr_chunk)] = 0
            #print("1na_cc:", sum(nan_curr_chunk))

            # aggregate and store aggregated values in the next zoom_level's data
            data_buffers[curr_zoom + 1] += list(
                ct.aggregate(curr_chunk, 2**zoom_step))
            nan_data_buffers[curr_zoom + 1] += list(
                ct.aggregate(nan_curr_chunk, 2**zoom_step))

            data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
            nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][
                chunk_size:]

            data = data_buffers[curr_zoom + 1]
            nan_data = nan_data_buffers[curr_zoom + 1]

            # do the same for the nan values buffers

            positions[curr_zoom] += chunk_size
            curr_zoom += 1

            if curr_zoom * zoom_step >= max_zoom:
                break

    values = []
    nan_values = []

    if has_header:
        f.readline()

    # the genome position up to which we've filled in values
    curr_genome_pos = 0

    # keep track of the previous value so that we can use it to fill in NAN values
    prev_value = 0

    for line in f:
        # each line should indicate a chromsome, start position and end position
        parts = line.strip().split()

        start_genome_pos = chrom_info.cum_chrom_lengths[parts[
            chrom_col - 1]] + int(parts[from_pos_col - 1])
        #print("len(values):", len(values), curr_genome_pos, start_genome_pos)
        #print("line:", line)

        if start_genome_pos - curr_genome_pos > 1:
            values += [np.nan] * (start_genome_pos - curr_genome_pos - 1)
            nan_values += [1] * (start_genome_pos - curr_genome_pos - 1)

            curr_genome_pos += (start_genome_pos - curr_genome_pos - 1)

        # count how many nan values there are in the dataset
        nan_count = 1 if parts[value_col - 1] == nan_value else 0

        # if the provided values are log2 transformed, we have to un-transform them
        if transform == 'exp2':
            value = 2**float(
                parts[value_col -
                      1]) if not parts[value_col - 1] == nan_value else np.nan
        else:
            value = float(
                parts[value_col -
                      1]) if not parts[value_col - 1] == nan_value else np.nan

        # print("pos:", int(parts[to_pos_col-1]) - int(parts[from_pos_col-1]))
        # we're going to add as many values are as specified in the bedfile line
        values_to_add = [value] * (int(parts[to_pos_col - 1]) -
                                   int(parts[from_pos_col - 1]))
        nan_counts_to_add = [nan_count] * (int(parts[to_pos_col - 1]) -
                                           int(parts[from_pos_col - 1]))

        if closed_interval:
            values_to_add += [value]
            nan_counts_to_add += [nan_count]

        # print("values_to_add", values_to_add)

        values += values_to_add
        nan_values += nan_counts_to_add

        d.attrs['max-position'] = start_genome_pos + len(values_to_add)

        #print("values:", values[:30])

        curr_genome_pos += len(values_to_add)

        while len(values) > chunk_size:
            print("len(values):", len(values), chunk_size)
            print("line:", line)
            add_values_to_data_buffers(values[:chunk_size],
                                       nan_values[:chunk_size])
            values = values[chunk_size:]
            nan_values = nan_values[chunk_size:]

    add_values_to_data_buffers(values, nan_values)

    # store the remaining data
    while True:
        # get the current chunk and store it
        chunk_size = len(data_buffers[curr_zoom])
        curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
        nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size])
        '''
        print("2curr_chunk", curr_chunk)
        print("2curr_zoom:", curr_zoom)
        print("2db", data_buffers[curr_zoom][:100])
        '''

        dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                         chunk_size] = curr_chunk
        nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                             chunk_size] = nan_curr_chunk

        #print("chunk_size:", chunk_size, "len(curr_chunk):", len(curr_chunk), "len(nan_curr_chunk)", len(nan_curr_chunk))

        # aggregate and store aggregated values in the next zoom_level's data
        data_buffers[curr_zoom + 1] += list(
            ct.aggregate(curr_chunk, 2**zoom_step))
        nan_data_buffers[curr_zoom + 1] += list(
            ct.aggregate(nan_curr_chunk, 2**zoom_step))

        data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
        nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][chunk_size:]

        data = data_buffers[curr_zoom + 1]
        nan_data = nan_data_buffers[curr_zoom + 1]

        positions[curr_zoom] += chunk_size
        curr_zoom += 1

        # we've created enough tile levels to cover the entire maximum width
        if curr_zoom * zoom_step >= max_zoom:
            break
Example #3
0
def _bigwig(filepath,
            chunk_size=14,
            zoom_step=8,
            tile_size=1024,
            output_file=None,
            assembly='hg19',
            chromsizes_filename=None,
            chromosome=None):
    last_end = 0
    data = []

    if output_file is None:
        if chromosome is None:
            output_file = op.splitext(filepath)[0] + '.hitile'
        else:
            output_file = op.splitext(
                filepath)[0] + '.' + chromosome + '.hitile'

    # Override the output file if it existts
    if op.exists(output_file):
        os.remove(output_file)
    f = h5py.File(output_file, 'w')

    if chromsizes_filename is not None:
        chrom_info = nc.get_chrominfo_from_file(chromsizes_filename)
        chrom_order = [
            a for a in nc.get_chromorder_from_file(chromsizes_filename)
        ]
        chrom_sizes = nc.get_chromsizes_from_file(chromsizes_filename)
    else:
        print("there")
        chrom_info = nc.get_chrominfo(assembly)
        chrom_order = [a for a in nc.get_chromorder(assembly)]
        chrom_sizes = nc.get_chromsizes(assembly)

    print("chrom_order:", chrom_order)
    assembly_size = chrom_info.total_length

    tile_size = tile_size
    chunk_size = tile_size * 2**chunk_size  # how many values to read in at once while tiling

    dsets = []  # data sets at each zoom level
    nan_dsets = []

    # initialize the arrays which will store the values at each stored zoom level
    z = 0
    positions = []  # store where we are at the current dataset
    data_buffers = [[]]
    nan_data_buffers = [[]]

    while assembly_size / 2**z > tile_size:
        dset_length = math.ceil(assembly_size / 2**z)
        dsets += [
            f.create_dataset('values_' + str(z), (dset_length, ),
                             dtype='f',
                             compression='gzip')
        ]
        nan_dsets += [
            f.create_dataset('nan_values_' + str(z), (dset_length, ),
                             dtype='f',
                             compression='gzip')
        ]

        data_buffers += [[]]
        nan_data_buffers += [[]]

        positions += [0]
        z += zoom_step

    # load the bigWig file
    bwf = pbw.open(filepath)

    # store some meta data
    d = f.create_dataset('meta', (1, ), dtype='f')

    if chromosome is not None:
        d.attrs['min-pos'] = chrom_info.cum_chrom_lengths[chromosome]
        d.attrs['max-pos'] = chrom_info.cum_chrom_lengths[
            chromosome] + bwf.chroms()[chromosome]
    else:
        d.attrs['min-pos'] = 0
        d.attrs['max-pos'] = assembly_size
    '''
    print("chroms.keys:", bwf.chroms().keys())
    print("chroms.values:", bwf.chroms().values())
    '''

    d.attrs['zoom-step'] = zoom_step
    d.attrs['max-length'] = assembly_size
    d.attrs['assembly'] = assembly
    d.attrs['chrom-names'] = [a.encode('utf-8') for a in chrom_order]
    d.attrs['chrom-sizes'] = chrom_sizes
    d.attrs['chrom-order'] = [a.encode('utf-8') for a in chrom_order]
    d.attrs['tile-size'] = tile_size
    d.attrs['max-zoom'] = max_zoom = math.ceil(
        math.log(d.attrs['max-length'] / tile_size) / math.log(2))
    d.attrs['max-width'] = tile_size * 2**max_zoom
    d.attrs['max-position'] = 0

    print("assembly size (max-length)", d.attrs['max-length'])
    print("max-width", d.attrs['max-width'])
    print("max_zoom:", d.attrs['max-zoom'])
    print("chunk-size:", chunk_size)
    print("chrom-order", d.attrs['chrom-order'])

    t1 = time.time()

    curr_zoom = 0

    def add_values_to_data_buffers(buffers_to_add, nan_buffers_to_add):
        curr_zoom = 0

        data_buffers[0] += buffers_to_add
        nan_data_buffers[0] += nan_buffers_to_add

        curr_time = time.time() - t1
        percent_progress = (positions[curr_zoom] + 1) / float(assembly_size)
        print(
            "position: {} progress: {:.2f} elapsed: {:.2f} remaining: {:.2f}".
            format(positions[curr_zoom] + 1, percent_progress, curr_time,
                   curr_time / (percent_progress) - curr_time))

        while len(data_buffers[curr_zoom]) >= chunk_size:
            # get the current chunk and store it, converting nans to 0
            print("len(data_buffers[curr_zoom])", len(data_buffers[curr_zoom]))
            curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
            nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size])
            #curr_chunk[np.isnan(curr_chunk)] = 0
            '''
            print("1cc:", sum(curr_chunk))
            print("1db:", data_buffers[curr_zoom][:chunk_size])
            print("1curr_chunk:", nan_curr_chunk)
            '''
            print("positions[curr_zoom]:", positions[curr_zoom])

            dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                             chunk_size] = curr_chunk
            nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                                 chunk_size] = nan_curr_chunk

            # aggregate nan values
            #nan_curr_chunk[np.isnan(curr_chunk)] = 0
            #print("1na_cc:", sum(nan_curr_chunk))

            # aggregate and store aggregated values in the next zoom_level's data
            data_buffers[curr_zoom + 1] += list(
                ct.aggregate(curr_chunk, 2**zoom_step))
            nan_data_buffers[curr_zoom + 1] += list(
                ct.aggregate(nan_curr_chunk, 2**zoom_step))

            data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
            nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][
                chunk_size:]

            data = data_buffers[curr_zoom + 1]
            nan_data = nan_data_buffers[curr_zoom + 1]

            # do the same for the nan values buffers

            positions[curr_zoom] += chunk_size
            curr_zoom += 1

            if curr_zoom * zoom_step >= max_zoom:
                break

    # Do we only want values from a single chromosome?
    if chromosome is not None:
        chroms_to_use = [chromosome]
    else:
        chroms_to_use = chrom_order

    for chrom in chroms_to_use:
        print("chrom:", chrom)
        '''
        if chrom not in bwf.chroms():
            print("skipping chrom (not in bigWig file):",
            chrom, chrom_info.chrom_lengths[chrom])
            continue
        '''

        counter = 0
        # chrom_size = bwf.chroms()[chrom]
        chrom_size = chrom_info.chrom_lengths[chrom]

        # print("chrom_size:", chrom_size, bwf.chroms()[chrom])
        d.attrs['max-position'] += chrom_size

        while counter < chrom_size:
            remaining = min(chunk_size, chrom_size - counter)

            if chrom not in bwf.chroms():
                values = [np.nan] * remaining
                nan_values = [1] * remaining
            else:
                values = bwf.values(chrom, counter, counter + remaining)
                nan_values = np.isnan(values).astype('i4')

            # print("counter:", counter, "remaining:", remaining,
            # "counter + remaining:", counter + remaining)
            counter += remaining
            curr_zoom = 0

            add_values_to_data_buffers(list(values), list(nan_values))

    while True:
        # get the current chunk and store it
        chunk_size = len(data_buffers[curr_zoom])
        curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
        nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size])

        dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                         chunk_size] = curr_chunk
        nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                             chunk_size] = nan_curr_chunk

        # aggregate and store aggregated values in the next zoom_level's data
        data_buffers[curr_zoom + 1] += list(
            ct.aggregate(curr_chunk, 2**zoom_step))
        nan_data_buffers[curr_zoom + 1] += list(
            ct.aggregate(nan_curr_chunk, 2**zoom_step))

        data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
        nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][chunk_size:]

        data = data_buffers[curr_zoom + 1]
        nan_data = nan_data_buffers[curr_zoom + 1]

        positions[curr_zoom] += chunk_size
        curr_zoom += 1

        # we've created enough tile levels to cover the entire maximum width
        if curr_zoom * zoom_step >= max_zoom:
            break

    # still need to take care of the last chunk

    data = np.array(data)
    t1 = time.time()
    pass
Example #4
0
def get_data(hdf_file, z, x):
    '''
    Return a tile from an hdf_file.

    :param hdf_file: A file handle for an HDF5 file (h5py.File('...'))
    :param z: The zoom level
    :param x: The x position of the tile
    '''

    # is the title within the range of possible tiles
    if x > 2**z:
        print("OUT OF RIGHT RANGE")
        return []
    if x < 0:
        print("OUT OF LEFT RANGE")
        return []

    d = hdf_file['meta']

    tile_size = int(d.attrs['tile-size'])
    zoom_step = int(d.attrs['zoom-step'])
    max_length = int(d.attrs['max-length'])
    max_zoom = int(d.attrs['max-zoom'])

    if 'min-pos' in d.attrs:
        min_pos = d.attrs['min-pos']
    else:
        min_pos = 0

    max_width = tile_size * 2**max_zoom

    if 'max-position' in d.attrs:
        max_position = int(d.attrs['max-position'])
    else:
        max_position = max_width

    rz = max_zoom - z
    tile_width = max_width / 2**z

    # because we only store some a subsection of the zoom levels
    next_stored_zoom = zoom_step * math.floor(rz / zoom_step)
    zoom_offset = rz - next_stored_zoom

    # the number of entries to aggregate for each new value
    num_to_agg = 2**zoom_offset
    total_in_length = tile_size * num_to_agg

    # which positions we need to retrieve in order to dynamically aggregate
    start_pos = int((x * 2**zoom_offset * tile_size))
    end_pos = int(start_pos + total_in_length)

    #print("max_position:", max_position)
    max_position = int(max_position / 2**next_stored_zoom)
    #print("new max_position:", max_position)
    '''
    print("start_pos:", start_pos)
    print("end_pos:", end_pos)
    print("next_stored_zoom", next_stored_zoom)
    print("max_position:", int(max_position))
    '''

    f = hdf_file['values_' + str(int(next_stored_zoom))]

    if start_pos > max_position:
        # we want a tile that's after the last bit of data
        a = np.zeros(end_pos - start_pos)
        a.fill(np.nan)
        ret_array = ct.aggregate(a, int(num_to_agg))
    elif start_pos < max_position and max_position < end_pos:
        a = f[start_pos:end_pos][:]
        a[max_position + 1:end_pos] = np.nan
        ret_array = ct.aggregate(a, int(num_to_agg))
    else:
        ret_array = ct.aggregate(f[start_pos:end_pos], int(num_to_agg))
    '''
    print("ret_array:", f[start_pos:end_pos])
    print('ret_array:', ret_array)
    '''
    #print('nansum', np.nansum(ret_array))

    # check to see if we counted the number of NaN values in the given
    # interval

    f_nan = None
    if "nan_values_" + str(int(next_stored_zoom)) in hdf_file:
        f_nan = hdf_file['nan_values_' + str(int(next_stored_zoom))]
        nan_array = ct.aggregate(f_nan[start_pos:end_pos], int(num_to_agg))
        num_aggregated = 2**(max_zoom - z)

        num_vals_array = np.zeros(len(nan_array))
        num_vals_array.fill(num_aggregated)
        num_summed_array = num_vals_array - nan_array

        averages_array = ret_array / num_summed_array

        return averages_array

    return ret_array
Example #5
0
def main():
    parser = argparse.ArgumentParser(description="""

    python main.py
""")

    parser.add_argument('-f', '--filepath', default=None)
    parser.add_argument('-c', '--chunk-size', default=14, type=int)
    parser.add_argument('-z', '--zoom-step', default=8, type=int)
    parser.add_argument('-t', '--tile-size', default=1024, type=int)
    parser.add_argument('-o', '--output-file', default='/tmp/tmp.hdf5')
    #parser.add_argument('-o', '--options', default='yo',
    #					 help="Some option", type='str')
    #parser.add_argument('-u', '--useless', action='store_true',
    #					 help='Another useless option')
    args = parser.parse_args()
    last_end = 0
    data = []

    max_zoom = 24
    if op.exists(args.output_file):
        os.remove(args.output_file)
    f = h5py.File(args.output_file, 'w')

    hum_size = 3137161264
    tile_size = args.tile_size

    chunk_size = tile_size * 2**args.chunk_size

    dsets = []

    # initialize the datasets
    z = 0
    positions = []  # store where we are at the current dataset
    data_buffers = [[]]
    while hum_size / 2**z > tile_size:
        dsets += [
            f.create_dataset('values_' + str(z), (hum_size / 2**z, ),
                             dtype='f',
                             compression='gzip')
        ]
        data_buffers += [[]]
        positions += [0]
        z += args.zoom_step
    d = f.create_dataset('meta', (1, ), dtype='f')

    d.attrs['zoom-step'] = args.zoom_step
    d.attrs['max-length'] = hum_size
    d.attrs['assembly'] = 'hg19'
    d.attrs['tile-size'] = tile_size
    d.attrs['max-zoom'] = math.ceil(
        math.log(d.attrs['max-length'] / tile_size) / math.log(2))

    print("max_zoom:", d.attrs['max-zoom'])

    if args.filepath is None:
        print("Waiting for input...")
        for line in sys.stdin:
            parts = line.split()
            start = int(parts[0], 10)
            end = int(parts[1], 10)
            val = float(parts[2])

            if start > last_end:
                # in case there's skipped values in the bed file
                data_buffers[0] += [0] * (last_end - start)

            data_buffers[0] += [float(parts[2])] * (end - start)
            curr_zoom = 0

            while len(data_buffers[curr_zoom]) > chunk_size:
                # get the current chunk and store it
                print("curr_zoom:", curr_zoom)
                curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
                dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                                 chunk_size] = curr_chunk

                # aggregate and store aggregated values in the next zoom_level's data
                data_buffers[curr_zoom + 1] += list(
                    ct.aggregate(curr_chunk, 2**args.zoom_step))
                data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
                positions[curr_zoom] += chunk_size
                data = data_buffers[curr_zoom + 1]
                curr_zoom += 1

        # store the remaining data
        print("tile_size:", tile_size, positions[0])

        while True:
            # get the current chunk and store it
            chunk_size = len(data_buffers[curr_zoom])
            curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
            dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                             chunk_size] = curr_chunk

            print("curr_zoom:", curr_zoom, "position:",
                  positions[curr_zoom] + len(curr_chunk))
            print("len:", [len(d) for d in data_buffers])

            # aggregate and store aggregated values in the next zoom_level's data
            data_buffers[curr_zoom + 1] += list(
                ct.aggregate(curr_chunk, 2**args.zoom_step))
            data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
            positions[curr_zoom] += chunk_size
            data = data_buffers[curr_zoom + 1]
            curr_zoom += 1

            # we've created enough tile levels to cover the entire maximum width
            if curr_zoom * args.zoom_step >= max_zoom:
                break

    # still need to take care of the last chunk

    data = np.array(data)
    t1 = time.time()
    '''