def add_values_to_data_buffers(buffers_to_add, nan_buffers_to_add): curr_zoom = 0 data_buffers[0] += buffers_to_add nan_data_buffers[0] += nan_buffers_to_add curr_time = time.time() - t1 percent_progress = (positions[curr_zoom] + 1) / float(assembly_size) print( "position: {} progress: {:.2f} elapsed: {:.2f} " "remaining: {:.2f}".format( positions[curr_zoom] + 1, percent_progress, curr_time, curr_time / (percent_progress) - curr_time ) ) while len(data_buffers[curr_zoom]) >= chunk_size: # get the current chunk and store it, converting nans to 0 print("len(data_buffers[curr_zoom])", len(data_buffers[curr_zoom])) curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size]) nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size]) ''' print("1cc:", sum(curr_chunk)) print("1db:", data_buffers[curr_zoom][:chunk_size]) print("1curr_chunk:", nan_curr_chunk) ''' print("positions[curr_zoom]:", positions[curr_zoom]) curr_pos = positions[curr_zoom] dsets[curr_zoom][curr_pos:curr_pos + chunk_size] = curr_chunk nan_dsets[curr_zoom][curr_pos:curr_pos + chunk_size] = nan_curr_chunk # aggregate and store aggregated values in the next zoom_level's # data data_buffers[curr_zoom + 1] += list( ct.aggregate(curr_chunk, 2 ** zoom_step) ) nan_data_buffers[curr_zoom + 1] += list( ct.aggregate(nan_curr_chunk, 2 ** zoom_step) ) data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:] nan_data_buffers[curr_zoom] =\ nan_data_buffers[curr_zoom][chunk_size:] # data = data_buffers[curr_zoom+1] # nan_data = nan_data_buffers[curr_zoom+1] # do the same for the nan values buffers positions[curr_zoom] += chunk_size curr_zoom += 1 if curr_zoom * zoom_step >= max_zoom: break
def _bedgraph( filepath, output_file, assembly, chrom_col, from_pos_col, to_pos_col, value_col, has_header, chromosome, tile_size, chunk_size, method, nan_value, transform, count_nan, closed_interval, chromsizes_filename, zoom_step, ): if output_file is None: output_file = op.splitext(filepath)[0] + ".hitile" print("output file:", output_file) # Override the output file if it existts if op.exists(output_file): os.remove(output_file) f = h5py.File(output_file, "w") # get the information about the chromosomes in this assembly if chromsizes_filename is not None: chrom_info = nc.get_chrominfo_from_file(chromsizes_filename) chrom_order = [ a.encode("utf-8") for a in nc.get_chromorder_from_file(chromsizes_filename) ] chrom_sizes = nc.get_chromsizes_from_file(chromsizes_filename) else: chrom_info = nc.get_chrominfo(assembly) chrom_order = [a.encode("utf-8") for a in nc.get_chromorder(assembly)] chrom_sizes = nc.get_chromsizes(assembly) assembly_size = chrom_info.total_length print("assembly_size:", assembly_size) tile_size = tile_size # how many values to read in at once while tiling chunk_size = tile_size * 2 ** chunk_size dsets = [] # data sets at each zoom level nan_dsets = [] # store nan values # initialize the arrays which will store the values at each stored zoom # level z = 0 positions = [] # store where we are at the current dataset data_buffers = [[]] nan_data_buffers = [[]] while assembly_size / 2 ** z > tile_size: dset_length = math.ceil(assembly_size / 2 ** z) dsets += [ f.create_dataset( "values_" + str(z), (dset_length,), dtype="f", compression="gzip" ) ] nan_dsets += [ f.create_dataset( "nan_values_" + str(z), (dset_length,), dtype="f", compression="gzip" ) ] data_buffers += [[]] nan_data_buffers += [[]] positions += [0] z += zoom_step # store some meta data d = f.create_dataset("meta", (1,), dtype="f") print("assembly:", assembly) d.attrs["zoom-step"] = zoom_step d.attrs["max-length"] = assembly_size d.attrs["assembly"] = assembly d.attrs["chrom-names"] = chrom_order d.attrs["chrom-sizes"] = chrom_sizes d.attrs["chrom-order"] = chrom_order d.attrs["tile-size"] = tile_size d.attrs["max-zoom"] = max_zoom = math.ceil( math.log(d.attrs["max-length"] / tile_size) / math.log(2) ) d.attrs["max-width"] = tile_size * 2 ** max_zoom d.attrs["max-position"] = 0 print("assembly size (max-length)", d.attrs["max-length"]) print("max-width", d.attrs["max-width"]) print("max_zoom:", d.attrs["max-zoom"]) print("chunk-size:", chunk_size) print("chrom-order", d.attrs["chrom-order"]) t1 = time.time() # are we reading the input from stdin or from a file? if filepath == "-": f = sys.stdin else: if filepath.endswith(".gz"): import gzip f = gzip.open(filepath, "rt") else: f = open(filepath, "r") curr_zoom = 0 def add_values_to_data_buffers(buffers_to_add, nan_buffers_to_add): curr_zoom = 0 data_buffers[0] += buffers_to_add nan_data_buffers[0] += nan_buffers_to_add curr_time = time.time() - t1 percent_progress = (positions[curr_zoom] + 1) / float(assembly_size) print( "position: {} progress: {:.2f} elapsed: {:.2f} " "remaining: {:.2f}".format( positions[curr_zoom] + 1, percent_progress, curr_time, curr_time / (percent_progress) - curr_time, ) ) while len(data_buffers[curr_zoom]) >= chunk_size: # get the current chunk and store it, converting nans to 0 print("len(data_buffers[curr_zoom])", len(data_buffers[curr_zoom])) curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size]) nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size]) """ print("1cc:", sum(curr_chunk)) print("1db:", data_buffers[curr_zoom][:chunk_size]) print("1curr_chunk:", nan_curr_chunk) """ print("positions[curr_zoom]:", positions[curr_zoom]) curr_pos = positions[curr_zoom] dsets[curr_zoom][curr_pos : curr_pos + chunk_size] = curr_chunk nan_dsets[curr_zoom][curr_pos : curr_pos + chunk_size] = nan_curr_chunk # aggregate and store aggregated values in the next zoom_level's # data data_buffers[curr_zoom + 1] += list( ct.aggregate(curr_chunk, 2 ** zoom_step) ) nan_data_buffers[curr_zoom + 1] += list( ct.aggregate(nan_curr_chunk, 2 ** zoom_step) ) data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:] nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][chunk_size:] # data = data_buffers[curr_zoom+1] # nan_data = nan_data_buffers[curr_zoom+1] # do the same for the nan values buffers positions[curr_zoom] += chunk_size curr_zoom += 1 if curr_zoom * zoom_step >= max_zoom: break values = [] nan_values = [] if has_header: f.readline() # the genome position up to which we've filled in values curr_genome_pos = 0 # keep track of the previous value so that we can use it to fill in NAN # values # prev_value = 0 for line in f: # each line should indicate a chromsome, start position and end # position parts = line.strip().split() start_genome_pos = chrom_info.cum_chrom_lengths[parts[chrom_col - 1]] + int( parts[from_pos_col - 1] ) if start_genome_pos - curr_genome_pos > 1: values += [np.nan] * (start_genome_pos - curr_genome_pos - 1) nan_values += [1] * (start_genome_pos - curr_genome_pos - 1) curr_genome_pos += start_genome_pos - curr_genome_pos - 1 # count how many nan values there are in the dataset nan_count = 1 if parts[value_col - 1] == nan_value else 0 # if the provided values are log2 transformed, we have to un-transform # them if transform == "exp2": value = ( 2 ** float(parts[value_col - 1]) if not parts[value_col - 1] == nan_value else np.nan ) else: value = ( float(parts[value_col - 1]) if not parts[value_col - 1] == nan_value else np.nan ) # we're going to add as many values are as specified in the bedfile line values_to_add = [value] * ( int(parts[to_pos_col - 1]) - int(parts[from_pos_col - 1]) ) nan_counts_to_add = [nan_count] * ( int(parts[to_pos_col - 1]) - int(parts[from_pos_col - 1]) ) if closed_interval: values_to_add += [value] nan_counts_to_add += [nan_count] # print("values_to_add", values_to_add) values += values_to_add nan_values += nan_counts_to_add d.attrs["max-position"] = start_genome_pos + len(values_to_add) curr_genome_pos += len(values_to_add) while len(values) > chunk_size: print("len(values):", len(values), chunk_size) print("line:", line) add_values_to_data_buffers(values[:chunk_size], nan_values[:chunk_size]) values = values[chunk_size:] nan_values = nan_values[chunk_size:] add_values_to_data_buffers(values, nan_values) # store the remaining data while True: # get the current chunk and store it chunk_size = len(data_buffers[curr_zoom]) curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size]) nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size]) """ print("2curr_chunk", curr_chunk) print("2curr_zoom:", curr_zoom) print("2db", data_buffers[curr_zoom][:100]) """ curr_pos = positions[curr_zoom] dsets[curr_zoom][curr_pos : curr_pos + chunk_size] = curr_chunk nan_dsets[curr_zoom][curr_pos : curr_pos + chunk_size] = nan_curr_chunk # aggregate and store aggregated values in the next zoom_level's data data_buffers[curr_zoom + 1] += list(ct.aggregate(curr_chunk, 2 ** zoom_step)) nan_data_buffers[curr_zoom + 1] += list( ct.aggregate(nan_curr_chunk, 2 ** zoom_step) ) data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:] nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][chunk_size:] # data = data_buffers[curr_zoom+1] # nan_data = nan_data_buffers[curr_zoom+1] positions[curr_zoom] += chunk_size curr_zoom += 1 # we've created enough tile levels to cover the entire maximum width if curr_zoom * zoom_step >= max_zoom: break
def get_data(hdf_file, z, x): """ Return a tile from an hdf_file. :param hdf_file: A file handle for an HDF5 file (h5py.File('...')) :param z: The zoom level :param x: The x position of the tile """ # is the title within the range of possible tiles if x > 2**z: print("OUT OF RIGHT RANGE") return [] if x < 0: print("OUT OF LEFT RANGE") return [] d = hdf_file["meta"] tile_size = int(d.attrs["tile-size"]) zoom_step = int(d.attrs["zoom-step"]) max_zoom = int(d.attrs["max-zoom"]) max_width = tile_size * 2**max_zoom if "max-position" in d.attrs: max_position = int(d.attrs["max-position"]) else: max_position = max_width rz = max_zoom - z # tile_width = max_width / 2**z # because we only store some a subsection of the zoom levels next_stored_zoom = zoom_step * math.floor(rz / zoom_step) zoom_offset = rz - next_stored_zoom # the number of entries to aggregate for each new value num_to_agg = 2**zoom_offset total_in_length = tile_size * num_to_agg # which positions we need to retrieve in order to dynamically aggregate start_pos = int((x * 2**zoom_offset * tile_size)) end_pos = int(start_pos + total_in_length) # print("max_position:", max_position) max_position = int(max_position / 2**next_stored_zoom) # print("new max_position:", max_position) """ print("start_pos:", start_pos) print("end_pos:", end_pos) print("next_stored_zoom", next_stored_zoom) print("max_position:", int(max_position)) """ f = hdf_file["values_" + str(int(next_stored_zoom))] if start_pos > max_position: # we want a tile that's after the last bit of data a = np.zeros(end_pos - start_pos) a.fill(np.nan) ret_array = ct.aggregate(a, int(num_to_agg)) elif start_pos < max_position and max_position < end_pos: a = f[start_pos:end_pos][:] a[max_position + 1:end_pos] = np.nan ret_array = ct.aggregate(a, int(num_to_agg)) else: ret_array = ct.aggregate(f[start_pos:end_pos], int(num_to_agg)) """ print("ret_array:", f[start_pos:end_pos]) print('ret_array:', ret_array) """ # print('nansum', np.nansum(ret_array)) # check to see if we counted the number of NaN values in the given # interval f_nan = None if "nan_values_" + str(int(next_stored_zoom)) in hdf_file: f_nan = hdf_file["nan_values_" + str(int(next_stored_zoom))] nan_array = ct.aggregate(f_nan[start_pos:end_pos], int(num_to_agg)) num_aggregated = 2**(max_zoom - z) num_vals_array = np.zeros(len(nan_array)) num_vals_array.fill(num_aggregated) num_summed_array = num_vals_array - nan_array averages_array = ret_array / num_summed_array return averages_array return ret_array