def chunk(bigwig, window_size, step_size, aggregation, chroms, verbose=False): base_bins = math.ceil(window_size / aggregation) chrom_values = [] for chrom in chroms: if chrom not in bbi.chromsizes(bigwig): print( "Skipping chrom (not in bigWig file):", chrom, bbi.chromsizes(bigwig)[chrom], ) continue chrom_size = bbi.chromsizes(bigwig)[chrom] values = np.zeros((math.ceil( (chrom_size - step_size) / step_size), base_bins)) starts = np.arange(0, chrom_size - step_size, step_size) ends = np.append(np.arange(window_size, chrom_size, step_size), chrom_size) bins = window_size / aggregation # Extract all but the last window in one fashion (faster than `fetch` # with loops) values[:-1] = bbi.stackup( bigwig, [chrom] * (starts.size - 1), starts[:-1], ends[:-1], bins=bins, missing=0.0, ) final_bins = math.ceil((ends[-1] - starts[-1]) / aggregation) # Extract the last window separately because it's size is likely to be # different from the others values[-1, :final_bins] = bbi.fetch(bigwig, chrom, starts[-1], ends[-1], bins=final_bins, missing=0.0) if verbose: print( "Chrom: {}".format(chrom), "# win: {}".format(values.shape[0]), "Max: {}".format(np.max(values)), ) chrom_values.append(values) return chrom_values
def tileset_info(bwpath): ''' Get the tileset info for a bigWig file Parameters ---------- bwpath: string The path to the bigwig file from which to retrieve data Returns ------- tileset_info: {'min_pos': [], 'max_pos': [], 'tile_size': 1024, 'max_zoom': 7 } ''' TILE_SIZE = 1024 chromsizes = bbi.chromsizes(bwpath) chromosomes = cooler.util.natsorted(chromsizes.keys()) chromsizes = pd.Series(chromsizes)[chromosomes] min_tile_cover = np.ceil(sum(chromsizes) / TILE_SIZE) max_zoom = int(np.ceil(np.log2(min_tile_cover))) tileset_info = { 'min_pos': [0], 'max_pos': [TILE_SIZE * 2**max_zoom], 'max_width': TILE_SIZE * 2**max_zoom, 'tile_size': TILE_SIZE, 'max_zoom': max_zoom } return tileset_info
def tileset_info(bwpath): """Get the tileset info for a bigWig file Parameters ---------- bwpath: string Path to the bigwig file Returns ------- tileset_info: { 'min_pos': [], 'max_pos': [], 'max_width': 131072 'tile_size': 1024, 'max_zoom': 7 } """ TILE_SIZE = 1024 chromsizes = bbi.chromsizes(bwpath) chromosomes = cooler.util.natsorted(chromsizes.keys()) chromsizes = pd.Series(chromsizes)[chromosomes] min_tile_cover = np.ceil(sum(chromsizes) / TILE_SIZE) max_zoom = int(np.ceil(np.log2(min_tile_cover))) tileset_info = { "min_pos": [0], "max_pos": [TILE_SIZE * 2**max_zoom], "max_width": TILE_SIZE * 2**max_zoom, "tile_size": TILE_SIZE, "max_zoom": max_zoom, } return tileset_info
def get_chromsizes(bwpath): """TODO: replace this with negspy. Also, return NaNs from any missing chromosomes in bbi.fetch """ chromsizes = bbi.chromsizes(bwpath) chromosomes = cooler.util.natsorted(chromsizes.keys()) return pd.Series(chromsizes)[chromosomes]
def get_chromsizes(bwpath): """ TODO: replace this with negspy Also, return NaNs from any missing chromosomes in bbi.fetch """ chromsizes = bbi.chromsizes(bwpath) chromosomes = natsorted(chromsizes.keys()) chrom_series = pd.Series(chromsizes)[chromosomes] return chrom_series
def test_fetch_oob(path): x = bbi.fetch(path, 'chr21', -10, 1000, oob=np.nan) assert np.all(np.isnan(x[:10])) x = bbi.fetch(path, 'chr21', -10, 1000, oob=0) assert np.all(x[:10] == 0) n = bbi.chromsizes(path)['chr21'] x = bbi.fetch(path, 'chr21', n - 1000, n + 10, oob=np.nan) assert np.all(np.isnan(x[-10:])) x = bbi.fetch(path, 'chr21', n - 1000, n + 10, oob=0) assert np.all(x[-10:] == 0)
def test_chromsizes(uri): chromsizes = bbi.chromsizes(uri) assert len(chromsizes) == 1 and 'chr21' in chromsizes
def bigwigs_to_multivec( filepaths, output_file, assembly, chromsizes_filename, row_infos_filename, tile_size, ): with tempfile.TemporaryDirectory() as td: print("temporary dir:", td) temp_file = op.join(td, "temp.mv5") f_out = h5py.File(temp_file, "w") (chrom_info, chrom_names, chrom_lengths) = cch.load_chromsizes(chromsizes_filename, assembly) if row_infos_filename is not None: with open(row_infos_filename, "r") as f: row_infos = [line.strip().encode("utf8") for line in f] else: row_infos = None starting_resolution = 1 resolution = starting_resolution for chrom in chrom_info.chrom_order: f_out.create_dataset( chrom, ( math.ceil( chrom_info.chrom_lengths[chrom] / starting_resolution), len(filepaths), ), fillvalue=np.nan, compression="gzip", ) # Fill in data for each bigwig file. for bw_index, bw_file in tqdm(list(enumerate(filepaths)), desc="bigwigs"): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set(chromsizes.keys()).intersection( set(chrom_names)) # Fill in data for each resolution of a bigwig file. for chr_name in matching_chromosomes: print("chr_name:", chr_name, resolution) chr_len = chrom_info.chrom_lengths[chr_name] chr_shape = (math.ceil(chr_len / resolution), len(filepaths)) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[0], summary="sum") f_out[chr_name][:, bw_index] = arr else: print(f"{bw_file} not is_bigwig") f_out.flush() f_out.close() tf = temp_file f_in = h5py.File(tf, "r") def agg(x): return x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T cmv.create_multivec_multires( f_in, chromsizes=zip(chrom_names, chrom_lengths), agg=agg, starting_resolution=starting_resolution, tile_size=tile_size, output_file=output_file, row_infos=row_infos, )
def bigwigs_to_multivec( input_bigwig_files, output_file, starting_resolution ): f = h5py.File(output_file, 'w') num_samples = len(input_bigwig_files) # Create level zero groups info_group = f.create_group("info") resolutions_group = f.create_group("resolutions") chroms_group = f.create_group("chroms") # Set info attributes info_group.attrs['tile-size'] = 256 # Prepare to fill in chroms dataset chromosomes = nc.get_chromorder(GENOME_BUILD) chromosomes = chromosomes[:25] # TODO: should more than chr1-chrM be used? chroms_length_arr = np.array([ nc.get_chrominfo('hg19').chrom_lengths[x] for x in chromosomes ], dtype="i8") chroms_name_arr = np.array(chromosomes, dtype="S23") chromosomes_set = set(chromosomes) chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) # Fill in chroms dataset entries "length" and "name" chroms_group.create_dataset("length", data=chroms_length_arr) chroms_group.create_dataset("name", data=chroms_name_arr) num_zoom_levels = math.floor(math.log2(GENOME_LENGTH / starting_resolution)) # Prepare to fill in resolutions dataset resolutions = [starting_resolution * (2 ** x) for x in range(num_zoom_levels)] # Create each resolution group. for resolution in resolutions: resolution_group = resolutions_group.create_group(str(resolution)) # TODO: remove the unnecessary "values" layer resolution_values_group = resolution_group.create_group("values") # Create each chromosome dataset. for chr_name, chr_len in zip(chromosomes, chroms_length_arr): chr_shape = (math.ceil(chr_len / resolution), num_samples) resolution_values_group.create_dataset(chr_name, chr_shape, dtype="f4", fillvalue=np.nan, compression='gzip') # Fill in data for each bigwig file. for bw_index, bw_file in enumerate(input_bigwig_files): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set(chromsizes.keys()).intersection(chromosomes_set) # Fill in data for each resolution of a bigwig file. for resolution in resolutions: # Fill in data for each chromosome of a resolution of a bigwig file. for chr_name in matching_chromosomes: chr_len = chrom_name_to_length[chr_name] num_bins = math.ceil(chr_len / resolution) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, num_bins, summary="sum") resolutions_group[str(resolution)]["values"][chr_name][:,bw_index] = arr else: print(f"{bw_file} not is_bigwig") f.flush() f.close() max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print(max_mem) # Append metadata to the top resolution row_infos attribute. row_infos = [] for input_bigwig_file in input_bigwig_files: _, filename = os.path.split(input_bigwig_file) name, _ = os.path.splitext(filename) row_infos.append({ 'id': name }) row_infos_encoded = str(json.dumps(row_infos)) f = h5py.File(output_file, 'r+') info_group = f["info"] info_group["row_infos"] = row_infos_encoded f.close()
def bigwigs_to_zarr(input_bigwig_files, output_file, starting_resolution, name): # Short-hand for creating a DirectoryStore with a root group. f = zarr.open(output_file, mode='w') compressor = Zlib(level=1) num_samples = len(input_bigwig_files) # Create level zero groups chromosomes_group = f.create_group("chromosomes") # Prepare to fill in chroms dataset chromosomes = nc.get_chromorder('hg38') chromosomes = [str(chr_name) for chr_name in chromosomes[:25] ] # TODO: should more than chr1-chrM be used? num_chromosomes = len(chromosomes) chroms_length_arr = np.array( [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes], dtype="i8") chroms_cumsum_arr = np.concatenate( (np.array([0]), np.cumsum(chroms_length_arr))) chromosomes_set = set(chromosomes) chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr)) # Prepare to fill in resolutions dataset resolutions = [starting_resolution * (2**x) for x in range(16)] # Create each chromosome dataset. for chr_name, chr_len in chrom_name_to_length.items(): chr_group = chromosomes_group.create_group(chr_name) # Create each resolution group. for resolution in resolutions: chr_shape = (num_samples, math.ceil(chr_len / resolution)) chr_group.create_dataset(str(resolution), shape=chr_shape, dtype="f4", fill_value=np.nan, compressor=compressor) # Fill in data for each bigwig file. for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)), desc='bigwigs'): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set( chromsizes.keys()).intersection(chromosomes_set) # Fill in data for each resolution of a bigwig file. for resolution in resolutions: # Fill in data for each chromosome of a resolution of a bigwig file. for chr_name in matching_chromosomes: chr_len = chrom_name_to_length[chr_name] chr_shape = (num_samples, math.ceil(chr_len / resolution)) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[1], summary="sum") chromosomes_group[chr_name][str(resolution)][ bw_index, :] = arr else: print(f"{bw_file} not is_bigwig") max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print(max_mem) # Append metadata to the top resolution row_infos attribute. row_infos = [] for bw_index, bw_file in enumerate(input_bigwig_files): row_infos.append({ "cluster": int(bw_index + 1), "file": os.path.basename(bw_file) }) # f.attrs should contain all tileset_info properties # For zarr, more attributes are used here to allow "serverless" f.attrs['row_infos'] = row_infos f.attrs['resolutions'] = sorted(resolutions, reverse=True) f.attrs['shape'] = [num_samples, 256] f.attrs['name'] = name f.attrs['coordSystem'] = "hg38" # https://github.com/zarr-developers/zarr-specs/issues/50 f.attrs['multiscales'] = [{ "version": "0.1", "name": chr_name, "datasets": [{ "path": f"chromosomes/{chr_name}/{resolution}" } for resolution in sorted(resolutions, reverse=True)], "type": "zarr-multivec", "metadata": { "chromoffset": int(chrom_name_to_cumsum[chr_name]), "chromsize": int(chr_len), } } for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr))]
def get_stats(bigwig, bigbed, norm_vals, window_size, step_size, aggregation, chrom): base_bins = math.ceil(window_size / aggregation) if chrom not in bbi.chromsizes(bigwig): print( "Skipping chrom (not in bigWig file):", chrom, bbi.chromsizes(bigwig)[chrom], ) return None chrom_size = bbi.chromsizes(bigwig)[chrom] intervals = np.zeros((math.ceil( (chrom_size - step_size) / step_size), base_bins)) starts = np.arange(0, chrom_size - step_size, step_size) ends = np.append(np.arange(window_size, chrom_size, step_size), chrom_size) bins = window_size / aggregation # Extract all but the last window in one fashion (faster than `fetch` # with loops) intervals[:-1] = bbi.stackup(bigbed, [chrom] * (starts.size - 1), starts[:-1], ends[:-1], bins=bins) final_bins = math.ceil((ends[-1] - starts[-1]) / aggregation) # Extract the last window separately because it's size is likely to be # different from the others intervals[-1, :final_bins] = bbi.fetch(bigbed, chrom, starts[-1], ends[-1], bins=final_bins, missing=0.0) intervals = np.round(intervals).astype(int) # 0. Number of intevals # 1. Min width of peaks # 2. Max width of peaks # 3. Median width of peaks # 4. Min distance of peaks # 5. Max distance pf peaks # 6. Median distance of peaks # 7. Sum of height of peaks # 8. Max height of peaks # 9. Median height of peaks # 10. Median signal # 11. Total signal # 12. Peak coverage stats = np.zeros((norm_vals.shape[0], 13)) stats[:, 0] = count_peaks(intervals) stats[:, 1] = peak_widths(intervals, np.min) stats[:, 2] = peak_widths(intervals, np.max) stats[:, 3] = peak_widths(intervals, np.median) stats[:, 4] = peak_distances(intervals, np.min) stats[:, 5] = peak_distances(intervals, np.max) stats[:, 6] = peak_distances(intervals, np.median) stats[:, 7] = peak_heights(intervals, norm_vals, stats[:, 0], np.nansum) stats[:, 8] = peak_heights(intervals, norm_vals, stats[:, 0], np.nanmax) stats[:, 9] = peak_heights(intervals, norm_vals, stats[:, 0], np.nanmedian) stats[:, 10] = np.median(norm_vals, axis=1) stats[:, 11] = np.sum(norm_vals, axis=1) stats[:, 12] = peak_widths(intervals, np.sum) / base_bins return stats, np.round(intervals).astype(int)
def bigwigs_to_multivec(input_bigwig_files, input_metadata_files, output_file, starting_resolution): f = h5py.File(output_file, 'w') num_samples = len(input_bigwig_files) # Zip the input to create (bw, metadata) tuples zipped_input = zip(input_bigwig_files, input_metadata_files) # Create level zero groups info_group = f.create_group("info") resolutions_group = f.create_group("resolutions") chroms_group = f.create_group("chroms") # Set info attributes info_group.attrs['tile-size'] = 256 # Prepare to fill in chroms dataset chromosomes = nc.get_chromorder('hg38') chromosomes = chromosomes[:25] # TODO: should more than chr1-chrM be used? num_chromosomes = len(chromosomes) chroms_length_arr = np.array( [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes], dtype="i8") chroms_name_arr = np.array(chromosomes, dtype="S23") chromosomes_set = set(chromosomes) chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) # Fill in chroms dataset entries "length" and "name" chroms_group.create_dataset("length", data=chroms_length_arr) chroms_group.create_dataset("name", data=chroms_name_arr) # Prepare to fill in resolutions dataset resolutions = [starting_resolution * (2**x) for x in range(16)] # Create each resolution group. for resolution in resolutions: resolution_group = resolutions_group.create_group(str(resolution)) # TODO: remove the unnecessary "values" layer resolution_values_group = resolution_group.create_group("values") # Create each chromosome dataset. for chr_name, chr_len in zip(chromosomes, chroms_length_arr): chr_shape = (math.ceil(chr_len / resolution), num_samples) resolution_values_group.create_dataset(chr_name, chr_shape, dtype="f4", fillvalue=np.nan, compression='gzip') # Fill in data for each bigwig file. for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)), desc='bigwigs'): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set( chromsizes.keys()).intersection(chromosomes_set) # Fill in data for each resolution of a bigwig file. for resolution in resolutions: # Fill in data for each chromosome of a resolution of a bigwig file. for chr_name in matching_chromosomes: chr_len = chrom_name_to_length[chr_name] chr_shape = (math.ceil(chr_len / resolution), num_samples) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[0], summary="sum") resolutions_group[str( resolution)]["values"][chr_name][:, bw_index] = arr else: print(f"{bw_file} not is_bigwig") f.flush() f.close() max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print(max_mem) # Append metadata to the top resolution row_infos attribute. row_infos = [] for metadata_index, metadata_file in enumerate(input_metadata_files): with open(metadata_file) as mf: try: metadata_json = json.load(mf) except Exception as e: print(f"Error loading metadata file: {metadata_file}") print(e) metadata_json = None row_info = metadata_json_to_row_info(metadata_json) row_infos.append(row_info) row_infos_encoded = str(json.dumps(row_infos)) f = h5py.File(output_file, 'r+') info_group = f["info"] info_group["row_infos"] = row_infos_encoded f.close()
def chunk( bigwig, window_size, resolution, step_size, chroms, normalize=True, verbose=False, ): base_bins = np.ceil(window_size / resolution).astype(int) num_total_windows = 0 bins = np.ceil(window_size / resolution).astype(int) for chrom in chroms: chrom_size = bbi.chromsizes(bigwig)[chrom] num_total_windows += np.ceil( (chrom_size - step_size) / step_size ).astype(int) values = np.zeros((num_total_windows, base_bins)) start = 0 for chrom in chroms: if chrom not in bbi.chromsizes(bigwig): print( "Skipping chrom (not in bigWig file):", chrom, bbi.chromsizes(bigwig)[chrom], ) continue chrom_size = bbi.chromsizes(bigwig)[chrom] num_windows = np.ceil((chrom_size - step_size) / step_size).astype(int) start_bps = np.arange(0, chrom_size - step_size, step_size) end_bps = np.append( np.arange(window_size, chrom_size, step_size), chrom_size ) end = start + num_windows # Extract all but the last window in one fashion (faster than `fetch` # with loops) values[start : end - 1] = bbi.stackup( bigwig, [chrom] * (start_bps.size - 1), start_bps[:-1], end_bps[:-1], bins=bins, missing=0, ) final_bins = np.ceil( (end_bps[-1] - start_bps[-1]) / resolution ).astype(int) # Extract the last window separately because it's size is likely to be # different from the others values[end - 1, :final_bins] = bbi.fetch( bigwig, chrom, start_bps[-1], end_bps[-1], bins=final_bins, missing=0.0, ) if normalize: values[start:end] = data.normalize(values[start:end]) if verbose: print( "LOADING ::", "Chrom: {}".format(chrom), "| Num windows: {}".format(num_windows), "| Max value: {}".format(np.max(values[start:end])), ) return values