def _multivec(filepath, output_file, assembly, tile_size, chromsizes_filename, starting_resolution, row_infos_filename=None): ''' Aggregate a multivec file. This is a file containing nxn data that is aggregated along only one axis. This data should be in an HDF5 file where each dataset is named for a chromosome and contains a 'resolutions' group containing values for the base level resolution. Example: f['chr1']['reslutions']['1000'] = [[1,2,3],[4,5,6]] The resulting data will be organized by resolution and chromosome. Example: f_out['chr1']['resolutions']['5000']=[[1000,2000,3000],[4000,5000,6000]] Aggregation is currently done by summing adjacent values. ''' f_in = h5py.File(filepath, 'r') if output_file is None: output_file = op.splitext(filepath)[0] + ".multires.mv5" (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly) if method == 'maxtotal': pass if method == 'logsumexp': def agg(x): a = x.T.reshape((x.shape[1], -1, 2)) return sm.logsumexp(a, axis=2).T else: agg = lambda x: x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T print("agg:", agg) if row_infos_filename is not None: with open(row_infos_filename, 'r') as fr: row_infos = [l.strip().encode('utf8') for l in fr] else: row_infos = None print("row_infos:", row_infos) cmv.create_multivec_multires( f_in, chromsizes=zip(chrom_names, chrom_sizes), agg=lambda x: np.nansum(x.T.reshape((x.shape[1], -1, 2)), axis=2).T, starting_resolution=starting_resolution, tile_size=tile_size, output_file=output_file, row_infos=row_infos)
def _bedgraph_to_multivec(filepaths, output_file, assembly, chrom_col, from_pos_col, to_pos_col, value_col, has_header, chunk_size, nan_value, chromsizes_filename, starting_resolution, num_rows, format, row_infos_filename, tile_size, method): print('chrom_col:', chrom_col) with tempfile.TemporaryDirectory() as td: print('temporary dir:', td) temp_file = op.join(td, 'temp.mv5') f_out = h5py.File(temp_file, 'w') (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly) if row_infos_filename is not None: with open(row_infos_filename, 'r') as fr: row_infos = [l.strip().encode('utf8') for l in fr] else: row_infos = None for chrom in chrom_info.chrom_order: f_out.create_dataset(chrom, (math.ceil( chrom_info.chrom_lengths[chrom] / starting_resolution), num_rows * len(filepaths)), fillvalue=np.nan, compression='gzip') def bedline_to_chrom_start_end_vector(bedlines, row_infos=None): chrom_set = set() start_set = set() end_set = set() all_vector = [] for bedline in bedlines: parts = bedline.strip().split() chrom = parts[chrom_col - 1] start = int(parts[from_pos_col - 1]) end = int(parts[to_pos_col - 1]) vector = [ float(f) if not f == 'NA' else np.nan for f in parts[value_col - 1:value_col - 1 + num_rows] ] chrom_set.add(chrom) start_set.add(start) end_set.add(end) if len(chrom_set) > 1: raise ValueError("Chromosomes don't match in these lines:", bedlines) if len(start_set) > 1: raise ValueError( "Start positions don't match in these lines:", bedlines) if len(end_set) > 1: raise ValueError( "End positions don't match in these lines:", bedlines) all_vector += vector return (list(chrom_set)[0], list(start_set)[0], list(end_set)[0], all_vector) if format == 'epilogos': cmv.bedfile_to_multivec(filepaths, f_out, epilogos_bedline_to_vector, starting_resolution, has_header, chunk_size) elif format == 'states': assert ( row_infos != None ), "A row_infos file must be provided for --format = 'states' " states_dic = {row_infos[x]: x for x in range(len(row_infos))} cmv.bedfile_to_multivec(filepaths, f_out, states_bedline_to_vector, starting_resolution, has_header, chunk_size, states_dic) else: cmv.bedfile_to_multivec(filepaths, f_out, bedline_to_chrom_start_end_vector, starting_resolution, has_header, chunk_size) f_out.close() tf = temp_file f_in = h5py.File(tf, 'r') if output_file is None: output_file = op.splitext(filepaths[0])[0] + '.multires.mv5' print('output_file:', output_file) # Override the output file if it existts if op.exists(output_file): os.remove(output_file) if method == 'logsumexp': def agg(x): # newshape = (x.shape[2], -1, 2) # b = x.T.reshape((-1,)) a = x.T.reshape((x.shape[1], -1, 2)) # this is going to be an odd way to get rid of nan # values orig_shape = a.shape na = a.reshape((-1, )) SMALL_NUM = -1e8 NAN_THRESHOLD_NUM = SMALL_NUM / 100 if np.nanmin(na) < NAN_THRESHOLD_NUM: raise ValueError( "Error removing nan's when running logsumexp aggregation" ) na[np.isnan(na)] = SMALL_NUM na = na.reshape(orig_shape) res = sm.logsumexp(a, axis=2).T nres = res.reshape((-1, )) # print("nres:", np.nansum(nres < NAN_THRESHOLD_NUM)) nres[nres < NAN_THRESHOLD_NUM] = np.nan res = nres.reshape(res.shape) # print("res:", np.nansum(res.reshape((-1,)))) return res else: agg = lambda x: x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T cmv.create_multivec_multires(f_in, chromsizes=zip(chrom_names, chrom_sizes), agg=agg, starting_resolution=starting_resolution, tile_size=tile_size, output_file=output_file, row_infos=row_infos)
def bigwigs_to_multivec( filepaths, output_file, assembly, chromsizes_filename, row_infos_filename, tile_size, ): with tempfile.TemporaryDirectory() as td: print("temporary dir:", td) temp_file = op.join(td, "temp.mv5") f_out = h5py.File(temp_file, "w") (chrom_info, chrom_names, chrom_lengths) = cch.load_chromsizes(chromsizes_filename, assembly) if row_infos_filename is not None: with open(row_infos_filename, "r") as f: row_infos = [line.strip().encode("utf8") for line in f] else: row_infos = None starting_resolution = 1 resolution = starting_resolution for chrom in chrom_info.chrom_order: f_out.create_dataset( chrom, ( math.ceil( chrom_info.chrom_lengths[chrom] / starting_resolution), len(filepaths), ), fillvalue=np.nan, compression="gzip", ) # Fill in data for each bigwig file. for bw_index, bw_file in tqdm(list(enumerate(filepaths)), desc="bigwigs"): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set(chromsizes.keys()).intersection( set(chrom_names)) # Fill in data for each resolution of a bigwig file. for chr_name in matching_chromosomes: print("chr_name:", chr_name, resolution) chr_len = chrom_info.chrom_lengths[chr_name] chr_shape = (math.ceil(chr_len / resolution), len(filepaths)) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[0], summary="sum") f_out[chr_name][:, bw_index] = arr else: print(f"{bw_file} not is_bigwig") f_out.flush() f_out.close() tf = temp_file f_in = h5py.File(tf, "r") def agg(x): return x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T cmv.create_multivec_multires( f_in, chromsizes=zip(chrom_names, chrom_lengths), agg=agg, starting_resolution=starting_resolution, tile_size=tile_size, output_file=output_file, row_infos=row_infos, )