def createCooler(pSparseMatrix, pChromosome, pChromSize, pResolution, pOutfile, pMetadata): #get indices of upper triangular matrix triu_Indices = np.triu_indices(pSparseMatrix.shape[0]) #create the bins for cooler bins = pd.DataFrame(columns=['chrom', 'start', 'end']) binStartList = list(range(0, pChromSize, int(pResolution))) binEndList = list(range(int(pResolution), pChromSize, int(pResolution))) binEndList.append(pChromSize) bins['start'] = binStartList bins['end'] = binEndList bins['chrom'] = str(pChromosome) #create the pixels for cooler pixels = pd.DataFrame(columns=['bin1_id', 'bin2_id', 'count']) pixels['bin1_id'] = triu_Indices[0] pixels['bin2_id'] = triu_Indices[1] readCounts = np.array(pSparseMatrix[triu_Indices])[0] pixels['count'] = np.float64(readCounts) pixels.sort_values(by=['bin1_id', 'bin2_id'], inplace=True) #write out the cooler cooler.create_cooler(pOutfile, bins=bins, pixels=pixels, dtypes={'count': np.float64}, metadata=pMetadata)
def test_load_cool(self): """Test loading of matrices in cool format""" # Write a dummy bedgraph2 (basically a diagonal) res, n_bins = 5000, 100000 chrom_names = ["c1", "c2", "c3"] bins_per_chrom = [n_bins // 3, n_bins // 3, n_bins // 3 + n_bins % 3] bins = pd.DataFrame( { "chrom": np.repeat(chrom_names, bins_per_chrom), "start": range(0, res * (n_bins), res), "end": range(res, res * (n_bins + 1), res), } ) pixels = pd.DataFrame( { "bin1_id": range(n_bins), "bin2_id": range(n_bins), "count": np.random.randint(0, 100, n_bins), } ) # Save dataframes into a cool file using cool API cooler.create_cooler(self.tmp_path, bins, pixels) # Load cool and check whether it was parsed correctly mat, chroms, bins, bin_size = cio.load_cool(self.tmp_path) # Median should work to estimate resolution id nbins >> nchroms assert res == abs(int(np.nanmedian(bins.start.shift(1) - bins.start))) assert res == bin_size assert n_bins == bins.shape[0] assert np.all(bins.columns == BIN_COLS) assert np.all(chroms.columns == CHR_COLS) assert mat.sum() == pixels["count"].sum()
def get_threshold(combos, cull_by_cis, bedfile): reads = {} rawgini = {} adjustedgini = {} bins_df = make_df(bedfile) if cull_by_cis: cis_threshold = int(sys.argv[3]) for pair in tqdm(combos): if pair[0] == pair[1]: continue pair = tuple(pair) cool1, cool2 = get_cools(pair) try: matrix1 = np.array(cool1.matrix(as_pixels=True, balance=False)[:]) matrix2 = np.array(cool2.matrix(as_pixels=True, balance=False)[:]) except: continue numreads1 = sum(matrix1[:, -1]) numreads2 = sum(matrix2[:, -1]) totalreads = numreads1 + numreads2 if cull_by_cis and (calculate_cistrans(matrix1) < cis_threshold or calculate_cistrans(matrix2) < cis_threshold): continue if numreads1 == 0 or numreads2 == 0 or totalreads < 50000: continue numtoselect = int( abs(np.random.normal(totalreads / 2, totalreads / 20))) rands = np.random.choice(np.arange(1, totalreads), numtoselect, replace=False) rands.sort() pixel_df = fill_pixel_df(rands, matrix1, matrix2, numreads1, numreads2) cooler.create_cooler("temp.cool", bins=bins_df, pixels=pixel_df, dtypes={ 'bin1_id': int, 'bin2_id': int, 'count': int }, ordered=True) newcool = cooler.Cooler("temp.cool") normalized, reads[pair], cis, trans = normalize_matrix(newcool) rawgini[pair] = gini(normalized) adjustedgini[pair] = adjust(rawgini[pair], reads[pair]) os.unlink("temp.cool") return reads, rawgini, adjustedgini
def gather_high_low_cool( cooler_file='Rao2014-GM12878-DpnII-allreps-filtered.10kb.cool', path='./data/raw/', chromosome='22', scale=4, output_path='./experiment/evaluation/'): file = os.path.join(path, cooler_file) cool_hic = cooler.Cooler(file) resolution = cool_hic.binsize mat = cool_hic.matrix(balance=True).fetch('chr' + chromosome) high_hic, idx = remove_zeros( mat) # idx: {true, false}, len is not changed/shrinked bool_idx = np.array(idx).flatten() num_idx = np.array(np.where(idx)).flatten() low_hic = sampling_hic(high_hic, scale**2, fix_seed=True) print('high hic shape: {}.'.format(high_hic.shape), end=' ') print('low hic shape: {}.'.format(low_hic.shape)) b = { 'chrom': ['chr{}'.format(chromosome)] * len(bool_idx), 'start': resolution * np.arange(len(bool_idx)), 'end': resolution * (np.arange(1, (len(bool_idx) + 1))), 'weight': 1.0 * bool_idx } bins = pd.DataFrame(data=b) high_hic = ICE_normalization(high_hic) low_hic = ICE_normalization(low_hic) high_hic = triu(high_hic, format='coo') low_hic = triu(low_hic, format='coo') output_path = os.path.join(output_path, 'chr{}'.format(chromosome)) os.makedirs(output_path, exist_ok=True) outfile = 'high_chr{}.cool'.format(chromosome) print('saving file {}'.format(os.path.join(output_path, outfile))) uri = os.path.join(output_path, outfile) p = { 'bin1_id': num_idx[high_hic.row], 'bin2_id': num_idx[high_hic.col], 'count': high_hic.data } pixels = pd.DataFrame(data=p) cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels) outfile = 'low_chr{}.cool'.format(chromosome) print('saving file {}'.format(os.path.join(output_path, outfile))) uri = os.path.join(output_path, outfile) p = { 'bin1_id': num_idx[low_hic.row], 'bin2_id': num_idx[low_hic.col], 'count': low_hic.data } pixels = pd.DataFrame(data=p) cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels)
def generate_cool(input_path='./experiment/significant_interactions', chromosomes=['22', '21', '20', '19', 'X'], resolution=10000, genomic_distance=2000000): k = np.ceil(genomic_distance/resolution).astype(int) for chro in chromosomes: path = os.path.join(input_path, 'chr{}'.format(chro)) hicfile = 'sample_high_chr{}.cool'.format(chro) cool_hic = cooler.Cooler(os.path.join(path, hicfile)) mat = cool_hic.matrix(balance=True).fetch('chr' + chro) bins = cool_hic.bins().fetch('chr' + chro) num_idx = np.array(np.where(np.array(bins['weight']))).flatten() high_mat = mat[num_idx, :] high_mat = high_mat[:, num_idx] high_mat = filter_diag_boundary(high_mat, diag_k=0, boundary_k=k) files = [f for f in os.listdir(path) if '.npz' in f] for file in files: if 'high' in file or 'low' in file: continue print(file) data = np.load(os.path.join(path, file), allow_pickle=True) mat = data['hic'] namelist = file.split('_') if len(namelist) == 3: name = namelist[0] else: model = namelist[1] win_len = namelist[3] if model == 'hicgan': # true_hic = np.log1p(true_hic) mat = np.expm1(mat) elif model == 'deephic': minv = high_mat.min() maxv = high_mat.max() # true_hic = np.divide((true_hic-minv), (maxv-minv), dtype=float,out=np.zeros_like(true_hic), where=(maxv-minv) != 0) mat = mat*(maxv-minv)+minv mat = (mat+np.transpose(mat))/2 elif model == 'hicsr': log_mat = np.log2(high_mat+1) # ture_hic = 2*(log_mat/np.max(log_mat)) - 1 maxv = np.max(log_mat) log_predict_hic = (mat+1)/2*maxv mat = np.expm1(log_predict_hic) '''elif model == 'ours': scn, dh = scn_normalization(high_mat, max_iter=3000) mat = scn_recover(mat, dh)''' name = '_'.join([model, win_len]) mat = filter_diag_boundary(mat, diag_k=0, boundary_k=k) # mat = ICE_normalization(mat) print('{} matrix shape: {}'.format(name, mat.shape)) uri = os.path.join(path, 'sample_{}_chr{}.cool'.format(name, chro)) mat = triu(mat, format='coo') # p = {'bin1_id': mat.row, 'bin2_id': mat.col, 'count': mat.data} p = {'bin1_id': num_idx[mat.row], 'bin2_id': num_idx[mat.col], 'count': mat.data} pixels = pd.DataFrame(data = p) cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels)
def save_cool(cool_out, mat, frags, metadata={}): """ Writes a .cool file from graal style tables. Parameters ---------- cool_out : str Path to the output cool file. mat : scipy coo_matrix The Hi-C contact matrix in sparse COO format. frags : pandas DataFrame The graal style 'fragments_list' table. metadata : dict Potential metadata to associate with the cool file. """ up_tri = False # Check if symmetric matrix is symmetric # (i.e. only upper triangle or full mat) if (abs(mat - mat.T) > 1e-10).nnz != 0: up_tri = True # Drop useless column try: bins = frags.drop("id", axis=1) except KeyError: bins = frags # Get column names right bins.rename( columns={ "seq": "chrom", "start_pos": "start", "end_pos": "end" }, inplace=True, ) mat_dict = {"bin1_id": mat.row, "bin2_id": mat.col, "count": mat.data} pixels = pd.DataFrame(mat_dict) cooler.create_cooler( # pylint: disable=undefined-variable cool_out, bins, pixels, metadata=metadata, symmetric_upper=up_tri, triucheck=False, )
def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): log.debug('Save in cool format') bins_data_frame, matrix_data_frame, dtype_pixel, info = self.create_cooler_input( pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection) local_temp_dir = os.path.dirname(os.path.realpath(pFileName)) cooler.create_cooler(cool_uri=pFileName, bins=bins_data_frame, pixels=matrix_data_frame, mode=self.appendData, dtypes=dtype_pixel, ordered=True, metadata=self.hic_metadata, temp_dir=local_temp_dir) if self.appendData == 'w': fileName = pFileName.split('::')[0] with h5py.File(fileName, 'r+') as h5file: h5file.attrs.update(info) h5file.close()
def to_cooler(self, store, normalise=False, **normalise_options): capture_bins = self.capture_bins capture_name = self.cooler.info["metadata"]["capture_name"] capture_coords = self.cooler.info["metadata"]["capture_coords"] capture_chrom = self.cooler.info["metadata"]["capture_chrom"] metadata = { "capture_bins": [int(x) for x in self.capture_bins], "capture_name": capture_name, "capture_coords": capture_coords, "capture_chrom": capture_chrom, "n_cis_interactions": self.n_cis_interactions, } if normalise: self.normalise_pixels(**normalise_options) if os.path.exists( store): # Will append to a prexisting file if one is supplied cooler_fn = f"{store}::/{capture_name}/resolutions/{self.binsize}" else: cooler_fn = ( f"{store.replace('.hdf5', '')}.{capture_name}.{self.binsize}.hdf5" ) cooler.create_cooler( cooler_fn, bins=self.bins, pixels=self.pixels, metadata=metadata, mode="w" if not os.path.exists(store) else "a", columns=self.pixels.columns[2:], ) return cooler_fn
def createCoolersFromDf(pResultsDf, pResolution, pPredictionOutfile, pTargetOutfile): #create the bins for cooler bins = pd.DataFrame(columns=['chrom','start','end']) maxPos = max(pResultsDf['bin1_id'].max(), pResultsDf['bin2_id'].max()) * pResolution + pResolution minPos = 0 binStartList = list(range(minPos, maxPos, pResolution)) binEndList = list(range(minPos + pResolution, maxPos, pResolution)) binEndList.append(maxPos) bins['start'] = binStartList bins['end'] = binEndList bins['chrom'] = pResultsDf.loc[0, 'chromosome'] #create the pixels / counts for predicted cooler pixels = pd.DataFrame(columns=['bin1_id','bin2_id','count']) pixels['bin1_id'] = pResultsDf['bin1_id'] pixels['bin2_id'] = pResultsDf['bin2_id'] pixels['count'] = pResultsDf['PredictedValue'] pixels.sort_values(by=['bin1_id','bin2_id'],inplace=True) #create the pixels / counts for target cooler targetPixels = pixels.copy(deep=True) targetPixels['count'] = pResultsDf['TrueValue'] targetPixels.sort_values(by=['bin1_id','bin2_id'],inplace=True) #store the coolers cooler.create_cooler(pPredictionOutfile, bins=bins, pixels=pixels, dtypes={'count': np.float64}) cooler.create_cooler(pTargetOutfile, bins=bins, pixels=targetPixels, dtypes={'count': np.float64})
def create_cool(bins, pixels, resolution, cool_file, genome_assembly): metadata = { 'format': 'HDF5::Cooler', 'format-version': '0.8.10', 'bin-type': 'fixed', 'bin-size': resolution, 'storage-mode': 'symmetric-upper', 'genome-assembly': genome_assembly, 'generated-by': 'boost-hic', # 'creation-date': datetime.date.today() } count_dtypes = {'count': 'float64'} bins = bins.astype({'chrom': str, 'start': int, 'end': int}) pixels = pixels.astype({'bin1_id': int, 'bin2_id': int, 'count': float}) cooler.create_cooler(cool_file, bins=bins, pixels=pixels, dtypes=count_dtypes, ordered=True, ensure_sorted=True, metadata=metadata) # cooler.create_cooler(cool_file, bins=bins, pixels=pixels, dtypes=count_dtypes, ordered=True, metadata=metadata) return cool_file
def test_create_custom_cols(): with isolated_filesystem(): df = pd.DataFrame( { "bin1_id": [0, 1, 1, 1, 2, 2, 3, 4, 5], "bin2_id": [1, 1, 3, 4, 5, 6, 7, 8, 9], "foo": [1, 1, 1, 1, 1, 2, 2, 2, 2], "bar": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], }, columns=["bin1_id", "bin2_id", "foo", "bar"], ) bins = pd.DataFrame({ "chrom": ["chr1"] * 5 + ["chr2"] * 5, "start": list(range(5)) * 2, "end": list(range(1, 6)) * 2, }) # works in unordered mode cooler.create_cooler("test.cool", bins, df, columns=["foo", "bar"]) clr = cooler.Cooler("test.cool") assert len(clr.pixels().columns) == 4 assert np.allclose( df, clr.pixels()[["bin1_id", "bin2_id", "foo", "bar"]][:]) # works in ordered mode cooler.create_cooler("test.cool", bins, df, columns=["foo", "bar"], ordered=True) clr = cooler.Cooler("test.cool") assert len(clr.pixels().columns) == 4 assert np.allclose( df, clr.pixels()[["bin1_id", "bin2_id", "foo", "bar"]][:]) # raises if no custom columns specified and 'count' does not exist with pytest.raises(ValueError): cooler.create_cooler("test.cool", bins, df, columns=None, ordered=True)
def test_create_custom_cols(): with isolated_filesystem() as fs: df = pd.DataFrame( { 'bin1_id': [0, 1, 1, 1, 2, 2, 3, 4, 5], 'bin2_id': [1, 1, 3, 4, 5, 6, 7, 8, 9], 'foo': [1, 1, 1, 1, 1, 2, 2, 2, 2], 'bar': [.1, .2, .3, .4, .5, .6, .7, .8, .9], }, columns=['bin1_id', 'bin2_id', 'foo', 'bar']) bins = pd.DataFrame({ 'chrom': ['chr1'] * 5 + ['chr2'] * 5, 'start': list(range(5)) * 2, 'end': list(range(1, 6)) * 2, }) # works in unordered mode cooler.create_cooler('test.cool', bins, df, columns=['foo', 'bar']) clr = cooler.Cooler('test.cool') assert len(clr.pixels().columns) == 4 assert np.allclose( df, clr.pixels()[['bin1_id', 'bin2_id', 'foo', 'bar']][:]) # works in ordered mode cooler.create_cooler('test.cool', bins, df, columns=['foo', 'bar'], ordered=True) clr = cooler.Cooler('test.cool') assert len(clr.pixels().columns) == 4 assert np.allclose( df, clr.pixels()[['bin1_id', 'bin2_id', 'foo', 'bar']][:]) # raises if no custom columns specified and 'count' does not exist with pytest.raises(ValueError): cooler.create_cooler('test.cool', bins, df, columns=None, ordered=True)
def test_dump(): runner = CliRunner() with runner.isolated_filesystem(): f_in = op.join(datadir, 'toy.symm.upper.2.cool') result = runner.invoke(dump, [ f_in, ]) assert result.exit_code == 0 # roundtrip symm-upper data bins = pd.read_csv(StringIO( runner.invoke(dump, [f_in, '-H', '-t', 'bins']).output), sep='\t') pixels = pd.read_csv(StringIO( runner.invoke(dump, [f_in, '-H']).output), sep='\t') cooler.create_cooler('out.cool', bins, pixels, symmetric_upper=True) cooler_cmp(f_in, 'out.cool') # duplexed output pixels2 = pd.read_csv(StringIO( runner.invoke(dump, [f_in, '--matrix', '-H']).output), sep='\t') assert len(pixels2) > len(pixels) upper = pixels2[pixels2['bin1_id'] <= pixels2['bin2_id']].reset_index( drop=True) assert np.allclose(pixels, upper) # lower triangle trans_lower = pd.read_csv(StringIO( runner.invoke(dump, [f_in, '-H', '-r', 'chr2', '-r2', 'chr1']).output), sep='\t') assert len(trans_lower) == 0 trans_lower = pd.read_csv(StringIO( runner.invoke( dump, [f_in, '-m', '-H', '-r', 'chr2', '-r2', 'chr1']).output), sep='\t') assert len(trans_lower) > 0 # roundtrip square data f_in = op.join(datadir, 'toy.asymm.2.cool') bins = pd.read_csv(StringIO( runner.invoke(dump, [f_in, '-H', '-t', 'bins']).output), sep='\t') pixels = pd.read_csv(StringIO( runner.invoke(dump, [f_in, '-H']).output), sep='\t') cooler.create_cooler('out.cool', bins, pixels, symmetric_upper=False) cooler_cmp(f_in, 'out.cool') pixels2 = pd.read_csv(StringIO( runner.invoke(dump, [f_in, '--matrix', '-H']).output), sep='\t') assert np.allclose(pixels, pixels2) # for square data, -m is a no-op lower1 = pd.read_csv(StringIO( runner.invoke(dump, [f_in, '-H', '-r', 'chr2', '-r2', 'chr1']).output), sep='\t') lower2 = pd.read_csv(StringIO( runner.invoke( dump, [f_in, '-m', '-H', '-r', 'chr2', '-r2', 'chr1']).output), sep='\t') assert np.allclose(lower1, lower2)
def create_from_unordered(cool_uri, bins, chunks, columns=None, dtypes=None, mergebuf=int(20e6), delete_temp=True, temp_dir=None, **kwargs): """ Create a Cooler in two passes via an external sort mechanism. In the first pass, a sequence of data chunks are processed and sorted in memory and saved to temporary Coolers. In the second pass, the temporary Coolers are merged into the output. This way the individual chunks do not need to be provided in any particular order. Parameters ---------- cool_uri : str Path to Cooler file or URI to Cooler group. If the file does not exist, it will be created. bins : DataFrame Segmentation of the chromosomes into genomic bins. May contain additional columns. chunks : iterable of DataFrames Sequence of chunks that get processed and written to separate Coolers and then subsequently merged. columns : sequence of str, optional Specify here the names of any additional value columns from the input besides 'count' to store in the Cooler. The standard columns ['bin1_id', 'bin2_id', 'count'] can be provided, but are already assumed and don't need to be given explicitly. Additional value columns provided here will be stored as np.float64 unless otherwised specified using `dtype`. dtypes : dict, optional Dictionary mapping column names to dtypes. Can be used to override the default dtypes of ``bin1_id``, ``bin2_id`` or ``count`` or assign dtypes to custom value columns. Non-standard value columns given in ``dtypes`` must also be provided in the ``columns`` argument or they will be ignored. assembly : str, optional Name of genome assembly. mode : {'w' , 'a'}, optional [default: 'w'] Write mode for the output file. 'a': if the output file exists, append the new cooler to it. 'w': if the output file exists, it will be truncated. Default is 'w'. metadata : dict, optional Experiment metadata to store in the file. Must be JSON compatible. mergebuf : int, optional Maximum number of records to buffer in memory at any give time during the merge step. delete_temp : bool, optional Whether to delete temporary files when finished. Useful for debugging. Default is False. temp_dir : str, optional Create temporary files in this directory. See also -------- sanitize_records sanitize_pixels """ bins = bins.copy() bins['chrom'] = bins['chrom'].astype(object) tf = tempfile.NamedTemporaryFile( suffix='.multi.cool', delete=delete_temp, dir=temp_dir) uris = [] for i, chunk in enumerate(chunks): uri = tf.name + '::' + str(i) uris.append(uri) log.info('Writing chunk {}: {}'.format(i, uri)) create_cooler(uri, bins, chunk, columns=columns, mode='a', boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False, ordered=True, dtypes=dtypes) chunks = CoolerMerger([Cooler(uri) for uri in uris], mergebuf) log.info('Merging into {}'.format(cool_uri)) create_cooler(cool_uri, bins, chunks, columns=columns, dtypes=dtypes, ordered=True, **kwargs)
def __init__(self, datasets, outfil, assembly='hg38', chromsizes_file=None, chroms=['#','X'], onlyIntra=True, dtype='int'): self.outfil = os.path.abspath(os.path.expanduser(outfil)) if os.path.exists(self.outfil): log.error('Cooler file {} already exists, exit ...'.format(self.outfil)) sys.exit(1) self.chroms = set(chroms) self.onlyIntra = onlyIntra data = datasets ## Ready for data loading if not chromsizes_file is None: chromsizes_path = os.path.abspath(os.path.expanduser(chromsizes_file)) log.info('Read chromosome sizes from {}'.format(chromsizes_path)) chromsizes = readChromSizes(chromsizes_path, self.chroms) else: log.info('Fetch chromosome sizes from UCSC ...') chromsizes = fetchChromSizes(assembly, self.chroms) chromlist = chromsizes.keys() # sort chromosome labels tmp = list(map(str, sorted(map(int, [i for i in chromlist if i.isdigit()])))) nondigits = [i for i in chromlist if not i.isdigit()] for i in ['X','Y','M']: if i in nondigits: tmp.append(nondigits.pop(nondigits.index(i))) chromlist = tmp + sorted(nondigits) lengths = [chromsizes[i] for i in chromlist] self.chromsizes = pd.Series(data=lengths, index=chromlist) log.info('Done') ## We don't read data into memory at this point. ## Waiting for more robust conditions, here I assume there is no sign '_' in any chromosome labels. self.Map = {} for res in data: if data[res].endswith('.npz'): self.Map[res] = {} lib = np.load(data[res]) for i in lib.files: if (not '_' in i) and ((not self.chroms) or (i.isdigit() and '#' in self.chroms) or (i in self.chroms)): # Compatible with TADLib and old version of runHiC c1 = c2 = i self.Map[res][(c1,c2)] = lib else: tmp = i.split('_') if len(tmp)!=2: continue c1, c2 = tmp check1 = ((not self.chroms) or (c1.isdigit() and '#' in self.chroms) or (c1 in self.chroms)) check2 = ((not self.chroms) or (c2.isdigit() and '#' in self.chroms) or (c2 in self.chroms)) if check1 and check2: self.Map[res][(c1,c2)] = lib else: self.Map[res] = self._scanFolder(data[res]) self._intertype = np.dtype({'names':['bin1', 'bin2', 'IF'], 'formats':[np.int, np.int, np.float]}) log.info('Extract and save data into cooler format for each resolution ...') for res in self.Map: log.info('Current resolution: {}bp'.format(res)) byres = self.Map[res] # Extract parts of chromsizes subset = [] for c1, c2 in byres: subset.extend([c1,c2]) subset = set(subset) Bool = [(i in subset) for i in self.chromsizes.index] chromsizes = self.chromsizes[Bool] bin_cumnums = self.binCount(chromsizes, res) log.info('Generate bin table ...') bintable = binnify(chromsizes, res) pixels = self._generator(byres, chromsizes, bin_cumnums) if os.path.exists(self.outfil): mode = 'a' else: mode = 'w' if dtype == 'int': dtypes = {'count': np.int32} else: dtypes = {'count': np.float64} cooler_uri = '{}::{}'.format(self.outfil, res) if self.onlyIntra: create_cooler(cooler_uri, bintable, pixels, assembly=assembly, mode=mode, boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False, ordered=True, metadata={'onlyIntra':str(self.onlyIntra)}, dtypes=dtypes) else: create_from_unordered(cooler_uri, bintable, pixels, assembly=assembly, mode=mode, metadata={'onlyIntra':str(self.onlyIntra)}, delete_temp=True, boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False, dtypes=dtypes)
def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): log.debug('Save in cool format') self.matrix.eliminate_zeros() if self.nan_bins is not None and len(self.nan_bins) > 0 and self.fileWasH5: # remove nan_bins correction_factors = np.ones(self.matrix.shape[0]) correction_factors[self.nan_bins] = 0 self.matrix.sort_indices() _instances, _features = self.matrix.nonzero() instances_factors = correction_factors[_instances] features_factors = correction_factors[_features] instances_factors = np.logical_not(np.logical_or(instances_factors, features_factors)) self.matrix.data[instances_factors] = 0 self.matrix.eliminate_zeros() # set possible nans in data to 0 mask = np.isnan(self.matrix.data) self.matrix.data[mask] = 0 self.matrix.eliminate_zeros() # save only the upper triangle of the if pSymmetric: # symmetric matrix self.matrix = triu(self.matrix, format='csr') else: self.matrix = self.matrix self.matrix.eliminate_zeros() # create data frame for bins # self.cut_intervals is having 4 tuples, bin_data_frame should have 3.correction_factors # it looks like it is faster to create it with 4, and drop the last one # instead of handling this before. bins_data_frame = pd.DataFrame(self.cut_intervals, columns=['chrom', 'start', 'end', 'interactions']).drop('interactions', axis=1) dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': np.int32} if self.correction_factors is not None and pApplyCorrection: dtype_pixel['weight'] = np.float32 # if the correction was applied by a division, invert it because cool format expects multiplicative if table name is 'weight' # https://cooler.readthedocs.io/en/latest/api.html#cooler.Cooler.matrix if (self.hic2cool_version is not None and self.hic2cool_version >= '0.5') or self.fileWasH5 or self.correctionOperator == '/': log.debug('h5 true') self.correction_factors = np.array(self.correction_factors).flatten() self.correction_factors = 1 / self.correction_factors mask = np.isnan(self.correction_factors) self.correction_factors[mask] = 0 mask = np.isinf(self.correction_factors) self.correction_factors[mask] = 0 self.correctionOperator = '*' log.debug('inverted correction factors') weight = convertNansToOnes(np.array(self.correction_factors).flatten()) bins_data_frame = bins_data_frame.assign(weight=weight) log.debug("Reverting correction factors on matrix...") instances, features = self.matrix.nonzero() self.correction_factors = np.array(self.correction_factors) # do not apply if correction factors are just 1's instances_factors = self.correction_factors[instances] features_factors = self.correction_factors[features] instances_factors *= features_factors self.matrix.data = self.matrix.data.astype(float) # Apply the invert operation to get the original data if self.correctionOperator == '*' or self.correctionOperator is None: self.matrix.data /= instances_factors instances_factors = None features_factors = None self.matrix.eliminate_zeros() if self.correction_factors is not None and pApplyCorrection is False: dtype_pixel['weight'] = np.float32 weight = convertNansToOnes(np.array(self.correction_factors).flatten()) bins_data_frame = bins_data_frame.assign(weight=weight) instances, features = self.matrix.nonzero() matrix_data_frame = pd.DataFrame(instances, columns=['bin1_id'], dtype=np.int32) del instances matrix_data_frame = matrix_data_frame.assign(bin2_id=features) del features if self.enforceInteger: dtype_pixel['count'] = np.int32 data = np.rint(self.matrix.data) matrix_data_frame = matrix_data_frame.assign(count=data) else: matrix_data_frame = matrix_data_frame.assign(count=self.matrix.data) if not self.enforceInteger and self.matrix.dtype not in [np.int32, int]: log.debug("Writing non-standard cooler matrix. Datatype of matrix['count'] is: {}".format(self.matrix.dtype)) dtype_pixel['count'] = self.matrix.dtype split_factor = 1 if len(self.matrix.data) > 1e7: split_factor = 1e4 matrix_data_frame = np.array_split(matrix_data_frame, split_factor) if self.appendData: self.appendData = 'a' else: self.appendData = 'w' info = {} # these fields are created by cooler lib. Can cause errors if not deleted. if 'metadata' in info: if self.hic_metadata is None: self.hic_metadata = info['metadata'] del info['metadata'] if 'bin-size' in info: del info['bin-size'] if 'bin-type' in info: del info['bin-type'] info['format'] = str('HDF5::Cooler') info['format-url'] = str('https://github.com/mirnylab/cooler') info['generated-by'] = str('HiCMatrix-' + __version__) info['generated-by-cooler-lib'] = str('cooler-' + cooler.__version__) info['tool-url'] = str('https://github.com/deeptools/HiCMatrix') # info['nchroms'] = int(bins_data_frame['chrom'][:].nunique()) # info['chromosomes'] = list(bins_data_frame['chrom'][:].unique()) # info['nnz'] = np.string_(str(self.matrix.nnz * 2)) # info['min-value'] = np.string_(str(matrix_data_frame['count'].min())) # info['max-value'] = np.string_(str(matrix_data_frame['count'].max())) # info['sum-elements'] = int(matrix_data_frame['count'].sum()) if self.hic_metadata is not None and 'matrix-generated-by' in self.hic_metadata: info['matrix-generated-by'] = str(self.hic_metadata['matrix-generated-by']) del self.hic_metadata['matrix-generated-by'] if self.hic_metadata is not None and 'matrix-generated-by-url' in self.hic_metadata: info['matrix-generated-by-url'] = str(self.hic_metadata['matrix-generated-by-url']) del self.hic_metadata['matrix-generated-by-url'] if self.hic_metadata is not None and 'genome-assembly' in self.hic_metadata: info['genome-assembly'] = str(self.hic_metadata['genome-assembly']) del self.hic_metadata['genome-assembly'] local_temp_dir = os.path.dirname(os.path.realpath(pFileName)) cooler.create_cooler(cool_uri=pFileName, bins=bins_data_frame, pixels=matrix_data_frame, mode=self.appendData, dtypes=dtype_pixel, ordered=True, metadata=self.hic_metadata, temp_dir=local_temp_dir) if self.appendData == 'w': fileName = pFileName.split('::')[0] with h5py.File(fileName, 'r+') as h5file: h5file.attrs.update(info) h5file.close()
def test_dump(): runner = CliRunner() with runner.isolated_filesystem(): f_in = op.join(datadir, "toy.symm.upper.2.cool") result = runner.invoke(dump, [f_in]) assert result.exit_code == 0 result = runner.invoke(dump, [f_in, "-t", "chroms", "--columns", "length"]) assert result.exit_code == 0 result = runner.invoke(dump, [f_in, "-t", "bins", "--columns", "chrom,start"]) assert result.exit_code == 0 result = runner.invoke(dump, [f_in, "-r", "chr1"]) assert result.exit_code == 0 result = runner.invoke(dump, [f_in, "-r", "chr1:0-16", "-r2", "chr1:10-25"]) assert result.exit_code == 0 result = runner.invoke(dump, [f_in, "-r", "chr1:10-25", "-r2", "chr1:0-5"]) assert result.exit_code == 0 result = runner.invoke(dump, [f_in, "--join"]) assert result.exit_code == 0 result = runner.invoke(dump, [f_in, "--join", "--one-based-ids"]) assert result.exit_code == 0 result = runner.invoke(dump, [f_in, "--join", "--one-based-starts"]) assert result.exit_code == 0 result = runner.invoke(dump, [f_in, "--annotate", "chrom", "--one-based-starts"]) assert result.exit_code == 0 # unbalanced file result = runner.invoke(dump, [f_in, "-b"]) assert result.exit_code == 1 # roundtrip symm-upper data result = runner.invoke(dump, [f_in, "-H", "-t", "bins"]) bins = pd.read_csv(StringIO(result.output), sep="\t") result = runner.invoke(dump, [f_in, "-H"]) pixels = pd.read_csv(StringIO(result.output), sep="\t") cooler.create_cooler("out.cool", bins, pixels, symmetric_upper=True) cooler_cmp(f_in, "out.cool") # duplexed output result = runner.invoke(dump, [f_in, "--matrix", "-H"]) pixels2 = pd.read_csv(StringIO(result.output), sep="\t") assert len(pixels2) > len(pixels) upper = pixels2[pixels2["bin1_id"] <= pixels2["bin2_id"]].reset_index(drop=True) assert np.allclose(pixels, upper) # lower triangle result = runner.invoke(dump, [f_in, "-H", "-r", "chr2", "-r2", "chr1"]) trans_lower = pd.read_csv(StringIO(result.output), sep="\t") assert len(trans_lower) == 0 result = runner.invoke(dump, [f_in, "-m", "-H", "-r", "chr2", "-r2", "chr1"]) trans_lower = pd.read_csv(StringIO(result.output), sep="\t") assert len(trans_lower) > 0 # roundtrip square data f_in = op.join(datadir, "toy.asymm.2.cool") result = runner.invoke(dump, [f_in, "-H", "-t", "bins"]) bins = pd.read_csv(StringIO(result.output), sep="\t") result = runner.invoke(dump, [f_in, "-H"]) pixels = pd.read_csv(StringIO(result.output), sep="\t") cooler.create_cooler("out.cool", bins, pixels, symmetric_upper=False) cooler_cmp(f_in, "out.cool") result = runner.invoke(dump, [f_in, "--matrix", "-H"]) pixels2 = pd.read_csv(StringIO(result.output), sep="\t") assert np.allclose(pixels, pixels2) # for square data, -m is a no-op result = runner.invoke(dump, [f_in, "-H", "-r", "chr2", "-r2", "chr1"]) lower1 = pd.read_csv(StringIO(result.output), sep="\t") result = runner.invoke(dump, [f_in, "-m", "-H", "-r", "chr2", "-r2", "chr1"]) lower2 = pd.read_csv(StringIO(result.output), sep="\t") assert np.allclose(lower1, lower2)
def to_cooler(hic, path, balance=True, multires=True, resolutions=None, n_zooms=10, threads=1, chunksize=100000, max_resolution=5000000, natural_order=True, chromosomes=None, **kwargs): """ Export Hi-C data as Cooler file. Only contacts that have not been filtered are exported. https://github.com/mirnylab/cooler/ Single resolution files: If input Hi-C matrix is uncorrected, the uncorrected matrix is stored. If it is corrected, the uncorrected matrix is stored along with bias vector. Cooler always calculates corrected matrix on-the-fly from the uncorrected matrix and the bias vector. Multi-resolution files (default): :param hic: Hi-C file in any compatible (RegionMatrixContainer) format :param path: Output path for cooler file :param balance: Include bias vector in cooler output (single res) or perform iterative correction (multi res) :param multires: Generate a multi-resolution cooler file :param resolutions: Resolutions in bp (int) for multi-resolution cooler output :param chunksize: Number of pixels processed at a time in cooler :param kwargs: Additional arguments passed to cooler.iterative_correction """ base_resolution = hic.bin_size tmp_files = [] try: if multires: if resolutions is None: resolutions = [ base_resolution * 2**i for i in range(n_zooms) if base_resolution * 2**i < max_resolution ] else: for r in resolutions: if r % base_resolution != 0: raise ValueError("Resolution {} must be a multiple of " "base resolution {}!".format( r, base_resolution)) single_path = tempfile.NamedTemporaryFile(delete=False, suffix='.cool').name tmp_files.append(single_path) multi_path = path else: single_path = path multi_path = None natural_key = cmp_to_key(natural_cmp) if chromosomes is None: chromosomes = hic.chromosomes() if natural_order: chromosomes = sorted( chromosomes, key=lambda x: natural_key(x.encode('utf-8'))) logger.info("Loading genomic regions") ix_converter = dict() regions = [] region_order = [] new_region_index = 0 for chromosome in chromosomes: for region in hic.regions(chromosome, lazy=True): regions.append( (region.chromosome, region.start - 1, region.end)) ix_converter[region.ix] = new_region_index region_order.append(region.ix) new_region_index += 1 region_df = pandas.DataFrame(regions, columns=['chrom', 'start', 'end']) def pixel_iter(): for chri in range(len(chromosomes)): chromosome1 = chromosomes[chri] for chrj in range(chri, len(chromosomes)): chromosome2 = chromosomes[chrj] logger.info("{} - {}".format(chromosome1, chromosome2)) def chromosome_pixel_iter(): for edge in hic.edges((chromosome1, chromosome2), norm=False, lazy=True): source, sink = ix_converter[ edge.source], ix_converter[edge.sink] if sink < source: source, sink = sink, source yield source, sink, edge.weight pixels = np.fromiter(chromosome_pixel_iter(), dtype=[("bin1_id", np.int_), ("bin2_id", np.int_), ("count", np.float_)]) pixels = np.sort(pixels, order=("bin1_id", "bin2_id")) if len(pixels) > 0: yield pandas.DataFrame(pixels) logger.info("Writing cooler") cooler.create_cooler(cool_uri=single_path, bins=region_df, pixels=pixel_iter(), ordered=False) cool_path, group_path = cooler.util.parse_cooler_uri(single_path) if not multires: if balance: logger.info("Writing bias vector from FAN-C matrix") bias = hic.bias_vector()[np.array(region_order)] # Copied this section from # https://github.com/mirnylab/cooler/blob/356a89f6a62e2565f42ff13ec103352f20d251be/cooler/cli/balance.py#L195 with h5py.File(cool_path, 'r+') as h5: grp = h5[group_path] # add the bias column to the file h5opts = dict(compression='gzip', compression_opts=6) grp['bins'].create_dataset("weight", data=bias, **h5opts) return CoolerHic(single_path) else: cooler.zoomify_cooler(single_path, multi_path, resolutions, chunksize, nproc=threads) if balance: logger.info("Balancing zoom resolutions...") for resolution in resolutions: uri = multi_path + "::resolutions/" + str(resolution) cool_path, group_path = cooler.util.parse_cooler_uri(uri) cool = cooler.Cooler(uri) bias, stats = cooler.balance_cooler(cool, chunksize=chunksize, **kwargs) with h5py.File(cool_path, 'r+') as h5: grp = h5[group_path] # add the bias column to the file h5opts = dict(compression='gzip', compression_opts=6) grp['bins'].create_dataset("weight", data=bias, **h5opts) grp['bins']['weight'].attrs.update(stats) return CoolerHic(multi_path + '::resolutions/{}'.format(base_resolution)) finally: for tmp_file in tmp_files: os.remove(tmp_file)
def pixel_iter(): chr_bin = 0 for ch_no in range(1, 23): ch = f"chr{ch_no}" print(ch, chr_bin) with open(f"Hippo_{ch}") as hic_file: counts = {"bin1_id": list(), "bin2_id": list(), "count": list()} i = 0 for line in hic_file: if line.strip(): row = line.split("\t") for j, cnt in enumerate(row): if j >= i and cnt.strip() != "": counts["bin1_id"].append(chr_bin + i) counts["bin2_id"].append(chr_bin + j) counts["count"].append(int(cnt)) i += 1 yield DataFrame(data=counts, copy=True) chr_bin += i + 1 bin_size = 40000 bins = read_bins(bin_size) pixels = pixel_iter() cooler.create_cooler(f"hippo.mcool::resolutions/{bin_size}", bins, pixels, ordered=True)
def test_dump(): runner = CliRunner() with runner.isolated_filesystem(): f_in = op.join(datadir, "toy.symm.upper.2.cool") result = runner.invoke(dump, [f_in]) assert result.exit_code == 0 # roundtrip symm-upper data bins = pd.read_csv( StringIO(runner.invoke(dump, [f_in, "-H", "-t", "bins"]).output), sep="\t" ) pixels = pd.read_csv( StringIO(runner.invoke(dump, [f_in, "-H"]).output), sep="\t" ) cooler.create_cooler("out.cool", bins, pixels, symmetric_upper=True) cooler_cmp(f_in, "out.cool") # duplexed output pixels2 = pd.read_csv( StringIO(runner.invoke(dump, [f_in, "--matrix", "-H"]).output), sep="\t" ) assert len(pixels2) > len(pixels) upper = pixels2[pixels2["bin1_id"] <= pixels2["bin2_id"]].reset_index(drop=True) assert np.allclose(pixels, upper) # lower triangle trans_lower = pd.read_csv( StringIO( runner.invoke(dump, [f_in, "-H", "-r", "chr2", "-r2", "chr1"]).output ), sep="\t", ) assert len(trans_lower) == 0 trans_lower = pd.read_csv( StringIO( runner.invoke( dump, [f_in, "-m", "-H", "-r", "chr2", "-r2", "chr1"] ).output ), sep="\t", ) assert len(trans_lower) > 0 # roundtrip square data f_in = op.join(datadir, "toy.asymm.2.cool") bins = pd.read_csv( StringIO(runner.invoke(dump, [f_in, "-H", "-t", "bins"]).output), sep="\t" ) pixels = pd.read_csv( StringIO(runner.invoke(dump, [f_in, "-H"]).output), sep="\t" ) cooler.create_cooler("out.cool", bins, pixels, symmetric_upper=False) cooler_cmp(f_in, "out.cool") pixels2 = pd.read_csv( StringIO(runner.invoke(dump, [f_in, "--matrix", "-H"]).output), sep="\t" ) assert np.allclose(pixels, pixels2) # for square data, -m is a no-op lower1 = pd.read_csv( StringIO( runner.invoke(dump, [f_in, "-H", "-r", "chr2", "-r2", "chr1"]).output ), sep="\t", ) lower2 = pd.read_csv( StringIO( runner.invoke( dump, [f_in, "-m", "-H", "-r", "chr2", "-r2", "chr1"] ).output ), sep="\t", ) assert np.allclose(lower1, lower2)
def export_to_cooler( contact_table, output_prefix, cooler_resolution, fragment_table, chromsizes, query, query_columns=None, by_haplotype=False, ): results = [] if query_columns: columns = query_columns[:] else: columns = [] columns.extend(["align1_fragment_id", "align2_fragment_id"]) if by_haplotype: columns.extend(["align1_haplotype", "align2_haplotype"]) contact_df = dd.read_parquet(contact_table, engine=PQ_ENGINE, version=PQ_VERSION, columns=columns, index=False) if query: contact_df = contact_df.query(query) chrom_dict = pd.read_csv(chromsizes, sep="\t", header=None, names=["chrom", "size"], index_col=["chrom"], squeeze=True) # create even-widht bins using cooler bins_df = binnify(chrom_dict, cooler_resolution) bins_df.index.name = "bin_id" # convert to ranges for overlap bins = pr.PyRanges(bins_df.reset_index().rename(columns={ "start": "Start", "end": "End", "chrom": "Chromosome" })) fragment_df = dd.read_parquet(fragment_table, engine=PQ_ENGINE, version=PQ_VERSION).compute() midpoint_df = pr.PyRanges( fragment_df.reset_index()[[ "chrom", "start", "end", "fragment_id" ]].assign(start=lambda x: ((x.start + x.end) * 0.5).round(0).astype( int)).eval("end = start + 1").rename(columns={ "chrom": "Chromosome", "start": "Start", "end": "End" })) # use a pyranges joing to assign fragments to bins fragment_to_bin = midpoint_df.join( bins, how="left").df[["fragment_id", "bin_id"]] fragment_to_bin = fragment_to_bin.set_index( "fragment_id").sort_index() # .astype(np.uint32) nulls = fragment_to_bin["bin_id"] == -1 if nulls.any(): logger.warning( "Some fragments did not overlap bins, removing from analysis:\n{}". format(fragment_to_bin[nulls].join(fragment_df))) fragment_to_bin = fragment_to_bin[~nulls] # use a join to assign each end of a contact to a bin binned_contacts = (contact_df.merge( fragment_to_bin, how="inner", right_index=True, left_on="align1_fragment_id").merge( fragment_to_bin, how="inner", right_index=True, left_on="align2_fragment_id", suffixes=[None, "_2"]).rename(columns={ "bin_id": "bin1_id", "bin_id_2": "bin2_id" })) if not by_haplotype: cooler_path = output_prefix + ".cool" # group size == number of contacts per bin_pair pixels = binned_contacts.groupby( ["bin1_id", "bin2_id"]).size().rename("count").astype(np.int32).reset_index() create_cooler(cooler_path, bins_df, pixels, ordered=True, symmetric_upper=True, ensure_sorted=True) c = Cooler(cooler_path) logger.info(f"Created cooler: {c.info}") results.append(cooler_path) else: tmp_parquet = output_prefix + ".tmp.pq" pixels = ( # create a key to groupy by haplotype pair, order of haplotypes doesn't matter binned_contacts.assign( hap_key=lambda x: x[["align1_haplotype", "align2_haplotype"] ].apply(lambda y: "{}_{}".format(*sorted( y)).replace("-1", "nohap"), axis=1, meta="object") ).groupby(["hap_key", "bin1_id", "bin2_id"]).size().rename("count").astype( np.int32 ).reset_index().astype({"hap_key": "category"})) # save to a temporary parquet file, this might not be necessary # but want to avoid the whole contact matrix hitting memory pixels.to_parquet( tmp_parquet, write_metadata_file=True, partition_on=["hap_key"], write_index=False, engine=PQ_ENGINE, version=PQ_VERSION, ) pixels = dd.read_parquet(tmp_parquet, engine=PQ_ENGINE, version=PQ_VERSION, columns=["hap_key"], index=False) hap_keys = pixels["hap_key"].unique().compute() # create a cooler for each haplotype pair for hap_key in hap_keys: cooler_path = f"{output_prefix}.{hap_key}.cool" pixels = dd.read_parquet( tmp_parquet, filters=[("hap_key", "==", hap_key)], index=False, engine=PQ_ENGINE, version=PQ_VERSION, columns=["bin1_id", "bin2_id", "count"], ) create_cooler(cooler_path, bins_df, pixels, ordered=True, symmetric_upper=True, ensure_sorted=True) c = Cooler(cooler_path) logger.info(f"Created cooler: {c.info}") results.append(cooler_path) shutil.rmtree(tmp_parquet) return results
vec_of_prob = [p / total_num_reads for p in vec_of_prob] print("start of sampling...") down_sampled_counts = np.random.multinomial(num_sample_reads, vec_of_prob) print("sampling finished!") if not os.path.exists(args['output_folder_path']): os.makedirs(args['output_folder_path']) start_ind = 0 for chr_file in chr_files_list: chr_data = pd.read_csv(os.path.join(COO_folder_path, chr_file), delimiter="\t", header=None) pixel_size = chr_data.shape[0] new_pixel = np.column_stack( (chr_data.iloc[:, 0], chr_data.iloc[:, 1], down_sampled_counts[start_ind:start_ind + pixel_size])) start_ind = start_ind + pixel_size np.savetxt(os.path.join(args['output_folder_path'], chr_file), new_pixel, delimiter="\t", fmt="%i") print(chr_file + " is done!") """ Obs1: when we fetch a specific chromosome it means first columns belong to regions in that chromosome but second column regions are through whole genome Obs2: reads are not considered twice in files, for example when we fetch chr2 pixels, there are not interactions between chr2 and chr1 any more. Obs3: number of intra reads: 125015861, whole reads: 153752070 (in low resolution sample) new_bins = high_res_cool.bins() cooler.create_cooler(cool_uri = "/Users/neda/prostate-samples/PCa13266.down-sample.cool", bins = new_bins, pixels = new_pixel) """
def create_cooler_cc( output_prefix: str, bins: pd.DataFrame, pixels: pd.DataFrame, capture_name: str, capture_oligos: os.PathLike, capture_bins: Union[int, list] = None, suffix=None, **cooler_kwargs, ) -> os.PathLike: """ Creates a cooler hdf5 file or cooler formatted group within a hdf5 file. Args: output_prefix (str): Output path for hdf5 file. If this already exists, will append a new group to the file. bins (pd.DataFrame): DataFrame containing the genomic coordinates of all bins in the pixels table. pixels (pd.DataFrame): DataFrame with columns: bin1_id, bin2_id, count. capture_name (str): Name of capture probe to store. capture_oligos (os.PathLike): Path to capture oligos used for the analysis. capture_bins (Union[int, list], optional): Bins containing capture oligos. Can be determined from oligos if not supplied. Defaults to None. suffix (str, optional): Suffix to append before the .hdf5 file extension. Defaults to None. Raises: ValueError: Capture name must exactly match the name of a supplied capture oligo. Returns: os.PathLike: Path of cooler hdf5 file. """ # Gets capture coordinates capture_coords = get_capture_coords(capture_oligos, capture_name) # Make sure capture coordinates are returned correctly, if not, error. if capture_coords is None: raise ValueError(f"Incorrect capture name specified: {capture_name}.") # If capture bins not provided get them using the coordinates. if not capture_bins: capture_bins = get_capture_bins( bins, capture_coords["chrom"], capture_coords["start"], capture_coords["end"], ) capture_bins = [int(x) for x in capture_bins] # Need to store bins as a list so make sure its not just a single int. elif isinstance(capture_bins, int): capture_bins = [ int(capture_bins), ] # The cooler.create_cooler function will not accept np.arrays so must convert to python list elif isinstance(capture_bins, (np.array, pd.Series)): capture_bins = [int(x) for x in capture_bins] # Get the number of cis interactions, required for normalisation. bins_cis = bins.query(f'chrom == "{capture_coords["chrom"]}"')["name"] pixels_cis = pixels.loc[lambda df: (df["bin1_id"].isin(bins_cis)) | (df["bin2_id"].isin(bins_cis))] n_cis_interactions = pixels_cis["count"].sum() # Metadata for cooler file. metadata = { "capture_bins": capture_bins, "capture_name": capture_name, "capture_chrom": capture_coords['chrom'], "capture_coords": f'{capture_coords["chrom"]}:{capture_coords["start"]}-{capture_coords["end"]}', "n_cis_interactions": int(n_cis_interactions), } if os.path.exists(output_prefix ): # Will append to a prexisting file if one is supplied append_to_file = True cooler_fn = f"{output_prefix}::/{capture_name}" else: append_to_file = False cooler_fn = f"{output_prefix.replace('.hdf5', '')}.{capture_name}{'.' + suffix if suffix else ''}.hdf5" cooler.create_cooler( cooler_fn, bins=bins, pixels=pixels, metadata=metadata, mode="w" if not append_to_file else "a", **cooler_kwargs, ) return cooler_fn
def createFakeMatrices(outfile, peakpos, peakwidth, length, resolution, count, chromosome): errorMsg = "" if not outfile.endswith('.cool'): errorMsg += "Matrix output file must be in cooler format. Aborting\n" if peakwidth > length / 2: errorMsg += "peak width must not be more than half the peak length\n" if peakpos - peakwidth / 2 < 0 or peakpos + peakwidth / 2 > length: errorMsg += "Peak is not fully inside the range (0...length). Reduce peak width or adjust peak position\n" if errorMsg != "": sys.exit(errorMsg) adjustedLength = length - length % resolution binStartList = list(range(0, adjustedLength, resolution)) binEndList = list(range(resolution, adjustedLength, resolution)) binEndList.append(adjustedLength) if len(binStartList) != len(binEndList): errorMsg = "bug while creating bins. Start and end bin lists not equally long" sys.exit(errorMsg) bins = pd.DataFrame(columns=['chrom', 'start', 'end']) bins['start'] = binStartList bins['end'] = binEndList bins['chrom'] = chromosome bin1List = [] bin2List = [] for bin1Id in range(len(binStartList)): for bin2Id in range(len(binStartList)): bin1List.append(bin1Id) bin2List.append(bin2Id) pixels = pd.DataFrame(columns=['bin1_id', 'bin2_id', 'count']) pixels['bin1_id'] = bin1List pixels['bin2_id'] = bin2List pixels['count'] = 0 adjustedPeakWidth = peakwidth - peakwidth % resolution peakStartBin = int((peakpos - adjustedPeakWidth / 2) / resolution) peakEndBin = peakStartBin + int(adjustedPeakWidth / resolution) m1 = pixels['bin1_id'] >= peakStartBin m2 = pixels['bin1_id'] < peakEndBin m3 = pixels['bin2_id'] >= peakStartBin m4 = pixels['bin2_id'] < peakEndBin mask = m1 & m2 & m3 & m4 pixels.loc[mask, 'count'] = count pixels.sort_values(by=['bin1_id', 'bin2_id'], inplace=True) #assert that the resulting matrix is symmetric matIdx = (list(pixels['bin1_id']), list(pixels['bin2_id'])) data = list(pixels['count']) mtrx = sparse.csr_matrix((data, matIdx)).todense() symmetric = np.allclose(mtrx, mtrx.T, rtol=1e-20, atol=1e-20) if not symmetric: errorMsg = 'bug: resulting matrix should be symmetric, but is not' sys.exit(errorMsg) cooler.create_cooler(outfile, bins=bins, pixels=pixels, triucheck=False, symmetric_upper=False)
def writeCooler(pMatrixList, pBinSizeInt, pOutfile, pChromosomeList, pChromSizeList=None, pMetadata=None): #takes a matrix as numpy array or sparse matrix and writes a cooler matrix from it #modified from study project such that multiple chroms can be written to a single matrix def pixelGenerator(pMatrixList, pOffsetList): ''' yields pixel dataframes per Matrix Parameters: pMatrixList: list of matrices as np.ndarray or sparse.csr_matrix pOffsetList: list of integers that specify the offset into the bins dataframe Yields: pixels: pixels dataframe for all Hi-C matrices in the input list ''' for matrix, offset in zip(pMatrixList, pOffsetList): #create the pixels for cooler triu_Indices = np.triu_indices(matrix.shape[0]) pixels_tmp = pd.DataFrame(columns=['bin1_id','bin2_id','count']) pixels_tmp['bin1_id'] = (triu_Indices[0] + offset).astype("uint32") pixels_tmp['bin2_id'] = (triu_Indices[1] + offset).astype("uint32") readCounts = matrix[triu_Indices] if sparse.isspmatrix_csr(matrix): #for sparse matrices, slicing is different readCounts = np.transpose(readCounts) pixels_tmp['count'] = np.float64(readCounts) pixels_tmp.sort_values(by=['bin1_id','bin2_id'],inplace=True) yield pixels_tmp if pMatrixList is None or pChromosomeList is None or pBinSizeInt is None or pOutfile is None: msg = "input empty. No cooler matrix written" print(msg) return if len(pMatrixList) != len(pChromosomeList): msg = "number of input arrays and chromosomes must be the same" print(msg) return if pChromSizeList is not None and len(pChromSizeList) != len(pChromosomeList): msg = "if chrom sizes are given, they must be provided for ALL chromosomes" print(msg) return bins = pd.DataFrame(columns=['chrom','start','end']) offsetList = [0] for i, (matrix, chrom) in enumerate(zip(pMatrixList,pChromosomeList)): #the chromosome size may not be integer-divisible by the bin size #so specifying the real chrom size is possible, but the #number of bins must still correspond to the matrix size chromSizeInt = int(matrix.shape[0] * pBinSizeInt) if pChromSizeList is not None \ and pChromSizeList[i] is not None \ and pChromSizeList[i] > (chromSizeInt - pBinSizeInt)\ and pChromSizeList[i] < chromSizeInt: chromSizeInt = int(pChromSizeList[0]) #create the bins for cooler bins_tmp = pd.DataFrame(columns=['chrom','start','end']) binStartList = list(range(0, chromSizeInt, int(pBinSizeInt))) binEndList = list(range(int(pBinSizeInt), chromSizeInt, int(pBinSizeInt))) binEndList.append(chromSizeInt) bins_tmp['start'] = np.uint32(binStartList) bins_tmp['end'] = np.uint32(binEndList) bins_tmp["chrom"] = str(chrom) bins = bins.append(bins_tmp, ignore_index=True) offsetList.append(offsetList[-1] + bins_tmp.shape[0]) #correct dtypes for joint dataframe bins["start"] = bins["start"].astype("uint32") bins["end"] = bins["end"].astype("uint32") offsetList = offsetList[:-1] #don't need the last one, no more matrix to follow #write out the cooler cooler.create_cooler(pOutfile, bins=bins, pixels=pixelGenerator(pMatrixList=pMatrixList, pOffsetList=offsetList), dtypes={'count': np.float64}, ordered=True, metadata=pMetadata)
def sample_cooler( clr, out_clr_path, count=None, frac=None, exact=False, map_func=map, chunksize=int(1e7), ): """ Pick a random subset of contacts from a Hi-C map. Parameters ---------- clr : cooler.Cooler or str A Cooler or a path/URI to a Cooler with input data. out_clr_path : str A path/URI to the output. count : float The target number of contacts in the sample. Mutually exclusive with `frac`. frac : float The target sample size as a fraction of contacts in the original dataset. Mutually exclusive with `count`. exact : bool If True, the resulting sample size will exactly match the target value. Exact sampling will load the whole pixel table into memory! If False, binomial sampling will be used instead and the sample size will be randomly distributed around the target value. map_func : function A map implementation. chunksize : int The number of pixels loaded and processed per step of computation. """ if issubclass(type(clr), str): clr = cooler.Cooler(clr) if count is not None and frac is None: frac = count / clr.info["sum"] elif count is None and frac is not None: count = np.round(frac * clr.info["sum"]) else: raise ValueError("Either frac or tot_count must be specified!") if frac >= 1.0: raise ValueError( "The number of contacts in a sample cannot exceed " "that in the original dataset." ) if exact: pixels = sample_pixels_exact(clr.pixels()[:], count) cooler.create_cooler(out_clr_path, clr.bins()[:], pixels, ordered=True) else: pipeline = ( cooler.tools.split( clr, include_bins=False, map=map_func, chunksize=chunksize ) .pipe(_extract_pixel_chunk) .pipe(sample_pixels_approx, frac=frac) ) cooler.create_cooler(out_clr_path, clr.bins()[:], iter(pipeline), ordered=True)
def toCooler(pixels, bins, outfile): #check if the inputs are as expected try: df_pixels = pd.read_csv(pixels, sep="\t", index_col=False) df_bins = pd.read_csv(bins, sep="\t", index_col=False) except Exception as e: msg = str(e) + "\nCould not read infiles, wrong files/format etc.?" raise SystemExit(msg) pixels_columns = {"cbin1", "cbin2", "expected_count", "observed_count"} bins_columns = {"cbin", "chr", "from.coord", "to.coord", "count"} if len(pixels_columns.intersection(set( df_pixels.columns))) != len(pixels_columns): msg = "pixels: not the expected column names" raise SystemExit(msg) if len(bins_columns.intersection(set( df_bins.columns))) != len(bins_columns): msg = "bins: not the expected column names" raise SystemExit(msg) #prepare the pixels for cooler df_pixels.rename(columns={ "cbin1": "bin1_id", "cbin2": "bin2_id", "observed_count": "count" }, inplace=True) df_pixels.drop(columns=["expected_count"], inplace=True) gt = df_pixels["bin1_id"] > df_pixels["bin2_id"] df_pixels = df_pixels[ ~gt] #drop duplicate entries, keep upper triangular part of matrix #prepare the bins for cooler df_bins.rename(columns={ "chr": "chrom", "from.coord": "start", "to.coord": "end" }, inplace=True) df_bins.drop(columns=["count"], inplace=True) binsize = df_bins.iloc[0, :]["end"] - df_bins.iloc[0, :]["start"] chromnames = list(df_bins["chrom"].unique()) print("chromnames:", chromnames) print("detected binsize:", binsize) #sometimes the last bin is present, but "end" does not point to chromsize, but max. size given by bin #need to set the max. chrom size in this case, otherwise the last bin will be duplicated, once with end==chromsize and once with end==maxbin*binsize for chrom in chromnames: max_allowed_val = int( np.ceil(DM3_CHROM_SIZES[chrom] / binsize) * binsize) chromfltr = df_bins["chrom"] == chrom max_given_val = df_bins[chromfltr]["end"].values[-1] if max_given_val == max_allowed_val: idx = df_bins.loc[chromfltr, "end"].index[-1] df_bins.loc[idx, "end"] = DM3_CHROM_SIZES[chrom] msg = "INFO: reset size of chromosome {:s} from {:d} to {:d}".format( chrom, max_given_val, DM3_CHROM_SIZES[chrom]) print(msg) elif max_given_val < max_allowed_val: pass else: msg = "Chrom {:s} is larger than expected from ref. genome dm3.".format( chrom) raise SystemExit(msg) #The provided bins dataframe is sparse (missing bins at start, end, and in-between) #So create a new one, which contains all bins df_bins_cpl = pd.DataFrame() for chr in chromnames: chromsize = DM3_CHROM_SIZES[chr] start_list = [x for x in range(0, chromsize, binsize)] end_list = [x for x in range(binsize, chromsize, binsize)] + [chromsize] df1 = pd.DataFrame() df1["start"] = start_list df1["end"] = end_list df1["chrom"] = chr df_bins_cpl = df_bins_cpl.append(df1, ignore_index=True) print(df_bins_cpl[df_bins_cpl["chrom"] == "2R"].tail()) df_bins_cpl.reset_index(inplace=True, drop=True) #get the old indices into the cpl bins dataframe, need them to update the pixels dataframe later df_bins_cpl = df_bins_cpl.merge(df_bins, on=["start", "end", "chrom"], how="outer") df_bins_cpl["cbin"].fillna(-1, inplace=True) df_bins_cpl["cbin"] = df_bins_cpl["cbin"].astype("int64") df_bins_cpl.sort_values(by=["chrom", "start", "end"], inplace=True) df_bins_cpl.reset_index(inplace=True, drop=True) df_bins_cpl["new_index"] = df_bins_cpl.index #update the bin ids in pixels df df_pixels = df_pixels.merge(df_bins_cpl, left_on="bin1_id", right_on="cbin", how="inner") df_pixels["bin1_id"] = df_pixels["new_index"] df_pixels.drop(columns=["new_index", "cbin", "start", "end", "chrom"], inplace=True) df_pixels = df_pixels.merge(df_bins_cpl, left_on="bin2_id", right_on="cbin", how="inner") df_pixels["bin2_id"] = df_pixels["new_index"] df_pixels.drop(columns=["new_index", "cbin", "start", "end", "chrom"], inplace=True) df_bins_cpl.drop(columns=["cbin", "new_index"], inplace=True) print("\nsome lines of bins df:") print(df_bins_cpl.head(10)) print(df_bins_cpl[df_bins_cpl["chrom"] == "2L"].tail()) print(df_bins_cpl[df_bins_cpl["chrom"] == "2R"].head()) print(df_bins_cpl[df_bins_cpl["chrom"] == "2R"].tail()) print(df_bins_cpl[df_bins_cpl["chrom"] == "3L"].head()) print(df_bins_cpl[df_bins_cpl["chrom"] == "3L"].tail()) print(df_bins_cpl[df_bins_cpl["chrom"] == "3R"].head()) print(df_bins_cpl[df_bins_cpl["chrom"] == "3R"].tail()) print(df_bins_cpl[df_bins_cpl["chrom"] == "4"].head()) print(df_bins_cpl[df_bins_cpl["chrom"] == "4"].tail()) print(df_bins_cpl[df_bins_cpl["chrom"] == "X"].head()) print(df_bins_cpl.tail()) print("\nsome lines of pixels df:") print(df_pixels.head()) print(df_pixels.tail()) #write the cooler file cooler.create_cooler(outfile, bins=df_bins_cpl, pixels=df_pixels, ordered=True, metadata={"fromFilenames": [pixels, bins]})