def _diagsum_asymm(clr, fields, transforms, regions1, regions2, span): """ calculates diagonal summary for a collection of rectangular regions defined as combinations of regions1 and regions2. returns a dictionary of DataFrames with diagonal sums as values, and 0-based indexes of rectangular genomic regions as keys. """ lo, hi = span bins = clr.bins()[:] pixels = clr.pixels()[lo:hi] pixels = cooler.annotate(pixels, bins, replace=False) # this could further expanded to allow for custom groupings: pixels["dist"] = pixels["bin2_id"] - pixels["bin1_id"] for field, t in transforms.items(): pixels[field] = t(pixels) diag_sums = {} # r1 and r2 define rectangular block i: for i, (r1, r2) in enumerate(zip(regions1, regions2)): r1 = assign_supports(pixels, [r1], suffix="1") r2 = assign_supports(pixels, [r2], suffix="2") # calculate diag_sums on the spot to allow for overlapping blocks: diag_sums[i] = pixels[(r1 == r2)].groupby("dist")[fields].sum() return diag_sums
def getData3(cooler_matrix, zoomLevel, startPos1, endPos1, startPos2, endPos2): c = cooler_matrix['cooler'] i0 = absCoord2bin(c, startPos1) i1 = absCoord2bin(c, endPos1) j0 = absCoord2bin(c, startPos2) j1 = absCoord2bin(c, endPos2) if (i1 - i0) == 0 or (j1 - j0) == 0: return pd.DataFrame(columns=['genome_start', 'genome_end', 'balanced']) pixels = c.matrix(as_pixels=True, max_chunk=np.inf)[i0:i1, j0:j1] if not len(pixels): return pd.DataFrame(columns=['genome_start', 'genome_end', 'balanced']) lo = min(i0, j0) hi = max(i1, j1) bins = c.bins()[['chrom', 'start', 'end', 'weight']][lo:hi] bins['chrom'] = bins['chrom'].cat.codes pixels = cooler.annotate(pixels, bins) pixels['genome_start'] = cumul_lengths[pixels['chrom1']] + pixels['start1'] pixels['genome_end'] = cumul_lengths[pixels['chrom2']] + pixels['end2'] pixels['balanced'] = pixels['count'] * \ pixels['weight1'] * pixels['weight2'] return pixels[['genome_start', 'genome_end', 'balanced']]
def getData3(fpath, zoomLevel, startPos1, endPos1, startPos2, endPos2): t1 = time.time() f = h5py.File(fpath, 'r') c = cooler.Cooler(f[str(zoomLevel)]) matrix = c.matrix(balance=True, as_pixels=True, join=True) cooler_matrix = {'cooler': c, 'matrix': matrix} c = cooler_matrix['cooler'] i0 = absCoord2bin(c, startPos1) i1 = absCoord2bin(c, endPos1) j0 = absCoord2bin(c, startPos2) j1 = absCoord2bin(c, endPos2) if (i1 - i0) == 0 or (j1 - j0) == 0: return pd.DataFrame(columns=['genome_start', 'genome_end', 'balanced']) pixels = c.matrix(as_pixels=True, max_chunk=np.inf)[i0:i1, j0:j1] if not len(pixels): return pd.DataFrame(columns=['genome_start', 'genome_end', 'balanced']) lo = min(i0, j0) hi = max(i1, j1) bins = c.bins()[['chrom', 'start', 'end', 'weight']][lo:hi] bins['chrom'] = bins['chrom'].cat.codes pixels = cooler.annotate(pixels, bins) pixels['genome_start'] = cumul_lengths[pixels['chrom1']] + pixels['start1'] pixels['genome_end'] = cumul_lengths[pixels['chrom2']] + pixels['end2'] pixels[ 'balanced'] = pixels['count'] * pixels['weight1'] * pixels['weight2'] #print type(pixels[map(lambda x: "{0:.2f}".format(x),map(lambda x: float(x),['genome_start', 'genome_end', 'balanced']))]) return pixels[['genome_start', 'genome_end', 'balanced']]
def getData3(cooler_matrix, zoomLevel, startPos1, endPos1, startPos2, endPos2): c = cooler_matrix["cooler"] i0 = absCoord2bin(c, startPos1) i1 = absCoord2bin(c, endPos1) j0 = absCoord2bin(c, startPos2) j1 = absCoord2bin(c, endPos2) if (i1 - i0) == 0 or (j1 - j0) == 0: return pd.DataFrame(columns=["genome_start", "genome_end", "balanced"]) pixels = c.matrix(as_pixels=True, max_chunk=np.inf)[i0:i1, j0:j1] if not len(pixels): return pd.DataFrame(columns=["genome_start", "genome_end", "balanced"]) lo = min(i0, j0) hi = max(i1, j1) bins = c.bins()[["chrom", "start", "end", "weight"]][lo:hi] bins["chrom"] = bins["chrom"].cat.codes pixels = cooler.annotate(pixels, bins) pixels["genome_start"] = cumul_lengths[pixels["chrom1"]] + pixels["start1"] pixels["genome_end"] = cumul_lengths[pixels["chrom2"]] + pixels["end2"] pixels[ "balanced"] = pixels["count"] * pixels["weight1"] * pixels["weight2"] return pixels[["genome_start", "genome_end", "balanced"]]
def _accum_by_cisdiag(c, bins, span): """Sum properties along the diagonals of the intrachromosomal matrices""" lo, hi = span pixels = c.pixels()[lo:hi] # assign chroms and filter for cis records pixels = cooler.annotate(pixels, bins[['chrom', 'weight']], replace=False) pixels = pixels[pixels.chrom1 == pixels.chrom2].copy() # assign diagonal indices pixels = pixels.rename(columns={'chrom1': 'chrom'}) pixels['diag'] = pixels['bin2_id'] - pixels['bin1_id'] # balance pixels[ 'balanced'] = pixels['count'] * pixels['weight1'] * pixels['weight2'] pixels['balanced2'] = pixels['balanced'] * pixels['balanced'] # group by diagonal and accumulate grouped = pixels.groupby(['chrom', 'diag'], sort=False) agg = grouped.aggregate({ 'balanced': np.sum, 'balanced2': np.sum, }) return agg.reset_index()
def _blocksum_asymm(clr, fields, transforms, regions1, regions2, span): """ calculates block summary for a collection of rectangular regions defined as combinations of regions1 and regions2. returns a dictionary of with block sums as values, and 0-based indexes of rectangular genomic regions as keys. """ lo, hi = span bins = clr.bins()[:] pixels = clr.pixels()[lo:hi] pixels = cooler.annotate(pixels, bins, replace=False) for field, t in transforms.items(): pixels[field] = t(pixels) block_sums = {} # r1 and r2 define rectangular block i: for i, (r1, r2) in enumerate(zip(regions1, regions2)): r1 = assign_supports(pixels, [r1], suffix="1") r2 = assign_supports(pixels, [r2], suffix="2") # calculate sum on the spot to allow for overlapping blocks: block_sums[i] = pixels[(r1 == r2)][fields].sum() return block_sums
def get_count_df(cool_mat): logger.debug(f"Creating counts dataframe for {cool_mat}") df = ( annotate(cool_mat.pixels()[:], cool_mat.bins()[:]["chrom"]) .eval("is_cis = (chrom1 == chrom2)") .pipe(get_distance, cool_mat.binsize) .set_index(["bin1_id", "bin2_id"]) .sort_index() ) num_bins = len(df) logger.info(f"Read {num_bins} bins from {cool_mat}") return df
def get_data(f, zoom_level, start_pos_1, end_pos_1, start_pos_2, end_pos_2): """Get balanced pixel data. Args: f (File): File pointer to a .cool filer. zoom_level (int): Test. start_pos_1 (int): Test. end_pos_1 (int): Test. start_pos_2 (int): Test. end_pos_2 (int): Test. Returns: DataFrame: Annotated cooler pixels. """ c = cooler.Cooler(f[str(zoom_level)]) (chroms, chrom_sizes, chrom_cum_lengths) = get_chromosome_names_cumul_lengths(c) i0 = abs_coord_2_bin(c, start_pos_1, chroms, chrom_cum_lengths, chrom_sizes) i1 = abs_coord_2_bin(c, end_pos_1, chroms, chrom_cum_lengths, chrom_sizes) j0 = abs_coord_2_bin(c, start_pos_2, chroms, chrom_cum_lengths, chrom_sizes) j1 = abs_coord_2_bin(c, end_pos_2, chroms, chrom_cum_lengths, chrom_sizes) pixels = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf)[i0:i1 + 1, j0:j1 + 1] if not len(pixels): return pd.DataFrame( columns=['genome_start1', 'genome_start2', 'balanced']) if 'weight' in c.bins(): bins = c.bins(convert_enum=False)[['chrom', 'start', 'end', 'weight']] else: bins = c.bins(convert_enum=False)[['chrom', 'start', 'end']] pixels = cooler.annotate(pixels, bins) pixels['genome_start1'] = chrom_cum_lengths[ pixels['chrom1']] + pixels['start1'] pixels['genome_start2'] = chrom_cum_lengths[ pixels['chrom2']] + pixels['start2'] if 'weight' in c.bins(): pixels['balanced'] = (pixels['count'] * pixels['weight1'] * pixels['weight2']) return pixels[['genome_start1', 'genome_start2', 'balanced']] else: return pixels[['genome_start1', 'genome_start2', 'count']]
def _diagsum_symm(clr, fields, transforms, supports, span): lo, hi = span bins = clr.bins()[:] pixels = clr.pixels()[lo:hi] pixels = cooler.annotate(pixels, bins, replace=False) pixels = pixels[pixels['chrom1'] == pixels['chrom2']].copy() pixels['diag'] = pixels['bin2_id'] - pixels['bin1_id'] for field, t in transforms.items(): pixels[field] = t(pixels) pixels['support'] = assign_supports(pixels, supports, suffix='1') pixel_groups = dict(iter(pixels.groupby('support'))) return {int(i): group.groupby('diag')[fields].sum() for i, group in pixel_groups.items()}
def _blocksum_asymm(clr, fields, transforms, supports1, supports2, span): lo, hi = span bins = clr.bins()[:] pixels = clr.pixels()[lo:hi] pixels = cooler.annotate(pixels, bins, replace=False) pixels = pixels[pixels["chrom1"] != pixels["chrom2"]].copy() for field, t in transforms.items(): pixels[field] = t(pixels) pixels["support1"] = assign_supports(pixels, supports1, suffix="1") pixels["support2"] = assign_supports(pixels, supports2, suffix="2") pixels = pixels.dropna() pixel_groups = dict(iter(pixels.groupby(["support1", "support2"]))) return {(int(i), int(j)): group[fields].sum() for (i, j), group in pixel_groups.items()}
def _diagsum_symm(clr, fields, transforms, supports, span): lo, hi = span bins = clr.bins()[:] pixels = clr.pixels()[lo:hi] pixels = cooler.annotate(pixels, bins, replace=False) pixels["support1"] = assign_supports(pixels, supports, suffix="1") pixels["support2"] = assign_supports(pixels, supports, suffix="2") pixels = pixels[pixels["support1"] == pixels["support2"]].copy() pixels["diag"] = pixels["bin2_id"] - pixels["bin1_id"] for field, t in transforms.items(): pixels[field] = t(pixels) pixelgroups = dict(iter(pixels.groupby("support1"))) return { int(i): group.groupby("diag")[fields].sum() for i, group in pixelgroups.items() }
def _diagsum_asymm(clr, fields, transforms, contact_type, supports1, supports2, span): lo, hi = span bins = clr.bins()[:] pixels = clr.pixels()[lo:hi] pixels = cooler.annotate(pixels, bins, replace=False) if contact_type == 'cis': pixels = pixels[pixels['chrom1'] == pixels['chrom2']].copy() elif contact_type == 'trans': pixels = pixels[pixels['chrom1'] != pixels['chrom2']].copy() pixels['diag'] = pixels['bin2_id'] - pixels['bin1_id'] for field, t in transforms.items(): pixels[field] = t(pixels) pixels['support1'] = assign_supports(pixels, supports1, suffix='1') pixels['support2'] = assign_supports(pixels, supports1, suffix='2') pixel_groups = dict(iter(pixels.groupby(('support1', 'support2')))) return {(int(i), int(j)): group.groupby('diag')[fields].sum() for (i, j), group in pixel_groups.items()}
def _diagsum_asymm(clr, fields, transforms, contact_type, supports1, supports2, span): lo, hi = span bins = clr.bins()[:] pixels = clr.pixels()[lo:hi] pixels = cooler.annotate(pixels, bins, replace=False) if contact_type == "cis": pixels = pixels[pixels["chrom1"] == pixels["chrom2"]].copy() elif contact_type == "trans": pixels = pixels[pixels["chrom1"] != pixels["chrom2"]].copy() pixels["diag"] = pixels["bin2_id"] - pixels["bin1_id"] for field, t in transforms.items(): pixels[field] = t(pixels) pixels["support1"] = assign_supports(pixels, supports1, suffix="1") pixels["support2"] = assign_supports(pixels, supports2, suffix="2") pixel_groups = dict(iter(pixels.groupby(["support1", "support2"]))) return {(int(i), int(j)): group.groupby("diag")[fields].sum() for (i, j), group in pixel_groups.items()}
def get_data(f, zoom_level, start_pos_1, end_pos_1, start_pos_2, end_pos_2, transform='default'): """Get balanced pixel data. Args: f (File): File pointer to a .cool filer. zoom_level (int): Test. start_pos_1 (int): Test. end_pos_1 (int): Test. start_pos_2 (int): Test. end_pos_2 (int): Test. Returns: DataFrame: Annotated cooler pixels. """ c = cooler.Cooler(f[str(zoom_level)]) (chroms, chrom_sizes, chrom_cum_lengths) = get_chromosome_names_cumul_lengths(c) i0 = abs_coord_2_bin(c, start_pos_1, chroms, chrom_cum_lengths, chrom_sizes) i1 = abs_coord_2_bin(c, end_pos_1, chroms, chrom_cum_lengths, chrom_sizes) j0 = abs_coord_2_bin(c, start_pos_2, chroms, chrom_cum_lengths, chrom_sizes) j1 = abs_coord_2_bin(c, end_pos_2, chroms, chrom_cum_lengths, chrom_sizes) pixels = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf)[i0:i1 + 1, j0:j1 + 1] if not len(pixels): return pd.DataFrame( columns=['genome_start1', 'genome_start2', 'balanced']) # select bin columns to extract cols = ['chrom', 'start', 'end'] if (transform == 'default' and 'weight' in c.bins()) or transform == 'weight': cols.append('weight') elif transform in ('KR', 'VC', 'VC_SQRT'): cols.append(transform) bins = c.bins(convert_enum=False)[cols] pixels = cooler.annotate(pixels, bins) pixels['genome_start1'] = chrom_cum_lengths[ pixels['chrom1']] + pixels['start1'] pixels['genome_start2'] = chrom_cum_lengths[ pixels['chrom2']] + pixels['start2'] # apply transform if (transform == 'default' and 'weight' in c.bins()) or transform == 'weight': pixels['balanced'] = (pixels['count'] * pixels['weight1'] * pixels['weight2']) return pixels[['genome_start1', 'genome_start2', 'balanced']] elif transform in ('KR', 'VC', 'VC_SQRT'): pixels['balanced'] = (pixels['count'] / pixels[transform + '1'] / pixels[transform + '2']) return pixels[['genome_start1', 'genome_start2', 'balanced']] else: return pixels[['genome_start1', 'genome_start2', 'count']]
def trans_expected(clr, chromosomes, chunksize=1000000, use_dask=False): """ Aggregate the signal in intrachromosomal blocks. Can be used as abackground for contact frequencies between chromosomes. Parameters ---------- clr : cooler.Cooler Cooler object chromosomes : list of str List of chromosome names chunksize : int, optional Size of dask chunks use_dask : bool, optional option to use dask Returns ------- pandas.DataFrame that stores total number of interactions between a pair of chromosomes: 'balanced.sum', corresponding number of bins involved in the inter-chromosomal interactions: 'n_valid', and a ratio 'balanced.avg = balanced.sum/n_valid', that is the actual value of expected for every interchromosomal pair. """ def n_total_trans_elements(clr, chromosomes): n = len(chromosomes) x = [clr.extent(chrom)[1] - clr.extent(chrom)[0] for chrom in chromosomes] pairblock_list = [] for i in range(n): for j in range(i + 1, n): # appending to the list of tuples pairblock_list.append((chromosomes[i], chromosomes[j], x[i] * x[j] )) return pd.DataFrame(pairblock_list, columns=['chrom1', 'chrom2', 'n_total']) def n_bad_trans_elements(clr, chromosomes): n = 0 # bad bins are ones with # the weight vector being NaN: x = [np.sum(clr.bins()['weight'] .fetch(chrom) .isnull() .astype(int) .values) for chrom in chromosomes] pairblock_list = [] for i in range(len(x)): for j in range(i + 1, len(x)): # appending to the list of tuples pairblock_list.append((chromosomes[i], chromosomes[j], x[i] * x[j] )) return pd.DataFrame(pairblock_list, columns=['chrom1', 'chrom2', 'n_bad']) if use_dask: # pixels = daskify(clr.filename, clr.root + '/pixels', chunksize=chunksize) raise NotImplementedError("To be implemented once dask supports MultiIndex") else: pixels = clr.pixels()[:] # getting pixels that belong to trans-area, # defined by the list of chromosomes: pixels = cooler.annotate(pixels, clr.bins(), replace=False) pixels = pixels[ (pixels.chrom1.isin(chromosomes)) & (pixels.chrom2.isin(chromosomes)) & (pixels.chrom1 != pixels.chrom2) ] pixels['balanced'] = pixels['count'] * pixels['weight1'] * pixels['weight2'] ntot = n_total_trans_elements(clr, chromosomes).groupby(('chrom1', 'chrom2'))['n_total'].sum() nbad = n_bad_trans_elements(clr, chromosomes).groupby(('chrom1', 'chrom2'))['n_bad'].sum() trans_area = ntot - nbad trans_area.name = 'n_valid' # processing with use_dask=True is different: if use_dask: # trans_sum = pixels.groupby(('chrom1', 'chrom2'))['balanced'].sum().compute() pass else: trans_sum = pixels.groupby(('chrom1', 'chrom2'))['balanced'].sum() # for consistency with the cis_expected function: trans_sum.name = trans_sum.name + '.sum' # returning a DataFrame with MultiIndex, that stores # pairs of 'balanced.sum' and 'n_valid' values for each # pair of chromosomes. dtable = pd.merge( trans_sum.to_frame(), trans_area.to_frame(), left_index=True, right_index=True) # the actual expected is balanced.sum/n_valid: dtable['balanced.avg'] = dtable['balanced.sum'] / dtable['n_valid'] return dtable
def get_data(f, start_pos_1, end_pos_1, start_pos_2, end_pos_2, transform='default', resolution=None): """Get balanced pixel data. Args: f: h5py.File An HDF5 Group that contains the cooler for this resolution start_pos_1 (int): Test. end_pos_1 (int): Test. start_pos_2 (int): Test. end_pos_2 (int): Test. Returns: DataFrame: Annotated cooler pixels. """ c = cooler.Cooler(f) (chroms, chrom_sizes, chrom_cum_lengths) = get_chromosome_names_cumul_lengths(c) i0 = abs_coord_2_bin(c, start_pos_1, chroms, chrom_cum_lengths, chrom_sizes) i1 = abs_coord_2_bin(c, end_pos_1, chroms, chrom_cum_lengths, chrom_sizes) j0 = abs_coord_2_bin(c, start_pos_2, chroms, chrom_cum_lengths, chrom_sizes) j1 = abs_coord_2_bin(c, end_pos_2, chroms, chrom_cum_lengths, chrom_sizes) matrix = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf) if i0 >= matrix.shape[0] or j0 >= matrix.shape[1]: # query beyond the bounds of the matrix # return an empty matrix i0, i1, j0, j1 = 0, 0, 0, 0 return (pd.DataFrame( columns=['genome_start1', 'genome_start2', 'balanced']), (pd.DataFrame({ 'genome_start': [], 'genome_end': [], 'weight': [] }), pd.DataFrame({ 'genome_start': [], 'genome_end': [], 'weight': [] }))) else: # limit the range of the query to be within bounds i1 = min(i1, matrix.shape[0] - 1) j1 = min(j1, matrix.shape[1] - 1) pixels = matrix[i0:i1 + 1, j0:j1 + 1] ''' if not len(pixels): return (pd.DataFrame(columns=['genome_start1', 'genome_start2', 'balanced']), (None, None)) ''' # select bin columns to extract cols = ['chrom', 'start', 'end'] if (transform == 'default' and 'weight' in c.bins()) or transform == 'weight': cols.append('weight') elif transform in ('KR', 'VC', 'VC_SQRT'): cols.append(transform) bins = c.bins(convert_enum=False)[cols] pixels = cooler.annotate(pixels, bins) pixels['genome_start1'] = chrom_cum_lengths[ pixels['chrom1']] + pixels['start1'] pixels['genome_start2'] = chrom_cum_lengths[ pixels['chrom2']] + pixels['start2'] bins1 = bins[i0:i1 + 1] bins2 = bins[j0:j1 + 1] bins1['genome_start'] = chrom_cum_lengths[bins1['chrom']] + bins1['start'] bins2['genome_start'] = chrom_cum_lengths[bins2['chrom']] + bins2['start'] bins1['genome_end'] = chrom_cum_lengths[bins1['chrom']] + bins1['end'] bins2['genome_end'] = chrom_cum_lengths[bins2['chrom']] + bins2['end'] # apply transform if (transform == 'default' and 'weight' in c.bins()) or transform == 'weight': pixels['balanced'] = (pixels['count'] * pixels['weight1'] * pixels['weight2']) return (pixels[['genome_start1', 'genome_start2', 'balanced']], (bins1, bins2)) elif transform in ('KR', 'VC', 'VC_SQRT'): pixels['balanced'] = (pixels['count'] / pixels[transform + '1'] / pixels[transform + '2']) bins1['weight'] = bins1[transform] bins2['weight'] = bins2[transform] return (pixels[['genome_start1', 'genome_start2', 'balanced']], (bins1, bins2)) else: return (pixels[['genome_start1', 'genome_start2', 'count']], (None, None))
def get_frag(c: cooler.api.Cooler, resolution: int, offsets: pd.core.series.Series, chrom1: str, start1: int, end1: int, chrom2: str, start2: int, end2: int, width: int = 22, height: int = -1, padding: int = 10, normalize: bool = True, balanced: bool = True, percentile: float = 100.0, ignore_diags: int = 0, no_normalize: bool = False) -> np.ndarray: """ Retrieves a matrix fragment. Args: c: Cooler object. chrom1: Chromosome 1. E.g.: `1` or `chr1`. start1: First start position in base pairs relative to `chrom1`. end1: First end position in base pairs relative to `chrom1`. chrom2: Chromosome 2. E.g.: `1` or `chr1`. start2: Second start position in base pairs relative to `chrom2`. end2: Second end position in base pairs relative to `chrom2`. offsets: Pandas Series of chromosome offsets in bins. width: Width of the fragment in pixels. height: Height of the fragments in pixels. If `-1` `height` will equal `width`. Defaults to `-1`. padding: Percental padding related to the dimension of the fragment. E.g., 10 = 10% padding (5% per side). Defaults to `10`. normalize: If `True` the fragment will be normalized to [0, 1]. Defaults to `True`. balanced: If `True` the fragment will be balanced using Cooler. Defaults to `True`. percentile: Percentile clip. E.g., For 99 the maximum will be capped at the 99-percentile. Defaults to `100.0`. ignore_diags: Number of diagonals to be ignored, i.e., set to 0. Defaults to `0`. no_normalize: If `true` the returned matrix is not normalized. Defaults to `False`. Returns: """ if height is -1: height = width # Restrict padding to be [0, 100]% padding = min(100, max(0, padding)) / 100 try: offset1 = offsets[chrom1] offset2 = offsets[chrom2] except KeyError: # One more try before we will fail miserably offset1 = offsets['chr{}'.format(chrom1)] offset2 = offsets['chr{}'.format(chrom2)] start_bin1 = offset1 + int(round(float(start1) / resolution)) end_bin1 = offset1 + int(round(float(end1) / resolution)) + 1 start_bin2 = offset2 + int(round(float(start2) / resolution)) end_bin2 = offset2 + int(round(float(end2) / resolution)) + 1 # Apply percentile padding padding1 = int(round(((end_bin1 - start_bin1) / 2) * padding)) padding2 = int(round(((end_bin2 - start_bin2) / 2) * padding)) start_bin1 -= padding1 start_bin2 -= padding2 end_bin1 += padding1 end_bin2 += padding2 # Get the size of the region dim1 = end_bin1 - start_bin1 dim2 = end_bin2 - start_bin2 # Get additional absolute padding if needed padding1 = 0 if dim1 < width: padding1 = int((width - dim1) / 2) start_bin1 -= padding1 end_bin1 += padding1 padding2 = 0 if dim2 < height: padding2 = int((height - dim2) / 2) start_bin2 -= padding2 end_bin2 += padding2 # In case the final dimension does not math the desired dimension we # increase the end bin. This can be caused when the padding is not # divisible by 2, since the padding is rounded to the nearest integer. abs_dim1 = abs(start_bin1 - end_bin1) if abs_dim1 < width: end_bin1 += width - abs_dim1 abs_dim1 = width abs_dim2 = abs(start_bin2 - end_bin2) if abs_dim2 < height: end_bin2 += height - abs_dim2 abs_dim2 = height # Maximum width / height is 512 if abs_dim1 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge() if abs_dim2 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge() # Finally, adjust to negative values. # Since relative bin IDs are adjusted by the start this will lead to a # white offset. real_start_bin1 = start_bin1 if start_bin1 >= 0 else 0 real_start_bin2 = start_bin2 if start_bin2 >= 0 else 0 # Get the data data = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf)[real_start_bin1:end_bin1, real_start_bin2:end_bin2] # Annotate pixels for balancing bins = c.bins(convert_enum=False)[['weight']] data = cooler.annotate(data, bins, replace=False) # Calculate relative bin IDs rel_bin1 = np.add(data['bin1_id'].values, -start_bin1) rel_bin2 = np.add(data['bin2_id'].values, -start_bin2) # Balance counts if balanced: values = data['count'].values.astype(np.float32) values *= data['weight1'].values * data['weight2'].values else: values = data['count'].values # Get pixel IDs for the upper triangle idx1 = np.add(np.multiply(rel_bin1, abs_dim1), rel_bin2) # Mirror matrix idx2_1 = np.add(data['bin2_id'].values, -start_bin1) idx2_2 = np.add(data['bin1_id'].values, -start_bin2) idx2 = np.add(np.multiply(idx2_1, abs_dim1), idx2_2) validBins = np.where((idx2_1 < abs_dim1) & (idx2_2 >= 0)) # Ignore diagonals diags_start_row = None if ignore_diags > 0: try: diags_start_idx = np.min( np.where(data['bin1_id'].values == data['bin2_id'].values)) diags_start_row = (rel_bin1[diags_start_idx] - rel_bin2[diags_start_idx]) except ValueError: pass # Copy pixel values onto the final array frag_len = abs_dim1 * abs_dim2 frag = np.zeros(frag_len, dtype=np.float32) # Make sure we're within the bounds idx1_f = np.where(idx1 < frag_len) frag[idx1[idx1_f]] = values[idx1_f] frag[idx2[validBins]] = values[validBins] frag = frag.reshape((abs_dim1, abs_dim2)) # Store low quality bins low_quality_bins = np.where(np.isnan(frag)) # Assign 0 for now to avoid influencing the max values frag[low_quality_bins] = 0 # Scale fragment down if needed scaled = False scale_x = width / frag.shape[0] if frag.shape[0] > width or frag.shape[1] > height: scaledFrag = np.zeros((width, height), float) frag = scaledFrag + zoomArray(frag, scaledFrag.shape, order=1) scaled = True # Normalize by minimum if not no_normalize: min_val = np.min(frag) frag -= min_val ignored_idx = None # Remove diagonals if ignore_diags > 0 and diags_start_row is not None: if width == height: scaled_row = int(np.rint(diags_start_row / scale_x)) idx = np.diag_indices(width) scaled_idx = (idx if scaled_row == 0 else [idx[0][scaled_row:], idx[0][:-scaled_row]]) for i in range(ignore_diags): # First set all cells to be ignored to `-1` so that we can # easily query for them later. if i == 0: frag[scaled_idx] = -1 else: dist_to_diag = scaled_row - i dist_neg = min(0, dist_to_diag) off = 0 if dist_to_diag >= 0 else i - scaled_row # Above diagonal frag[((scaled_idx[0] - i)[off:], (scaled_idx[1])[off:])] = -1 # Extra cutoff at the bottom right frag[(range( scaled_idx[0][-1] - i, scaled_idx[0][-1] + 1 + dist_neg, ), range(scaled_idx[1][-1], scaled_idx[1][-1] + i + 1 + dist_neg))] = -1 # Below diagonal frag[((scaled_idx[0] + i)[:-i], (scaled_idx[1])[:-i])] = -1 # Save the final selection of ignored cells for fast access # later and set those values to `0` now. ignored_idx = np.where(frag == -1) frag[ignored_idx] = 0 else: logger.warn( 'Ignoring the diagonal only supported for squared features') # Capp by percentile max_val = np.percentile(frag, percentile) frag = np.clip(frag, 0, max_val) # Normalize by maximum if not no_normalize and max_val > 0: frag /= max_val # Set the ignored diagonal to the maximum if ignored_idx: frag[ignored_idx] = 1.0 if not scaled: # Recover low quality bins frag[low_quality_bins] = -1 return frag
def insul_diamond( pixel_query, bins, window=10, ignore_diags=2, norm_by_median=True, clr_weight_name="weight", ): """ Calculates the insulation score of a Hi-C interaction matrix. Parameters ---------- pixel_query : RangeQuery object <TODO:update description> A table of Hi-C interactions. Must follow the Cooler columnar format: bin1_id, bin2_id, count, balanced (optional)). bins : pandas.DataFrame A table of bins, is used to determine the span of the matrix and the locations of bad bins. window : int The width (in bins) of the diamond window to calculate the insulation score. ignore_diags : int If > 0, the interactions at separations < `ignore_diags` are ignored when calculating the insulation score. Typically, a few first diagonals of the Hi-C map should be ignored due to contamination with Hi-C artifacts. norm_by_median : bool If True, normalize the insulation score by its NaN-median. clr_weight_name : str or None Name of balancing weight column from the cooler to use. Using raw unbalanced data is not supported for insulation. """ lo_bin_id = bins.index.min() hi_bin_id = bins.index.max() + 1 N = hi_bin_id - lo_bin_id sum_counts = np.zeros(N) sum_balanced = np.zeros(N) if clr_weight_name is None: # define n_pixels n_pixels = get_n_pixels(np.repeat(False, len(bins)), window=window, ignore_diags=ignore_diags) else: # calculate n_pixels n_pixels = get_n_pixels( bins[clr_weight_name].isnull().values, window=window, ignore_diags=ignore_diags, ) # define transform - balanced and raw ('count') for now weight1 = clr_weight_name + "1" weight2 = clr_weight_name + "2" transform = lambda p: p["count"] * p[weight1] * p[weight2] for chunk_dict in pixel_query.read_chunked(): chunk = pd.DataFrame(chunk_dict, columns=["bin1_id", "bin2_id", "count"]) diag_pixels = chunk[chunk.bin2_id - chunk.bin1_id <= (window - 1) * 2] if clr_weight_name: diag_pixels = cooler.annotate(diag_pixels, bins[[clr_weight_name]]) diag_pixels["balanced"] = transform(diag_pixels) valid_pixel_mask = ~diag_pixels["balanced"].isnull().values i = diag_pixels.bin1_id.values - lo_bin_id j = diag_pixels.bin2_id.values - lo_bin_id for i_shift in range(0, window): for j_shift in range(0, window): if i_shift + j_shift < ignore_diags: continue mask = ((i + i_shift == j - j_shift) & (i + i_shift < N) & (j - j_shift >= 0)) sum_counts += np.bincount(i[mask] + i_shift, diag_pixels["count"].values[mask], minlength=N) if clr_weight_name: sum_balanced += np.bincount( i[mask & valid_pixel_mask] + i_shift, diag_pixels["balanced"].values[mask & valid_pixel_mask], minlength=N, ) with warnings.catch_warnings(): warnings.simplefilter("ignore") if clr_weight_name: score = sum_balanced / n_pixels else: score = sum_counts / n_pixels if norm_by_median: score /= np.nanmedian(score) return score, n_pixels, sum_balanced, sum_counts
def dots( cool_path, expected_path, view, clr_weight_name, nproc, max_loci_separation, max_nans_tolerated, tile_size, kernel_width, kernel_peak, num_lambda_chunks, fdr, dots_clustering_radius, verbose, out_prefix, ): """ Call dots on a Hi-C heatmap that are not larger than max_loci_separation. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. EXPECTED_PATH : The paths to a tsv-like file with expected signal, including a header. Use the '::' syntax to specify a column name. Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and therefore these chromosomes must be a subset of chromosomes referred to in COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial, i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH before applying this script. COOL_PATH and EXPECTED_PATH must be binned at the same resolution. EXPECTED_PATH must contain at least the following columns for cis contacts: 'region1/2', 'dist', 'n_valid', value_name. value_name is controlled using options. Header must be present in a file. """ clr = cooler.Cooler(cool_path) expected_path, expected_value_col = expected_path #### Generate viewframes #### # 1:cooler_view_df. Generate viewframe from clr.chromsizes: cooler_view_df = make_cooler_view(clr) # 2:view_df. Define global view for calculating calling dots # use input "view" BED file or all chromosomes : if view is None: view_df = cooler_view_df else: view_df = read_viewframe_from_file(view, clr, check_sorting=True) #### Read expected: #### expected_summary_cols = [ expected_value_col, ] expected = read_expected_from_file( expected_path, contact_type="cis", expected_value_cols=expected_summary_cols, verify_view=view_df, verify_cooler=clr, ) # add checks to make sure cis-expected is symmetric # Prepare some parameters. binsize = clr.binsize loci_separation_bins = int(max_loci_separation / binsize) tile_size_bins = int(tile_size / binsize) balance_factor = 1.0 # clr._load_attrs("bins/weight")["scale"] # clustering would deal with bases-units for now, so supress this for now # clustering_radius_bins = int(dots_clustering_radius/binsize) # kernels # 'upright' is a symmetrical inversion of "lowleft", not needed. ktypes = ["donut", "vertical", "horizontal", "lowleft"] if (kernel_width is None) or (kernel_peak is None): w, p = api.dotfinder.recommend_kernel_params(binsize) logging.info( f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}" ) else: w, p = kernel_width, kernel_peak # add some sanity check for w,p: if not w > p: raise ValueError( f"Wrong inner/outer kernel parameters w={w}, p={p}") logging.info(f"Using kernel parameters w={w}, p={p} provided by user") # once kernel parameters are setup check max_nans_tolerated # to make sure kernel footprints overlaping 1 side with the # NaNs filled row/column are not "allowed" # this requires dynamic adjustment for the "shrinking donut" if not max_nans_tolerated <= 2 * w: raise ValueError("Too many NaNs allowed!") # may lead to scoring the same pixel twice, - i.e. duplicates. # generate standard kernels - consider providing custom ones kernels = {k: api.dotfinder.get_kernel(w, p, k) for k in ktypes} # list of tile coordinate ranges tiles = list( api.dotfinder.heatmap_tiles_generator_diag(clr, view_df, w, tile_size_bins, loci_separation_bins)) # lambda-chunking edges ... if not 40 <= num_lambda_chunks <= 50: raise ValueError("Incompatible num_lambda_chunks") base = 2**(1 / 3) ledges = np.concatenate(( [-np.inf], np.logspace( 0, num_lambda_chunks - 1, num=num_lambda_chunks, base=base, dtype=np.float64, ), [np.inf], )) # 1. Calculate genome-wide histograms of scores. gw_hist = api.dotfinder.scoring_and_histogramming_step( clr, expected.set_index(["region1", "region2", "dist"]), expected_value_col, clr_weight_name, tiles, kernels, ledges, max_nans_tolerated, loci_separation_bins, nproc, verbose, ) if verbose: logging.info("Done building histograms ...") # 2. Determine the FDR thresholds. threshold_df, qvalues = api.dotfinder.determine_thresholds( kernels, ledges, gw_hist, fdr) # 3. Filter using FDR thresholds calculated in the histogramming step filtered_pixels = api.dotfinder.scoring_and_extraction_step( clr, expected.set_index(["region1", "region2", "dist"]), expected_value_col, clr_weight_name, tiles, kernels, ledges, threshold_df, max_nans_tolerated, balance_factor, loci_separation_bins, op.join(op.dirname(out_prefix), op.basename(out_prefix) + ".enriched.tsv"), nproc, verbose, bin1_id_name="bin1_id", bin2_id_name="bin2_id", ) # 4. Post-processing if verbose: logging.info( f"Begin post-processing of {len(filtered_pixels)} filtered pixels") logging.info("preparing to extract needed q-values ...") filtered_pixels_qvals = api.dotfinder.annotate_pixels_with_qvalues( filtered_pixels, qvalues, kernels) # 4a. clustering ######################################################################## # Clustering has to be done using annotated DataFrame of filtered pixels # why ? - because - clustering has to be done independently for every region! ######################################################################## filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:]) filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, view_df) # consider reseting index here centroids = api.dotfinder.clustering_step( filtered_pixels_annotated, view_df["name"], dots_clustering_radius, verbose, ) # 4b. filter by enrichment and qval postprocessed_calls = api.dotfinder.thresholding_step(centroids) # Final-postprocessed result if out_prefix is not None: postprocessed_fname = op.join( op.dirname(out_prefix), op.basename(out_prefix) + ".postproc.bedpe") postprocessed_calls.to_csv(postprocessed_fname, sep="\t", header=True, index=False, compression=None)
def cis_expected(clr, regions, field="balanced", chunksize=1000000, use_dask=True, ignore_diags=2): """ Compute the mean signal along diagonals of one or more regional blocks of intra-chromosomal contact matrices. Typically used as a background model for contact frequencies on the same polymer chain. Parameters ---------- clr : cooler.Cooler Input Cooler regions : iterable of genomic regions or pairs of regions Iterable of genomic region strings or 3-tuples, or 5-tuples for pairs of regions field : str, optional Which values of the contact matrix to aggregate. This is currently a no-op. *FIXME* chunksize : int, optional Size of dask chunks. Returns ------- Dataframe of diagonal statistics, indexed by region and diagonal number """ warnings.warn( "`cooltools.expected.cis_expected()` is deprecated in 0.3.2, will be removed subsequently. " "Use `cooltools.expected.diagsum()` and `cooltools.expected.diagsum_asymm()` instead.", category=FutureWarning, stacklevel=2, ) def _bg2slice_frame(bg2, region1, region2): """ Slice a dataframe with columns ['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2']. Assumes no proper nesting of intervals. [Warning] this function does not follow the same logic as cooler.matrix.fetch when start/end are at the edges of the bins. """ chrom1, start1, end1 = region1 chrom2, start2, end2 = region2 if end1 is None: end1 = np.inf if end2 is None: end2 = np.inf out = bg2[(bg2["chrom1"] == chrom1) & (bg2["start1"] >= start1) & (bg2["end1"] < end1) & (bg2["chrom2"] == chrom2) & (bg2["start2"] >= start2) & (bg2["end2"] < end2)] return out import dask.dataframe as dd from cooler.sandbox.dask import read_table if use_dask: pixels = read_table(clr.uri + "/pixels", chunksize=chunksize) else: pixels = clr.pixels()[:] pixels = cooler.annotate(pixels, clr.bins(), replace=False) pixels = pixels[pixels.chrom1 == pixels.chrom2] named_regions = False if isinstance(regions, pd.DataFrame): named_regions = True chroms = regions["chrom"].values names = regions["name"].values regions = regions[["chrom", "start", "end"]].to_records(index=False) else: chroms = [region[0] for region in regions] names = chroms cis_maps = {chrom: pixels[pixels.chrom1 == chrom] for chrom in chroms} diag_tables = [] data_sums = [] for region in regions: if len(region) == 1: chrom, = region start1, end1 = 0, clr.chromsizes[chrom] start2, end2 = start1, end1 elif len(region) == 3: chrom, start1, end1 = region start2, end2 = start1, end1 elif len(region) == 5: chrom, start1, end1, start2, end2 = region else: raise ValueError("Regions must be sequences of length 1, 3 or 5") bins = clr.bins().fetch(chrom).reset_index(drop=True) bad_mask = np.array(bins["weight"].isnull()) lo1, hi1 = clr.extent((chrom, start1, end1)) lo2, hi2 = clr.extent((chrom, start2, end2)) co = clr.offset(chrom) lo1 -= co lo2 -= co hi1 -= co hi2 -= co dt = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2]) sel = _bg2slice_frame(cis_maps[chrom], (chrom, start1, end1), (chrom, start2, end2)).copy() sel["diag"] = sel["bin2_id"] - sel["bin1_id"] sel["balanced"] = sel["count"] * sel["weight1"] * sel["weight2"] agg = _sum_diagonals(sel, field) diag_tables.append(dt) data_sums.append(agg) # run dask scheduler if len(data_sums) and isinstance(data_sums[0], dd.Series): data_sums = dd.compute(*data_sums) # append to tables for dt, agg in zip(diag_tables, data_sums): dt[agg.name] = 0 dt[agg.name] = dt[agg.name].add(agg, fill_value=0) dt.iloc[:ignore_diags, dt.columns.get_loc(agg.name)] = np.nan # merge and return if named_regions: dtable = pd.concat(diag_tables, keys=zip(names, chroms), names=["name", "chrom"]) else: dtable = pd.concat(diag_tables, keys=list(chroms), names=["chrom"]) # the actual expected is balanced.sum/n_valid: dtable["balanced.avg"] = dtable["balanced.sum"] / dtable["n_valid"] return dtable
def score_tile(tile_cij, clr, cis_exp, exp_v_name, bal_v_name, kernels, nans_tolerated, band_to_cover, verbose): """ The main working function that given a tile of a heatmap, applies kernels to perform convolution to calculate locally-adjusted expected and then calculates a p-value for every meaningfull pixel against these l.a. expected values. Parameters ---------- tile_cij : tuple Tuple of 3: chromosome name, tile span row-wise, tile span column-wise: (chrom, tile_i, tile_j), where tile_i = (start_i, end_i), and tile_j = (start_j, end_j). clr : cooler Cooler object to use to extract Hi-C heatmap data. cis_exp : pandas.DataFrame DataFrame with 1 dimensional expected, indexed with 'chrom' and 'diag'. exp_v_name : str Name of a value column in expected DataFrame bal_v_name : str Name of a value column with balancing weights in a cooler.bins() DataFrame. Typically 'weight'. kernels : dict A dictionary with keys being kernels names and values being ndarrays representing those kernels. nans_tolerated : int Number of NaNs tolerated in a footprint of every kernel. band_to_cover : int Results would be stored only for pixels connecting loci closer than 'band_to_cover'. verbose : bool Enable verbose output. Returns ------- res_df : pandas.DataFrame results: annotated pixels with calculated locally adjusted expected for every kernels, observed, precalculated pvalues, number of NaNs in footprint of every kernels, all of that in a form of an annotated pixels DataFrame for eligible pixels of a given tile. """ # unpack tile's coordinates chrom, tilei, tilej = tile_cij origin = (tilei[0], tilej[0]) # we have to do it for every tile, because # chrom is not known apriori (maybe move outside): lazy_exp = LazyToeplitz(cis_exp.loc[chrom][exp_v_name].values) # RAW observed matrix slice: observed = clr.matrix(balance=False)[slice(*tilei), slice(*tilej)] # expected as a rectangular tile : expected = lazy_exp[slice(*tilei), slice(*tilej)] # slice of balance_weight for row-span and column-span : bal_weight_i = clr.bins()[slice(*tilei)][bal_v_name].values bal_weight_j = clr.bins()[slice(*tilej)][bal_v_name].values # do the convolutions result = dotfinder.get_adjusted_expected_tile_some_nans( origin=origin, observed=observed, expected=expected, bal_weight=(bal_weight_i,bal_weight_j), kernels=kernels, verbose=verbose) # Post-processing filters # (1) exclude pixels that connect loci further than 'band_to_cover' apart: is_inside_band = (result["bin1_id"] > (result["bin2_id"]-band_to_cover)) # (2) identify pixels that pass number of NaNs compliance test for ALL kernels: does_comply_nans = np.all( result[["la_exp."+k+".nnans" for k in kernels]] < nans_tolerated, axis=1) # so, selecting inside band and nNaNs compliant results: # ( drop dropping index maybe ??? ) ... res_df = result[is_inside_band & does_comply_nans].reset_index(drop=True) # do Poisson tests: get_pval = lambda la_exp : 1.0 - poisson.cdf(res_df["obs.raw"], la_exp) for k in kernels: res_df["la_exp."+k+".pval"] = get_pval( res_df["la_exp."+k+".value"] ) # annotate and return return cooler.annotate(res_df.reset_index(drop=True), clr.bins()[:])
def cis_expected(clr, regions, field='balanced', chunksize=1000000, use_dask=True, ignore_diags=2): """ Compute the mean signal along diagonals of one or more regional blocks of intra-chromosomal contact matrices. Typically used as a background model for contact frequencies on the same polymer chain. Parameters ---------- clr : cooler.Cooler Input Cooler regions : iterable of genomic regions or pairs of regions Iterable of genomic region strings or 3-tuples, or 5-tuples for pairs of regions field : str, optional Which values of the contact matrix to aggregate. This is currently a no-op. *FIXME* chunksize : int, optional Size of dask chunks. Returns ------- Dataframe of diagonal statistics, indexed by region and diagonal number """ if use_dask: pixels = daskify(clr.filename, clr.root + '/pixels', chunksize=chunksize) else: pixels = clr.pixels()[:] pixels = cooler.annotate(pixels, clr.bins(), replace=False) pixels = pixels[pixels.chrom1 == pixels.chrom2] named_regions = False if isinstance(regions, pd.DataFrame): named_regions = True chroms = regions['chrom'].values names = regions['name'].values regions = regions[['chrom', 'start', 'end']].to_records(index=False) else: chroms = [region[0] for region in regions] names = chroms cis_maps = {chrom: pixels[pixels.chrom1==chrom] for chrom in chroms} diag_tables = [] data_sums = [] for region in regions: if len(region) == 1: chrom, = region start1, end1 = 0, clr.chromsizes[chrom] start2, end2 = start1, end1 elif len(region) == 3: chrom, start1, end1 = region start2, end2 = start1, end1 elif len(region) == 5: chrom, start1, end1, start2, end2 = region else: raise ValueError("Regions must be sequences of length 1, 3 or 5") bins = clr.bins().fetch(chrom).reset_index(drop=True) bad_mask = np.array(bins['weight'].isnull()) lo1, hi1 = clr.extent((chrom, start1, end1)) lo2, hi2 = clr.extent((chrom, start2, end2)) co = clr.offset(chrom) lo1 -= co lo2 -= co hi1 -= co hi2 -= co dt = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2]) sel = bg2slice_frame( cis_maps[chrom], (chrom, start1, end1), (chrom, start2, end2) ).copy() sel['diag'] = sel['bin2_id'] - sel['bin1_id'] sel['balanced'] = sel['count'] * sel['weight1'] * sel['weight2'] agg = _sum_diagonals(sel, field) diag_tables.append(dt) data_sums.append(agg) # run dask scheduler if len(data_sums) and isinstance(data_sums[0], dd.Series): data_sums = dd.compute(*data_sums) # append to tables for dt, agg in zip(diag_tables, data_sums): dt[agg.name] = 0 dt[agg.name] = dt[agg.name].add(agg, fill_value=0) dt.iloc[:ignore_diags, dt.columns.get_loc(agg.name)] = np.nan # merge and return if named_regions: dtable = pd.concat( diag_tables, keys=zip(names, chroms), names=['name', 'chrom']) else: dtable = pd.concat( diag_tables, keys=list(chroms), names=['chrom']) # the actual expected is balanced.sum/n_valid: dtable['balanced.avg'] = dtable['balanced.sum'] / dtable['n_valid'] return dtable
def insul_diamond(pixel_query, bins, window=10, ignore_diags=2, norm_by_median=True): """ Calculates the insulation score of a Hi-C interaction matrix. Parameters ---------- pixel_query : RangeQuery object <TODO:update description> A table of Hi-C interactions. Must follow the Cooler columnar format: bin1_id, bin2_id, count, balanced (optional)). bins : pandas.DataFrame A table of bins, is used to determine the span of the matrix and the locations of bad bins. window : int The width (in bins) of the diamond window to calculate the insulation score. ignore_diags : int If > 0, the interactions at separations < `ignore_diags` are ignored when calculating the insulation score. Typically, a few first diagonals of the Hi-C map should be ignored due to contamination with Hi-C artifacts. norm_by_median : bool If True, normalize the insulation score by its NaN-median. """ lo_bin_id = bins.index.min() hi_bin_id = bins.index.max() + 1 N = hi_bin_id - lo_bin_id sum_counts = np.zeros(N) sum_balanced = np.zeros(N) n_pixels = get_n_pixels(bins.weight.isnull().values, window=window, ignore_diags=ignore_diags) for chunk_dict in pixel_query.read_chunked(): chunk = pd.DataFrame(chunk_dict, columns=["bin1_id", "bin2_id", "count"]) diag_pixels = chunk[chunk.bin2_id - chunk.bin1_id <= (window - 1) * 2] diag_pixels = cooler.annotate(diag_pixels, bins[["weight"]]) diag_pixels["balanced"] = (diag_pixels["count"] * diag_pixels["weight1"] * diag_pixels["weight2"]) valid_pixel_mask = ~diag_pixels["balanced"].isnull().values i = diag_pixels.bin1_id.values - lo_bin_id j = diag_pixels.bin2_id.values - lo_bin_id for i_shift in range(0, window): for j_shift in range(0, window): if i_shift + j_shift < ignore_diags: continue mask = ((i + i_shift == j - j_shift) & (i + i_shift < N) & (j - j_shift >= 0)) sum_counts += np.bincount(i[mask] + i_shift, diag_pixels["count"].values[mask], minlength=N) sum_balanced += np.bincount( i[mask & valid_pixel_mask] + i_shift, diag_pixels["balanced"].values[mask & valid_pixel_mask], minlength=N, ) with warnings.catch_warnings(): warnings.simplefilter("ignore") score = sum_balanced / n_pixels if norm_by_median: score /= np.nanmedian(score) return score, n_pixels, sum_balanced, sum_counts
def get_data( f, start_pos_1, end_pos_1, start_pos_2, end_pos_2, transform="default", resolution=None, ): """Get balanced pixel data. Args: f: h5py.File An HDF5 Group that contains the cooler for this resolution start_pos_1 (int): Test. end_pos_1 (int): Test. start_pos_2 (int): Test. end_pos_2 (int): Test. Returns: DataFrame: Annotated cooler pixels. """ c = cooler.Cooler(f) (chroms, chrom_sizes, chrom_cum_lengths) = get_chromosome_names_cumul_lengths(c) i0 = abs_coord_2_bin(c, start_pos_1, chroms, chrom_cum_lengths, chrom_sizes) i1 = abs_coord_2_bin(c, end_pos_1, chroms, chrom_cum_lengths, chrom_sizes) j0 = abs_coord_2_bin(c, start_pos_2, chroms, chrom_cum_lengths, chrom_sizes) j1 = abs_coord_2_bin(c, end_pos_2, chroms, chrom_cum_lengths, chrom_sizes) matrix = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf) if i0 >= matrix.shape[0] or j0 >= matrix.shape[1]: # query beyond the bounds of the matrix # return an empty matrix i0, i1, j0, j1 = 0, 0, 0, 0 return ( pd.DataFrame( columns=["genome_start1", "genome_start2", "balanced"]), ( pd.DataFrame({ "genome_start": [], "genome_end": [], "weight": [] }), pd.DataFrame({ "genome_start": [], "genome_end": [], "weight": [] }), ), ) else: # limit the range of the query to be within bounds i1 = min(i1, matrix.shape[0] - 1) j1 = min(j1, matrix.shape[1] - 1) pixels = matrix[i0:i1 + 1, j0:j1 + 1] """ if not len(pixels): return (pd.DataFrame(columns=['genome_start1', 'genome_start2', 'balanced']), (None, None)) """ # select bin columns to extract cols = ["chrom", "start", "end"] if (transform == "default" and "weight" in c.bins()) or transform == "weight": cols.append("weight") elif transform in ("KR", "VC", "VC_SQRT"): cols.append(transform) bins = c.bins(convert_enum=False)[cols] pixels = cooler.annotate(pixels, bins) pixels["genome_start1"] = chrom_cum_lengths[ pixels["chrom1"]] + pixels["start1"] pixels["genome_start2"] = chrom_cum_lengths[ pixels["chrom2"]] + pixels["start2"] bins1 = bins[i0:i1 + 1] bins2 = bins[j0:j1 + 1] bins1["genome_start"] = chrom_cum_lengths[bins1["chrom"]] + bins1["start"] bins2["genome_start"] = chrom_cum_lengths[bins2["chrom"]] + bins2["start"] bins1["genome_end"] = chrom_cum_lengths[bins1["chrom"]] + bins1["end"] bins2["genome_end"] = chrom_cum_lengths[bins2["chrom"]] + bins2["end"] # apply transform if (transform == "default" and "weight" in c.bins()) or transform == "weight": pixels["balanced"] = pixels["count"] * pixels["weight1"] * pixels[ "weight2"] return (pixels[["genome_start1", "genome_start2", "balanced"]], (bins1, bins2)) elif transform in ("KR", "VC", "VC_SQRT"): pixels["balanced"] = (pixels["count"] / pixels[transform + "1"] / pixels[transform + "2"]) bins1["weight"] = bins1[transform] bins2["weight"] = bins2[transform] return (pixels[["genome_start1", "genome_start2", "balanced"]], (bins1, bins2)) else: return (pixels[["genome_start1", "genome_start2", "count"]], (None, None))
def call_dots( cool_path, expected_path, expected_name, weight_name, nproc, max_loci_separation, max_nans_tolerated, tile_size, kernel_width, kernel_peak, num_lambda_chunks, fdr, dots_clustering_radius, verbose, output_scores, output_hists, output_calls, score_dump_mode, temp_dir, no_delete_temp, ): """ Call dots on a Hi-C heatmap that are not larger than max_loci_separation. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. EXPECTED_PATH : The paths to a tsv-like file with expected signal. Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and therefore these chromosomes must be a subset of chromosomes referred to in COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial, i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH before applying this script. COOL_PATH and EXPECTED_PATH must be binned at the same resolution. EXPECTED_PATH must contain at least the following columns for cis contacts: 'chrom', 'diag', 'n_valid', value_name. value_name is controlled using options. Header must be present in a file. """ clr = cooler.Cooler(cool_path) expected_columns = ["chrom", "diag", "n_valid", expected_name] expected_index = ["chrom", "diag"] expected_dtypes = { "chrom": np.str, "diag": np.int64, "n_valid": np.int64, expected_name: np.float64, } expected = pd.read_table( expected_path, usecols=expected_columns, dtype=expected_dtypes, comment=None, verbose=verbose, ) expected.set_index(expected_index, inplace=True) # Input validation # unique list of chroms mentioned in expected_path # do simple column-name validation for now get_exp_chroms = lambda df: df.index.get_level_values("chrom").unique() expected_chroms = get_exp_chroms(expected) if not set(expected_chroms).issubset(clr.chromnames): raise ValueError( "Chromosomes in {} must be subset of ".format(expected_path) + "chromosomes in cooler {}".format(cool_path)) # check number of bins # compute # of bins by comparing matching indexes get_exp_bins = lambda df, ref_chroms: (df.index.get_level_values("chrom"). isin(ref_chroms).sum()) expected_bins = get_exp_bins(expected, expected_chroms) cool_bins = clr.bins()[:]["chrom"].isin(expected_chroms).sum() if not (expected_bins == cool_bins): raise ValueError( "Number of bins is not matching: ", "{} in {}, and {} in {} for chromosomes {}".format( expected_bins, expected_path, cool_bins, cool_path, expected_chroms), ) if verbose: print("{} and {} passed cross-compatibility checks.".format( cool_path, expected_path)) # Prepare some parameters. binsize = clr.binsize loci_separation_bins = int(max_loci_separation / binsize) tile_size_bins = int(tile_size / binsize) balance_factor = 1.0 # clr._load_attrs("bins/weight")["scale"] # clustering would deal with bases-units for now, so supress this for now # clustering_radius_bins = int(dots_clustering_radius/binsize) # kernels # 'upright' is a symmetrical inversion of "lowleft", not needed. ktypes = ["donut", "vertical", "horizontal", "lowleft"] if (kernel_width is None) or (kernel_peak is None): w, p = dotfinder.recommend_kernel_params(binsize) print("Using kernel parameters w={}, p={} recommended for binsize {}". format(w, p, binsize)) else: w, p = kernel_width, kernel_peak # add some sanity check for w,p: assert w > p, "Wrong inner/outer kernel parameters w={}, p={}".format( w, p) print("Using kernel parameters w={}, p={} provided by user".format( w, p)) # once kernel parameters are setup check max_nans_tolerated # to make sure kernel footprints overlaping 1 side with the # NaNs filled row/column are not "allowed" # this requires dynamic adjustment for the "shrinking donut" assert max_nans_tolerated <= 2 * w, "Too many NaNs allowed!" # may lead to scoring the same pixel twice, - i.e. duplicates. # generate standard kernels - consider providing custom ones kernels = {k: dotfinder.get_kernel(w, p, k) for k in ktypes} # list of tile coordinate ranges tiles = list( dotfinder.heatmap_tiles_generator_diag(clr, expected_chroms, w, tile_size_bins, loci_separation_bins)) # lambda-chunking edges ... assert dotfinder.HiCCUPS_W1_MAX_INDX <= num_lambda_chunks <= 50 base = 2**(1 / 3) ledges = np.concatenate(( [-np.inf], np.logspace( 0, num_lambda_chunks - 1, num=num_lambda_chunks, base=base, dtype=np.float, ), [np.inf], )) # 1. Calculate genome-wide histograms of scores. gw_hist = dotfinder.scoring_and_histogramming_step( clr, expected, expected_name, weight_name, tiles, kernels, ledges, max_nans_tolerated, loci_separation_bins, nproc, verbose, ) if verbose: print("Done building histograms ...") # 2. Determine the FDR thresholds. threshold_df, qvalues = dotfinder.determine_thresholds( kernels, ledges, gw_hist, fdr) # 3. Filter using FDR thresholds calculated in the histogramming step filtered_pixels = dotfinder.scoring_and_extraction_step( clr, expected, expected_name, weight_name, tiles, kernels, ledges, threshold_df, max_nans_tolerated, balance_factor, loci_separation_bins, output_calls, nproc, verbose, ) # 4. Post-processing if verbose: print("Begin post-processing of {} filtered pixels".format( len(filtered_pixels))) print("preparing to extract needed q-values ...") filtered_pixels_qvals = dotfinder.annotate_pixels_with_qvalues( filtered_pixels, qvalues, kernels) # 4a. clustering ######################################################################## # Clustering has to be done using annotated DataFrame of filtered pixels # why ? - because - clustering has to be done chromosome by chromosome ! ######################################################################## filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:]) centroids = dotfinder.clustering_step(filtered_pixels_annotated, expected_chroms, dots_clustering_radius, verbose) # 4b. filter by enrichment and qval postprocessed_calls = dotfinder.thresholding_step(centroids) # Final-postprocessed result if output_calls is not None: postprocessed_fname = op.join(op.dirname(output_calls), op.basename(output_calls) + ".postproc") postprocessed_calls.to_csv(postprocessed_fname, sep="\t", header=True, index=False, compression=None)
def call_dots( cool_path, expected_path, regions, expected_name, weight_name, nproc, max_loci_separation, max_nans_tolerated, tile_size, kernel_width, kernel_peak, num_lambda_chunks, fdr, dots_clustering_radius, verbose, out_prefix, ): """ Call dots on a Hi-C heatmap that are not larger than max_loci_separation. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. EXPECTED_PATH : The paths to a tsv-like file with expected cis-expected. Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and therefore these chromosomes must be a subset of chromosomes referred to in COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial, i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH before applying this script. COOL_PATH and EXPECTED_PATH must be binned at the same resolution. EXPECTED_PATH must contain at least the following columns for cis contacts: 'region', 'diag', 'n_valid', value_name. value_name is controlled using options. Header must be present in a file. """ clr = cooler.Cooler(cool_path) # preliminary SCHEMA for cis-expected region_column_name = "region" expected_columns = [region_column_name, "diag", "n_valid", expected_name] expected_dtypes = { region_column_name: np.str, "diag": np.int64, "n_valid": np.int64, expected_name: np.float64, } try: expected = pd.read_table( expected_path, usecols=expected_columns, dtype=expected_dtypes, comment=None, verbose=verbose, ) except ValueError as e: raise ValueError( "input expected does not match the schema\n" "tab-separated expected file must have a header as wel") expected_index = [ region_column_name, "diag", ] expected.set_index(expected_index, inplace=True) # end of SCHEMA for cis-expected # Optional reading region table provided by the user: if regions is None: try: uniq_regions = expected.index.get_level_values( region_column_name).unique() regions_table = bioframe.parse_regions(uniq_regions, clr.chromsizes) regions_table["name"] = regions_table["chrom"] except ValueError as e: print(e) raise ValueError( "Cannot interpret regions from EXPECTED_PATH\n" "specify regions definitions using --regions option.") else: # Flexible reading of the regions table: regions_buf, names = util.sniff_for_header(regions) regions_table = pd.read_csv(regions_buf, sep="\t", header=None) if regions_table.shape[1] not in (3, 4): raise ValueError( "The region file does not have three or four tab-delimited columns." "We expect a bed file with columns chrom, start, end, and optional name" ) if regions_table.shape[1] == 4: regions_table = regions_table.rename(columns={ 0: "chrom", 1: "start", 2: "end", 3: "name" }) regions_table = bioframe.parse_regions(regions_table) else: regions_table = regions_table.rename(columns={ 0: "chrom", 1: "start", 2: "end" }) regions_table = bioframe.parse_regions(regions_table) regions_table = regions_table[regions_table["chrom"].isin( clr.chromnames)].reset_index(drop=True) # Verify appropriate columns order (required for heatmap_tiles_generator_diag): regions_table = regions_table[["chrom", "start", "end", "name"]] # Input validation get_exp_regions = lambda df: df.index.get_level_values(region_column_name ).unique() expected_regions = get_exp_regions(expected) # unique list of regions mentioned in expected_path # are also in regions table if not set(expected_regions).issubset(regions_table["name"]): raise ValueError( "Regions in {} must be subset of ".format(expected_path) + f"regions in {'regions table'+regions_path if not regions_path is None else 'cooler'}" ) # check number of bins per region in cooler and expected table # compute # of bins by comparing matching indexes try: for region_name, group in expected.reset_index().groupby( region_column_name): n_diags = group.shape[0] region = regions_table.set_index("name").loc[region_name] lo, hi = clr.extent(region) assert n_diags == (hi - lo) except AssertionError: raise ValueError("Region shape mismatch between expected and cooler. " "Are they using the same resolution?") # All the checks have passed: if verbose: print("{} and {} passed cross-compatibility checks.".format( cool_path, expected_path)) # by now we have a usable region_table and expected for most scenarios # Prepare some parameters. binsize = clr.binsize loci_separation_bins = int(max_loci_separation / binsize) tile_size_bins = int(tile_size / binsize) balance_factor = 1.0 # clr._load_attrs("bins/weight")["scale"] # clustering would deal with bases-units for now, so supress this for now # clustering_radius_bins = int(dots_clustering_radius/binsize) # kernels # 'upright' is a symmetrical inversion of "lowleft", not needed. ktypes = ["donut", "vertical", "horizontal", "lowleft"] if (kernel_width is None) or (kernel_peak is None): w, p = dotfinder.recommend_kernel_params(binsize) print( f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}" ) else: w, p = kernel_width, kernel_peak # add some sanity check for w,p: assert w > p, f"Wrong inner/outer kernel parameters w={w}, p={p}" print(f"Using kernel parameters w={w}, p={p} provided by user") # once kernel parameters are setup check max_nans_tolerated # to make sure kernel footprints overlaping 1 side with the # NaNs filled row/column are not "allowed" # this requires dynamic adjustment for the "shrinking donut" assert max_nans_tolerated <= 2 * w, "Too many NaNs allowed!" # may lead to scoring the same pixel twice, - i.e. duplicates. # generate standard kernels - consider providing custom ones kernels = {k: dotfinder.get_kernel(w, p, k) for k in ktypes} # list of tile coordinate ranges tiles = list( dotfinder.heatmap_tiles_generator_diag(clr, regions_table, w, tile_size_bins, loci_separation_bins)) # lambda-chunking edges ... assert dotfinder.HiCCUPS_W1_MAX_INDX <= num_lambda_chunks <= 50 base = 2**(1 / 3) ledges = np.concatenate(( [-np.inf], np.logspace( 0, num_lambda_chunks - 1, num=num_lambda_chunks, base=base, dtype=np.float, ), [np.inf], )) # 1. Calculate genome-wide histograms of scores. gw_hist = dotfinder.scoring_and_histogramming_step( clr, expected, expected_name, weight_name, tiles, kernels, ledges, max_nans_tolerated, loci_separation_bins, nproc, verbose, ) if verbose: print("Done building histograms ...") # 2. Determine the FDR thresholds. threshold_df, qvalues = dotfinder.determine_thresholds( kernels, ledges, gw_hist, fdr) # 3. Filter using FDR thresholds calculated in the histogramming step filtered_pixels = dotfinder.scoring_and_extraction_step( clr, expected, expected_name, weight_name, tiles, kernels, ledges, threshold_df, max_nans_tolerated, balance_factor, loci_separation_bins, op.join(op.dirname(out_prefix), op.basename(out_prefix) + ".enriched.tsv"), nproc, verbose, bin1_id_name="bin1_id", bin2_id_name="bin2_id", ) # 4. Post-processing if verbose: print( f"Begin post-processing of {len(filtered_pixels)} filtered pixels") print("preparing to extract needed q-values ...") filtered_pixels_qvals = dotfinder.annotate_pixels_with_qvalues( filtered_pixels, qvalues, kernels) # 4a. clustering ######################################################################## # Clustering has to be done using annotated DataFrame of filtered pixels # why ? - because - clustering has to be done independently for every region! ######################################################################## filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:]) filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, regions_table) # consider reseting index here centroids = dotfinder.clustering_step(filtered_pixels_annotated, expected_regions, dots_clustering_radius, verbose) # 4b. filter by enrichment and qval postprocessed_calls = dotfinder.thresholding_step(centroids) # Final-postprocessed result if out_prefix is not None: postprocessed_fname = op.join( op.dirname(out_prefix), op.basename(out_prefix) + ".postproc.bedpe") postprocessed_calls.to_csv(postprocessed_fname, sep="\t", header=True, index=False, compression=None)