def digitize_track(binedges, track, regions=None): """ Digitize genomic signal tracks into integers between `1` and `n`. Parameters ---------- binedges : 1D array (length n + 1) Bin edges for quantization of signal. For `n` bins, there are `n + 1` edges. See encoding details in Notes. track : tuple of (DataFrame, str) bedGraph-like dataframe along with the name of the value column. regions: sequence of str or tuples List of genomic regions to include. Each can be a chromosome, a UCSC-style genomic region string or a tuple. Returns ------- digitized : DataFrame New bedGraph-like dataframe with value column and an additional digitized value column with name suffixed by '.d' hist : 1D array (length n + 2) Histogram of digitized signal values. Its length is `n + 2` because the first and last elements correspond to outliers. See notes. Notes ----- The digital encoding is as follows: - `1..n` <-> values assigned to histogram bins - `0` <-> left outlier values - `n+1` <-> right outlier values - `-1` <-> missing data (NaNs) """ if not isinstance(track, tuple): raise ValueError( "``track`` should be a tuple of (dataframe, column_name)") track, name = track # subset and re-order chromosome groups if regions is not None: regions = [bioframe.parse_region(reg) for reg in regions] grouped = track.groupby('chrom') track = pd.concat( bioframe.bedslice(grouped, chrom, st, end) for (chrom, st, end) in regions) # histogram the signal digitized = track.copy() digitized[name + '.d'] = np.digitize(track[name].values, binedges, right=False) mask = track[name].isnull() digitized.loc[mask, name + '.d'] = -1 x = digitized[name + '.d'].values.copy() x = x[(x > 0) & (x < len(binedges) + 1)] hist = np.bincount(x, minlength=len(binedges) + 1) return digitized, hist
def bedpeslice(df, chrom, start, end): index = df.index.values df_l = df[['chrom1', 'start1', 'end1']].rename(columns=lambda x: x[0:-1]) gb_l = df_l.groupby('chrom') subset_l = bioframe.bedslice(gb_l, chrom, start, end) index_l = subset_l.index.values mask_l = np.isin(index, index_l) df_r = df[['chrom2', 'start2', 'end2']].rename(columns=lambda x: x[0:-1]) gb_r = df_r.groupby('chrom') subset_r = bioframe.bedslice(gb_r, chrom, start, end) index_r = subset_r.index.values mask_r = np.isin(index, index_r) sliced_df = df.loc[(mask_l & mask_r)].copy() return sliced_df
def sort_by_eigenvalue(lams, vectors): lam_list = [] vector_list = [] for reg, lambdas in lams.iterrows(): if fnmatch.fnmatch(reg, '*:*-*'): chrom = reg[0:reg.find(':')] start = int(reg[reg.find(':')+1:reg.find('-')]) end = int(reg[reg.find('-')+1:]) else: chrom = reg start, end = None, None if start is None and end is None: region_vector = vectors[vectors.chrom == chrom].copy(deep=True) else: region_vector = bedslice(vectors.groupby('chrom'), chrom, start, end) if np.any(np.isnan(lambdas.values)): srtd_idx = np.array([0,1,2]) else: srtd_idx = np.argsort(-np.abs(lambdas.values)) region_vector[['E1', 'E2', 'E3']] = region_vector[['E1', 'E2', 'E3']].values[:, srtd_idx] lam_list.append(lambdas.values[srtd_idx]) vector_list.append(region_vector) sorted_vectors = pd.concat(vector_list) missing = [ch for ch in vectors.chrom.unique() if ch not in sorted_vectors.chrom.unique()] for item in missing: vector_list.append(vectors[vectors.chrom == item].copy(deep=True)) sorted_lams = pd.DataFrame(data=np.concatenate(tuple(lam_list)).reshape(-1,3), columns=lams.columns) sorted_lams['region'] = lams.index sorted_lams.set_index('region', inplace=True) sorted_vectors = pd.concat(vector_list).drop_duplicates() return sorted_lams, sorted_vectors
def _mask_fetcher(region): bins = cool.bins().fetch(region) length = len(bins) bins['label'] = np.nan if region[0] not in vector.chrom.unique(): return np.zeros(len(bins)) region_eig = bedslice(vector, region[0], region[1], region[2]) if len(region_eig) == 0: return np.zeros(len(bins)) for i, row in region_eig.iterrows(): cond = np.logical_and(bins.start >= row.start, bins.end <= row.end) bins.loc[bins[cond].index, 'label'] = row.label mask = np.logical_and(bins.label == val, ~np.isnan(bins.weight)).values.astype(int) assert not np.any(np.isnan(bins[mask.astype(bool)]['weight'])) assert np.all(bins[mask.astype(bool)]['label'] == val) return mask
def exclude_regions(df, regions_to_keep=[], genome=None, print_final=False): if len(regions_to_keep): assert genome is not None, 'Please provide valid genome' chromsizes = bioframe.fetch_chromsizes(genome) else: if print_final: print(np.asarray(df.region.unique())) return df regions_to_keep = [ bioframe.parse_region(reg, chromsizes) for reg in regions_to_keep ] assert 'region' in df.columns regions = df['region'].apply( lambda x: bioframe.parse_region(x, chromsizes)).values chrom, start, end = list(zip(*regions)) df['chrom'] = chrom df['start'] = start df['end'] = end new_df = [] for chrom, start, end in regions_to_keep: sub_df = bioframe.bedslice(df, (chrom, start, end)) new_df.append(sub_df) new_df = pd.concat(new_df) if print_final: print(np.asarray(new_df.region.unique())) del new_df['chrom'] del new_df['start'] del new_df['end'] return new_df
def make_saddle(getmatrix, binedges, digitized, contact_type, regions=None, min_diag=3, max_diag=-1, trim_outliers=False, verbose=False): """ Make a matrix of average interaction probabilities between genomic bin pairs as a function of a specified genomic track. The provided genomic track must be pre-quantized as integers (i.e. digitized). Parameters ---------- getmatrix : function A function returning a matrix of interaction between two chromosomes given their names/indicies. binedges : 1D array (length n + 1) Bin edges of the digitized signal. For `n` bins, there are `n + 1` edges. See :func:`digitize_track`. digitized : tuple of (DataFrame, str) BedGraph-like dataframe of digitized signal along with the name of the digitized value column. contact_type : str If 'cis' then only cis interactions are used to build the matrix. If 'trans', only trans interactions are used. regions : sequence of str or tuple, optional A list of genomic regions to use. Each can be a chromosome, a UCSC-style genomic region string or a tuple. min_diag : int Smallest diagonal to include in computation. Ignored with contact_type=trans. max_diag : int Biggest diagonal to include in computation. Ignored with contact_type=trans. trim_outliers : bool, optional Remove first and last row and column from the output matrix. verbose : bool, optional If True then reports progress. Returns ------- interaction_sum : 2D array The matrix of summed interaction probability between two genomic bins given their values of the provided genomic track. interaction_count : 2D array The matrix of the number of genomic bin pairs that contributed to the corresponding pixel of ``interaction_sum``. """ digitized_df, name = digitized if regions is None: regions = [(chrom, df.start.min(), df.end.max()) for chrom, df in digitized_df.groupby('chrom')] else: regions = [bioframe.parse_region(reg) for reg in regions] digitized_tracks = { reg: bioframe.bedslice(digitized_df.groupby('chrom'), reg[0], reg[1], reg[2])[name] for reg in regions } if contact_type == 'cis': supports = list(zip(regions, regions)) elif contact_type == 'trans': supports = list(combinations(regions, 2)) else: raise ValueError("The allowed values for the contact_type " "argument are 'cis' or 'trans'.") # n_bins here includes 2 open bins # for values <lo and >hi. n_bins = len(binedges) + 1 interaction_sum = np.zeros((n_bins, n_bins)) interaction_count = np.zeros((n_bins, n_bins)) for reg1, reg2 in supports: _accumulate(interaction_sum, interaction_count, getmatrix, digitized_tracks, reg1, reg2, min_diag, max_diag, verbose) interaction_sum += interaction_sum.T interaction_count += interaction_count.T if trim_outliers: interaction_sum = interaction_sum[1:-1, 1:-1] interaction_count = interaction_count[1:-1, 1:-1] return interaction_sum, interaction_count
print(name) os.makedirs(f'{savepath}{name}/100000/cis', exist_ok=True) os.makedirs(f'{savepath}{name}/100000/trans', exist_ok=True) c = row['cooler_100000'] lams = row['lams_100000'] vector = row['vectors_100000'] # supports = {'cis': DNA_info.get_chromosome_arms(genome), # 'trans': [(chrom, 0, c.chromsizes[chrom]) # for chrom in c.chromnames[0:22]]} for region in DNA_info.get_chromosome_arms(genome): print(region) chrom, start, end = region mat = c.matrix(balance=True).fetch(region) vec = bioframe.bedslice(vector.groupby('chrom'), chrom, start, end) vec = vec['E1_cis'].values if np.all(np.isnan(vec)): continue if len(vec) == 0: continue S, C = saddleplot.construct_cis_saddleplot(mat, vec, num_percentile=20) np.save(f'{savepath}{name}/100000/cis/{chrom}:{start}-{end}.npy', np.dstack((S, C))) for i in np.arange(len(c.chromnames[0:22])): for j in np.arange(i + 1, len(c.chromnames[0:22])): chrom1 = c.chromnames[i] start1, end1 = 0, c.chromsizes[chrom1]