def _each(region): """ perform eigen decomposition for a given region assuming safety checks are done outside of this function. Parameters ---------- region: tuple-like tuple of the form (chroms,start,end,*) Returns ------- _region, eigvals, eigvecs -> ndarrays array of eigenvalues and an array eigenvectors """ _region = region[:3] # take only (chrom, start, end) print("now doing region:", _region) if smooth: A = numutils.adaptive_coarsegrain( clr.matrix(balance=True).fetch(_region), clr.matrix(balance=False).fetch(_region), cutoff=cutoff, max_levels=max_levels) else: A = clr.matrix(balance=balance).fetch(_region) # filter bad_bins relevant for the _region from A if bad_bins is not None: # filter bad_bins for the _region and turn relative: lo, hi = clr.extent(_region) bad_bins_region = bad_bins[(bad_bins>=lo)&(bad_bins<hi)] bad_bins_region -= lo if len(bad_bins_region) > 0: # apply bad bins to symmetric matrix A: A[:,bad_bins_region] = np.nan A[bad_bins_region,:] = np.nan # extract phasing track relevant for the _region phasing_track = ( bioframe.select(bins, _region)[phasing_track_col].values if phasing_track_col else None ) eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track, clip_percentile=clip_percentile, sort_metric=sort_metric, OE_log=OE_log ) return _region, eigvals, eigvecs
def digitize_track(binedges, track, regions=None): """ Digitize genomic signal tracks into integers between `1` and `n`. Parameters ---------- binedges : 1D array (length n + 1) Bin edges for quantization of signal. For `n` bins, there are `n + 1` edges. See encoding details in Notes. track : tuple of (DataFrame, str) bedGraph-like dataframe along with the name of the value column. regions: sequence of str or tuples List of genomic regions to include. Each can be a chromosome, a UCSC-style genomic region string or a tuple. Returns ------- digitized : DataFrame New bedGraph-like dataframe with value column and an additional digitized value column with name suffixed by '.d' hist : 1D array (length n + 2) Histogram of digitized signal values. Its length is `n + 2` because the first and last elements correspond to outliers. See notes. Notes ----- The digital encoding is as follows: - `1..n` <-> values assigned to histogram bins - `0` <-> left outlier values - `n+1` <-> right outlier values - `-1` <-> missing data (NaNs) """ if not isinstance(track, tuple): raise ValueError( "``track`` should be a tuple of (dataframe, column_name)") track, name = track # subset and re-order chromosome groups if regions is not None: regions = [bioframe.parse_region(reg) for reg in regions] track = pd.concat(bioframe.select(track, region) for region in regions) # histogram the signal digitized = track.copy() digitized[name + ".d"] = np.digitize(track[name].values, binedges, right=False) mask = track[name].isnull() digitized.loc[mask, name + ".d"] = -1 x = digitized[name + ".d"].values.copy() x = x[(x > 0) & (x < len(binedges) + 1)] hist = np.bincount(x, minlength=len(binedges) + 1) return digitized, hist
def _each(region): """ perform eigen decomposition for a given region assuming safety checks are done outside of this function. Parameters ---------- region: tuple-like tuple of the form (chroms,start,end,*) Returns ------- _region, eigvals, eigvecs -> ndarrays array of eigenvalues and an array eigenvectors """ _region = region[:3] # take only (chrom, start, end) A = clr.matrix(balance=clr_weight_name).fetch(_region) # filter bad_bins relevant for the _region from A if bad_bins is not None: # filter bad_bins for the _region and turn relative: lo, hi = clr.extent(_region) bad_bins_region = bad_bins[(bad_bins >= lo) & (bad_bins < hi)] bad_bins_region -= lo if len(bad_bins_region) > 0: # apply bad bins to symmetric matrix A: A[:, bad_bins_region] = np.nan A[bad_bins_region, :] = np.nan # extract phasing track relevant for the _region if phasing_track is not None: phasing_track_region = bioframe.select(phasing_track, _region) phasing_track_region_values = phasing_track_region["value"].values else: phasing_track_region_values = None eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track_region_values, clip_percentile=clip_percentile, sort_metric=sort_metric, ) return _region, eigvals, eigvecs
def plot_insulation(clr, insulation, windows, resolution, out_path, exclude_chroms, title): dir_path = os.path.join(os.path.dirname(out_path), title) if not os.path.exists(dir_path): os.mkdir(dir_path) chromsizes = bioframe.fetch_chromsizes('sacCer3', filter_chroms=False) regions = [(k, 0, v) for k, v in chromsizes.drop('chrM').iteritems()] for region in regions: norm = LogNorm(vmax=0.1, vmin=0.001) data = clr.matrix(balance=True).fetch(region) fig, ax = plt.subplots(figsize=(20, 4)) img = plot_45_mat(ax, data, start=0, resolution=resolution, norm=norm, cmap='fall') ax.set_aspect(0.5) ax.set_ylim(0, 30000) format_ticks(ax, rotate=False) ax.xaxis.set_visible(False) divider = make_axes_locatable(ax) cax = divider.append_axes('right', size='1%' ,pad=0.1, aspect=6) plt.colorbar(img, cax=cax) insul_region = bioframe.select(insulation, region) ins_ax = divider.append_axes('bottom', size='50%', pad=0.0, sharex=ax) ins_ax.set_prop_cycle(plt.cycler('color', plt.cm.plasma(np.linspace(0, 1, 5)))) for window in windows: ins_ax.plot(insul_region[['start', 'end']].mean(axis=1), insul_region[f'log2_insulation_score_{window}'], label=f'{window} bp window', lw=1) ins_ax.legend(bbox_to_anchor=(1.125, 1.05), loc='upper right') fig.suptitle(f'{title}: {region[0]}') path = os.path.join(dir_path, '_'.join((region[0], os.path.basename(out_path)))) plt.savefig(path, dpi=300)
def _each(region): A = clr.matrix(balance=balance).fetch(region) if phasing_track_col and (phasing_track_col not in bins): raise ValueError( 'No column "{}" in the bin table'.format(phasing_track_col) ) phasing_track = ( bioframe.select(bins, region)[phasing_track_col].values if phasing_track_col else None ) eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track, clip_percentile=clip_percentile, sort_metric=sort_metric, ) return eigvals, eigvecs
def get_saddle( clr, expected, digitized_track, contact_type, view_df=None, clr_weight_name="weight", expected_value_col="balanced.avg", view_name_col="name", min_diag=3, max_diag=-1, trim_outliers=False, verbose=False, ): """ Get a matrix of average interactions between genomic bin pairs as a function of a specified genomic track. The provided genomic track must a dataframe with a categorical column, as generated by `get_digitized()`. Parameters ---------- clr : cooler.Cooler Observed matrix. expected : DataFrame in expected format Diagonal summary statistics for each chromosome, and name of the column with the values of expected to use. contact_type : str If 'cis' then only cis interactions are used to build the matrix. If 'trans', only trans interactions are used. digitized_track : DataFrame with digitized value column A track, i.e. BedGraph-like dataframe, of digitized signal. The value column specifies a category for every position in the track. Generated by get_digitzed() from track. view_df: viewframe Viewframe with genomic regions. If none, generate from track chromosomes. clr_weight_name : str Name of the column in the clr.bins to use as balancing weights. expected_value_col : str Name of the column in expected used for normalizing. view_name_col : str Name of column in view_df with region names. min_diag : int Smallest diagonal to include in computation. Ignored with contact_type=trans. max_diag : int Biggest diagonal to include in computation. Ignored with contact_type=trans. trim_outliers : bool, optional Remove first and last row and column from the output matrix. verbose : bool, optional If True then reports progress. Returns ------- interaction_sum : 2D array The matrix of summed interaction probability between two genomic bins given their values of the provided genomic track. interaction_count : 2D array The matrix of the number of genomic bin pairs that contributed to the corresponding pixel of ``interaction_sum``. """ ### TODO add input validation for: track, expeced, if type(digitized_track.dtypes[3] ) is not pd.core.dtypes.dtypes.CategoricalDtype: raise ValueError( "a digitized track, where the value column is a" + "pandas categorical must be provided as input. see get_digitized()." ) digitized_col = digitized_track.columns[3] cats = digitized_track[digitized_col].dtype.categories.values n_bins = len(cats[cats > -1]) - 2 if view_df is None: view_df = _view_from_track(digitized_track) else: view_df = _make_cooler_view(view_df, clr) digitized_tracks = {} for num, reg in view_df.iterrows(): digitized_reg = bioframe.select(digitized_track, reg) digitized_tracks[reg[view_name_col]] = digitized_reg[digitized_col] ### set "cis" or "trans" for supports (regions to iterate over) and matrix fetcher if contact_type == "cis": supports = list(zip(view_df[view_name_col], view_df[view_name_col])) if not bioframe.is_cataloged(expected, view_df, df_view_col="region1", view_name_col=view_name_col): raise ValueError( "Region names in expected are not cataloged in view_df.") getmatrix = _make_cis_obsexp_fetcher( clr, expected, view_df, view_name_col=view_name_col, expected_value_col=expected_value_col, clr_weight_name=clr_weight_name, ) elif contact_type == "trans": supports = list(combinations(view_df[view_name_col], 2)) supports = [ i for i in supports if (view_df["chrom"].loc[view_df[view_name_col] == i[0]].values != view_df["chrom"].loc[view_df[view_name_col] == i[1]].values) ] getmatrix = _make_trans_obsexp_fetcher( clr, expected, view_df, view_name_col=view_name_col, expected_value_col=expected_value_col, clr_weight_name=clr_weight_name, ) else: raise ValueError( "Allowed values for contact_type are 'cis' or 'trans'.") # n_bins here includes 2 open bins for values <lo and >hi. interaction_sum = np.zeros((n_bins + 2, n_bins + 2)) interaction_count = np.zeros((n_bins + 2, n_bins + 2)) for reg1, reg2 in supports: _accumulate( interaction_sum, interaction_count, getmatrix, digitized_tracks, reg1, reg2, min_diag=min_diag, max_diag=max_diag, verbose=verbose, ) interaction_sum += interaction_sum.T interaction_count += interaction_count.T if trim_outliers: interaction_sum = interaction_sum[1:-1, 1:-1] interaction_count = interaction_count[1:-1, 1:-1] return interaction_sum, interaction_count
def cooler_cis_eig( clr, bins, regions=None, n_eigs=3, phasing_track_col="GC", balance="weight", ignore_diags=None, bad_bins=None, clip_percentile=99.9, sort_metric=None, smooth=False, cutoff = 3, max_levels = 8, OE_log=False, map=map, ): """ Compute compartment eigenvector for a given cooler `clr` in a number of symmetric intra chromosomal regions (cis-regions), or for each chromosome. Note that the amplitude of compartment eigenvectors is weighted by their corresponding eigenvalue Parameters ---------- clr : cooler cooler object to fetch data from bins : DataFrame table of bins derived from clr with phasing track added regions : iterable or DataFrame, optional if provided, eigenvectors are calculated for the regions only, otherwise chromosome-wide eigenvectors are computed, for chromosomes specified in bins. n_eigs : int number of eigenvectors to compute phasing_track_col : str, optional name of the columns in `bins` table, if provided, eigenvectors are flipped to achieve a positive correlation with `bins[phasing_track_col]`. balance : str name of the column with balancing weights to be used. ignore_diags : int, optional the number of diagonals to ignore. Derived from cooler metadata if not specified. bad_bins : array-like a list of bins to ignore. Indexes of bins must be absolute, as in clr.bins()[:], as opposed to being offset by chromosome start. `bad_bins` will be combined with the bad bins masked by balancing. clip_percentile : float if >0 and <100, clip pixels with diagonal-normalized values higher than the specified percentile of matrix-wide values. sort_metric : str If provided, re-sort `eigenvecs` and `eigvals` in the order of decreasing correlation between phasing_track and eigenvector, using the specified measure of correlation. Possible values: 'pearsonr' - sort by decreasing Pearson correlation. 'var_explained' - sort by decreasing absolute amount of variation in `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec)) 'MAD_explained' - sort by decreasing absolute amount of Median Absolute Deviation from the median of `eigvecs` explained by `phasing_track` (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)). 'spearmanr' - sort by decreasing Spearman correlation. This option is designed to report the most "biologically" informative eigenvectors first, and prevent eigenvector swapping caused by translocations. In reality, however, sometimes it shows poor performance and may lead to reporting of non-informative eigenvectors. Off by default. smooth : boolean, optional This option lets you coarsegrain the matrix prior to calling eigendecomposition. cutoff: int, optional Cutoff to pass to adaptive_coarsegrain's cutoff argument max_levels: int, optional Max level to pass to adaptive_coarsegrain's max_levels argument OE_log: boolean, optional Pass OE_log to cis_eig's OE_log argument. This works only if matrix does not contain zeroes (eg. after using adaptive_coarsegrain) map : callable, optional Map functor implementation. Returns ------- eigvals, eigvec_table -> DataFrames with eigenvalues for each region and a table of eigenvectors filled in the `bins` table. .. note:: ALWAYS check your EVs by eye. The first one occasionally does not reflect the compartment structure, but instead describes chromosomal arms or translocation blowouts. Possible mitigations: employ `regions` (e.g. arms) to avoid issues with chromosomal arms, use `bad_bins` to ignore small transolcations. """ # get chromosomes from bins, if regions not specified: if regions is None: regions = list(bins["chrom"].unique()) # parse_regions fill in the rest # make sure phasing_track_col is in bins, if phasing is requested if phasing_track_col and (phasing_track_col not in bins): raise ValueError(f'No column "{phasing_track_col}" in the bin table') # regions to dataframe # regions = bioframe.parse_regions(regions, clr.chromsizes) regions = bioframe.make_viewframe(regions) # ignore diags as in cooler inless specified ignore_diags = ( clr._load_attrs("bins/weight").get("ignore_diags", 2) if ignore_diags is None else ignore_diags ) # prepare output table for eigen vectors eigvec_table = bins.copy() eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)] for ev_col in eigvec_columns: eigvec_table[ev_col] = np.nan # prepare output table for eigenvalues eigvals_table = regions.copy() eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)] for eval_col in eigval_columns: eigvals_table[eval_col] = np.nan def _each(region): """ perform eigen decomposition for a given region assuming safety checks are done outside of this function. Parameters ---------- region: tuple-like tuple of the form (chroms,start,end,*) Returns ------- _region, eigvals, eigvecs -> ndarrays array of eigenvalues and an array eigenvectors """ _region = region[:3] # take only (chrom, start, end) print("now doing region:", _region) if smooth: A = numutils.adaptive_coarsegrain( clr.matrix(balance=True).fetch(_region), clr.matrix(balance=False).fetch(_region), cutoff=cutoff, max_levels=max_levels) else: A = clr.matrix(balance=balance).fetch(_region) # filter bad_bins relevant for the _region from A if bad_bins is not None: # filter bad_bins for the _region and turn relative: lo, hi = clr.extent(_region) bad_bins_region = bad_bins[(bad_bins>=lo)&(bad_bins<hi)] bad_bins_region -= lo if len(bad_bins_region) > 0: # apply bad bins to symmetric matrix A: A[:,bad_bins_region] = np.nan A[bad_bins_region,:] = np.nan # extract phasing track relevant for the _region phasing_track = ( bioframe.select(bins, _region)[phasing_track_col].values if phasing_track_col else None ) eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track, clip_percentile=clip_percentile, sort_metric=sort_metric, OE_log=OE_log ) return _region, eigvals, eigvecs # eigendecompose matrix per region (can be multiprocessed) # output assumes that the order of results matches regions results = map(_each, regions.values) # go through eigendecomposition results and fill in # output table eigvec_table and eigvals_table for _region, _eigvals, _eigvecs in results: idx = bioframe.select(eigvec_table, _region).index eigvec_table.at[idx, eigvec_columns] = _eigvecs.T idx = bioframe.select(eigvals_table, _region).index eigvals_table.at[idx, eigval_columns] = _eigvals return eigvals_table, eigvec_table
def make_saddle( getmatrix, binedges, digitized, contact_type, regions=None, min_diag=3, max_diag=-1, trim_outliers=False, verbose=False, ): """ Make a matrix of average interaction probabilities between genomic bin pairs as a function of a specified genomic track. The provided genomic track must be pre-quantized as integers (i.e. digitized). Parameters ---------- getmatrix : function A function returning a matrix of interaction between two chromosomes given their names/indicies. binedges : 1D array (length n + 1) Bin edges of the digitized signal. For `n` bins, there are `n + 1` edges. See :func:`digitize_track`. digitized : tuple of (DataFrame, str) BedGraph-like dataframe of digitized signal along with the name of the digitized value column. contact_type : str If 'cis' then only cis interactions are used to build the matrix. If 'trans', only trans interactions are used. regions : sequence of str or tuple, optional A list of genomic regions to use. Each can be a chromosome, a UCSC-style genomic region string or a tuple. min_diag : int Smallest diagonal to include in computation. Ignored with contact_type=trans. max_diag : int Biggest diagonal to include in computation. Ignored with contact_type=trans. trim_outliers : bool, optional Remove first and last row and column from the output matrix. verbose : bool, optional If True then reports progress. Returns ------- interaction_sum : 2D array The matrix of summed interaction probability between two genomic bins given their values of the provided genomic track. interaction_count : 2D array The matrix of the number of genomic bin pairs that contributed to the corresponding pixel of ``interaction_sum``. """ digitized_df, name = digitized digitized_df = digitized_df[["chrom", "start", "end", name]] if regions is None: regions = [(chrom, df.start.min(), df.end.max()) for chrom, df in digitized_df.groupby("chrom")] regions = bioframe.parse_regions(regions) digitized_tracks = {} for reg in regions.values: track = bioframe.select(digitized_df, reg) digitized_tracks[reg[3]] = track[name] # 3 = name if contact_type == "cis": supports = list(zip(regions["name"], regions["name"])) elif contact_type == "trans": supports = list(combinations(regions["name"], 2)) else: raise ValueError("The allowed values for the contact_type " "argument are 'cis' or 'trans'.") # n_bins here includes 2 open bins # for values <lo and >hi. n_bins = len(binedges) + 1 interaction_sum = np.zeros((n_bins, n_bins)) interaction_count = np.zeros((n_bins, n_bins)) for reg1, reg2 in supports: _accumulate( interaction_sum, interaction_count, getmatrix, digitized_tracks, reg1, reg2, min_diag, max_diag, verbose, ) interaction_sum += interaction_sum.T interaction_count += interaction_count.T if trim_outliers: interaction_sum = interaction_sum[1:-1, 1:-1] interaction_count = interaction_count[1:-1, 1:-1] return interaction_sum, interaction_count
def compute_scaling(pairs_paths, out_path, region, exclude_chroms, assembly, centromeres_path, split_arms, normalized, plot_slope, show_average_trans, labels, title, no_cache): """ Compute and plot contact frequency vs genomic separation curves for one or more pairs files. """ labels = list(labels) # parse left/right arm parameter of chromosomes to exclude exclude_chroms = [chrom.split(':') for chrom in exclude_chroms] chromsizes = bioframe.fetch_chromsizes(assembly, filter_chroms=False, as_bed=True) chromsizes = chromsizes[~chromsizes.chrom.isin(exclude_chroms)] if centromeres_path: centromeres = {} with open(centromeres_path) as file: for line in file: cols = line.split(' ') centromeres[cols[0]] = (int(cols[1]) + int(cols[2])) // 2 else: centromeres = bioframe.fetch_centromeres(assembly) centromeres.set_index('chrom', inplace=True) centromeres = centromeres.mid.to_dict() if len(labels) != 0 and len(pairs_paths) != len(labels) and not split_arms: sys.exit('Please provide as many labels as pairs paths.') if region: regions = bioframe.select(chromsizes, region).reset_index() else: # use chromosomal arms as separate regions if no regions are specified arms = bioframe.split(chromsizes, centromeres) # remove user-excluded chromosomes/arms for chrom in exclude_chroms: if len(chrom) == 1: # no arm specified, remove entire chromosome arms = arms[arms.chrom != chrom[0]] elif chrom[1] == 'left': # remove specified chromosome with start == 0 (left arm) arms = arms[~((arms.chrom == chrom[0]) & (arms.start == 0))] elif chrom[1] == 'right': # remove specified chromosome with start != 0 (right arm) arms = arms[~((arms.chrom == chrom[0]) & (arms.start != 0))] # remove 40kb from each side (80kb total) of an arm to remove centromere and telomere regions arms = bioframe.ops.expand(arms, -ARM_PADDING) # remove arms arms with a length of < 0 after removing side regions regions = arms[arms.start < arms.end].reset_index() all_scalings = [] all_avg_trans_levels = [] for idx, path in enumerate(pairs_paths): cis_scalings, avg_trans = None, None if split_arms: # calculate scalings per arm per chromosome cis_scalings, trans_levels = pairlib.scalings.compute_scaling( path, regions, chromsizes, dist_range=(int(1e1), int(1e9)), n_dist_bins=128, chunksize=int(1e7)) # remove unassigned pairs with start/end positions < 0 cis_scalings = cis_scalings[(cis_scalings.start1 > 0) & (cis_scalings.end1 > 0) & (cis_scalings.start2 > 0) & (cis_scalings.end2 > 0)] sc_agg = (cis_scalings.groupby( ['chrom1', 'start1', 'min_dist', 'max_dist']).agg({ 'n_pairs': 'sum', 'n_bp2': 'sum' }).reset_index()) avail_chroms = set(sc_agg.chrom1) for chrom in avail_chroms: # calculate scalings for left/right arms (left arms start at position 0 + ARM_PADDING) sc_left, avg_trans_left = (calc_pair_freqs( sc_agg[(sc_agg.chrom1 == chrom) & (sc_agg.start1 == ARM_PADDING)], trans_levels, show_average_trans, normalized)) sc_right, avg_trans_right = (calc_pair_freqs( sc_agg[(sc_agg.chrom1 == chrom) & (sc_agg.start1 != ARM_PADDING)], trans_levels, show_average_trans, normalized)) dir_path = os.path.join(os.path.dirname(out_path), os.path.basename(path)) if not os.path.exists(dir_path): os.mkdir(dir_path) chrom_path = os.path.join( dir_path, '_'.join((chrom, os.path.basename(out_path)))) (plot_scalings( scalings=[sc_left, sc_right], avg_trans_levels=[avg_trans_left, avg_trans_right], plot_slope=plot_slope, labels=['left', 'right'], title=chrom, out_path=chrom_path)) else: if not no_cache: # get cached values cached = cache.get(path) if cached is not None: cis_scalings = cached['cis_scalings'] if cached[ 'normalized'] == normalized else None avg_trans = cached['avg_trans'] if no_cache or cis_scalings is None or (avg_trans is None and show_average_trans): print( f'Computing scalings for file {idx + 1}/{len(pairs_paths)} ...', end='\r') # caching disabled or no cached values found cis_scalings, trans_levels = pairlib.scalings.compute_scaling( path, regions, chromsizes, dist_range=(int(1e1), int(1e9)), n_dist_bins=128, chunksize=int(1e7)) # remove unassigned pairs with start/end positions < 0 cis_scalings = cis_scalings[(cis_scalings.start1 >= 0) & (cis_scalings.end1 >= 0) & (cis_scalings.start2 >= 0) & (cis_scalings.end2 >= 0)] sc_agg = (cis_scalings.groupby(['min_dist', 'max_dist']).agg({ 'n_pairs': 'sum', 'n_bp2': 'sum' }).reset_index()) cis_scalings, avg_trans = calc_pair_freqs( sc_agg, trans_levels, show_average_trans, normalized) if not no_cache: cache.set( path, { 'cis_scalings': cis_scalings, 'avg_trans': avg_trans, 'normalized': normalized }) else: print( f'Retrieved cached values for file {idx + 1}/{len(pairs_paths)}.', end='\r') # use file names as labels if labels have not been provided labels.append( os.path.basename) if len(labels) < len(pairs_paths) else None all_scalings.append(cis_scalings) all_avg_trans_levels.append( avg_trans) if avg_trans is not None else None if len(all_scalings) > 0 and not split_arms: plot_scalings(all_scalings, all_avg_trans_levels, plot_slope, labels, title, out_path)
def cooler_cis_eig( clr, bins, regions=None, n_eigs=3, phasing_track_col="GC", balance="weight", ignore_diags=None, clip_percentile=99.9, sort_metric=None, ): # Perform consitency checks. if regions is None: chroms_not_in_clr = [ chrom for chrom in bins["chrom"].unique() if chrom not in clr.chromsizes ] if len(chroms_not_in_clr) > 0: raise ValueError( "The following chromosomes are found in the bin table, but not " "in the cooler: " + str(chroms_not_in_clr) ) if regions is None: regions = ( [(chrom, 0, clr.chromsizes[chrom]) for chrom in bins["chrom"].unique()] if regions is None else [bioframe.parse_region(r) for r in regions] ) ignore_diags = ( clr._load_attrs("bins/weight").get("ignore_diags", 2) if ignore_diags is None else ignore_diags ) eigvec_table = bins.copy() for i in range(n_eigs): eigvec_table["E" + str(i + 1)] = np.nan def _each(region): A = clr.matrix(balance=balance).fetch(region) if phasing_track_col and (phasing_track_col not in bins): raise ValueError( 'No column "{}" in the bin table'.format(phasing_track_col) ) phasing_track = ( bioframe.slice_bedframe(bins, region)[phasing_track_col].values if phasing_track_col else None ) eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track, clip_percentile=clip_percentile, sort_metric=sort_metric, ) return eigvals, eigvecs eigvals_per_reg, eigvecs_per_reg = zip(*map(_each, regions)) for region, eigvecs in zip(regions, eigvecs_per_reg): idx = bioframe.select(bins, region).index for i, eigvec in enumerate(eigvecs): eigvec_table.loc[idx, "E" + str(i + 1)] = eigvec region_strs = [ ( chrom if (start == 0 and end == clr.chromsizes[chrom]) else "{}:{}-{}".format(chrom, start, end) ) for chrom, start, end in regions ] eigvals = pd.DataFrame( index=region_strs, data=np.vstack(eigvals_per_reg), columns=["eigval" + str(i + 1) for i in range(n_eigs)], ) eigvals.index.name = "region" return eigvals, eigvec_table
def eigs_cis( clr, phasing_track=None, view_df=None, n_eigs=3, clr_weight_name="weight", ignore_diags=None, bad_bins=None, clip_percentile=99.9, sort_metric=None, map=map, ): """ Compute compartment eigenvector for a given cooler `clr` in a number of symmetric intra chromosomal regions defined in view_df (cis-regions), or for each chromosome. Note that the amplitude of compartment eigenvectors is weighted by their corresponding eigenvalue. Eigenvectors can be oriented by passing a binned `phasing_track` with the same resolution as the cooler. Parameters ---------- clr : cooler cooler object to fetch data from phasing_track : DataFrame binned track with the same resolution as cooler bins, the fourth column is used to phase the eigenvectors, flipping them to achieve a positive correlation. view_df : iterable or DataFrame, optional if provided, eigenvectors are calculated for the regions of the view only, otherwise chromosome-wide eigenvectors are computed, for chromosomes specified in phasing_track. n_eigs : int number of eigenvectors to compute clr_weight_name : str name of the column with balancing weights to be used. ignore_diags : int, optional the number of diagonals to ignore. Derived from cooler metadata if not specified. bad_bins : array-like a list of bins to ignore. Indexes of bins must be absolute, as in clr.bins()[:], as opposed to being offset by chromosome start. `bad_bins` will be combined with the bad bins masked by balancing. clip_percentile : float if >0 and <100, clip pixels with diagonal-normalized values higher than the specified percentile of matrix-wide values. sort_metric : str If provided, re-sort `eigenvecs` and `eigvals` in the order of decreasing correlation between phasing_track and eigenvector, using the specified measure of correlation. Possible values: 'pearsonr' - sort by decreasing Pearson correlation. 'var_explained' - sort by decreasing absolute amount of variation in `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec)) 'MAD_explained' - sort by decreasing absolute amount of Median Absolute Deviation from the median of `eigvecs` explained by `phasing_track` (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)). 'spearmanr' - sort by decreasing Spearman correlation. This option is designed to report the most "biologically" informative eigenvectors first, and prevent eigenvector swapping caused by translocations. In reality, however, sometimes it shows poor performance and may lead to reporting of non-informative eigenvectors. Off by default. map : callable, optional Map functor implementation. Returns ------- eigvals, eigvec_table -> DataFrames with eigenvalues for each region and a table of eigenvectors filled in the `bins` table. .. note:: ALWAYS check your EVs by eye. The first one occasionally does not reflect the compartment structure, but instead describes chromosomal arms or translocation blowouts. Possible mitigations: employ `view_df` (e.g. arms) to avoid issues with chromosomal arms, use `bad_bins` to ignore small transolcations. """ # get chromosomes from cooler, if view_df not specified: if view_df is None: view_df = make_cooler_view(clr) else: # Make sure view_df is a proper viewframe try: _ = is_compatible_viewframe( view_df, clr, check_sorting=True, raise_errors=True, ) except Exception as e: raise ValueError("view_df is not a valid viewframe or incompatible") from e # check if cooler is balanced try: _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True) except Exception as e: raise ValueError( f"provided cooler is not balanced or {clr_weight_name} is missing" ) from e # ignore diags as in cooler unless specified ignore_diags = ( clr._load_attrs(f"bins/{clr_weight_name}").get("ignore_diags", 2) if ignore_diags is None else ignore_diags ) bins = clr.bins()[:] if phasing_track is not None: phasing_track = align_track_with_cooler( phasing_track, clr, view_df=view_df, clr_weight_name=clr_weight_name, mask_bad_bins=True, ) # prepare output table for eigen vectors eigvec_table = bins.copy() eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)] for ev_col in eigvec_columns: eigvec_table[ev_col] = np.nan # prepare output table for eigenvalues eigvals_table = view_df.copy() eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)] for eval_col in eigval_columns: eigvals_table[eval_col] = np.nan def _each(region): """ perform eigen decomposition for a given region assuming safety checks are done outside of this function. Parameters ---------- region: tuple-like tuple of the form (chroms,start,end,*) Returns ------- _region, eigvals, eigvecs -> ndarrays array of eigenvalues and an array eigenvectors """ _region = region[:3] # take only (chrom, start, end) A = clr.matrix(balance=clr_weight_name).fetch(_region) # filter bad_bins relevant for the _region from A if bad_bins is not None: # filter bad_bins for the _region and turn relative: lo, hi = clr.extent(_region) bad_bins_region = bad_bins[(bad_bins >= lo) & (bad_bins < hi)] bad_bins_region -= lo if len(bad_bins_region) > 0: # apply bad bins to symmetric matrix A: A[:, bad_bins_region] = np.nan A[bad_bins_region, :] = np.nan # extract phasing track relevant for the _region if phasing_track is not None: phasing_track_region = bioframe.select(phasing_track, _region) phasing_track_region_values = phasing_track_region["value"].values else: phasing_track_region_values = None eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track_region_values, clip_percentile=clip_percentile, sort_metric=sort_metric, ) return _region, eigvals, eigvecs # eigendecompose matrix per region (can be multiprocessed) # output assumes that the order of results matches regions results = map(_each, view_df.values) # go through eigendecomposition results and fill in # output table eigvec_table and eigvals_table for _region, _eigvals, _eigvecs in results: idx = bioframe.select(eigvec_table, _region).index eigvec_table.loc[idx, eigvec_columns] = _eigvecs.T idx = bioframe.select(eigvals_table, _region).index eigvals_table.loc[idx, eigval_columns] = _eigvals return eigvals_table, eigvec_table
def align_track_with_cooler(track, clr, view_df=None, clr_weight_name="weight", mask_bad_bins=True): """ Sync a track dataframe with a cooler bintable. Checks that bin sizes match between a track and a cooler, merges the cooler bintable with the track, and propagates masked regions from a cooler bintable to a track. Parameters ---------- track : pd.DataFrame bedGraph-like track DataFrame to check clr : cooler cooler object to check against view_df : bioframe.viewframe or None Optional viewframe of regions to check for their number of bins with assigned track values. If None, constructs a view_df from cooler chromsizes. clr_weight_name : str Name of the column in the bin table with weight mask_bad_bins : bool Whether to propagate null bins from cooler bintable column clr_weight_name to the 'value' column of the output clr_track. Default True. Returns ------- clr_track track dataframe that has been aligned with the cooler bintable and has columns ['chrom','start','end','value'] """ from .checks import is_track, is_cooler_balanced try: is_track(track, raise_errors=True) except Exception as e: raise ValueError("invalid input track") from e # since tracks are currently allowed to have flexible column names c, s, e, v = track.columns[:4] # using median to allow for shorter / longer last bin on any chromosome track_bin_width = int((track[e] - track[s]).median()) if not (track_bin_width == clr.binsize): raise ValueError( "mismatch between track and cooler bin size, check track resolution" ) clr_track = ((clr.bins()[:]).copy().merge( track.rename(columns={ c: "chrom", s: "start", e: "end", v: "value" }), how="left", on=["chrom", "start"], suffixes=("", "_"))) if clr_weight_name: try: is_cooler_balanced(clr, clr_weight_name=clr_weight_name, raise_errors=True) except Exception as e: raise ValueError( f"no column {clr_weight_name} detected in input cooler bintable" ) from e else: clr_track[clr_weight_name] = 1.0 valid_bins = clr_track[clr_weight_name].notna() num_valid_bins = valid_bins.sum() num_assigned_bins = (clr_track["value"][valid_bins].notna()).sum() if num_assigned_bins == 0: raise ValueError("no track values assigned to cooler bintable") elif num_assigned_bins < 0.5 * np.sum(valid_bins): warnings.warn("less than 50% of valid bins have been assigned a value") view_df = make_cooler_view(clr) if view_df is None else view_df for region in view_df.itertuples(index=False): track_region = bioframe.select(clr_track, region) num_assigned_region_bins = track_region["value"].notna().sum() if num_assigned_region_bins == 0: raise ValueError( f"no track values assigned to region {bioframe.to_ucsc_string(region)}" ) if mask_bad_bins: clr_track.loc[~valid_bins, "value"] = np.nan return clr_track[["chrom", "start", "end", "value"]]
def saddle( clr, expected, track, contact_type, n_bins, vrange=None, qrange=None, view_df=None, clr_weight_name="weight", expected_value_col="balanced.avg", view_name_col="name", min_diag=3, max_diag=-1, trim_outliers=False, verbose=False, ): """ Get a matrix of average interactions between genomic bin pairs as a function of a specified genomic track. The provided genomic track is either: (a) digitized inside this function by passing 'n_bins', and one of 'v_range' or 'q_range' (b) passed as a pre-digitized track with a categorical value column as generated by `get_digitized()`. Parameters ---------- clr : cooler.Cooler Observed matrix. expected : DataFrame in expected format Diagonal summary statistics for each chromosome, and name of the column with the values of expected to use. contact_type : str If 'cis' then only cis interactions are used to build the matrix. If 'trans', only trans interactions are used. track : DataFrame A track, i.e. BedGraph-like dataframe, which is digitized with the options n_bins, vrange and qrange. Can optionally be passed as a pre-digitized dataFrame with a categorical value column, as generated by get_digitzied(), also passing n_bins as None. n_bins : int or None number of bins for signal quantization. If None, then track must be passed as a pre-digitized track. vrange : tuple Low and high values used for binning track values. See get_digitized(). qrange : tuple Low and high values for quantile binning track values. Low must be 0.0 or more, high must be 1.0 or less. Only one of vrange or qrange can be passed. See get_digitzed(). view_df: viewframe Viewframe with genomic regions. If none, generate from track chromosomes. clr_weight_name : str Name of the column in the clr.bins to use as balancing weights. Using raw unbalanced data is not supported for saddles. expected_value_col : str Name of the column in expected used for normalizing. view_name_col : str Name of column in view_df with region names. min_diag : int Smallest diagonal to include in computation. Ignored with contact_type=trans. max_diag : int Biggest diagonal to include in computation. Ignored with contact_type=trans. trim_outliers : bool, optional Remove first and last row and column from the output matrix. verbose : bool, optional If True then reports progress. Returns ------- interaction_sum : 2D array The matrix of summed interaction probability between two genomic bins given their values of the provided genomic track. interaction_count : 2D array The matrix of the number of genomic bin pairs that contributed to the corresponding pixel of ``interaction_sum``. """ if type(n_bins) is int: # perform digitization track = align_track_with_cooler( track, clr, view_df=view_df, clr_weight_name=clr_weight_name, mask_bad_bins=True, ) digitized_track, binedges = digitize( track.iloc[:, :4], n_bins, vrange=vrange, qrange=qrange, digitized_suffix=".d", ) digitized_col = digitized_track.columns[3] elif n_bins is None: # assume and test if track is pre-digitized digitized_track = track digitized_col = digitized_track.columns[3] is_track(track.astype({digitized_col: "float"}), raise_errors=True) if (type(digitized_track.dtypes[3]) is not pd.core.dtypes.dtypes.CategoricalDtype): raise ValueError( "when n_bins=None, saddle assumes the track has been " + "pre-digitized and the value column is a " + "pandas categorical. See get_digitized().") cats = digitized_track[digitized_col].dtype.categories.values # cats has two additional categories, 0 and n_bins+1, for values # falling outside range, as well as -1 for NAs. n_bins = len(cats[cats > -1]) - 2 else: raise ValueError("n_bins must be provided as int or None") if view_df is None: view_df = view_from_track(digitized_track) else: # Make sure view_df is a proper viewframe try: _ = is_compatible_viewframe( view_df, clr, check_sorting=True, # just in case raise_errors=True, ) except Exception as e: raise ValueError( "view_df is not a valid viewframe or incompatible") from e # make sure provided expected is compatible try: _ = is_valid_expected( expected, contact_type, view_df, verify_cooler=clr, expected_value_cols=[ expected_value_col, ], raise_errors=True, ) except Exception as e: raise ValueError("provided expected is not compatible") from e # check if cooler is balanced if clr_weight_name: try: _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True) except Exception as e: raise ValueError( f"provided cooler is not balanced or {clr_weight_name} is missing" ) from e digitized_tracks = {} for num, reg in view_df.iterrows(): digitized_reg = bioframe.select(digitized_track, reg) digitized_tracks[reg[view_name_col]] = digitized_reg[digitized_col] # set "cis" or "trans" for supports (regions to iterate over) and matrix fetcher if contact_type == "cis": # only symmetric intra-chromosomal regions : supports = list(zip(view_df[view_name_col], view_df[view_name_col])) getmatrix = _make_cis_obsexp_fetcher( clr, expected, view_df, view_name_col=view_name_col, expected_value_col=expected_value_col, clr_weight_name=clr_weight_name, ) elif contact_type == "trans": # asymmetric inter-chromosomal regions : supports = list(combinations(view_df[view_name_col], 2)) supports = [ i for i in supports if (view_df["chrom"].loc[view_df[view_name_col] == i[0]].values != view_df["chrom"].loc[view_df[view_name_col] == i[1]].values) ] getmatrix = _make_trans_obsexp_fetcher( clr, expected, view_df, view_name_col=view_name_col, expected_value_col=expected_value_col, clr_weight_name=clr_weight_name, ) else: raise ValueError( "Allowed values for contact_type are 'cis' or 'trans'.") # n_bins here includes 2 open bins for values <lo and >hi. interaction_sum = np.zeros((n_bins + 2, n_bins + 2)) interaction_count = np.zeros((n_bins + 2, n_bins + 2)) for reg1, reg2 in supports: _accumulate( interaction_sum, interaction_count, getmatrix, digitized_tracks, reg1, reg2, min_diag=min_diag, max_diag=max_diag, verbose=verbose, ) interaction_sum += interaction_sum.T interaction_count += interaction_count.T if trim_outliers: interaction_sum = interaction_sum[1:-1, 1:-1] interaction_count = interaction_count[1:-1, 1:-1] return interaction_sum, interaction_count