def _fetch_trans_oe(reg1, reg2): reg1 = bioframe.parse_region(reg1) reg2 = bioframe.parse_region(reg2) return ( clr.matrix().fetch(reg1, reg2) / _fetch_trans_exp(reg1[0], reg2[0]) )
def cooler_global_scaling(cool, genome, trans=True, mapper=map, balance='weight', thres=None, ignore_diags=2): row_masker = col_masker = cooler_mask(cool, header=balance, thres=thres) matrix_fetcher = cooler_matrix_generator(cool, header=balance) resolution = cool.info['bin-size'] chrom_arms = DNA_info.get_chromosome_arms(genome) cis_regions = [(arm, arm) for arm in chrom_arms] cis_results = cis_binning(cis_regions, matrix_fetcher, row_masker, col_masker, resolution, ignore_diags, mapper=mapper) cis_results = pd.concat(cis_results) cis_results = cis_results.reset_index().rename( columns={'region1': 'region'}) del cis_results['region2'] cis_results.set_index(['region', 'diag'], inplace=True, drop=True) if trans: print('Computing trans expected') chromsizes = bioframe.fetch_chromsizes(genome) trans_regions = [(bioframe.parse_region(cool.chromnames[i], chromsizes=chromsizes), bioframe.parse_region(cool.chromnames[j], chromsizes=chromsizes)) for i in range(len(cool.chromnames)) for j in range(i + 1, len(cool.chromnames))] trans_results = trans_binning(trans_regions, matrix_fetcher, row_masker, col_masker, resolution, mapper=mapper) trans_results = [ result for result in trans_results if result is not None ] trans_results = pd.concat(trans_results) trans_results['chrom1'] = trans_results.index.map( lambda x: x[0][0]).values trans_results['chrom2'] = trans_results.index.map( lambda x: x[1][0]).values trans_results.set_index(['chrom1', 'chrom2'], inplace=True) return cis_results, trans_results return cis_results
def make_diag_tables(clr, supports): where = np.flatnonzero diag_tables = {} for region in supports: if isinstance(region, str): region = bioframe.parse_region(region) if len(region) == 1: chrom, = region start1, end1 = 0, clr.chromsizes[chrom] start2, end2 = start1, end1 elif len(region) == 2: chrom, start1, end1 = region[0] _, start2, end2 = region[1] elif len(region) == 3: chrom, start1, end1 = region start2, end2 = start1, end1 elif len(region) == 5: chrom, start1, end1, start2, end2 = region else: raise ValueError("Regions must be sequences of length 1, 3 or 5") bins = clr.bins().fetch(chrom).reset_index(drop=True) bad_mask = np.array(bins['weight'].isnull()) lo1, hi1 = clr.extent((chrom, start1, end1)) lo2, hi2 = clr.extent((chrom, start2, end2)) co = clr.offset(chrom) lo1 -= co lo2 -= co hi1 -= co hi2 -= co dt = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2]) diag_tables[region] = dt return diag_tables
def digitize_track(binedges, track, regions=None): """ Digitize genomic signal tracks into integers between `1` and `n`. Parameters ---------- binedges : 1D array (length n + 1) Bin edges for quantization of signal. For `n` bins, there are `n + 1` edges. See encoding details in Notes. track : tuple of (DataFrame, str) bedGraph-like dataframe along with the name of the value column. regions: sequence of str or tuples List of genomic regions to include. Each can be a chromosome, a UCSC-style genomic region string or a tuple. Returns ------- digitized : DataFrame New bedGraph-like dataframe with value column and an additional digitized value column with name suffixed by '.d' hist : 1D array (length n + 2) Histogram of digitized signal values. Its length is `n + 2` because the first and last elements correspond to outliers. See notes. Notes ----- The digital encoding is as follows: - `1..n` <-> values assigned to histogram bins - `0` <-> left outlier values - `n+1` <-> right outlier values - `-1` <-> missing data (NaNs) """ if not isinstance(track, tuple): raise ValueError( "``track`` should be a tuple of (dataframe, column_name)") track, name = track # subset and re-order chromosome groups if regions is not None: regions = [bioframe.parse_region(reg) for reg in regions] grouped = track.groupby('chrom') track = pd.concat( bioframe.bedslice(grouped, chrom, st, end) for (chrom, st, end) in regions) # histogram the signal digitized = track.copy() digitized[name + '.d'] = np.digitize(track[name].values, binedges, right=False) mask = track[name].isnull() digitized.loc[mask, name + '.d'] = -1 x = digitized[name + '.d'].values.copy() x = x[(x > 0) & (x < len(binedges) + 1)] hist = np.bincount(x, minlength=len(binedges) + 1) return digitized, hist
def exclude_regions(df, regions_to_keep=[], genome=None, print_final=False): if len(regions_to_keep): assert genome is not None, 'Please provide valid genome' chromsizes = bioframe.fetch_chromsizes(genome) else: if print_final: print(np.asarray(df.region.unique())) return df regions_to_keep = [ bioframe.parse_region(reg, chromsizes) for reg in regions_to_keep ] assert 'region' in df.columns regions = df['region'].apply( lambda x: bioframe.parse_region(x, chromsizes)).values chrom, start, end = list(zip(*regions)) df['chrom'] = chrom df['start'] = start df['end'] = end new_df = [] for chrom, start, end in regions_to_keep: sub_df = bioframe.bedslice(df, (chrom, start, end)) new_df.append(sub_df) new_df = pd.concat(new_df) if print_final: print(np.asarray(new_df.region.unique())) del new_df['chrom'] del new_df['start'] del new_df['end'] return new_df
def make_diag_tables(clr, supports): bins = clr.bins()[:] if 'weight' in clr.bins().columns: groups = dict(iter(bins.groupby('chrom')['weight'])) bad_bin_dict = { chrom: np.array(groups[chrom].isnull()) for chrom in groups.keys() } else: sizes = dict(bins.groupby('chrom').size()) bad_bin_dict = { chrom: np.zeros(sizes[chrom], dtype=bool) for chrom in sizes.keys() } where = np.flatnonzero diag_tables = {} for region in supports: if isinstance(region, str): region = bioframe.parse_region(region) if len(region) == 1: chrom, = region start1, end1 = 0, clr.chromsizes[chrom] start2, end2 = start1, end1 elif len(region) == 2: chrom, start1, end1 = region[0] _, start2, end2 = region[1] elif len(region) == 3: chrom, start1, end1 = region start2, end2 = start1, end1 elif len(region) == 5: chrom, start1, end1, start2, end2 = region else: raise ValueError("Regions must be sequences of length 1, 3 or 5") lo1, hi1 = clr.extent((chrom, start1, end1)) lo2, hi2 = clr.extent((chrom, start2, end2)) co = clr.offset(chrom) lo1 -= co lo2 -= co hi1 -= co hi2 -= co bad_mask = bad_bin_dict[chrom] diag_tables[region] = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2]) return diag_tables
def _fetch_trans_oe(reg1, reg2): reg1 = bioframe.parse_region(reg1) reg2 = bioframe.parse_region(reg2) return clr.matrix(balance=weight_name).fetch( reg1, reg2) / _fetch_trans_exp(reg1[0], reg2[0])
def make_saddle(getmatrix, binedges, digitized, contact_type, regions=None, min_diag=3, max_diag=-1, trim_outliers=False, verbose=False): """ Make a matrix of average interaction probabilities between genomic bin pairs as a function of a specified genomic track. The provided genomic track must be pre-quantized as integers (i.e. digitized). Parameters ---------- getmatrix : function A function returning a matrix of interaction between two chromosomes given their names/indicies. binedges : 1D array (length n + 1) Bin edges of the digitized signal. For `n` bins, there are `n + 1` edges. See :func:`digitize_track`. digitized : tuple of (DataFrame, str) BedGraph-like dataframe of digitized signal along with the name of the digitized value column. contact_type : str If 'cis' then only cis interactions are used to build the matrix. If 'trans', only trans interactions are used. regions : sequence of str or tuple, optional A list of genomic regions to use. Each can be a chromosome, a UCSC-style genomic region string or a tuple. min_diag : int Smallest diagonal to include in computation. Ignored with contact_type=trans. max_diag : int Biggest diagonal to include in computation. Ignored with contact_type=trans. trim_outliers : bool, optional Remove first and last row and column from the output matrix. verbose : bool, optional If True then reports progress. Returns ------- interaction_sum : 2D array The matrix of summed interaction probability between two genomic bins given their values of the provided genomic track. interaction_count : 2D array The matrix of the number of genomic bin pairs that contributed to the corresponding pixel of ``interaction_sum``. """ digitized_df, name = digitized if regions is None: regions = [(chrom, df.start.min(), df.end.max()) for chrom, df in digitized_df.groupby('chrom')] else: regions = [bioframe.parse_region(reg) for reg in regions] digitized_tracks = { reg: bioframe.bedslice(digitized_df.groupby('chrom'), reg[0], reg[1], reg[2])[name] for reg in regions } if contact_type == 'cis': supports = list(zip(regions, regions)) elif contact_type == 'trans': supports = list(combinations(regions, 2)) else: raise ValueError("The allowed values for the contact_type " "argument are 'cis' or 'trans'.") # n_bins here includes 2 open bins # for values <lo and >hi. n_bins = len(binedges) + 1 interaction_sum = np.zeros((n_bins, n_bins)) interaction_count = np.zeros((n_bins, n_bins)) for reg1, reg2 in supports: _accumulate(interaction_sum, interaction_count, getmatrix, digitized_tracks, reg1, reg2, min_diag, max_diag, verbose) interaction_sum += interaction_sum.T interaction_count += interaction_count.T if trim_outliers: interaction_sum = interaction_sum[1:-1, 1:-1] interaction_count = interaction_count[1:-1, 1:-1] return interaction_sum, interaction_count
def cooler_cis_eig( clr, bins, regions=None, n_eigs=3, phasing_track_col='GC', balance='weight', ignore_diags=None, clip_percentile=99.9, sort_metric=None): # Perform consitency checks. if regions is None: chroms_not_in_clr = [ chrom for chrom in bins['chrom'].unique() if chrom not in clr.chromsizes] if len(chroms_not_in_clr) > 0: raise ValueError( 'The following chromosomes are found in the bin table, but not ' 'in the cooler: '+str(chroms_not_in_clr) ) if regions is None: regions = ( [(chrom, 0, clr.chromsizes[chrom]) for chrom in bins['chrom'].unique()] if regions is None else [bioframe.parse_region(r) for r in regions] ) ignore_diags = ( clr._load_attrs('bins/weight').get('ignore_diags', 2) if ignore_diags is None else ignore_diags) eigvec_table = bins.copy() for i in range(n_eigs): eigvec_table['E'+str(i+1)] = np.nan def _each(region): A = clr.matrix(balance=balance).fetch(region) if phasing_track_col and (phasing_track_col not in bins): raise ValueError('No column "{}" in the bin table'.format( phasing_track_col)) phasing_track = ( bioframe.slice_bedframe(bins, region)[phasing_track_col].values if phasing_track_col else None) eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track, clip_percentile=clip_percentile, sort_metric=sort_metric) return eigvals, eigvecs eigvals_per_reg, eigvecs_per_reg = zip(*map(_each, regions)) for region, eigvecs in zip(regions, eigvecs_per_reg): lo, hi = bioframe.bisect_bedframe(bins, region) for i, eigvec in enumerate(eigvecs): eigvec_table.iloc[ lo:hi, eigvec_table.columns.get_loc('E'+str(i+1))] = eigvec region_strs = [ (chrom if (start == 0 and end == clr.chromsizes[chrom]) else '{}:{}-{}'.format(chrom, start, end) ) for chrom, start, end in regions ] eigvals = pd.DataFrame( index=region_strs, data=np.vstack(eigvals_per_reg), columns=['eigval'+str(i+1) for i in range(n_eigs)], ) eigvals.index.name = 'region' return eigvals, eigvec_table
def make_diag_tables(clr, supports, weight_name="weight", bad_bins=None): """ For every support region infer diagonals that intersect this region and calculate the size of these intersections in pixels, both "total" and "n_valid", where "n_valid" does not include "bad" bins into counting. "Bad" pixels are inferred from the balancing weight column `weight_name` or provided directly in the form of an array `bad_bins`. Setting `weight_name` and `bad_bins` to `None` yields 0 "bad" pixels per diagonal per support region. Parameters ---------- clr : cooler.Cooler Input cooler supports : list a list of genomic support regions weight_name : str name of the weight vector in the "bins" table, if weight_name is None returns 0 for each block. Balancing weight are used to infer bad bins. bad_bins : array-like a list of bins to ignore per support region. Overwrites inference of bad bins from balacning weight [to be implemented]. Returns ------- diag_tables : dict dictionary with DataFrames of relevant diagonals for every support. """ if bad_bins is not None: raise NotImplementedError("providing external list \ of bad bins is not implemented.") bins = clr.bins()[:] if weight_name is None: # ignore bad bins sizes = dict(bins.groupby("chrom").size()) bad_bin_dict = { chrom: np.zeros(sizes[chrom], dtype=bool) for chrom in sizes.keys() } elif isinstance(weight_name, str): # using balacning weight to infer bad bins if weight_name not in clr.bins().columns: raise KeyError("Balancing weight {weight_name} not found!") groups = dict(iter(bins.groupby("chrom")[weight_name])) bad_bin_dict = { chrom: np.array(groups[chrom].isnull()) for chrom in groups.keys() } else: raise ValueError("`weight_name` can be `str` or `None`") where = np.flatnonzero diag_tables = {} for region in supports: # parse region if str if isinstance(region, str): region = bioframe.parse_region(region) # unpack region(s) into chroms,starts,ends if len(region) == 1: chrom, = region start1, end1 = 0, clr.chromsizes[chrom] start2, end2 = start1, end1 elif len(region) == 2: chrom, start1, end1 = region[0] _, start2, end2 = region[1] elif len(region) == 3: chrom, start1, end1 = region start2, end2 = start1, end1 elif len(region) == 5: chrom, start1, end1, start2, end2 = region else: raise ValueError("Regions must be sequences of length 1, 3 or 5") # translate regions into relative bin id-s: lo1, hi1 = clr.extent((chrom, start1, end1)) lo2, hi2 = clr.extent((chrom, start2, end2)) co = clr.offset(chrom) lo1 -= co lo2 -= co hi1 -= co hi2 -= co bad_mask = bad_bin_dict[chrom] diag_tables[region] = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2]) return diag_tables
def cooler_cis_eig( clr, bins, regions=None, n_eigs=3, phasing_track_col="GC", balance="weight", ignore_diags=None, clip_percentile=99.9, sort_metric=None, ): # Perform consitency checks. if regions is None: chroms_not_in_clr = [ chrom for chrom in bins["chrom"].unique() if chrom not in clr.chromsizes ] if len(chroms_not_in_clr) > 0: raise ValueError( "The following chromosomes are found in the bin table, but not " "in the cooler: " + str(chroms_not_in_clr) ) if regions is None: regions = ( [(chrom, 0, clr.chromsizes[chrom]) for chrom in bins["chrom"].unique()] if regions is None else [bioframe.parse_region(r) for r in regions] ) ignore_diags = ( clr._load_attrs("bins/weight").get("ignore_diags", 2) if ignore_diags is None else ignore_diags ) eigvec_table = bins.copy() for i in range(n_eigs): eigvec_table["E" + str(i + 1)] = np.nan def _each(region): A = clr.matrix(balance=balance).fetch(region) if phasing_track_col and (phasing_track_col not in bins): raise ValueError( 'No column "{}" in the bin table'.format(phasing_track_col) ) phasing_track = ( bioframe.slice_bedframe(bins, region)[phasing_track_col].values if phasing_track_col else None ) eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track, clip_percentile=clip_percentile, sort_metric=sort_metric, ) return eigvals, eigvecs eigvals_per_reg, eigvecs_per_reg = zip(*map(_each, regions)) for region, eigvecs in zip(regions, eigvecs_per_reg): idx = bioframe.select(bins, region).index for i, eigvec in enumerate(eigvecs): eigvec_table.loc[idx, "E" + str(i + 1)] = eigvec region_strs = [ ( chrom if (start == 0 and end == clr.chromsizes[chrom]) else "{}:{}-{}".format(chrom, start, end) ) for chrom, start, end in regions ] eigvals = pd.DataFrame( index=region_strs, data=np.vstack(eigvals_per_reg), columns=["eigval" + str(i + 1) for i in range(n_eigs)], ) eigvals.index.name = "region" return eigvals, eigvec_table