def diagsum_asymm(clr, supports1, supports2, contact_type='cis', transforms=None, chunksize=10000000, ignore_diags=2, map=map): """ Intra-chromosomal diagonal summary statistics. Parameters ---------- clr : cooler.Cooler Cooler object supports : sequence of genomic range tuples Support regions for intra-chromosomal diagonal summation transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. chunksize : int, optional Size of pixel table chunks to process ignore_diags : int, optional Number of intial diagonals to exclude from statistics map : callable, optional Map functor implementation. Returns ------- dict of support region -> dataframe of diagonal statistics """ spans = partition(0, len(clr.pixels()), chunksize) fields = ['count'] + list(transforms.keys()) areas = list(zip(supports1, supports2)) dtables = make_diag_tables(clr, areas) for dt in dtables.values(): for field in fields: agg_name = '{}.sum'.format(field) dt[agg_name] = 0 job = partial(_diagsum_asymm, clr, fields, transforms, contact_type, supports1, supports2) results = map(job, spans) for result in results: for (i, j), agg in result.items(): support1 = supports1[i] support2 = supports2[j] for field in fields: agg_name = '{}.sum'.format(field) dtables[support1, support2][agg_name] = \ dtables[support1, support2][agg_name].add( agg[field], fill_value=0) if ignore_diags: for dt in dtables.values(): for field in fields: agg_name = '{}.sum'.format(field) j = dt.columns.get_loc(agg_name) dt.iloc[:ignore_diags, j] = np.nan return dtables
def blocksum_asymm( clr, regions1, regions2, transforms={}, weight_name="weight", bad_bins=None, chunksize=1000000, map=map, ): """ Summary statistics on rectangular blocks of genomic regions. Parameters ---------- clr : cooler.Cooler Cooler object regions1 : sequence of genomic range tuples "left"-side support regions for diagonal summation regions2 : sequence of genomic range tuples "right"-side support regions for diagonal summation transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. weight_name : str name of the balancing weight vector used to count "bad"(masked) pixels per block. Use `None` to avoid masking "bad" pixels. bad_bins : array-like a list of bins to ignore per support region. Combines with the list of bad bins from balacning weight. chunksize : int, optional Size of pixel table chunks to process map : callable, optional Map functor implementation. Returns ------- DataFrame with entries for each blocks: region1, region2, n_valid, count.sum """ regions1 = bioframe.parse_regions(regions1, clr.chromsizes) regions2 = bioframe.parse_regions(regions2, clr.chromsizes) spans = partition(0, len(clr.pixels()), chunksize) fields = ["count"] + list(transforms.keys()) # similar with diagonal summations, pre-generate a block_table listing # all of the rectangular blocks and "n_valid" number of pixels per each block: records = make_block_table( clr, regions1, regions2, weight_name=weight_name, bad_bins=bad_bins ) # combine masking with existing transforms and add a "count" transform: if bad_bins is not None: # turn bad_bins into a mask of size clr.bins: mask_size = len(clr.bins()) bad_bins_mask = np.ones(mask_size, dtype=int) bad_bins_mask[bad_bins] = 0 # masked_transforms = {} bin1 = "bin1_id" bin2 = "bin2_id" for field in fields: if field in transforms: # combine masking and transform, minding the scope: t = transforms[field] masked_transforms[field] = ( lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]] ) else: # presumably field == "count", mind the scope as well: masked_transforms[field] = ( lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]] ) # substitute transforms to the masked_transforms: transforms = masked_transforms job = partial( _blocksum_asymm, clr, fields, transforms, regions1.values, regions2.values ) results = map(job, spans) for result in results: for i, agg in result.items(): for field in fields: agg_name = "{}.sum".format(field) s = agg[field].item() if not np.isnan(s): n1 = regions1.loc[i, "name"] n2 = regions2.loc[i, "name"] records[n1, n2][agg_name] += s # returning a dataframe for API consistency: return pd.DataFrame( [{"region1": n1, "region2": n2, **rec} for (n1, n2), rec in records.items()], columns=["region1", "region2", "n_valid", "count.sum"] + [k + ".sum" for k in transforms.keys()], )
def diagsum_asymm( clr, regions1, regions2, transforms={}, weight_name="weight", bad_bins=None, chunksize=10000000, map=map, ): """ Diagonal summary statistics. Matchings elements of `regions1` and `regions2` define asymmetric rectangular blocks for calculating diagonal summary statistics. Only intra-chromosomal blocks are supported. Parameters ---------- clr : cooler.Cooler Cooler object regions1 : sequence of genomic range tuples "left"-side support regions for diagonal summation regions2 : sequence of genomic range tuples "right"-side support regions for diagonal summation transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. weight_name : str name of the balancing weight vector used to count "bad"(masked) pixels per diagonal. Use `None` to avoid masking "bad" pixels. bad_bins : array-like a list of bins to ignore per support region. Combines with the list of bad bins from balacning weight. chunksize : int, optional Size of pixel table chunks to process map : callable, optional Map functor implementation. Returns ------- DataFrame with summary statistic of every diagonal of every block: region1, region2, diag, n_valid, count.sum """ spans = partition(0, len(clr.pixels()), chunksize) fields = ["count"] + list(transforms.keys()) regions1 = bioframe.parse_regions(regions1, clr.chromsizes) regions2 = bioframe.parse_regions(regions2, clr.chromsizes) dtables = make_diag_tables( clr, regions1, regions2, weight_name=weight_name, bad_bins=bad_bins ) # combine masking with existing transforms and add a "count" transform: if bad_bins is not None: # turn bad_bins into a mask of size clr.bins: mask_size = len(clr.bins()) bad_bins_mask = np.ones(mask_size, dtype=int) bad_bins_mask[bad_bins] = 0 # masked_transforms = {} bin1 = "bin1_id" bin2 = "bin2_id" for field in fields: if field in transforms: # combine masking and transform, minding the scope: t = transforms[field] masked_transforms[field] = ( lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]] ) else: # presumably field == "count", mind the scope as well: masked_transforms[field] = ( lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]] ) # substitute transforms to the masked_transforms: transforms = masked_transforms for dt in dtables.values(): for field in fields: agg_name = "{}.sum".format(field) dt[agg_name] = 0 job = partial( _diagsum_asymm, clr, fields, transforms, regions1.values, regions2.values ) results = map(job, spans) for result in results: for i, agg in result.items(): region1 = regions1.loc[i, "name"] region2 = regions2.loc[i, "name"] for field in fields: agg_name = "{}.sum".format(field) dtables[region1, region2][agg_name] = dtables[region1, region2][ agg_name ].add(agg[field], fill_value=0) # returning a dataframe for API consistency: result = [] for (i, j), dtable in dtables.items(): dtable = dtable.reset_index() dtable.insert(0, "region1", i) dtable.insert(1, "region2", j) result.append(dtable) return pd.concat(result).reset_index(drop=True)
def diagsum( clr, regions, transforms={}, weight_name="weight", bad_bins=None, chunksize=10000000, ignore_diags=2, map=map, ): """ Intra-chromosomal diagonal summary statistics. Parameters ---------- clr : cooler.Cooler Cooler object regions : sequence of genomic range tuples Support regions for intra-chromosomal diagonal summation transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. weight_name : str name of the balancing weight vector used to count "bad"(masked) pixels per diagonal. Use `None` to avoid masking "bad" pixels. bad_bins : array-like a list of bins to ignore per support region. Combines with the list of bad bins from balacning weight. chunksize : int, optional Size of pixel table chunks to process ignore_diags : int, optional Number of intial diagonals to exclude from statistics map : callable, optional Map functor implementation. Returns ------- Dataframe of diagonal statistics for all regions """ spans = partition(0, len(clr.pixels()), chunksize) fields = ["count"] + list(transforms.keys()) regions = bioframe.parse_regions(regions, clr.chromsizes) dtables = make_diag_tables(clr, regions, weight_name=weight_name, bad_bins=bad_bins) # combine masking with existing transforms and add a "count" transform: if bad_bins is not None: # turn bad_bins into a mask of size clr.bins: mask_size = len(clr.bins()) bad_bins_mask = np.ones(mask_size, dtype=int) bad_bins_mask[bad_bins] = 0 # masked_transforms = {} bin1 = "bin1_id" bin2 = "bin2_id" for field in fields: if field in transforms: # combine masking and transform, minding the scope: t = transforms[field] masked_transforms[field] = ( lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]] ) else: # presumably field == "count", mind the scope as well: masked_transforms[field] = ( lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]] ) # substitute transforms to the masked_transforms: transforms = masked_transforms for dt in dtables.values(): for field in fields: agg_name = "{}.sum".format(field) dt[agg_name] = 0 job = partial(_diagsum_symm, clr, fields, transforms, regions.values) results = map(job, spans) for result in results: for i, agg in result.items(): region = regions.loc[i, "name"] for field in fields: agg_name = "{}.sum".format(field) dtables[region][agg_name] = dtables[region][agg_name].add( agg[field], fill_value=0 ) if ignore_diags: for dt in dtables.values(): for field in fields: agg_name = "{}.sum".format(field) j = dt.columns.get_loc(agg_name) dt.iloc[:ignore_diags, j] = np.nan # returning dataframe for API consistency result = [] for i, dtable in dtables.items(): dtable = dtable.reset_index() dtable.insert(0, "region", i) result.append(dtable) return pd.concat(result).reset_index(drop=True)
def compute_expected(c, binsize, drop_diags, chunksize, map_impl=map, regions=None, smooth_factor=None): bins = c.bins()[:] if regions is None: # names = [item[0] for item in bins.groupby('chrom', sort=False)] groups = [item[1] for item in bins.groupby('chrom', sort=False)] else: groups = [] g = bins.groupby('chrom', sort=False) for _, region in regions.iterrows(): # names.extend([region['name']] * len(g)) groups.append( bedslice(g, region['chrom'], region['start'], region['end'])) n_bins_per_group = [len(g) for g in groups] bad_bins_per_group = [where(np.isnan(g['weight'].values)) for g in groups] # initialize ex = bins[['chrom']].copy() # ex['name'] = bins ex['diag'] = bins['start'] // binsize ex['balanced'] = 0 ex['bad'] = list( concat( map_impl(count_bad_pixels_per_diag, n_bins_per_group, bad_bins_per_group))) ex['total'] = list( concat(map_impl(count_all_pixels_per_diag, n_bins_per_group))) # split records into chunks args = partition(0, len(c.pixels()), chunksize) # apply + combine combined = pandas.concat(map_impl(partial(_accum_by_cisdiag, c, bins), args), axis=0, ignore_index=True) combined = combined.groupby(['chrom', 'diag']).sum() ex = ex.set_index(['chrom', 'diag']) ex = ex.add(combined, fill_value=0) ex = ex.reset_index() if smooth_factor is not None: ex['balanced'] = apply_by_chrom( ex, lambda c, g: logsmooth(g['balanced'], smooth_factor)) ex['balanced2'] = apply_by_chrom( ex, lambda c, g: logsmooth(g['balanced2'], smooth_factor)) # average over valid elements n = ex['total'] - ex['bad'] ex['average'] = ex['balanced'] / n ex['std'] = np.sqrt(ex['balanced2'] / n - (ex['balanced'] / n)**2) # mask out bad diagonals ex.loc[ex['diag'] < drop_diags, 'average'] = np.nan ex.loc[ex['diag'] < drop_diags, 'std'] = np.nan return ex
def blocksum_pairwise(clr, supports, transforms=None, weight_name="weight", bad_bins=None, chunksize=1000000, map=map): """ Summary statistics on inter-chromosomal rectangular blocks. Parameters ---------- clr : cooler.Cooler Cooler object supports : sequence of genomic range tuples Support regions for summation. Blocks for all pairs of support regions will be used. transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. weight_name : str name of the balancing weight vector used to count "bad"(masked) pixels per block. Use `None` to avoid masking "bad" pixels. bad_bins : array-like a list of bins to ignore per support region. Overwrites inference of bad bins from balacning weight [to be implemented]. chunksize : int, optional Size of pixel table chunks to process map : callable, optional Map functor implementation. Returns ------- dict of support region -> (field name -> summary) """ blocks = list(combinations(supports, 2)) supports1, supports2 = list(zip(*blocks)) spans = partition(0, len(clr.pixels()), chunksize) fields = ["count"] + list(transforms.keys()) n_tot = count_all_pixels_per_block(clr, supports) n_bad = count_bad_pixels_per_block(clr, supports, weight_name=weight_name, bad_bins=bad_bins) records = {(c1, c2): defaultdict(int) for (c1, c2) in blocks} for c1, c2 in blocks: records[c1, c2]["n_valid"] = n_tot[c1, c2] - n_bad[c1, c2] job = partial(_blocksum_asymm, clr, fields, transforms, supports1, supports2) results = map(job, spans) for result in results: for (i, j), agg in result.items(): for field in fields: agg_name = "{}.sum".format(field) s = agg[field].item() if not np.isnan(s): records[supports1[i], supports2[j]][agg_name] += s return records
def diagsum_asymm(clr, supports1, supports2, contact_type="cis", transforms=None, weight_name="weight", bad_bins=None, chunksize=10000000, ignore_diags=2, map=map): """ Intra-chromosomal diagonal summary statistics. Parameters ---------- clr : cooler.Cooler Cooler object supports : sequence of genomic range tuples Support regions for intra-chromosomal diagonal summation transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. weight_name : str name of the balancing weight vector used to count "bad"(masked) pixels per diagonal. Use `None` to avoid masking "bad" pixels. bad_bins : array-like a list of bins to ignore per support region. Overwrites inference of bad bins from balacning weight [to be implemented]. chunksize : int, optional Size of pixel table chunks to process ignore_diags : int, optional Number of intial diagonals to exclude from statistics map : callable, optional Map functor implementation. Returns ------- dict of support region -> dataframe of diagonal statistics """ spans = partition(0, len(clr.pixels()), chunksize) fields = ["count"] + list(transforms.keys()) areas = list(zip(supports1, supports2)) dtables = make_diag_tables(clr, areas, weight_name=weight_name, bad_bins=bad_bins) for dt in dtables.values(): for field in fields: agg_name = "{}.sum".format(field) dt[agg_name] = 0 job = partial(_diagsum_asymm, clr, fields, transforms, contact_type, supports1, supports2) results = map(job, spans) for result in results: for (i, j), agg in result.items(): support1 = supports1[i] support2 = supports2[j] for field in fields: agg_name = "{}.sum".format(field) dtables[support1, support2][agg_name] = dtables[support1, support2][agg_name].add( agg[field], fill_value=0) if ignore_diags: for dt in dtables.values(): for field in fields: agg_name = "{}.sum".format(field) j = dt.columns.get_loc(agg_name) dt.iloc[:ignore_diags, j] = np.nan return dtables
def blocksum_pairwise(clr, supports, transforms=None, chunksize=1000000, map=map): """ Summary statistics on inter-chromosomal rectangular blocks. Parameters ---------- clr : cooler.Cooler Cooler object supports : sequence of genomic range tuples Support regions for summation. Blocks for all pairs of support regions will be used. transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. chunksize : int, optional Size of pixel table chunks to process map : callable, optional Map functor implementation. Returns ------- dict of support region -> (field name -> summary) """ def n_total_block_elements(clr, supports): n = len(supports) x = [clr.extent(region)[1] - clr.extent(region)[0] for region in supports] blocks = {} for i in range(n): for j in range(i + 1, n): blocks[supports[i], supports[j]] = x[i] * x[j] return blocks def n_bad_block_elements(clr, supports): n = 0 # bad bins are ones with # the weight vector being NaN: x = [np.sum(clr.bins()['weight'] .fetch(region) .isnull() .astype(int) .values) for region in supports] blocks = {} for i in range(len(x)): for j in range(i + 1, len(x)): blocks[supports[i], supports[j]] = x[i] * x[j] return blocks blocks = list(combinations(supports, 2)) supports1, supports2 = list(zip(*blocks)) spans = partition(0, len(clr.pixels()), chunksize) fields = ['count'] + list(transforms.keys()) n_tot = n_total_block_elements(clr, supports) n_bad = n_bad_block_elements(clr, supports) records = {(c1, c2): defaultdict(int) for (c1, c2) in blocks} for c1, c2 in blocks: records[c1, c2]['n_valid'] = n_tot[c1, c2] - n_bad[c1, c2] job = partial(_blocksum_asymm, clr, fields, transforms, supports1, supports2) results = map(job, spans) for result in results: for (i, j), agg in result.items(): for field in fields: agg_name = '{}.sum'.format(field) s = np.asscalar(agg[field]) if not np.isnan(s): records[supports1[i], supports2[j]][agg_name] += s return records