def do_pileup_iccf( clr: cooler.Cooler, snipping_windows: pd.DataFrame, proc: int = 5, collapse: bool = True, ) -> np.ndarray: """Takes a cooler file handle and snipping windows constructed by assignRegions and performs a pileup on all these regions based on the corrected HiC counts. Returns a numpy array that contains averages of all selected regions. The collapse parameter specifies whether to return the average window over all piles (collapse=True), or the individual windows (collapse=False).""" # get regions from snipping windows region_frame = get_regions_from_snipping_windows(snipping_windows) iccf_snipper = cooltools.snipping.CoolerSnipper( clr, regions=bioframe.parse_regions(region_frame)) with multiprocess.Pool(proc) as pool: iccf_pile = cooltools.snipping.pileup(snipping_windows, iccf_snipper.select, iccf_snipper.snip, map=pool.map) if collapse: # calculate the average of all windows collapsed_pile_plus = np.nanmean(iccf_pile[:, :, :], axis=2) return collapsed_pile_plus return iccf_pile
def do_pileup_obs_exp( clr: cooler.Cooler, expected_df: pd.DataFrame, snipping_windows: pd.DataFrame, proc: int = 5, collapse: bool = True, ) -> np.ndarray: """Takes a cooler file handle, an expected dataframe constructed by getExpected, snipping windows constructed by assignRegions and performs a pileup on all these regions based on the obs/exp value. Returns a numpy array that contains averages of all selected regions. The collapse parameter specifies whether to return the average window over all piles (collapse=True), or the individual windows (collapse=False).""" region_frame = get_regions_from_snipping_windows(expected_df) oe_snipper = cooltools.snipping.ObsExpSnipper( clr, expected_df, regions=bioframe.parse_regions(region_frame)) # set warnings filter to ignore RuntimeWarnings since cooltools # does not check whether there are inf or 0 values in # the expected dataframe with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) with multiprocess.Pool(proc) as pool: # extract a matrix of obs/exp average values for each snipping_window oe_pile = cooltools.snipping.pileup(snipping_windows, oe_snipper.select, oe_snipper.snip, map=pool.map) if collapse: # calculate the average of all windows collapsed_pile = np.nanmean(oe_pile[:, :, :], axis=2) return collapsed_pile return oe_pile
def extract_windows_different_sizes_iccf(regions, arms, cooler_file, processes=2): """For extraction of a collection of regions that span genomic regions . regions -> data_frame with chrom, start, end (start, end in genomic coordinates) cooler -> opened cooler file arms -> chromosomal arms """ # assign arms to regions snipping_windows = _assign_supports(regions, bioframe.parse_regions(arms)).dropna() iccf_snipper = cooltools.snipping.CoolerSnipper( cooler_file, regions=bioframe.parse_regions(arms)) with multiprocess.Pool(processes) as pool: result = flexible_pileup(snipping_windows, iccf_snipper.select, iccf_snipper.snip, mapper=pool.map) return result
def assign_regions( window: int, binsize: int, chroms: pd.Series, positions: pd.Series, arms: pd.DataFrame, ) -> pd.DataFrame: """Constructs a 2d region around a series of chromosomal location. Window specifies the windowsize for the constructed regions. The total region assigned will be pos-window until pos+window. The binsize specifies the size of the HiC bins. The positions which represent the center of the regions is givin the the chroms series and the positions series.""" # construct windows from the passed chromosomes and positions snipping_windows = cooltools.snipping.make_bin_aligned_windows( binsize, chroms.values, positions.values, window) # assign chromosomal arm to each position snipping_windows = _assign_supports(snipping_windows, bioframe.parse_regions(arms)) return snipping_windows
def compute_expected( cool_path, nproc, chunksize, output, hdf, contact_type, regions, balance, weight_name, blacklist, ignore_diags, ): """ Calculate expected Hi-C signal either for cis or for trans regions of chromosomal interaction map. When balancing weights are not applied to the data, there is no masking of bad bins performed. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. """ if blacklist is not None: raise NotImplementedError( "Custom genomic regions for masking from calculation of expected" "are not implemented." ) # use blacklist-ing from cooler balance module # https://github.com/mirnylab/cooler/blob/843dadca5ef58e3b794dbaf23430082c9a634532/cooler/cli/balance.py#L175 clr = cooler.Cooler(cool_path) if regions is None: regions = [(chrom, 0, clr.chromsizes[chrom]) for chrom in clr.chromnames] regions = parse_regions(regions) regions["name"] = clr.chromnames else: regions_buf, names = util.sniff_for_header(regions) regions = pd.read_csv(regions_buf, sep="\t", header=None) if regions.shape[1] not in (3, 4): raise ValueError( "The region file does not have three or four tab-delimited columns." "We expect a bed file with columns chrom, start, end, and optional name" ) if regions.shape[1] == 4: regions = regions.rename(columns={0:"chrom",1:"start",2:"end",3:"name"}) regions = parse_regions(regions) else: regions = regions.rename(columns={0:"chrom",1:"start",2:"end"}) regions["name"] = list(regions.apply(lambda x: "{}:{}-{}".format(*x), axis=1)) regions = parse_regions(regions) # define transofrms - balanced and raw ('count') for now if balance: weight1 = weight_name + "1" weight2 = weight_name + "2" transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]} else: # no masking bad bins of any kind, when balancing is not applied weight_name = None transforms = {} # execution details if nproc > 1: pool = mp.Pool(nproc) map_ = pool.map else: map_ = map # using try-clause to close mp.Pool properly try: if contact_type == "cis": result = expected.diagsum( clr, regions, transforms=transforms, weight_name=weight_name, bad_bins=None, chunksize=chunksize, ignore_diags=ignore_diags, map=map_, ) elif contact_type == "trans": # prepare pairwise combinations of regions for trans-expected (blocksum): regions_pairwise = combinations(regions.itertuples(index=False), 2) regions1, regions2 = zip(*regions_pairwise) result = expected.blocksum_asymm( clr, regions1 = pd.DataFrame(regions1), regions2 = pd.DataFrame(regions2), transforms=transforms, weight_name=weight_name, bad_bins=None, chunksize=chunksize, map=map_, ) finally: if nproc > 1: pool.close() # calculate actual averages by dividing sum by n_valid: result["count.avg"] = result["count.sum"] / result["n_valid"] for key in transforms.keys(): result[key + ".avg"] = result[key + ".sum"] / result["n_valid"] # output to file if specified: if output: result.to_csv(output, sep="\t", index=False, na_rep="nan") # or print into stdout otherwise: else: print(result.to_csv(sep="\t", index=False, na_rep="nan")) # would be nice to have some binary output to preserve precision. # to_hdf/read_hdf should work in this case as the file is small . # still debated as to how should we store it - store in cooler seems # to be consensus: if hdf: raise NotImplementedError("hdf output is to be implemented")
def make_saddle( getmatrix, binedges, digitized, contact_type, regions=None, min_diag=3, max_diag=-1, trim_outliers=False, verbose=False, ): """ Make a matrix of average interaction probabilities between genomic bin pairs as a function of a specified genomic track. The provided genomic track must be pre-quantized as integers (i.e. digitized). Parameters ---------- getmatrix : function A function returning a matrix of interaction between two chromosomes given their names/indicies. binedges : 1D array (length n + 1) Bin edges of the digitized signal. For `n` bins, there are `n + 1` edges. See :func:`digitize_track`. digitized : tuple of (DataFrame, str) BedGraph-like dataframe of digitized signal along with the name of the digitized value column. contact_type : str If 'cis' then only cis interactions are used to build the matrix. If 'trans', only trans interactions are used. regions : sequence of str or tuple, optional A list of genomic regions to use. Each can be a chromosome, a UCSC-style genomic region string or a tuple. min_diag : int Smallest diagonal to include in computation. Ignored with contact_type=trans. max_diag : int Biggest diagonal to include in computation. Ignored with contact_type=trans. trim_outliers : bool, optional Remove first and last row and column from the output matrix. verbose : bool, optional If True then reports progress. Returns ------- interaction_sum : 2D array The matrix of summed interaction probability between two genomic bins given their values of the provided genomic track. interaction_count : 2D array The matrix of the number of genomic bin pairs that contributed to the corresponding pixel of ``interaction_sum``. """ digitized_df, name = digitized digitized_df = digitized_df[["chrom", "start", "end", name]] if regions is None: regions = [(chrom, df.start.min(), df.end.max()) for chrom, df in digitized_df.groupby("chrom")] regions = bioframe.parse_regions(regions) digitized_tracks = {} for reg in regions.values: track = bioframe.select(digitized_df, reg) digitized_tracks[reg[3]] = track[name] # 3 = name if contact_type == "cis": supports = list(zip(regions["name"], regions["name"])) elif contact_type == "trans": supports = list(combinations(regions["name"], 2)) else: raise ValueError("The allowed values for the contact_type " "argument are 'cis' or 'trans'.") # n_bins here includes 2 open bins # for values <lo and >hi. n_bins = len(binedges) + 1 interaction_sum = np.zeros((n_bins, n_bins)) interaction_count = np.zeros((n_bins, n_bins)) for reg1, reg2 in supports: _accumulate( interaction_sum, interaction_count, getmatrix, digitized_tracks, reg1, reg2, min_diag, max_diag, verbose, ) interaction_sum += interaction_sum.T interaction_count += interaction_count.T if trim_outliers: interaction_sum = interaction_sum[1:-1, 1:-1] interaction_count = interaction_count[1:-1, 1:-1] return interaction_sum, interaction_count
def blocksum_asymm( clr, regions1, regions2, transforms={}, weight_name="weight", bad_bins=None, chunksize=1000000, map=map, ): """ Summary statistics on rectangular blocks of genomic regions. Parameters ---------- clr : cooler.Cooler Cooler object regions1 : sequence of genomic range tuples "left"-side support regions for diagonal summation regions2 : sequence of genomic range tuples "right"-side support regions for diagonal summation transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. weight_name : str name of the balancing weight vector used to count "bad"(masked) pixels per block. Use `None` to avoid masking "bad" pixels. bad_bins : array-like a list of bins to ignore per support region. Combines with the list of bad bins from balacning weight. chunksize : int, optional Size of pixel table chunks to process map : callable, optional Map functor implementation. Returns ------- DataFrame with entries for each blocks: region1, region2, n_valid, count.sum """ regions1 = bioframe.parse_regions(regions1, clr.chromsizes) regions2 = bioframe.parse_regions(regions2, clr.chromsizes) spans = partition(0, len(clr.pixels()), chunksize) fields = ["count"] + list(transforms.keys()) # similar with diagonal summations, pre-generate a block_table listing # all of the rectangular blocks and "n_valid" number of pixels per each block: records = make_block_table( clr, regions1, regions2, weight_name=weight_name, bad_bins=bad_bins ) # combine masking with existing transforms and add a "count" transform: if bad_bins is not None: # turn bad_bins into a mask of size clr.bins: mask_size = len(clr.bins()) bad_bins_mask = np.ones(mask_size, dtype=int) bad_bins_mask[bad_bins] = 0 # masked_transforms = {} bin1 = "bin1_id" bin2 = "bin2_id" for field in fields: if field in transforms: # combine masking and transform, minding the scope: t = transforms[field] masked_transforms[field] = ( lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]] ) else: # presumably field == "count", mind the scope as well: masked_transforms[field] = ( lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]] ) # substitute transforms to the masked_transforms: transforms = masked_transforms job = partial( _blocksum_asymm, clr, fields, transforms, regions1.values, regions2.values ) results = map(job, spans) for result in results: for i, agg in result.items(): for field in fields: agg_name = "{}.sum".format(field) s = agg[field].item() if not np.isnan(s): n1 = regions1.loc[i, "name"] n2 = regions2.loc[i, "name"] records[n1, n2][agg_name] += s # returning a dataframe for API consistency: return pd.DataFrame( [{"region1": n1, "region2": n2, **rec} for (n1, n2), rec in records.items()], columns=["region1", "region2", "n_valid", "count.sum"] + [k + ".sum" for k in transforms.keys()], )
def diagsum_asymm( clr, regions1, regions2, transforms={}, weight_name="weight", bad_bins=None, chunksize=10000000, map=map, ): """ Diagonal summary statistics. Matchings elements of `regions1` and `regions2` define asymmetric rectangular blocks for calculating diagonal summary statistics. Only intra-chromosomal blocks are supported. Parameters ---------- clr : cooler.Cooler Cooler object regions1 : sequence of genomic range tuples "left"-side support regions for diagonal summation regions2 : sequence of genomic range tuples "right"-side support regions for diagonal summation transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. weight_name : str name of the balancing weight vector used to count "bad"(masked) pixels per diagonal. Use `None` to avoid masking "bad" pixels. bad_bins : array-like a list of bins to ignore per support region. Combines with the list of bad bins from balacning weight. chunksize : int, optional Size of pixel table chunks to process map : callable, optional Map functor implementation. Returns ------- DataFrame with summary statistic of every diagonal of every block: region1, region2, diag, n_valid, count.sum """ spans = partition(0, len(clr.pixels()), chunksize) fields = ["count"] + list(transforms.keys()) regions1 = bioframe.parse_regions(regions1, clr.chromsizes) regions2 = bioframe.parse_regions(regions2, clr.chromsizes) dtables = make_diag_tables( clr, regions1, regions2, weight_name=weight_name, bad_bins=bad_bins ) # combine masking with existing transforms and add a "count" transform: if bad_bins is not None: # turn bad_bins into a mask of size clr.bins: mask_size = len(clr.bins()) bad_bins_mask = np.ones(mask_size, dtype=int) bad_bins_mask[bad_bins] = 0 # masked_transforms = {} bin1 = "bin1_id" bin2 = "bin2_id" for field in fields: if field in transforms: # combine masking and transform, minding the scope: t = transforms[field] masked_transforms[field] = ( lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]] ) else: # presumably field == "count", mind the scope as well: masked_transforms[field] = ( lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]] ) # substitute transforms to the masked_transforms: transforms = masked_transforms for dt in dtables.values(): for field in fields: agg_name = "{}.sum".format(field) dt[agg_name] = 0 job = partial( _diagsum_asymm, clr, fields, transforms, regions1.values, regions2.values ) results = map(job, spans) for result in results: for i, agg in result.items(): region1 = regions1.loc[i, "name"] region2 = regions2.loc[i, "name"] for field in fields: agg_name = "{}.sum".format(field) dtables[region1, region2][agg_name] = dtables[region1, region2][ agg_name ].add(agg[field], fill_value=0) # returning a dataframe for API consistency: result = [] for (i, j), dtable in dtables.items(): dtable = dtable.reset_index() dtable.insert(0, "region1", i) dtable.insert(1, "region2", j) result.append(dtable) return pd.concat(result).reset_index(drop=True)
def diagsum( clr, regions, transforms={}, weight_name="weight", bad_bins=None, chunksize=10000000, ignore_diags=2, map=map, ): """ Intra-chromosomal diagonal summary statistics. Parameters ---------- clr : cooler.Cooler Cooler object regions : sequence of genomic range tuples Support regions for intra-chromosomal diagonal summation transforms : dict of str -> callable, optional Transformations to apply to pixels. The result will be assigned to a temporary column with the name given by the key. Callables take one argument: the current chunk of the (annotated) pixel dataframe. weight_name : str name of the balancing weight vector used to count "bad"(masked) pixels per diagonal. Use `None` to avoid masking "bad" pixels. bad_bins : array-like a list of bins to ignore per support region. Combines with the list of bad bins from balacning weight. chunksize : int, optional Size of pixel table chunks to process ignore_diags : int, optional Number of intial diagonals to exclude from statistics map : callable, optional Map functor implementation. Returns ------- Dataframe of diagonal statistics for all regions """ spans = partition(0, len(clr.pixels()), chunksize) fields = ["count"] + list(transforms.keys()) regions = bioframe.parse_regions(regions, clr.chromsizes) dtables = make_diag_tables(clr, regions, weight_name=weight_name, bad_bins=bad_bins) # combine masking with existing transforms and add a "count" transform: if bad_bins is not None: # turn bad_bins into a mask of size clr.bins: mask_size = len(clr.bins()) bad_bins_mask = np.ones(mask_size, dtype=int) bad_bins_mask[bad_bins] = 0 # masked_transforms = {} bin1 = "bin1_id" bin2 = "bin2_id" for field in fields: if field in transforms: # combine masking and transform, minding the scope: t = transforms[field] masked_transforms[field] = ( lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]] ) else: # presumably field == "count", mind the scope as well: masked_transforms[field] = ( lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]] ) # substitute transforms to the masked_transforms: transforms = masked_transforms for dt in dtables.values(): for field in fields: agg_name = "{}.sum".format(field) dt[agg_name] = 0 job = partial(_diagsum_symm, clr, fields, transforms, regions.values) results = map(job, spans) for result in results: for i, agg in result.items(): region = regions.loc[i, "name"] for field in fields: agg_name = "{}.sum".format(field) dtables[region][agg_name] = dtables[region][agg_name].add( agg[field], fill_value=0 ) if ignore_diags: for dt in dtables.values(): for field in fields: agg_name = "{}.sum".format(field) j = dt.columns.get_loc(agg_name) dt.iloc[:ignore_diags, j] = np.nan # returning dataframe for API consistency result = [] for i, dtable in dtables.items(): dtable = dtable.reset_index() dtable.insert(0, "region", i) result.append(dtable) return pd.concat(result).reset_index(drop=True)
def make_block_table(clr, regions1, regions2, weight_name="weight", bad_bins=None): """ Creates a table that characterizes a set of rectangular genomic blocks formed by combining regions from regions1 and regions2. For every block calculate its "area" in pixels ("n_total"), and calculate number of "valid" pixels in each block ("n_valid"). "Valid" pixels exclude "bad" pixels, which in turn inferred from the balancing weight column `weight_name` or provided directly in the form of an array of `bad_bins`. Setting `weight_name` and `bad_bins` to `None` yields 0 "bad" pixels per block. Parameters ---------- clr : cooler.Cooler Input cooler regions1 : iterable a collection of genomic regions regions2 : iterable a collection of genomic regions weight_name : str name of the weight vector in the "bins" table, if weight_name is None returns 0 for each block. Balancing weight are used to infer bad bins. bad_bins : array-like a list of bins to ignore. Indexes of bins must be absolute, as in clr.bins()[:], as opposed to being offset by chromosome start. "bad_bins" will be combined with the bad bins masked by balancing if there are any. Returns ------- block_table : dict dictionary for blocks that are 0-indexed """ if bad_bins is None: bad_bins = np.asarray([]).astype(int) else: bad_bins = np.asarray(bad_bins).astype(int) regions1 = bioframe.parse_regions(regions1, clr.chromsizes).values regions2 = bioframe.parse_regions(regions2, clr.chromsizes).values # should we check for nestedness here, or that each region1 is < region2 ? block_table = {} for r1, r2 in zip(regions1, regions2): chrom1, start1, end1, name1 = r1 chrom2, start2, end2, name2 = r2 # translate regions into relative bin id-s: lo1, hi1 = clr.extent((chrom1, start1, end1)) lo2, hi2 = clr.extent((chrom2, start2, end2)) # width and height of a block: x = hi1 - lo1 y = hi2 - lo2 # get "regional" bad_bins for each of the regions bx = bad_bins[(bad_bins >= lo1) & (bad_bins < hi1)] - lo1 by = bad_bins[(bad_bins >= lo2) & (bad_bins < hi2)] - lo2 # now we need to combine it with the balancing weights if weight_name is None: bad_bins_x = len(bx) bad_bins_y = len(by) elif isinstance(weight_name, str): if weight_name not in clr.bins().columns: raise KeyError("Balancing weight {weight_name} not found!") else: # extract "bad" bins filtered by balancing: cb_bins_x = clr.bins()[weight_name][lo1:hi1].isnull().values cb_bins_y = clr.bins()[weight_name][lo2:hi2].isnull().values # combine with "bad_bins" using assignment: cb_bins_x[bx] = True cb_bins_y[by] = True # count and yield final list of bad bins: bad_bins_x = np.count_nonzero(cb_bins_x) bad_bins_y = np.count_nonzero(cb_bins_y) else: raise ValueError("`weight_name` can be `str` or `None`") # calculate total and bad pixels per block: n_tot = count_all_pixels_per_block(x, y) n_bad = count_bad_pixels_per_block(x, y, bad_bins_x, bad_bins_y) # fill in "block_table" with number of valid pixels: block_table[name1, name2] = defaultdict(int) block_table[name1, name2]["n_valid"] = n_tot - n_bad return block_table
def make_diag_tables(clr, regions, regions2=None, weight_name="weight", bad_bins=None): """ For every support region infer diagonals that intersect this region and calculate the size of these intersections in pixels, both "total" and "n_valid", where "n_valid" does not include "bad" bins into counting. "Bad" pixels are inferred from the balancing weight column `weight_name` or provided directly in the form of an array `bad_bins`. Setting `weight_name` and `bad_bins` to `None` yields 0 "bad" pixels per diagonal per support region. When `regions2` are provided, all intersecting diagonals are reported for each rectangular and asymmetric block defined by combinations of matching elements of `regions` and `regions2`. Otherwise only `regions`-based symmetric square blocks are considered. Only intra-chromosomal regions are supported. Parameters ---------- clr : cooler.Cooler Input cooler regions : list a list of genomic support regions regions2 : list a list of genomic support regions for asymmetric regions weight_name : str name of the weight vector in the "bins" table, if weight_name is None returns 0 for each block. Balancing weight are used to infer bad bins. bad_bins : array-like a list of bins to ignore. Indexes of bins must be absolute, as in clr.bins()[:], as opposed to being offset by chromosome start. "bad_bins" will be combined with the bad bins masked by balancing if there are any. Returns ------- diag_tables : dict dictionary with DataFrames of relevant diagonals for every support. """ regions = bioframe.parse_regions(regions, clr.chromsizes).values if regions2 is not None: regions2 = bioframe.parse_regions(regions2, clr.chromsizes).values bins = clr.bins()[:] if weight_name is None: # ignore bad bins sizes = dict(bins.groupby("chrom").size()) bad_bin_dict = { chrom: np.zeros(sizes[chrom], dtype=bool) for chrom in sizes.keys() } elif isinstance(weight_name, str): # using balacning weight to infer bad bins if weight_name not in clr.bins().columns: raise KeyError("Balancing weight {weight_name} not found!") groups = dict(iter(bins.groupby("chrom")[weight_name])) bad_bin_dict = { chrom: np.array(groups[chrom].isnull()) for chrom in groups.keys() } else: raise ValueError("`weight_name` can be `str` or `None`") # combine custom "bad_bins" with "bad_bin_dict": if bad_bins is not None: # check if "bad_bins" are legit: try: bad_bins_chrom = bins.iloc[bad_bins].reset_index(drop=False) except IndexError: raise ValueError("Provided `bad_bins` are incorrect or out-of-bound") # group them by observed chromosomes only bad_bins_grp = bad_bins_chrom[["index", "chrom"]].groupby( "chrom", observed=True ) # update "bad_bin_dict" with "bad_bins" for each chrom: for chrom, bin_ids in bad_bins_grp["index"]: co = clr.offset(chrom) # adjust by chromosome offset bad_bin_dict[chrom][bin_ids.values - co] = True diag_tables = {} for i in range(len(regions)): chrom, start1, end1, name1 = regions[i] if regions2 is not None: chrom2, start2, end2, name2 = regions2[i] # cis-only for now: assert chrom2 == chrom else: start2, end2 = start1, end1 # translate regions into relative bin id-s: lo1, hi1 = clr.extent((chrom, start1, end1)) lo2, hi2 = clr.extent((chrom, start2, end2)) co = clr.offset(chrom) lo1 -= co lo2 -= co hi1 -= co hi2 -= co bad_mask = bad_bin_dict[chrom] newname = name1 if regions2 is not None: newname = (name1, name2) diag_tables[newname] = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2]) return diag_tables
def compute_saddle( cool_path, track_path, expected_path, contact_type, min_dist, max_dist, n_bins, quantiles, range_, qrange, weight_name, strength, regions, out_prefix, fig, scale, cmap, vmin, vmax, hist_color, verbose, ): """ Calculate saddle statistics and generate saddle plots for an arbitrary signal track on the genomic bins of a contact matrix. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the '::' syntax to specify a group path in a multicooler file. TRACK_PATH : The path to bedGraph-like file with a binned compartment track (eigenvector), including a header. Use the '::' syntax to specify a column name. EXPECTED_PATH : The paths to a tsv-like file with expected signal, including a header. Use the '::' syntax to specify a column name. Analysis will be performed for chromosomes referred to in TRACK_PATH, and therefore these chromosomes must be a subset of chromosomes referred to in COOL_PATH and EXPECTED_PATH. COOL_PATH, TRACK_PATH and EXPECTED_PATH must be binned at the same resolution (expect for EXPECTED_PATH in case of trans contact type). EXPECTED_PATH must contain at least the following columns for cis contacts: 'chrom', 'diag', 'n_valid', value_name and the following columns for trans contacts: 'chrom1', 'chrom2', 'n_valid', value_name value_name is controlled using options. Header must be present in a file. """ clr = cooler.Cooler(cool_path) expected_path, expected_name = expected_path track_path, track_name = track_path if regions is None: regions = [(chrom, 0, clr.chromsizes[chrom]) for chrom in clr.chromnames] regions = parse_regions(regions) regions["name"] = clr.chromnames else: regions_buf, names = util.sniff_for_header(regions) regions = pd.read_csv(regions_buf, sep="\t", header=None) if regions.shape[1] not in (3, 4): raise ValueError( "The region file does not have three or four tab-delimited columns." "We expect a bed file with columns chrom, start, end, and optional name" ) if regions.shape[1] == 4: regions = regions.rename(columns={ 0: "chrom", 1: "start", 2: "end", 3: "name" }) regions = parse_regions(regions) else: regions = regions.rename(columns={ 0: "chrom", 1: "start", 2: "end" }) regions["name"] = list( regions.apply(lambda x: "{}:{}-{}".format(*x), axis=1)) regions = parse_regions(regions) if vmin <= 0 or vmax <= 0: raise ValueError( "vmin and vmax values are in original units irrespective " "of used scale, and therefore should be positive") # read expected and make preparations for validation, # it's contact_type dependent: if contact_type == "cis": # that's what we expect as column names: expected_columns = ["region", "diag", "n_valid", expected_name] # what would become a MultiIndex: expected_index = ["region", "diag"] # expected dtype as a rudimentary form of validation: expected_dtype = { "region": np.str, "diag": np.int64, "n_valid": np.int64, expected_name: np.float64, } # # unique list of chroms mentioned in expected_path: # get_exp_chroms = lambda df: df.index.get_level_values("region").unique() # # compute # of bins by comparing matching indexes: # get_exp_bins = lambda df, ref_chroms, _: ( # df.index.get_level_values("chrom").isin(ref_chroms).sum() # ) elif contact_type == "trans": # that's what we expect as column names: expected_columns = ["region1", "region2", "n_valid", expected_name] # what would become a MultiIndex: expected_index = ["region1", "region2"] # expected dtype as a rudimentary form of validation: expected_dtype = { "region1": np.str, "region2": np.str, "n_valid": np.int64, expected_name: np.float64, } # # unique list of chroms mentioned in expected_path: # get_exp_chroms = lambda df: np.union1d( # df.index.get_level_values("region1").unique(), # df.index.get_level_values("region2").unique(), # ) # # no way to get bins from trans-expected, so just get the number: # get_exp_bins = lambda _1, _2, correct_bins: correct_bins else: raise ValueError( "Incorrect contact_type: {}, ".format(contact_type), "Should have been caught by click.", ) if min_dist < 0: min_diag = 3 else: min_diag = int(np.ceil(min_dist / clr.binsize)) if max_dist >= 0: max_diag = int(np.floor(max_dist / clr.binsize)) else: max_diag = -1 # use 'usecols' as a rudimentary form of validation, # and dtype. Keep 'comment' and 'verbose' - explicit, # as we may use them later: expected = pd.read_table( expected_path, usecols=expected_columns, index_col=expected_index, dtype=expected_dtype, comment=None, verbose=verbose, ) # read bedGraph-file : track_columns = ["chrom", "start", "end", track_name] # specify dtype as a rudimentary form of validation: track_dtype = { "chrom": np.str, "start": np.int64, "end": np.int64, track_name: np.float64, } track = pd.read_table( track_path, usecols=track_columns, dtype=track_dtype, comment=None, verbose=verbose, ) ############################################# # CROSS-VALIDATE COOLER, EXPECTED AND TRACK: ############################################# # TRACK vs COOLER: track_chroms = track["chrom"].unique() # We might want to try this eventually: # https://github.com/TMiguelT/PandasSchema # do simple column-name validation for now: if not set(track_chroms).issubset(clr.chromnames): raise ValueError( "Chromosomes in {} must be subset of ".format(track_path) + "chromosomes in cooler {}".format(cool_path)) # check number of bins: track_bins = len(track) cool_bins = clr.bins()[:]["chrom"].isin(track_chroms).sum() if not (track_bins == cool_bins): raise ValueError( "Number of bins is not matching: ", "{} in {}, and {} in {} for chromosomes {}".format( track_bins, track_path, cool_bins, cool_path, track_chroms), ) # # EXPECTED vs TRACK: # # validate expected a bit as well: # expected_chroms = get_exp_chroms(expected) # # do simple column-name validation for now: # if not set(track_chroms).issubset(expected_chroms): # raise ValueError( # "Chromosomes in {} must be subset of ".format(track_path) # + "chromosomes in expected {}".format(expected_path) # ) # # and again bins are supposed to match up: # # only for cis though ... # expected_bins = get_exp_bins(expected, track_chroms, track_bins) # if not (track_bins == expected_bins): # raise ValueError( # "Number of bins is not matching: ", # "{} in {}, and {} in {} for chromosomes {}".format( # track_bins, track_path, expected_bins, expected_path, track_chroms # ), # ) ############################################# # CROSS-VALIDATION IS COMPLETE. ############################################# track = saddle.mask_bad_bins((track, track_name), (clr.bins()[:], weight_name)) if contact_type == "cis": getmatrix = saddle.make_cis_obsexp_fetcher(clr, (expected, expected_name), weight_name=weight_name) elif contact_type == "trans": getmatrix = saddle.make_trans_obsexp_fetcher(clr, (expected, expected_name), weight_name=weight_name) if quantiles: if len(range_): qlo, qhi = saddle.ecdf(track[track_name], range_) elif len(qrange): qlo, qhi = qrange else: qlo, qhi = 0.0, 1.0 q_edges = np.linspace(qlo, qhi, n_bins) binedges = saddle.quantile(track[track_name], q_edges) else: if len(range_): lo, hi = range_ elif len(qrange): lo, hi = saddle.quantile(track[track_name], qrange) else: lo, hi = track[track_name].min(), track[track_name].max() binedges = np.linspace(lo, hi, n_bins) digitized, hist = saddle.digitize_track(binedges, track=(track, track_name), regions=track_chroms) S, C = saddle.make_saddle( getmatrix, binedges, (digitized, track_name + ".d"), contact_type=contact_type, regions=regions, min_diag=min_diag, max_diag=max_diag, ) saddledata = S / C to_save = dict(saddledata=saddledata, binedges=binedges, hist=hist) if strength: ratios = saddle.saddle_strength(S, C) ratios = ratios[1:-1] # drop outlier bins to_save["saddle_strength"] = ratios # Save data np.savez(out_prefix + ".saddledump", **to_save) # .npz auto-added digitized.to_csv(out_prefix + ".digitized.tsv", sep="\t", index=False) # Generate figure if len(fig): try: import matplotlib as mpl mpl.use("Agg") # savefig only for now: import matplotlib.pyplot as plt except ImportError: print("Install matplotlib to use ", file=sys.stderr) sys.exit(1) if hist_color is None: color = ( 0.41568627450980394, 0.8, 0.39215686274509803, ) # sns.color_palette('muted')[2] else: color = mpl.colors.colorConverter.to_rgb(hist_color) title = op.basename(cool_path) + " ({})".format(contact_type) if quantiles: edges = q_edges track_label = track_name + " quantiles" else: edges = binedges track_label = track_name clabel = "(contact frequency / expected)" saddle.saddleplot(edges, hist, saddledata, scale=scale, vmin=vmin, vmax=vmax, color=color, title=title, xlabel=track_label, ylabel=track_label, clabel=clabel, cmap=cmap) for ext in fig: plt.savefig(out_prefix + "." + ext, bbox_inches="tight")
def cooler_cis_eig( clr, bins, regions=None, n_eigs=3, phasing_track_col="GC", balance="weight", ignore_diags=None, bad_bins=None, clip_percentile=99.9, sort_metric=None, map=map, ): """ Compute compartment eigenvector for a given cooler `clr` in a number of symmetric intra chromosomal regions (cis-regions), or for each chromosome. Note that the amplitude of compartment eigenvectors is weighted by their corresponding eigenvalue Parameters ---------- clr : cooler cooler object to fetch data from bins : DataFrame table of bins derived from clr with phasing track added regions : iterable or DataFrame, optional if provided, eigenvectors are calculated for the regions only, otherwise chromosome-wide eigenvectors are computed, for chromosomes specified in bins. n_eigs : int number of eigenvectors to compute phasing_track_col : str, optional name of the columns in `bins` table, if provided, eigenvectors are flipped to achieve a positive correlation with `bins[phasing_track_col]`. balance : str name of the column with balancing weights to be used. ignore_diags : int, optional the number of diagonals to ignore. Derived from cooler metadata if not specified. bad_bins : array-like a list of bins to ignore. Indexes of bins must be absolute, as in clr.bins()[:], as opposed to being offset by chromosome start. `bad_bins` will be combined with the bad bins masked by balancing. clip_percentile : float if >0 and <100, clip pixels with diagonal-normalized values higher than the specified percentile of matrix-wide values. sort_metric : str If provided, re-sort `eigenvecs` and `eigvals` in the order of decreasing correlation between phasing_track and eigenvector, using the specified measure of correlation. Possible values: 'pearsonr' - sort by decreasing Pearson correlation. 'var_explained' - sort by decreasing absolute amount of variation in `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec)) 'MAD_explained' - sort by decreasing absolute amount of Median Absolute Deviation from the median of `eigvecs` explained by `phasing_track` (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)). 'spearmanr' - sort by decreasing Spearman correlation. This option is designed to report the most "biologically" informative eigenvectors first, and prevent eigenvector swapping caused by translocations. In reality, however, sometimes it shows poor performance and may lead to reporting of non-informative eigenvectors. Off by default. map : callable, optional Map functor implementation. Returns ------- eigvals, eigvec_table -> DataFrames with eigenvalues for each region and a table of eigenvectors filled in the `bins` table. .. note:: ALWAYS check your EVs by eye. The first one occasionally does not reflect the compartment structure, but instead describes chromosomal arms or translocation blowouts. Possible mitigations: employ `regions` (e.g. arms) to avoid issues with chromosomal arms, use `bad_bins` to ignore small transolcations. """ # get chromosomes from bins, if regions not specified: if regions is None: regions = list( bins["chrom"].unique()) # parse_regions fill in the rest # make sure phasing_track_col is in bins, if phasing is requested if phasing_track_col and (phasing_track_col not in bins): raise ValueError(f'No column "{phasing_track_col}" in the bin table') # regions to dataframe regions = bioframe.parse_regions(regions, clr.chromsizes) # ignore diags as in cooler inless specified ignore_diags = (clr._load_attrs("bins/weight").get("ignore_diags", 2) if ignore_diags is None else ignore_diags) # prepare output table for eigen vectors eigvec_table = bins.copy() eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)] for ev_col in eigvec_columns: eigvec_table[ev_col] = np.nan # prepare output table for eigenvalues eigvals_table = regions.copy() eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)] for eval_col in eigval_columns: eigvals_table[eval_col] = np.nan def _each(region): """ perform eigen decomposition for a given region assuming safety checks are done outside of this function. Parameters ---------- region: tuple-like tuple of the form (chroms,start,end,*) Returns ------- _region, eigvals, eigvecs -> ndarrays array of eigenvalues and an array eigenvectors """ _region = region[:3] # take only (chrom, start, end) A = clr.matrix(balance=balance).fetch(_region) # filter bad_bins relevant for the _region from A if bad_bins is not None: # filter bad_bins for the _region and turn relative: lo, hi = clr.extent(_region) bad_bins_region = bad_bins[(bad_bins >= lo) & (bad_bins < hi)] bad_bins_region -= lo if len(bad_bins_region) > 0: # apply bad bins to symmetric matrix A: A[:, bad_bins_region] = np.nan A[bad_bins_region, :] = np.nan # extract phasing track relevant for the _region phasing_track = (bioframe.select(bins, _region)[phasing_track_col].values if phasing_track_col else None) eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track, clip_percentile=clip_percentile, sort_metric=sort_metric, ) return _region, eigvals, eigvecs # eigendecompose matrix per region (can be multiprocessed) # output assumes that the order of results matches regions results = map(_each, regions.values) # go through eigendecomposition results and fill in # output table eigvec_table and eigvals_table for _region, _eigvals, _eigvecs in results: idx = bioframe.select(eigvec_table, _region).index eigvec_table.at[idx, eigvec_columns] = _eigvecs.T idx = bioframe.select(eigvals_table, _region).index eigvals_table.at[idx, eigval_columns] = _eigvals return eigvals_table, eigvec_table
def call_compartments( cool_path, reference_track, regions, contact_type, n_eigs, verbose, out_prefix, bigwig, ): """ Perform eigen value decomposition on a cooler matrix to calculate compartment signal by finding the eigenvector that correlates best with the phasing track. COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the '::' syntax to specify a group path in a multicooler file. TRACK_PATH : the path to a BedGraph-like file that stores phasing track as track-name named column. BedGraph-like format assumes tab-separated columns chrom, start, stop and track-name. """ clr = cooler.Cooler(cool_path) if reference_track is not None: # TODO: This all needs to be refactored into a more generic tabular file parser # Needs to handle stdin case too. track_path, col = reference_track buf, names = sniff_for_header(track_path) if names is None: if not isinstance(col, int): raise click.BadParameter( "No header found. " 'Cannot find "{}" column without a header.'.format(col)) track_name = "ref" kwargs = dict( header=None, usecols=[0, 1, 2, col], names=["chrom", "start", "end", track_name], ) else: if isinstance(col, int): try: col = names[col] except IndexError: raise click.BadParameter( 'Column #{} not compatible with header "{}".'.format( col, ",".join(names))) else: if col not in names: raise click.BadParameter( 'Column "{}" not found in header "{}"'.format( col, ",".join(names))) track_name = col kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name]) track_df = pd.read_table(buf, dtype={ "chrom": str, "start": np.int64, "end": np.int64, track_name: np.float64, }, comment="#", verbose=verbose, **kwargs) # we need to merge phasing track DataFrame with the cooler bins to get # a DataFrame with phasing info aligned and validated against bins inside of # the cooler file. track = pd.merge(left=clr.bins()[:], right=track_df, how="left", on=["chrom", "start", "end"]) # sanity check would be to check if len(bins) becomes > than nbins ... # that would imply there was something in the track_df that didn't match # ["chrom", "start", "end"] - keys from the c.bins()[:] . if len(track) > len(clr.bins()): ValueError( "There is something in the {} that ".format(track_path) + "couldn't be merged with cooler-bins {}".format(cool_path)) else: # use entire bin-table from cooler, when reference-track is not provided: track = clr.bins()[["chrom", "start", "end"]][:] track_name = None # define regions for cis compartment-calling # use input "regions" BED file or all chromosomes mentioned in "track": if regions is None: # use full chromosomes referred to in the track : track_chroms = track["chrom"].unique() cis_regions_table = bioframe.parse_regions(track_chroms, clr.chromsizes) cis_regions_table["name"] = cis_regions_table["chrom"] else: if contact_type == "trans": raise NotImplementedError( "Regions not yet supported with trans contact type") # Flexible reading of the regions table: regions_buf, names = sniff_for_header(regions) cis_regions_table = pd.read_csv(regions_buf, sep="\t", header=None) if cis_regions_table.shape[1] not in (3, 4): raise ValueError( "The region file does not have three or four tab-delimited columns." "We expect a bed file with columns chrom, start, end, and optional name" ) if cis_regions_table.shape[1] == 4: cis_regions_table = cis_regions_table.rename(columns={ 0: "chrom", 1: "start", 2: "end", 3: "name" }) cis_regions_table = bioframe.parse_regions(cis_regions_table) else: cis_regions_table = cis_regions_table.rename(columns={ 0: "chrom", 1: "start", 2: "end" }) cis_regions_table = bioframe.parse_regions(cis_regions_table) # make sure custom regions are compatible with the track: track_chroms = track["chrom"].unique() cis_regions_table = cis_regions_table[cis_regions_table["chrom"].isin( track_chroms)].reset_index(drop=True) # it's contact_type dependent: if contact_type == "cis": eigvals, eigvec_table = eigdecomp.cooler_cis_eig( clr=clr, bins=track, regions=cis_regions_table, n_eigs=n_eigs, phasing_track_col=track_name, clip_percentile=99.9, sort_metric=None, ) elif contact_type == "trans": eigvals, eigvec_table = eigdecomp.cooler_trans_eig( clr=clr, bins=track, n_eigs=n_eigs, partition=None, phasing_track_col=track_name, sort_metric=None, ) # Output eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt", sep="\t", index=False) eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv", sep="\t", index=False) if bigwig: bioframe.to_bigwig( eigvec_table, clr.chromsizes, out_prefix + "." + contact_type + ".bw", value_field="E1", )
def call_dots( cool_path, expected_path, regions, expected_name, weight_name, nproc, max_loci_separation, max_nans_tolerated, tile_size, kernel_width, kernel_peak, num_lambda_chunks, fdr, dots_clustering_radius, verbose, out_prefix, ): """ Call dots on a Hi-C heatmap that are not larger than max_loci_separation. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. EXPECTED_PATH : The paths to a tsv-like file with expected cis-expected. Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and therefore these chromosomes must be a subset of chromosomes referred to in COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial, i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH before applying this script. COOL_PATH and EXPECTED_PATH must be binned at the same resolution. EXPECTED_PATH must contain at least the following columns for cis contacts: 'region', 'diag', 'n_valid', value_name. value_name is controlled using options. Header must be present in a file. """ clr = cooler.Cooler(cool_path) # preliminary SCHEMA for cis-expected region_column_name = "region" expected_columns = [region_column_name, "diag", "n_valid", expected_name] expected_dtypes = { region_column_name: np.str, "diag": np.int64, "n_valid": np.int64, expected_name: np.float64, } try: expected = pd.read_table( expected_path, usecols=expected_columns, dtype=expected_dtypes, comment=None, verbose=verbose, ) except ValueError as e: raise ValueError( "input expected does not match the schema\n" "tab-separated expected file must have a header as wel") expected_index = [ region_column_name, "diag", ] expected.set_index(expected_index, inplace=True) # end of SCHEMA for cis-expected # Optional reading region table provided by the user: if regions is None: try: uniq_regions = expected.index.get_level_values( region_column_name).unique() regions_table = bioframe.parse_regions(uniq_regions, clr.chromsizes) regions_table["name"] = regions_table["chrom"] except ValueError as e: print(e) raise ValueError( "Cannot interpret regions from EXPECTED_PATH\n" "specify regions definitions using --regions option.") else: # Flexible reading of the regions table: regions_buf, names = util.sniff_for_header(regions) regions_table = pd.read_csv(regions_buf, sep="\t", header=None) if regions_table.shape[1] not in (3, 4): raise ValueError( "The region file does not have three or four tab-delimited columns." "We expect a bed file with columns chrom, start, end, and optional name" ) if regions_table.shape[1] == 4: regions_table = regions_table.rename(columns={ 0: "chrom", 1: "start", 2: "end", 3: "name" }) regions_table = bioframe.parse_regions(regions_table) else: regions_table = regions_table.rename(columns={ 0: "chrom", 1: "start", 2: "end" }) regions_table = bioframe.parse_regions(regions_table) regions_table = regions_table[regions_table["chrom"].isin( clr.chromnames)].reset_index(drop=True) # Verify appropriate columns order (required for heatmap_tiles_generator_diag): regions_table = regions_table[["chrom", "start", "end", "name"]] # Input validation get_exp_regions = lambda df: df.index.get_level_values(region_column_name ).unique() expected_regions = get_exp_regions(expected) # unique list of regions mentioned in expected_path # are also in regions table if not set(expected_regions).issubset(regions_table["name"]): raise ValueError( "Regions in {} must be subset of ".format(expected_path) + f"regions in {'regions table'+regions_path if not regions_path is None else 'cooler'}" ) # check number of bins per region in cooler and expected table # compute # of bins by comparing matching indexes try: for region_name, group in expected.reset_index().groupby( region_column_name): n_diags = group.shape[0] region = regions_table.set_index("name").loc[region_name] lo, hi = clr.extent(region) assert n_diags == (hi - lo) except AssertionError: raise ValueError("Region shape mismatch between expected and cooler. " "Are they using the same resolution?") # All the checks have passed: if verbose: print("{} and {} passed cross-compatibility checks.".format( cool_path, expected_path)) # by now we have a usable region_table and expected for most scenarios # Prepare some parameters. binsize = clr.binsize loci_separation_bins = int(max_loci_separation / binsize) tile_size_bins = int(tile_size / binsize) balance_factor = 1.0 # clr._load_attrs("bins/weight")["scale"] # clustering would deal with bases-units for now, so supress this for now # clustering_radius_bins = int(dots_clustering_radius/binsize) # kernels # 'upright' is a symmetrical inversion of "lowleft", not needed. ktypes = ["donut", "vertical", "horizontal", "lowleft"] if (kernel_width is None) or (kernel_peak is None): w, p = dotfinder.recommend_kernel_params(binsize) print( f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}" ) else: w, p = kernel_width, kernel_peak # add some sanity check for w,p: assert w > p, f"Wrong inner/outer kernel parameters w={w}, p={p}" print(f"Using kernel parameters w={w}, p={p} provided by user") # once kernel parameters are setup check max_nans_tolerated # to make sure kernel footprints overlaping 1 side with the # NaNs filled row/column are not "allowed" # this requires dynamic adjustment for the "shrinking donut" assert max_nans_tolerated <= 2 * w, "Too many NaNs allowed!" # may lead to scoring the same pixel twice, - i.e. duplicates. # generate standard kernels - consider providing custom ones kernels = {k: dotfinder.get_kernel(w, p, k) for k in ktypes} # list of tile coordinate ranges tiles = list( dotfinder.heatmap_tiles_generator_diag(clr, regions_table, w, tile_size_bins, loci_separation_bins)) # lambda-chunking edges ... assert dotfinder.HiCCUPS_W1_MAX_INDX <= num_lambda_chunks <= 50 base = 2**(1 / 3) ledges = np.concatenate(( [-np.inf], np.logspace( 0, num_lambda_chunks - 1, num=num_lambda_chunks, base=base, dtype=np.float, ), [np.inf], )) # 1. Calculate genome-wide histograms of scores. gw_hist = dotfinder.scoring_and_histogramming_step( clr, expected, expected_name, weight_name, tiles, kernels, ledges, max_nans_tolerated, loci_separation_bins, nproc, verbose, ) if verbose: print("Done building histograms ...") # 2. Determine the FDR thresholds. threshold_df, qvalues = dotfinder.determine_thresholds( kernels, ledges, gw_hist, fdr) # 3. Filter using FDR thresholds calculated in the histogramming step filtered_pixels = dotfinder.scoring_and_extraction_step( clr, expected, expected_name, weight_name, tiles, kernels, ledges, threshold_df, max_nans_tolerated, balance_factor, loci_separation_bins, op.join(op.dirname(out_prefix), op.basename(out_prefix) + ".enriched.tsv"), nproc, verbose, bin1_id_name="bin1_id", bin2_id_name="bin2_id", ) # 4. Post-processing if verbose: print( f"Begin post-processing of {len(filtered_pixels)} filtered pixels") print("preparing to extract needed q-values ...") filtered_pixels_qvals = dotfinder.annotate_pixels_with_qvalues( filtered_pixels, qvalues, kernels) # 4a. clustering ######################################################################## # Clustering has to be done using annotated DataFrame of filtered pixels # why ? - because - clustering has to be done independently for every region! ######################################################################## filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:]) filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, regions_table) # consider reseting index here centroids = dotfinder.clustering_step(filtered_pixels_annotated, expected_regions, dots_clustering_radius, verbose) # 4b. filter by enrichment and qval postprocessed_calls = dotfinder.thresholding_step(centroids) # Final-postprocessed result if out_prefix is not None: postprocessed_fname = op.join( op.dirname(out_prefix), op.basename(out_prefix) + ".postproc.bedpe") postprocessed_calls.to_csv(postprocessed_fname, sep="\t", header=True, index=False, compression=None)