def test_diag_trim(matrix): """Check if trimming diagonals preserves shape and sets diagonals to zero.""" for d in range(matrix.shape[0]): trimmed = preproc.diag_trim(matrix.tocsr(), d) diag_sums = [ trimmed.diagonal(d).sum() for d in range(trimmed.shape[0]) ] assert trimmed.shape == matrix.shape assert np.sum(diag_sums[d + 1:]) == 0
def test_make_missing_mask(self): """Test if missing bin masks are generated properly according to matrix type""" missing_bins = np.array([0, 4, 9]) valid_bins = np.array([i for i in range(10) if i not in missing_bins]) valid_cols = np.array([i for i in range(15) if i not in missing_bins]) max_dist = 3 # Symmetric mask, whole matrix masked exp_mask_sym = np.zeros((10, 10), dtype=bool) exp_mask_sym[:, missing_bins] = True exp_mask_sym[missing_bins, :] = True # Asymmetric mask, whole matrix masked exp_mask_asym = np.zeros((10, 15), dtype=bool) exp_mask_asym[:, missing_bins] = True exp_mask_asym[missing_bins, :] = True # Symmetric mask, only upper triangle masked exp_mask_sym_upper = np.triu(exp_mask_sym) # Symmetric upper triangle masked up to a certain distance exp_mask_sym_upper_maxdist = preproc.diag_trim(exp_mask_sym_upper, max_dist + 1) # Test if correct bins are masked obs_mask_sym = preproc.make_missing_mask(exp_mask_sym.shape, valid_bins, valid_bins, sym_upper=False) assert np.all(obs_mask_sym == exp_mask_sym) # Test if only upper triangle is masked in upper symmetric matrices obs_mask_sym_upper = preproc.make_missing_mask(exp_mask_sym.shape, valid_bins, valid_bins, sym_upper=True) assert np.all(obs_mask_sym_upper == exp_mask_sym_upper) # Test masking of asymmetric matrices obs_mask_asym = preproc.make_missing_mask(exp_mask_asym.shape, valid_bins, valid_cols) assert np.all(obs_mask_asym == exp_mask_asym) # Test if giving an asymmetric matrix with sym_upper results in error with self.assertRaises(ValueError): preproc.make_missing_mask(obs_mask_asym.shape, valid_bins, valid_bins, sym_upper=True) # Test if using max_dist yields the same results as manually truncating diagonals obs_mask_sym_upper_maxdist = preproc.make_missing_mask( exp_mask_sym.shape, valid_bins, valid_bins, sym_upper=True, max_dist=max_dist, ) assert np.all(obs_mask_sym_upper_maxdist == exp_mask_sym_upper_maxdist)
def pattern_detector( contact_map, kernel_config, kernel_matrix, coords=None, dump=None, full=False, tsvd=None, ): """ Detect patterns in a contact map by kernel matching, and extract windows around the detected patterns. If coordinates are provided, detection is skipped and windows are extracted around those coordinates. Parameters ---------- contact_map : ContactMap object An object containing an inter- or intra-chromosomal Hi-C contact map and additional metadata. kernel_config : dict The kernel configuration, as documented in chromosight.utils.io.load_kernel_config kernel_matrix : numpy.array The kernel matrix to use for convolution as a 2D numpy array coords : numpy.array of ints or None A table with coordinates of patterns, with one pattern per row and 2 columns being the row and column number of the pattern in the input contact map. If this is provided, detection is skipped and quantification is performed on those coordinates. dump : str or None Folder in which dumps should be generated after each step of the detection process. If None, no dump is generated tsvd : float or None If a float between 0 and 1 is given, the input kernel is factorised using truncated SVD, keeping enough singular vectors to retain this proportion of information. Factorisation speeds up convolution at the cost of a loss of information. If the number of singular vectors required to retain the desired information is disabled by default. Returns ------- filtered_chrom_patterns : pandas.DataFrame A table of detected patterns with 4 columns: bin1, bin2, score, qvalue. chrom_pattern_windows : numpy array A 3D array containing the pile of windows around detected patterns. """ km, kn = kernel_matrix.shape kh, kw = (km - 1) // 2, (kn - 1) // 2 def save_dump(base, mat): """Define where to save the dump""" sp.save_npz( pathlib.Path(dump) / f"{contact_map.name}_{base}", mat ) # Define type of analysis. run_mode = "detect" if coords is None else "quantify" # Do not attempt pattern detection unless matrix is larger than the kernel if min(contact_map.matrix.shape) <= max(kernel_matrix.shape): return None, None # If full is specified, missing bins are accounted for using a mask if full: missing_mask = preproc.make_missing_mask( contact_map.matrix.shape, valid_rows=contact_map.detectable_bins[0], valid_cols=contact_map.detectable_bins[1], max_dist=contact_map.max_dist, sym_upper=not contact_map.inter, ) else: missing_mask = None # Pattern matching operates here mat_conv, mat_log10_pvals = normxcorr2( contact_map.matrix.tocsr(), kernel_matrix, max_dist=contact_map.max_dist, sym_upper=not contact_map.inter, full=full, missing_mask=missing_mask, tsvd=tsvd, pval=True, missing_tol=kernel_config["max_perc_undetected"] / 100, ) if dump: save_dump("03_normxcorr2", mat_conv) # Clean potential missing values mat_conv.data[np.isnan(mat_conv.data)] = 0 # Only keep corrcoefs in scannable range if not contact_map.inter: mat_conv = preproc.diag_trim(mat_conv.tocsr(), contact_map.max_dist) if dump: save_dump("04_diag_trim", mat_conv) mat_conv = mat_conv.tocoo() mat_conv.eliminate_zeros() # Only attempt detection if no input coordinates were given if run_mode == "detect": # Find foci of highly correlated pixels and pick local maxima # coords, foci_mat = pick_foci(np.abs(mat_log10_pvals), 5) coords, foci_mat = pick_foci(mat_conv, kernel_config["pearson"],) # If nothing was detected, no point in resuming if coords is None: return None, None if dump: save_dump("05_foci", foci_mat) mat = contact_map.matrix.copy() det = [d.copy() for d in contact_map.detectable_bins] # Zero pad contact and convolution maps and shift missing bins and detected # pattern coords before validation if in full mode if full: mat = mat.tocoo() mat = preproc.zero_pad_sparse(mat, kh, kw, fmt="csr") mat_conv = preproc.zero_pad_sparse(mat_conv, kh, kw, fmt="csr") det[0] += kh det[1] += kw coords[:, 0] += kh coords[:, 1] += kw if not contact_map.inter: # set the first kh / 2 diagonals in the lower triangle to NaN # so that pileups do not count them big_k = max(km, kn) mat = mat.tocsr() mat += sp.diags( np.full(big_k, np.nan), -np.arange(1, big_k + 1), shape=mat.shape, format="csr", ) # When detecting 1D pattern, enforce coordinates on diagonal # coordinates can be shifted by 1 since we keep the two first # diagonals to allow formation of foci via 4-way adjacency if kernel_config["max_dist"] == 0: coords[:, 0] = coords[:, 1] # Extract windows around coordinates and assign a correlation # to each pattern. In detection mode, we drop invalid patterns # in quantification mode, all input patterns are returned. filtered_coords, filtered_windows = validate_patterns( coords, mat, mat_conv.tocsr(), det, kernel_matrix, zero_tol=kernel_config["max_perc_zero"] / 100, missing_tol=kernel_config["max_perc_undetected"] / 100, drop=True if run_mode == "detect" else False, ) # Shift coordinates of detected patterns back if padding was added if full: filtered_coords.bin1 -= kh filtered_coords.bin2 -= kw try: filtered_coords["pvalue"] = mat_log10_pvals[ filtered_coords.bin1, filtered_coords.bin2 ].A1 # No coordinate passed the validation filters except AttributeError: filtered_coords["pvalue"] = None # Remove log10 transform and correct p-values for multiple testing filtered_coords["pvalue"] = 10 ** filtered_coords["pvalue"] return filtered_coords, filtered_windows
def _corrcoef2d_dense( signal, kernel, max_dist=None, sym_upper=False, scaling="pearson" ): """Implementation of signal-kernel 2D correlation for dense matrices Pearson correlation coefficient between signal and sliding kernel. Convolutes the input signal and kernel computes a cross correlation coefficient. Parameters ---------- signal : numpy.array The input processed Hi-C matrix. kernel : numpy.array The pattern kernel to use for convolution. max_dist : int Maximum scan distance, in number of bins from the diagonal. If None, the whole matrix is convoluted. Otherwise, pixels further than this distance from the diagonal are set to 0 and ignored for performance. Only useful for intrachromosomal matrices. sym_upper : False Whether the matrix is symmetric and upper triangle. True for intrachromosomal matrices. scaling : str Which metric to use when computing correlation coefficients. Either 'pearson' for Pearson correlation, or 'cross' for cross correlation. Returns ------- numpy.array The sparse matrix of correlation coefficients """ # Convert numpy matrices to array to avoid operator overloading if isinstance(signal, np.matrix): signal = np.array(signal) if isinstance(kernel, np.matrix): kernel = np.array(kernel) # If using only the upper triangle matrix, set diagonals that will # overlap the kernel in the lower triangle to their opposite diagonal # in the upper triangle if sym_upper: # Full matrix is stored for dense arrays anyway # -> make symmetric sys.stderr.write("Making dense matrix symmetric.\n") signal = signal + np.transpose(signal) - np.diag(np.diag(signal)) kernel_size = kernel.shape[0] * kernel.shape[1] if scaling == "cross": # Compute convolution product conv = xcorr2(signal, kernel) # Generate constant kernel kernel1 = np.ones(kernel.shape) # Convolute squared signal with constant kernel signal2 = xcorr2(signal ** 2, kernel1) kernel2 = float(np.sum(kernel ** 2)) denom = signal2 * kernel2 denom = np.sqrt(denom) elif scaling == "pearson": mean_kernel = float(kernel.mean()) std_kernel = float(kernel.std()) if not (std_kernel > 0): raise ValueError( "Cannot have scaling=pearson when kernel" "is flat. Use scaling=cross." ) kernel1 = np.ones(kernel.shape) mean_signal = xcorr2(signal, kernel1 / kernel_size) std_signal = ( xcorr2(signal ** 2, kernel1 / kernel_size) - mean_signal ** 2 ) std_signal = np.sqrt(std_signal) conv = xcorr2(signal, kernel / kernel_size) - mean_signal * mean_kernel denom = std_signal * std_kernel conv /= denom if (max_dist is not None) and sym_upper: # Trim diagonals further than max_scan_distance conv = preproc.diag_trim(conv, max_dist) if sym_upper: conv = np.triu(conv) conv[~np.isfinite(conv)] = 0.0 conv[conv < 0] = 0.0 return conv
def _corrcoef2d_sparse( signal, kernel, max_dist=None, sym_upper=False, scaling="pearson" ): """Implementation of signal-kernel 2D correlation for sparse matrices Pearson correlation coefficient between signal and sliding kernel. Convolutes the input signal and kernel computes a cross correlation coefficient. Parameters ---------- signal : scipy.sparse.csr_matrix The input processed Hi-C matrix. kernel : numpy.array The pattern kernel to use for convolution. max_dist : int Maximum scan distance, in number of bins from the diagonal. If None, the whole matrix is convoluted. Otherwise, pixels further than this distance from the diagonal are set to 0 and ignored for performance. Only useful for intrachromosomal matrices. sym_upper : False Whether the matrix is symmetric and upper triangle. True for intrachromosomal matrices. scaling : str Which metric to use when computing correlation coefficients. Either 'pearson' for Pearson correlation, or 'cross' for cross correlation. Returns ------- scipy.sparse.csr_matrix The sparse matrix of correlation coefficients """ # If using only the upper triangle matrix, set diagonals that will # overlap the kernel in the lower triangle to their opposite diagonal # in the upper triangle if sym_upper: signal = signal.tolil() for i in range(1, kernel.shape[0]): signal.setdiag(signal.diagonal(i), -i) signal = signal.tocsr() kernel_size = kernel.shape[0] * kernel.shape[1] if scaling == "cross": # Compute convolution product conv = xcorr2(signal, kernel) # Generate constant kernel kernel1 = np.ones(kernel.shape) # Convolute squared signal with constant kernel signal2 = xcorr2(signal.power(2), kernel1) kernel2 = float(np.sum(np.power(kernel, 2))) denom = signal2 * kernel2 denom = denom.sqrt() elif scaling == "pearson": mean_kernel = float(kernel.mean()) std_kernel = float(kernel.std()) if not (std_kernel > 0): raise ValueError( "Cannot have scaling=pearson when kernel" "is flat. Use scaling=cross." ) kernel1 = np.ones(kernel.shape) mean_signal = xcorr2(signal, kernel1 / kernel_size) std_signal = xcorr2( signal.power(2), kernel1 / kernel_size ) - mean_signal.power(2) std_signal = std_signal.sqrt() conv = xcorr2(signal, kernel / kernel_size) - mean_signal * mean_kernel denom = std_signal * std_kernel # Since elementwise sparse matrices division is not implemented, compute # numerator and denominator and perform division on the 1D array of nonzero # values. # Get coords of non-zero (nz) values in the numerator nz_vals = conv.nonzero() # Divide them by corresponding entries in the numerator denom = denom.tocsr() try: conv.data /= denom[nz_vals].A1 # Case there are no nonzero corrcoef except AttributeError: pass if (max_dist is not None) and sym_upper: # Trim diagonals further than max_scan_distance conv = preproc.diag_trim(conv.todia(), max_dist) if sym_upper: conv = sp.triu(conv) conv = conv.tocoo() conv.data[~np.isfinite(conv.data)] = 0.0 conv.data[conv.data < 0] = 0.0 conv.eliminate_zeros() conv = conv.tocsr() return conv
def pattern_detector(contact_map, kernel_config, kernel_matrix, dump=None): """Pattern detector Detect patterns by iterated kernel matching, and extract windows around the detected patterns. Parameters ---------- contact_map : ContactMap object An object containing an inter- or intra-chromosomal Hi-C contact map and additional metadata. kernel_config : dict The kernel configuration, as documented in chromosight.utils.io.load_kernel_config kernel_matrix : numpy.array The kernel matrix to use for convolution as a 2D numpy array dump : str or None Folder in which dumps should be generated after each step of the detection process. If None, no dump is generated Returns ------- filtered_chrom_patterns : numpy.array A 2D array of detected patterns with 3 columns: x, y, score. chrom_pattern_windows : numpy array A 3D array containing the pile of windows around detected patterns. """ # Define where to save the dump save_dump = lambda base, mat: sp.save_npz( pathlib.Path(dump) / f"{contact_map.name}_{base}", mat ) # Do not attempt pattern detection unless matrix is larger than the kernel if min(contact_map.matrix.shape) <= max(kernel_matrix.shape): return None, None # Dirty trick: Since sparse implementation of convolution currently works # only for symmetric matrices, use dense implementation for inter-matrices # This is very expensive in RAM # Pattern matching operate here mat_conv = corrcoef2d( contact_map.matrix, kernel_matrix, max_dist=kernel_config["max_dist"], sym_upper=not contact_map.inter, ) if dump: save_dump("03_corrcoef2d", mat_conv) # Only trim diagonals for intra matrices (makes no sense for inter) mat_conv = mat_conv.tocoo() # Clean potential missing values mat_conv.data[np.isnan(mat_conv.data)] = 0 # Only keep corrcoefs in scannable range if not contact_map.inter: mat_conv = preproc.diag_trim(mat_conv.todia(), contact_map.max_dist) if dump: save_dump("04_diag_trim", mat_conv) mat_conv = mat_conv.tocoo() mat_conv.eliminate_zeros() # Find foci of highly correlated pixels chrom_pattern_coords, foci_mat = picker( mat_conv, kernel_config["precision"] ) if chrom_pattern_coords is None: return None, None if dump: save_dump("05_foci", foci_mat) filtered_chrom_patterns, chrom_pattern_windows = validate_patterns( chrom_pattern_coords, contact_map.matrix, mat_conv.tocsr(), contact_map.detectable_bins, kernel_matrix, kernel_config["max_perc_undetected"], ) return filtered_chrom_patterns, chrom_pattern_windows
def detection_matrix( samples: pd.DataFrame, kernel: np.ndarray, region: Optional[str] = None, subsample: Optional[int] = None, max_dist: Optional[int] = None, pearson_thresh: Optional[float] = None, density_thresh: Optional[float] = None, snr_thresh: Optional[float] = 1.0, n_cpus: int = 4, ) -> Tuple[Optional[sp.csr_matrix], Optional[sp.csr_matrix]]: """ Run the detection process for a single chromosome or region. This is abstracted from all notions of chromosomes and genomic coordinates. """ # We consider the matrix is symmetric upper (i.e. intrachromosomal) sym_upper = True # Diagonals will be trimmed at max_dist with a margin for convolution if max_dist is None: trim_dist = None else: mat_size = samples.cool[0].matrix(sparse=True).fetch(region).shape[0] trim_dist = min(mat_size, max_dist + max(kernel.shape)) # Compute number of contacts in the matrix with the lowest coverage if subsample: min_contacts = get_min_contacts(samples.cool, region=region) else: min_contacts = None # Define the condition of the first sample as the baseline condition control = samples.cond.values[0] # Preprocess all matrices (subsample, balance, detrend) # Samples pocessed in parallel if requested if n_cpus > 1: pool = mp.Pool(n_cpus) map_fun = pool.starmap else: map_fun = lambda x, y: [x(*args) for args in y] # Hi-C specific preprocessing individual matrices (subsample, balance, detrend) samples["mat"] = map_fun( preprocess_hic, zip(samples.cool, it.repeat(min_contacts), it.repeat(region)), ) print(f"{region} preprocessed", file=sys.stderr) # Return nothing if the matrix is smaller than kernel if np.any(np.array(samples["mat"][0].shape) <= np.array(kernel.shape)): return None, None # Retrieve the indices of bins which are valid in all samples (not missing # because of repeated sequences or low coverage) common_bins = pap.get_common_valid_bins(samples["mat"]) # Trim diagonals beyond max_dist (with kernel margin for the convolution) # to spare resources if trim_dist is not None: samples["mat"] = map_fun(cup.diag_trim, zip(samples["mat"], it.repeat(trim_dist))) # Generate a missing mask from these bins missing_mask = cup.make_missing_mask( samples["mat"][0].shape, common_bins, common_bins, max_dist=trim_dist, sym_upper=sym_upper, ) # Remove all missing values from each sample's matrix samples["mat"] = map_fun( cup.erase_missing, zip( map(sp.triu, samples["mat"]), it.repeat(common_bins), it.repeat(common_bins), it.repeat(sym_upper), ), ) print(f"{region} missing bins erased", file=sys.stderr) # Compute a density filter: regions with sufficient proportion of nonzero # pixels in kernel windows, in all samples. We will use it for downstream # which filter if (density_thresh is not None) and (density_thresh > 0): density_filter = make_density_filter( samples["mat"], density_thresh=density_thresh, win_size=kernel.shape[0], sym_upper=sym_upper, ) # Generate correlation maps for all samples using chromosight's algorithm corrs = map_fun( cud.normxcorr2, zip( samples.mat.values, it.repeat(kernel), it.repeat(max_dist), it.repeat(True), it.repeat(True), it.repeat(missing_mask), it.repeat(0.75), it.repeat(None), it.repeat(False), ), ) samples["mat"] = [tup[0] for tup in corrs] del corrs print(f"{region} correlation matrices computed", file=sys.stderr) # Get the union of nonzero coordinates across all samples total_nnz_set = pap.get_nnz_union(samples["mat"]) # Fill zeros at these coordinates samples["mat"] = samples["mat"].apply( lambda cor: pap.fill_nnz(cor, total_nnz_set)) # Erase pixels where all samples are below pearson threshold if pearson_thresh is not None: pearson_fail = [(m.data < pearson_thresh).astype(bool) for m in samples["mat"]] pearson_fail = np.bitwise_and.reduce(pearson_fail) # Threshold maps using pearson correlations to reduce noisy detections for i, m in enumerate(samples["mat"]): m.data[pearson_fail] = 0.0 samples["mat"][i] = m if n_cpus > 1: pool.close() # Use median background diff, snr = _median_bg_subtraction(samples, control, snr_thresh) # Erase pixels which do not pass the density filter in all samples if (density_thresh is not None) and (density_thresh > 0): diff = diff.multiply(density_filter) # Remove all values beyond user-specified max_dist if max_dist is not None: diff = cup.diag_trim(diff, max_dist + 2) return diff, snr