def test_zero_pad_sparse(): """ Test if zero padding yields correct dimensions and centered input. """ mat = sp.coo_matrix(np.ones((10, 10))) for hpad in range(4): for vpad in range(4): padded = preproc.zero_pad_sparse(mat, margin_h=hpad, margin_v=vpad) assert padded.shape[0] == mat.shape[0] + 2 * vpad assert padded.shape[1] == mat.shape[1] + 2 * hpad assert np.all( mat.toarray() == padded.toarray()[vpad:padded.shape[0] - vpad, hpad:padded.shape[1] - hpad, ])
def _xcorr2_sparse(signal, kernel, threshold=1e-6): """ Cross correlate a sparse 2D signal with a dense 2D kernel. Parameters ---------- signal: scipy.sparse.csr_matrix A 2-dimensional numpy array Ms x Ns acting as the detrended Hi-C map. kernel: numpy.array of floats or tuple of numpy.arrays A 2-dimensional numpy array Mk x Nk acting as the pattern template. Can also be a factorised kernel. threshold : float Convolution score below which pixels will be set back to zero to save on time and memory. Returns ------- out: scipy.sparse.csr_matrix Convolution product of signal by kernel. """ sm, sn = signal.shape if type(kernel) is tuple: kernel_l, kernel_r = kernel km = kernel_l.shape[0] kn = kernel_r.shape[1] if kernel_l.shape[1] != kernel_r.shape[0]: raise ValueError("Kernel factorisation is invalid") n_factors = kernel_l.shape[1] for f in range(n_factors): subkernel_l = sp.diags( kernel_l[:, f], np.arange(km), shape=(sm - km + 1, sm), format="dia", ) subkernel_r = sp.diags( kernel_r[f, :], -np.arange(kn), shape=(sn, sn - kn + 1), format="dia", ) if f == 0: out = (subkernel_l @ signal) @ subkernel_r else: out += (subkernel_l @ signal) @ subkernel_r else: km, kn = kernel.shape # Sanity checks if sp.issparse(kernel): raise ValueError("cannot handle kernel in sparse format") if not sp.issparse(signal): raise ValueError("cannot handle signal in dense format") # Check of kernel is constant (uniform) constant_kernel = np.nan if np.allclose(kernel, np.tile(kernel[0, 0], kernel.shape), rtol=1e-08): constant_kernel = kernel[0, 0] out = sp.csc_matrix((sm - km + 1, sn - kn + 1), dtype=np.float64) # Simplified convolution for the special case where kernel is constant: if np.isfinite(constant_kernel): l_subkernel_sp = sp.diags( constant_kernel * np.ones(km), np.arange(km), shape=(sm - km + 1, sm), format="dia", ) r_subkernel_sp = sp.diags( np.ones(kn), -np.arange(kn), shape=(sn, sn - kn + 1), format="dia", ) out = (l_subkernel_sp @ signal) @ r_subkernel_sp # convolution code for general case # 1. 2D kernel composed of 1D filters, each col being a 1D filter. # 2. input remains unchanged, and each 1D kernel is unrolled into # a sparse toeplitz matrix. # 3. Each 1D conv is computed via a sparse matrix x vector product. # It is fast because the product is delegated to numpy. else: # In case the kernel is rectangle, it is faster to scan # the largest dimension first if kn < km: for kj in range(kn): subkernel_sp = sp.diags( kernel[:, kj], np.arange(km), shape=(sm - km + 1, sm), format="csr", ) out += subkernel_sp.dot(signal[:, kj : sn - kn + 1 + kj]) else: for ki in range(km): subkernel_sp = sp.diags( np.array(kernel[ki, :]).flatten(), np.arange(kn), shape=(sn - kn + 1, sn), format="csr", ) out += signal[ki : sm - km + 1 + ki, :].dot(subkernel_sp.T) # Set very low pixels to 0 out.data[np.abs(out.data) < threshold] = 0 out.eliminate_zeros() # Resize matrix to original dimensions out = preproc.zero_pad_sparse( out, margin_h=(kn - 1) // 2, margin_v=(km - 1) // 2, fmt="csr" ) return out
def pattern_detector( contact_map, kernel_config, kernel_matrix, coords=None, dump=None, full=False, tsvd=None, ): """ Detect patterns in a contact map by kernel matching, and extract windows around the detected patterns. If coordinates are provided, detection is skipped and windows are extracted around those coordinates. Parameters ---------- contact_map : ContactMap object An object containing an inter- or intra-chromosomal Hi-C contact map and additional metadata. kernel_config : dict The kernel configuration, as documented in chromosight.utils.io.load_kernel_config kernel_matrix : numpy.array The kernel matrix to use for convolution as a 2D numpy array coords : numpy.array of ints or None A table with coordinates of patterns, with one pattern per row and 2 columns being the row and column number of the pattern in the input contact map. If this is provided, detection is skipped and quantification is performed on those coordinates. dump : str or None Folder in which dumps should be generated after each step of the detection process. If None, no dump is generated tsvd : float or None If a float between 0 and 1 is given, the input kernel is factorised using truncated SVD, keeping enough singular vectors to retain this proportion of information. Factorisation speeds up convolution at the cost of a loss of information. If the number of singular vectors required to retain the desired information is disabled by default. Returns ------- filtered_chrom_patterns : pandas.DataFrame A table of detected patterns with 4 columns: bin1, bin2, score, qvalue. chrom_pattern_windows : numpy array A 3D array containing the pile of windows around detected patterns. """ km, kn = kernel_matrix.shape kh, kw = (km - 1) // 2, (kn - 1) // 2 def save_dump(base, mat): """Define where to save the dump""" sp.save_npz( pathlib.Path(dump) / f"{contact_map.name}_{base}", mat ) # Define type of analysis. run_mode = "detect" if coords is None else "quantify" # Do not attempt pattern detection unless matrix is larger than the kernel if min(contact_map.matrix.shape) <= max(kernel_matrix.shape): return None, None # If full is specified, missing bins are accounted for using a mask if full: missing_mask = preproc.make_missing_mask( contact_map.matrix.shape, valid_rows=contact_map.detectable_bins[0], valid_cols=contact_map.detectable_bins[1], max_dist=contact_map.max_dist, sym_upper=not contact_map.inter, ) else: missing_mask = None # Pattern matching operates here mat_conv, mat_log10_pvals = normxcorr2( contact_map.matrix.tocsr(), kernel_matrix, max_dist=contact_map.max_dist, sym_upper=not contact_map.inter, full=full, missing_mask=missing_mask, tsvd=tsvd, pval=True, missing_tol=kernel_config["max_perc_undetected"] / 100, ) if dump: save_dump("03_normxcorr2", mat_conv) # Clean potential missing values mat_conv.data[np.isnan(mat_conv.data)] = 0 # Only keep corrcoefs in scannable range if not contact_map.inter: mat_conv = preproc.diag_trim(mat_conv.tocsr(), contact_map.max_dist) if dump: save_dump("04_diag_trim", mat_conv) mat_conv = mat_conv.tocoo() mat_conv.eliminate_zeros() # Only attempt detection if no input coordinates were given if run_mode == "detect": # Find foci of highly correlated pixels and pick local maxima # coords, foci_mat = pick_foci(np.abs(mat_log10_pvals), 5) coords, foci_mat = pick_foci(mat_conv, kernel_config["pearson"],) # If nothing was detected, no point in resuming if coords is None: return None, None if dump: save_dump("05_foci", foci_mat) mat = contact_map.matrix.copy() det = [d.copy() for d in contact_map.detectable_bins] # Zero pad contact and convolution maps and shift missing bins and detected # pattern coords before validation if in full mode if full: mat = mat.tocoo() mat = preproc.zero_pad_sparse(mat, kh, kw, fmt="csr") mat_conv = preproc.zero_pad_sparse(mat_conv, kh, kw, fmt="csr") det[0] += kh det[1] += kw coords[:, 0] += kh coords[:, 1] += kw if not contact_map.inter: # set the first kh / 2 diagonals in the lower triangle to NaN # so that pileups do not count them big_k = max(km, kn) mat = mat.tocsr() mat += sp.diags( np.full(big_k, np.nan), -np.arange(1, big_k + 1), shape=mat.shape, format="csr", ) # When detecting 1D pattern, enforce coordinates on diagonal # coordinates can be shifted by 1 since we keep the two first # diagonals to allow formation of foci via 4-way adjacency if kernel_config["max_dist"] == 0: coords[:, 0] = coords[:, 1] # Extract windows around coordinates and assign a correlation # to each pattern. In detection mode, we drop invalid patterns # in quantification mode, all input patterns are returned. filtered_coords, filtered_windows = validate_patterns( coords, mat, mat_conv.tocsr(), det, kernel_matrix, zero_tol=kernel_config["max_perc_zero"] / 100, missing_tol=kernel_config["max_perc_undetected"] / 100, drop=True if run_mode == "detect" else False, ) # Shift coordinates of detected patterns back if padding was added if full: filtered_coords.bin1 -= kh filtered_coords.bin2 -= kw try: filtered_coords["pvalue"] = mat_log10_pvals[ filtered_coords.bin1, filtered_coords.bin2 ].A1 # No coordinate passed the validation filters except AttributeError: filtered_coords["pvalue"] = None # Remove log10 transform and correct p-values for multiple testing filtered_coords["pvalue"] = 10 ** filtered_coords["pvalue"] return filtered_coords, filtered_windows