def test_get_detectable_bins(self): """Check if correct bin indices are reported as detectable.""" # Make random matrix using uniform sample between 0 and 1 uniform_mat = sp.random(1000, 1000, density=0.1, format="csr") # introduce outlier bin uniform_mat[10, :] = 0.0 uniform_mat[:, 10] = 0.0 uniform_mat = uniform_mat.tocoo() uniform_mat.eliminate_zeros() det_bins = preproc.get_detectable_bins(uniform_mat, inter=False, n_mads=1) # Check if symmetric mode return same detectable rows and cols assert np.all(det_bins[0] == det_bins[1]) # Check if the right bin index is indetectable assert 10 not in det_bins[0] asym_mat = sp.random(100, 1000, 0.1, format="csr") asym_mat[10, :] = 0 asym_mat[:, 6] = 0 asym_mat = asym_mat.tocoo() asym_mat.eliminate_zeros() asym_mat.data *= 1000 det_bins = preproc.get_detectable_bins(asym_mat, inter=True, n_mads=1) # Ensure correct index was removed in both dimensions assert 10 not in det_bins[0] assert 6 not in det_bins[1] # Ensure asymmetric interchromosomal matrix as input results in crash with self.assertRaises(ValueError): preproc.get_detectable_bins(asym_mat, inter=False)
def test_hic_genome_normalize(path): """Test if normalization of HicGenome object yields expected results""" hic_genome = ccm.HicGenome(path) valid_bins = preproc.get_detectable_bins(hic_genome.matrix, n_mads=5) hic_genome.normalize(iterations=100) filtered_mat = hic_genome.matrix.tocsr()[valid_bins[0], :] filtered_mat = filtered_mat[:, valid_bins[1]] bin_sums = preproc.sum_mat_bins(filtered_mat) assert np.allclose(bin_sums, 1, rtol=0.05)
def get_common_valid_bins( mats: Iterable[sp.csr_matrix], n_mads: float = 5, ) -> np.ndarray: """ Generates an array of valid bins indices, using the intersection of valid bins from all input sparse matrices. All input matrices must be square and have the same shape. """ common_valid = None for mat in mats: if mat.shape[0] != mat.shape[1]: NotImplementedError("Only square matrices are valid input.") # Get the list of valid bins in the current matrix valid = cup.get_detectable_bins(mat, n_mads=n_mads) # Initialize set of common bins with the first matrix if common_valid is None: common_valid = set(valid[0]) # Remove elements absent from current matrix from the common set else: common_valid = common_valid.intersection(set(valid[0])) return np.array(list(common_valid))
def get_common_valid_bins( mats: Iterable["sp.csr_matrix[float]"], n_mads: float = 5, ) -> "np.ndarray[int]": """ Generates an array of valid bins indices, using the intersection of valid bins from all input sparse matrices. All input matrices must be square and have the same shape. Valid bins are defined based on their proportion of nonzero pixels. Parameters ---------- mats : Iterable of sp.csr_matrix A list sparse matrices representing Hi-C contacts, each matrix represents a sample. n_mads : float A bin is considered missing if its proportion of nonzero pixels is lower than n_mads median absolute deviations below the median of the bin distribution for the whole matrix. Returns ------- np.ndarray of ints : A 1D array containing the indices of valid (non-missing) bins. """ common_valid = None for mat in mats: if mat.shape[0] != mat.shape[1]: raise NotImplementedError("Only square matrices are valid input.") # Get the list of valid bins in the current matrix valid = cup.get_detectable_bins(mat, n_mads=n_mads) # Initialize set of common bins with the first matrix if common_valid is None: common_valid = set(valid[0]) # Remove elements absent from current matrix from the common set else: common_valid = common_valid.intersection(set(valid[0])) return np.array(list(common_valid))
def preprocess_hic( clr: cooler.Cooler, min_contacts: Optional[int] = None, region: Optional[str] = None, ) -> sp.csr_matrix: """ Given an input cooler object, returns the preprocessed Hi-C matrix. Preprocessing involves (in that order): subsetting region, subsampling contacts, normalisation, detrending (obs / exp). Balancing weights must be pre-computer in the referenced cool file. Region must be in UCSC format. """ # Load raw matrix and subset region if requested mat = clr.matrix(sparse=True, balance=False) bins = clr.bins() if region is None: mat = mat[:] bins = bins[:] else: mat = mat.fetch(region) bins = bins.fetch(region) try: biases = bins["weight"].values except KeyError as err: sys.stderr.write("Error: Input cooler must be balanced.\n") raise err # get to same coverage if requested and matrix is not empty if mat.sum() and (min_contacts is not None): mat = cup.subsample_contacts(mat, min_contacts).tocoo() valid = cup.get_detectable_bins(mat, n_mads=5) # balance region with weights precomputed on the whole matrix mat.data = mat.data * biases[mat.row] * biases[mat.col] # Detrend for P(s) mat = cup.detrend(mat.tocsr(), smooth=False, detectable_bins=valid[0]) # Replace NaNs by 0s mat.data = np.nan_to_num(mat.data) mat.eliminate_zeros() return mat