def preprocess_hic( clr: cooler.Cooler, min_contacts: Optional[int] = None, region: Optional[str] = None, ) -> sp.csr_matrix: """ Given an input cooler object, returns the preprocessed Hi-C matrix. Preprocessing involves (in that order): subsetting region, subsampling contacts, normalisation, detrending (obs / exp). Balancing weights must be pre-computer in the referenced cool file. Region must be in UCSC format. """ # Load raw matrix and subset region if requested mat = clr.matrix(sparse=True, balance=False) bins = clr.bins() if region is None: mat = mat[:] bins = bins[:] else: mat = mat.fetch(region) bins = bins.fetch(region) try: biases = bins["weight"].values except KeyError as err: sys.stderr.write("Error: Input cooler must be balanced.\n") raise err # get to same coverage if requested and matrix is not empty if mat.sum() and (min_contacts is not None): mat = cup.subsample_contacts(mat, min_contacts).tocoo() valid = cup.get_detectable_bins(mat, n_mads=5) # balance region with weights precomputed on the whole matrix mat.data = mat.data * biases[mat.row] * biases[mat.col] # Detrend for P(s) mat = cup.detrend(mat.tocsr(), smooth=False, detectable_bins=valid[0]) # Replace NaNs by 0s mat.data = np.nan_to_num(mat.data) mat.eliminate_zeros() return mat
def test_subsample_contacts_count(n_contacts): """Test sampling raw contact counts""" sampled = preproc.subsample_contacts(mat.tocoo(), n_contacts) assert np.isclose(sampled.data.sum(), n_contacts, rtol=0.1)
def test_subsample_contacts_prop(prop): """Test sampling proportions of contacts""" sampled = preproc.subsample_contacts(mat.tocoo(), int(prop * mat.data.sum())) assert np.isclose(sampled.data.sum(), mat.data.sum() * prop, rtol=0.1)
def test_subsample_contacts_exceed(self, n_contacts): """Oversampling should result in value errors""" with self.assertRaises(ValueError): preproc.subsample_contacts(mat, n_contacts)