Ejemplo n.º 1
0
def preprocess_hic(
    clr: cooler.Cooler,
    min_contacts: Optional[int] = None,
    region: Optional[str] = None,
) -> sp.csr_matrix:
    """
    Given an input cooler object, returns the preprocessed Hi-C matrix.
    Preprocessing involves (in that order): subsetting region, subsampling
    contacts, normalisation, detrending (obs / exp). Balancing weights must
    be pre-computer in the referenced cool file. Region must be in UCSC format.
    """
    # Load raw matrix and subset region if requested
    mat = clr.matrix(sparse=True, balance=False)
    bins = clr.bins()
    if region is None:
        mat = mat[:]
        bins = bins[:]
    else:
        mat = mat.fetch(region)
        bins = bins.fetch(region)
    try:
        biases = bins["weight"].values
    except KeyError as err:
        sys.stderr.write("Error: Input cooler must be balanced.\n")
        raise err
    # get to same coverage if requested and matrix is not empty
    if mat.sum() and (min_contacts is not None):
        mat = cup.subsample_contacts(mat, min_contacts).tocoo()
    valid = cup.get_detectable_bins(mat, n_mads=5)

    # balance region with weights precomputed on the whole matrix
    mat.data = mat.data * biases[mat.row] * biases[mat.col]
    # Detrend for P(s)
    mat = cup.detrend(mat.tocsr(), smooth=False, detectable_bins=valid[0])
    # Replace NaNs by 0s
    mat.data = np.nan_to_num(mat.data)
    mat.eliminate_zeros()
    return mat
Ejemplo n.º 2
0
def test_subsample_contacts_count(n_contacts):
    """Test sampling raw contact counts"""
    sampled = preproc.subsample_contacts(mat.tocoo(), n_contacts)
    assert np.isclose(sampled.data.sum(), n_contacts, rtol=0.1)
Ejemplo n.º 3
0
def test_subsample_contacts_prop(prop):
    """Test sampling proportions of contacts"""
    sampled = preproc.subsample_contacts(mat.tocoo(),
                                         int(prop * mat.data.sum()))
    assert np.isclose(sampled.data.sum(), mat.data.sum() * prop, rtol=0.1)
Ejemplo n.º 4
0
 def test_subsample_contacts_exceed(self, n_contacts):
     """Oversampling should result in value errors"""
     with self.assertRaises(ValueError):
         preproc.subsample_contacts(mat, n_contacts)