예제 #1
0
 def test_get_detectable_bins(self):
     """Check if correct bin indices are reported as detectable."""
     # Make random matrix using uniform sample between 0 and 1
     uniform_mat = sp.random(1000, 1000, density=0.1, format="csr")
     # introduce outlier bin
     uniform_mat[10, :] = 0.0
     uniform_mat[:, 10] = 0.0
     uniform_mat = uniform_mat.tocoo()
     uniform_mat.eliminate_zeros()
     det_bins = preproc.get_detectable_bins(uniform_mat,
                                            inter=False,
                                            n_mads=1)
     # Check if symmetric mode return same detectable rows and cols
     assert np.all(det_bins[0] == det_bins[1])
     # Check if the right bin index is indetectable
     assert 10 not in det_bins[0]
     asym_mat = sp.random(100, 1000, 0.1, format="csr")
     asym_mat[10, :] = 0
     asym_mat[:, 6] = 0
     asym_mat = asym_mat.tocoo()
     asym_mat.eliminate_zeros()
     asym_mat.data *= 1000
     det_bins = preproc.get_detectable_bins(asym_mat, inter=True, n_mads=1)
     # Ensure correct index was removed in both dimensions
     assert 10 not in det_bins[0]
     assert 6 not in det_bins[1]
     # Ensure asymmetric interchromosomal matrix as input results in crash
     with self.assertRaises(ValueError):
         preproc.get_detectable_bins(asym_mat, inter=False)
예제 #2
0
def test_hic_genome_normalize(path):
    """Test if normalization of HicGenome object yields expected results"""
    hic_genome = ccm.HicGenome(path)
    valid_bins = preproc.get_detectable_bins(hic_genome.matrix, n_mads=5)
    hic_genome.normalize(iterations=100)
    filtered_mat = hic_genome.matrix.tocsr()[valid_bins[0], :]
    filtered_mat = filtered_mat[:, valid_bins[1]]
    bin_sums = preproc.sum_mat_bins(filtered_mat)
    assert np.allclose(bin_sums, 1, rtol=0.05)
예제 #3
0
def get_common_valid_bins(
    mats: Iterable[sp.csr_matrix],
    n_mads: float = 5,
) -> np.ndarray:
    """
    Generates an array of valid bins indices, using the intersection
    of valid bins from all input sparse matrices. All input matrices must
    be square and have the same shape.
    """
    common_valid = None
    for mat in mats:
        if mat.shape[0] != mat.shape[1]:
            NotImplementedError("Only square matrices are valid input.")
        # Get the list of valid bins in the current matrix
        valid = cup.get_detectable_bins(mat, n_mads=n_mads)
        # Initialize set of common bins with the first matrix
        if common_valid is None:
            common_valid = set(valid[0])
        # Remove elements absent from current matrix from the common set
        else:
            common_valid = common_valid.intersection(set(valid[0]))
    return np.array(list(common_valid))
예제 #4
0
def get_common_valid_bins(
    mats: Iterable["sp.csr_matrix[float]"],
    n_mads: float = 5,
) -> "np.ndarray[int]":
    """
    Generates an array of valid bins indices, using the intersection
    of valid bins from all input sparse matrices. All input matrices must
    be square and have the same shape. Valid bins are defined based on their
    proportion of nonzero pixels.

    Parameters
    ----------
    mats : Iterable of sp.csr_matrix
        A list sparse matrices representing Hi-C contacts, each matrix
        represents a sample.
    n_mads : float
        A bin is considered missing if its proportion of nonzero pixels is lower
        than n_mads median absolute deviations below the median of the bin
        distribution for the whole matrix.

    Returns
    -------
    np.ndarray of ints :
        A 1D array containing the indices of valid (non-missing) bins.
    """
    common_valid = None
    for mat in mats:
        if mat.shape[0] != mat.shape[1]:
            raise NotImplementedError("Only square matrices are valid input.")
        # Get the list of valid bins in the current matrix
        valid = cup.get_detectable_bins(mat, n_mads=n_mads)
        # Initialize set of common bins with the first matrix
        if common_valid is None:
            common_valid = set(valid[0])
        # Remove elements absent from current matrix from the common set
        else:
            common_valid = common_valid.intersection(set(valid[0]))
    return np.array(list(common_valid))
예제 #5
0
def preprocess_hic(
    clr: cooler.Cooler,
    min_contacts: Optional[int] = None,
    region: Optional[str] = None,
) -> sp.csr_matrix:
    """
    Given an input cooler object, returns the preprocessed Hi-C matrix.
    Preprocessing involves (in that order): subsetting region, subsampling
    contacts, normalisation, detrending (obs / exp). Balancing weights must
    be pre-computer in the referenced cool file. Region must be in UCSC format.
    """
    # Load raw matrix and subset region if requested
    mat = clr.matrix(sparse=True, balance=False)
    bins = clr.bins()
    if region is None:
        mat = mat[:]
        bins = bins[:]
    else:
        mat = mat.fetch(region)
        bins = bins.fetch(region)
    try:
        biases = bins["weight"].values
    except KeyError as err:
        sys.stderr.write("Error: Input cooler must be balanced.\n")
        raise err
    # get to same coverage if requested and matrix is not empty
    if mat.sum() and (min_contacts is not None):
        mat = cup.subsample_contacts(mat, min_contacts).tocoo()
    valid = cup.get_detectable_bins(mat, n_mads=5)

    # balance region with weights precomputed on the whole matrix
    mat.data = mat.data * biases[mat.row] * biases[mat.col]
    # Detrend for P(s)
    mat = cup.detrend(mat.tocsr(), smooth=False, detectable_bins=valid[0])
    # Replace NaNs by 0s
    mat.data = np.nan_to_num(mat.data)
    mat.eliminate_zeros()
    return mat