Ejemplo n.º 1
0
def test_zero_pad_sparse():
    """
    Test if zero padding yields correct dimensions and centered input.
    """
    mat = sp.coo_matrix(np.ones((10, 10)))
    for hpad in range(4):
        for vpad in range(4):
            padded = preproc.zero_pad_sparse(mat, margin_h=hpad, margin_v=vpad)
            assert padded.shape[0] == mat.shape[0] + 2 * vpad
            assert padded.shape[1] == mat.shape[1] + 2 * hpad
            assert np.all(
                mat.toarray() == padded.toarray()[vpad:padded.shape[0] - vpad,
                                                  hpad:padded.shape[1] -
                                                  hpad, ])
Ejemplo n.º 2
0
def _xcorr2_sparse(signal, kernel, threshold=1e-6):
    """
    Cross correlate a sparse 2D signal with a dense 2D kernel.

    Parameters
    ----------
    signal: scipy.sparse.csr_matrix
        A 2-dimensional numpy array Ms x Ns acting as the detrended Hi-C map.
    kernel: numpy.array of floats or tuple of numpy.arrays
        A 2-dimensional numpy array Mk x Nk acting as the pattern template. Can
        also be a factorised kernel.
    threshold : float
        Convolution score below which pixels will be set back to zero to save
        on time and memory.
    Returns
    -------
    out: scipy.sparse.csr_matrix
        Convolution product of signal by kernel.
    """
    sm, sn = signal.shape

    if type(kernel) is tuple:
        kernel_l, kernel_r = kernel
        km = kernel_l.shape[0]
        kn = kernel_r.shape[1]
        if kernel_l.shape[1] != kernel_r.shape[0]:
            raise ValueError("Kernel factorisation is invalid")
        n_factors = kernel_l.shape[1]
        for f in range(n_factors):
            subkernel_l = sp.diags(
                kernel_l[:, f], np.arange(km), shape=(sm - km + 1, sm), format="dia",
            )
            subkernel_r = sp.diags(
                kernel_r[f, :], -np.arange(kn), shape=(sn, sn - kn + 1), format="dia",
            )
            if f == 0:
                out = (subkernel_l @ signal) @ subkernel_r
            else:
                out += (subkernel_l @ signal) @ subkernel_r
    else:
        km, kn = kernel.shape

        # Sanity checks
        if sp.issparse(kernel):
            raise ValueError("cannot handle kernel in sparse format")
        if not sp.issparse(signal):
            raise ValueError("cannot handle signal in dense format")
        # Check of kernel is constant (uniform)
        constant_kernel = np.nan
        if np.allclose(kernel, np.tile(kernel[0, 0], kernel.shape), rtol=1e-08):
            constant_kernel = kernel[0, 0]

        out = sp.csc_matrix((sm - km + 1, sn - kn + 1), dtype=np.float64)

        # Simplified convolution for the special case where kernel is constant:
        if np.isfinite(constant_kernel):
            l_subkernel_sp = sp.diags(
                constant_kernel * np.ones(km),
                np.arange(km),
                shape=(sm - km + 1, sm),
                format="dia",
            )
            r_subkernel_sp = sp.diags(
                np.ones(kn), -np.arange(kn), shape=(sn, sn - kn + 1), format="dia",
            )
            out = (l_subkernel_sp @ signal) @ r_subkernel_sp
        # convolution code for general case
        # 1. 2D kernel composed of 1D filters, each col being a 1D filter.
        # 2. input remains unchanged, and each 1D kernel is unrolled into
        # a sparse toeplitz matrix.
        # 3. Each 1D conv is computed via a sparse matrix x vector product.
        # It is fast because the product is delegated to numpy.
        else:
            # In case the kernel is rectangle, it is faster to scan
            # the largest dimension first
            if kn < km:
                for kj in range(kn):
                    subkernel_sp = sp.diags(
                        kernel[:, kj], np.arange(km), shape=(sm - km + 1, sm), format="csr",
                    )
                    out += subkernel_sp.dot(signal[:, kj : sn - kn + 1 + kj])
            else:
                for ki in range(km):
                    subkernel_sp = sp.diags(
                        np.array(kernel[ki, :]).flatten(), np.arange(kn), shape=(sn - kn + 1, sn), format="csr",
                    )
                    out += signal[ki : sm - km + 1 + ki, :].dot(subkernel_sp.T)

    # Set very low pixels to 0
    out.data[np.abs(out.data) < threshold] = 0
    out.eliminate_zeros()

    # Resize matrix to original dimensions
    out = preproc.zero_pad_sparse(
        out, margin_h=(kn - 1) // 2, margin_v=(km - 1) // 2, fmt="csr"
    )
    return out
Ejemplo n.º 3
0
def pattern_detector(
    contact_map,
    kernel_config,
    kernel_matrix,
    coords=None,
    dump=None,
    full=False,
    tsvd=None,
):
    """
    Detect patterns in a contact map by kernel matching, and extract windows
    around the detected patterns. If coordinates are provided, detection is
    skipped and windows are extracted around those coordinates.

    Parameters
    ----------
    contact_map : ContactMap object
        An object containing an inter- or intra-chromosomal Hi-C contact map
        and additional metadata.
    kernel_config : dict
        The kernel configuration, as documented in
        chromosight.utils.io.load_kernel_config
    kernel_matrix : numpy.array
        The kernel matrix to use for convolution as a 2D numpy array
    coords : numpy.array of ints or None
        A table with coordinates of patterns, with one pattern per row
        and 2 columns being the row and column number of the pattern in
        the input contact map. If this is provided, detection is skipped
        and quantification is performed on those coordinates.
    dump : str or None
        Folder in which dumps should be generated after each step of the
        detection process. If None, no dump is generated
    tsvd : float or None
        If a float between 0 and 1 is given, the input kernel is factorised
        using truncated SVD, keeping enough singular vectors to retain this
        proportion of information. Factorisation speeds up convolution at
        the cost of a loss of information. If the number of singular vectors
        required to retain the desired information is disabled by default.

    Returns
    -------
    filtered_chrom_patterns : pandas.DataFrame
        A table of detected patterns with 4 columns: bin1, bin2, score, qvalue.
    chrom_pattern_windows : numpy array
        A 3D array containing the pile of windows around detected patterns.
    """
    km, kn = kernel_matrix.shape
    kh, kw = (km - 1) // 2, (kn - 1) // 2


    def save_dump(base, mat):
        """Define where to save the dump"""
        sp.save_npz(
            pathlib.Path(dump) / f"{contact_map.name}_{base}", mat
        )

    # Define type of analysis.
    run_mode = "detect" if coords is None else "quantify"

    # Do not attempt pattern detection unless matrix is larger than the kernel
    if min(contact_map.matrix.shape) <= max(kernel_matrix.shape):
        return None, None

    # If full is specified, missing bins are accounted for using a mask
    if full:
        missing_mask = preproc.make_missing_mask(
            contact_map.matrix.shape,
            valid_rows=contact_map.detectable_bins[0],
            valid_cols=contact_map.detectable_bins[1],
            max_dist=contact_map.max_dist,
            sym_upper=not contact_map.inter,
        )
    else:
        missing_mask = None

    # Pattern matching operates here
    mat_conv, mat_log10_pvals = normxcorr2(
        contact_map.matrix.tocsr(),
        kernel_matrix,
        max_dist=contact_map.max_dist,
        sym_upper=not contact_map.inter,
        full=full,
        missing_mask=missing_mask,
        tsvd=tsvd,
        pval=True,
        missing_tol=kernel_config["max_perc_undetected"] / 100,
    )
    if dump:
        save_dump("03_normxcorr2", mat_conv)
    # Clean potential missing values
    mat_conv.data[np.isnan(mat_conv.data)] = 0
    # Only keep corrcoefs in scannable range
    if not contact_map.inter:
        mat_conv = preproc.diag_trim(mat_conv.tocsr(), contact_map.max_dist)
        if dump:
            save_dump("04_diag_trim", mat_conv)
    mat_conv = mat_conv.tocoo()
    mat_conv.eliminate_zeros()

    # Only attempt detection if no input coordinates were given
    if run_mode == "detect":
        # Find foci of highly correlated pixels and pick local maxima
        # coords, foci_mat = pick_foci(np.abs(mat_log10_pvals), 5)
        coords, foci_mat = pick_foci(mat_conv, kernel_config["pearson"],)
        # If nothing was detected, no point in resuming
        if coords is None:
            return None, None
        if dump:
            save_dump("05_foci", foci_mat)
    mat = contact_map.matrix.copy()
    det = [d.copy() for d in contact_map.detectable_bins]
    # Zero pad contact and convolution maps and shift missing bins and detected
    # pattern coords before validation if in full mode

    if full:
        mat = mat.tocoo()
        mat = preproc.zero_pad_sparse(mat, kh, kw, fmt="csr")
        mat_conv = preproc.zero_pad_sparse(mat_conv, kh, kw, fmt="csr")
        det[0] += kh
        det[1] += kw
        coords[:, 0] += kh
        coords[:, 1] += kw

    if not contact_map.inter:
        # set the first kh / 2 diagonals in the lower triangle to NaN
        # so that pileups do not count them
        big_k = max(km, kn)
        mat = mat.tocsr()
        mat += sp.diags(
            np.full(big_k, np.nan),
            -np.arange(1, big_k + 1),
            shape=mat.shape,
            format="csr",
        )
        # When detecting 1D pattern, enforce coordinates on diagonal
        # coordinates can be shifted by 1 since we keep the two first
        # diagonals to allow formation of foci via 4-way adjacency
        if kernel_config["max_dist"] == 0:
            coords[:, 0] = coords[:, 1]

    # Extract windows around coordinates and assign a correlation
    # to each pattern. In detection mode, we drop invalid patterns
    # in quantification mode, all input patterns are returned.
    filtered_coords, filtered_windows = validate_patterns(
        coords,
        mat,
        mat_conv.tocsr(),
        det,
        kernel_matrix,
        zero_tol=kernel_config["max_perc_zero"] / 100,
        missing_tol=kernel_config["max_perc_undetected"] / 100,
        drop=True if run_mode == "detect" else False,
    )

    # Shift coordinates of detected patterns back if padding was added
    if full:
        filtered_coords.bin1 -= kh
        filtered_coords.bin2 -= kw

    try:
        filtered_coords["pvalue"] = mat_log10_pvals[
            filtered_coords.bin1, filtered_coords.bin2
        ].A1
    # No coordinate passed the validation filters
    except AttributeError:
        filtered_coords["pvalue"] = None
    # Remove log10 transform and correct p-values for multiple testing
    filtered_coords["pvalue"] = 10 ** filtered_coords["pvalue"]
    return filtered_coords, filtered_windows