def test_connectivity(seed=36):
    # Test that graph connectivity test works as expected
    graph = np.array([[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]])
    assert_equal(_graph_is_connected(graph), False)
    assert_equal(_graph_is_connected(csr_matrix(graph)), False)
    assert_equal(_graph_is_connected(csc_matrix(graph)), False)
    graph = np.array([[1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]])
    assert_equal(_graph_is_connected(graph), True)
    assert_equal(_graph_is_connected(csr_matrix(graph)), True)
    assert_equal(_graph_is_connected(csc_matrix(graph)), True)
    def compute_markov_matrix(self, skip_checks=False, overwrite=False):

        """
        Slightly modified code originally written by Satrajit Ghosh ([email protected] github.com/satra/mapalign)
        """

        import numpy as np
        import scipy.sparse as sps

        L = self.current_cmat_
        alpha = self.diff_alpha

        use_sparse = False
        if sps.issparse(L):
            use_sparse = True

        if not skip_checks:
            from sklearn.manifold.spectral_embedding_ import _graph_is_connected
            if not _graph_is_connected(L):
                raise ValueError('Graph is disconnected')

        ndim = L.shape[0]
        if overwrite:
            L_alpha = L
        else:
            L_alpha = L.copy()

        if alpha > 0:
            # Step 2
            d = np.array(L_alpha.sum(axis=1)).flatten()
            d_alpha = np.power(d, -alpha)
            if use_sparse:
                L_alpha.data *= d_alpha[L_alpha.indices]
                L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
                L_alpha.data *= d_alpha[L_alpha.indices]
                L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
            else:
                L_alpha = d_alpha[:, np.newaxis] * L_alpha
                L_alpha = L_alpha * d_alpha[np.newaxis, :]

        # Step 3
        d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1)
        if use_sparse:
            L_alpha.data *= d_alpha[L_alpha.indices]
        else:
            L_alpha = d_alpha[:, np.newaxis] * L_alpha

        return L_alpha
Ejemplo n.º 3
0
def compute_markov_matrix(L,
                          alpha=0.5,
                          diffusion_time=0,
                          skip_checks=False,
                          overwrite=False):
    """
    Computes a markov transition matrix from the affinity matrix.
    Code written by Satra Ghosh (https://github.com/satra/mapalign)
    """

    use_sparse = False
    if sps.issparse(L):
        use_sparse = True

    if not skip_checks:
        from sklearn.manifold.spectral_embedding_ import _graph_is_connected
        if not _graph_is_connected(L):
            raise ValueError('Graph is disconnected')

    ndim = L.shape[0]
    if overwrite:
        L_alpha = L
    else:
        L_alpha = L.copy()

    if alpha > 0:
        # Step 2
        d = np.array(L_alpha.sum(axis=1)).flatten()
        d_alpha = np.power(d, -alpha)
        if use_sparse:
            L_alpha.data *= d_alpha[L_alpha.indices]
            L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
            L_alpha.data *= d_alpha[L_alpha.indices]
            L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
        else:
            L_alpha = d_alpha[:, np.newaxis] * L_alpha
            L_alpha = L_alpha * d_alpha[np.newaxis, :]

    # Step 3
    d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1)
    if use_sparse:
        L_alpha.data *= d_alpha[L_alpha.indices]
    else:
        L_alpha = d_alpha[:, np.newaxis] * L_alpha

    return L_alpha
Ejemplo n.º 4
0
def test_connectivity(seed=36):
    # Test that graph connectivity test works as expected
    graph = np.array([[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 1, 0],
                      [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]])
    assert not _graph_is_connected(graph)
    assert not _graph_is_connected(sparse.csr_matrix(graph))
    assert not _graph_is_connected(sparse.csc_matrix(graph))
    graph = np.array([[1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [0, 1, 1, 1, 0],
                      [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]])
    assert _graph_is_connected(graph)
    assert _graph_is_connected(sparse.csr_matrix(graph))
    assert _graph_is_connected(sparse.csc_matrix(graph))
Ejemplo n.º 5
0
def test_connectivity(seed=36):
    # Test that graph connectivity test works as expected
    graph = np.array([[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 1, 0],
                      [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]])
    assert_equal(_graph_is_connected(graph), False)
    assert_equal(_graph_is_connected(csr_matrix(graph)), False)
    assert_equal(_graph_is_connected(csc_matrix(graph)), False)
    graph = np.array([[1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [0, 1, 1, 1, 0],
                      [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]])
    assert_equal(_graph_is_connected(graph), True)
    assert_equal(_graph_is_connected(csr_matrix(graph)), True)
    assert_equal(_graph_is_connected(csc_matrix(graph)), True)
Ejemplo n.º 6
0
def compute_markov_matrix(L, alpha=0.5, diffusion_time=0, skip_checks=False, overwrite=False):
    import numpy as np
    import scipy.sparse as sps

    use_sparse = False
    if sps.issparse(L):
        use_sparse = True

    if not skip_checks:
        from sklearn.manifold.spectral_embedding_ import _graph_is_connected
        if not _graph_is_connected(L):
            raise ValueError('Graph is disconnected')

    ndim = L.shape[0]
    if overwrite:
        L_alpha = L
    else:
        L_alpha = L.copy()

    if alpha > 0:
        # Step 2
        d = np.array(L_alpha.sum(axis=1)).flatten()
        d_alpha = np.power(d, -alpha)
        if use_sparse:
            L_alpha.data *= d_alpha[L_alpha.indices]
            L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
            L_alpha.data *= d_alpha[L_alpha.indices]
            L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
        else:
            L_alpha = d_alpha[:, np.newaxis] * L_alpha
            L_alpha = L_alpha * d_alpha[np.newaxis, :]

    # Step 3
    d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1)
    if use_sparse:
        L_alpha.data *= d_alpha[L_alpha.indices]
    else:
        L_alpha = d_alpha[:, np.newaxis] * L_alpha

    return L_alpha
def diffusemapReduction(dist,
                        epi=0.5,
                        mode='manual',
                        n_neighbors=10,
                        n_components=10):
    if mode == 'auto':
        dist_thresh, epi = thresholdDistanceMatrix(dist)
    else:
        dist_thresh, epi = thresholdDistanceMatrix(dist,
                                                   epi=epi,
                                                   mode='manual')
    plt.figure(10)
    plt.imshow(dist_thresh)
    plt.colorbar()
    if not _graph_is_connected(dist_thresh):
        raise ValueError('Graph is disconnected')
    embedding, result = compute_diffusion_map(dist_thresh,
                                              alpha=0.,
                                              n_components=n_components)

    result['epi'] = epi

    return embedding, result
Ejemplo n.º 8
0
def compute_diffusion_map(L,
                          alpha=0.5,
                          n_components=None,
                          diffusion_time=0,
                          skip_checks=False,
                          overwrite=False):
    """Compute the diffusion maps of a symmetric similarity matrix

        L : matrix N x N
           L is symmetric and L(x, y) >= 0

        alpha: float [0, 1]
            Setting alpha=1 and the diffusion operator approximates the
            Laplace-Beltrami operator. We then recover the Riemannian geometry
            of the data set regardless of the distribution of the points. To
            describe the long-term behavior of the point distribution of a
            system of stochastic differential equations, we can use alpha=0.5
            and the resulting Markov chain approximates the Fokker-Planck
            diffusion. With alpha=0, it reduces to the classical graph Laplacian
            normalization.

        n_components: int
            The number of diffusion map components to return. Due to the
            spectrum decay of the eigenvalues, only a few terms are necessary to
            achieve a given relative accuracy in the sum M^t.

        diffusion_time: float >= 0
            use the diffusion_time (t) step transition matrix M^t

            t not only serves as a time parameter, but also has the dual role of
            scale parameter. One of the main ideas of diffusion framework is
            that running the chain forward in time (taking larger and larger
            powers of M) reveals the geometric structure of X at larger and
            larger scales (the diffusion process).

            t = 0 empirically provides a reasonable balance from a clustering
            perspective. Specifically, the notion of a cluster in the data set
            is quantified as a region in which the probability of escaping this
            region is low (within a certain time t).

        skip_checks: bool
            Avoid expensive pre-checks on input data. The caller has to make
            sure that input data is valid or results will be undefined.

        overwrite: bool
            Optimize memory usage by re-using input matrix L as scratch space.

        References
        ----------

        [1] https://en.wikipedia.org/wiki/Diffusion_map
        [2] Coifman, R.R.; S. Lafon. (2006). "Diffusion maps". Applied and
        Computational Harmonic Analysis 21: 5-30. doi:10.1016/j.acha.2006.04.006
    """

    import numpy as np
    import scipy.sparse as sps

    use_sparse = False
    if sps.issparse(L):
        use_sparse = True

    if not skip_checks:
        from sklearn.manifold.spectral_embedding_ import _graph_is_connected
        if not _graph_is_connected(L):
            raise ValueError('Graph is disconnected')

    ndim = L.shape[0]
    if overwrite:
        L_alpha = L
    else:
        L_alpha = L.copy()

    if alpha > 0:
        # Step 2
        d = np.array(L_alpha.sum(axis=1)).flatten()
        d_alpha = np.power(d, -alpha)
        if use_sparse:
            L_alpha.data *= d_alpha[L_alpha.indices]
            L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
            L_alpha.data *= d_alpha[L_alpha.indices]
            L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
        else:
            L_alpha = d_alpha[:, np.newaxis] * L_alpha
            L_alpha = L_alpha * d_alpha[np.newaxis, :]

    # Step 3
    d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1)
    if use_sparse:
        L_alpha.data *= d_alpha[L_alpha.indices]
    else:
        L_alpha = d_alpha[:, np.newaxis] * L_alpha

    M = L_alpha

    from scipy.sparse.linalg import eigsh, eigs

    # Step 4
    func = eigs
    if n_components is not None:
        lambdas, vectors = func(M, k=n_components + 1)
    else:
        lambdas, vectors = func(M, k=max(2, int(np.sqrt(ndim))))
    del M

    if func == eigsh:
        lambdas = lambdas[::-1]
        vectors = vectors[:, ::-1]
    else:
        lambdas = np.real(lambdas)
        vectors = np.real(vectors)
        lambda_idx = np.argsort(lambdas)[::-1]
        lambdas = lambdas[lambda_idx]
        vectors = vectors[:, lambda_idx]

    # Step 5
    psi = vectors / vectors[:, [0]]
    diffusion_times = diffusion_time
    if diffusion_time == 0:
        diffusion_times = np.exp(1. -
                                 np.log(1 - lambdas[1:]) / np.log(lambdas[1:]))
        lambdas = lambdas[1:] / (1 - lambdas[1:])
    else:
        lambdas = lambdas[1:]**float(diffusion_time)
    lambda_ratio = lambdas / lambdas[0]
    threshold = max(0.05, lambda_ratio[-1])

    n_components_auto = np.amax(np.nonzero(lambda_ratio > threshold)[0])
    n_components_auto = min(n_components_auto, ndim)
    if n_components is None:
        n_components = n_components_auto
    embedding = psi[:, 1:(n_components + 1)] * lambdas[:n_components][None, :]

    result = dict(lambdas=lambdas,
                  vectors=vectors,
                  n_components=n_components,
                  diffusion_time=diffusion_times,
                  n_components_auto=n_components_auto)
    return embedding, result
Ejemplo n.º 9
0
def diffusion_mapping(adj,
                      n_components=10,
                      alpha=0.5,
                      diffusion_time=0,
                      random_state=None):
    """Compute diffusion map of affinity matrix.

    Parameters
    ----------
    adj : ndarray or sparse matrix, shape = (n, n)
        Affinity matrix.
    n_components : int or None, optional
        Number of eigenvectors. If None, selection of `n_components` is based
        on 95% drop-off in eigenvalues. When `n_components` is None,
        the maximum number of eigenvectors is restricted to
        ``n_components <= sqrt(n)``. Default is 10.
    alpha : float, optional
        Anisotropic diffusion parameter, ``0 <= alpha <= 1``. Default is 0.5.
    diffusion_time : int, optional
        Diffusion time or scale. If ``diffusion_time == 0`` use multi-scale
        diffusion maps. Default is 0.
    random_state : int or None, optional
        Random state. Default is None.

    Returns
    -------
    v : ndarray, shape (n, n_components)
        Eigenvectors of the affinity matrix in same order.
    w : ndarray, shape (n_components,)
        Eigenvalues of the affinity matrix in descending order.

    References
    ----------
    * Coifman, R.R.; S. Lafon. (2006). "Diffusion maps". Applied and
      Computational Harmonic Analysis 21: 5-30. doi:10.1016/j.acha.2006.04.006
    * Joseph W.R., Peter E.F., Ann B.L., Chad M.S. Accurate parameter
      estimation for star formation history in galaxies using SDSS spectra.
    """

    rs = check_random_state(random_state)
    use_sparse = ssp.issparse(adj)

    # Make symmetric
    if not is_symmetric(adj, tol=1E-10):
        warnings.warn('Affinity is not symmetric. Making symmetric.')
        adj = make_symmetric(adj, check=False, copy=True, sparse_format='coo')
    else:  # Copy anyways because we will be working on the matrix
        adj = adj.tocoo(copy=True) if use_sparse else adj.copy()

    # Check connected
    if not _graph_is_connected(adj):
        warnings.warn('Graph is not fully connected.')

    ###########################################################
    # Step 2
    ###########################################################
    # When α=0, you get back the diffusion map based on the random walk-style
    # diffusion operator (and Laplacian Eigenmaps). For α=1, the diffusion
    # operator approximates the Laplace-Beltrami operator and for α=0.5,
    # you get Fokker-Planck diffusion. The anisotropic diffusion
    # parameter: \alpha \in \[0, 1\]
    # W(α) = D^{−1/\alpha} W D^{−1/\alpha}
    if alpha > 0:
        if use_sparse:
            d = np.power(adj.sum(axis=1).A1, -alpha)
            adj.data *= d[adj.row]
            adj.data *= d[adj.col]
        else:
            d = adj.sum(axis=1, keepdims=True)
            d = np.power(d, -alpha)
            adj *= d.T
            adj *= d

    ###########################################################
    # Step 3
    ###########################################################
    # Diffusion operator
    # P(α) = D(α)^{−1}W(α)
    if use_sparse:
        d_alpha = np.power(adj.sum(axis=1).A1, -1)
        adj.data *= d_alpha[adj.row]
    else:
        adj *= np.power(adj.sum(axis=1, keepdims=True), -1)

    ###########################################################
    # Step 4
    ###########################################################
    if n_components is None:
        n_components = max(2, int(np.sqrt(adj.shape[0])))
        auto_n_comp = True
    else:
        auto_n_comp = False

    # For repeatability of results
    v0 = rs.uniform(-1, 1, adj.shape[0])

    # Find largest eigenvalues and eigenvectors
    w, v = eigsh(adj, k=n_components + 1, which='LM', tol=0, v0=v0)

    # Sort descending
    w, v = w[::-1], v[:, ::-1]

    ###########################################################
    # Step 5
    ###########################################################
    # Force first eigenvector to be all ones.
    v /= v[:, [0]]

    # Largest eigenvalue should be equal to one too
    w /= w[0]

    # Discard first (largest) eigenvalue and eigenvector
    w, v = w[1:], v[:, 1:]

    if diffusion_time <= 0:
        # use multi-scale diffusion map, ref [4]
        # considers all scales: t=1,2,3,...
        w /= (1 - w)
    else:
        # Raise eigenvalues to the power of diffusion time
        w **= diffusion_time

    if auto_n_comp:
        # Choose n_comp to coincide with a 95 % drop-off
        # in the eigenvalue multipliers, ref [4]
        lambda_ratio = w / w[0]

        # If all eigenvalues larger than 0.05, select all
        # (i.e., sqrt(adj.shape[0]))
        threshold = max(0.05, lambda_ratio[-1])
        n_components = np.argmin(lambda_ratio > threshold)

        w = w[:n_components]
        v = v[:, :n_components]

    # Rescale eigenvectors with eigenvalues
    v *= w[None, :]

    # Consistent sign (s.t. largest value of element eigenvector is pos)
    v *= np.sign(v[np.abs(v).argmax(axis=0), range(v.shape[1])])
    return v, w
Ejemplo n.º 10
0
def laplacian_eigenmaps(adj,
                        n_components=10,
                        norm_laplacian=True,
                        random_state=None):
    """Compute embedding using Laplacian eigenmaps.

    Adapted from Scikit-learn to also provide eigenvalues.

    Parameters
    ----------
    adj : 2D ndarray or sparse matrix
        Affinity matrix.
    n_components : int, optional
        Number of eigenvectors. Default is 10.
    norm_laplacian : bool, optional
        If True use normalized Laplacian. Default is True.
    random_state : int or None, optional
        Random state. Default is None.

    Returns
    -------
    v : 2D ndarray, shape (n, n_components)
        Eigenvectors of the affinity matrix in same order. Where `n` is
        the number of rows of the affinity matrix.
    w : 1D ndarray, shape (n_components,)
        Eigenvalues of the affinity matrix in ascending order.

    References
    ----------
    * Belkin, M. and Niyogi, P. (2003). Laplacian Eigenmaps for
      dimensionality reduction and data representation.
      Neural Computation 15(6): 1373-96. doi:10.1162/089976603321780317

    """

    rs = check_random_state(random_state)

    # Make symmetric
    if not is_symmetric(adj, tol=1E-10):
        warnings.warn('Affinity is not symmetric. Making symmetric.')
        adj = make_symmetric(adj, check=False)

    # Check connected
    if not _graph_is_connected(adj):
        warnings.warn('Graph is not fully connected.')

    lap, dd = laplacian(adj, normed=norm_laplacian, return_diag=True)
    if norm_laplacian:
        if ssp.issparse(lap):
            lap.setdiag(1)
        else:
            np.fill_diagonal(lap, 1)

    lap *= -1
    v0 = rs.uniform(-1, 1, lap.shape[0])
    w, v = eigsh(lap, k=n_components + 1, sigma=1, which='LM', tol=0, v0=v0)

    # Sort descending and change sign of eigenvalues
    w, v = -w[::-1], v[:, ::-1]

    if norm_laplacian:
        v /= dd[:, None]

    # Drop smallest
    w, v = w[1:], v[:, 1:]

    # Consistent sign (s.t. largest value of element eigenvector is pos)
    v *= np.sign(v[np.abs(v).argmax(axis=0), range(v.shape[1])])
    return v, w

thresholdDistanceMatrix(dist_Y)

#using diffusion mapping to do this
epi =100
dist_thresh = np.exp(-dist_Y**2/epi/2.0)
M = dist_thresh
for i in range(M.shape[0]):
    M[i] = M[i]/np.sum(M[i])

from sklearn.manifold.spectral_embedding_ import _graph_is_connected
plt.figure(10)
plt.imshow(dist_thresh)
plt.colorbar()
if not _graph_is_connected(dist_thresh):
    raise ValueError('Graph is disconnected')

embedding, result = compute_diffusion_map(dist_thresh, n_components=5)    

    
# w,v = np.linalg.eig(M)

# U,S,V = np.linalg.svd(M)

# plt.figure(1)
# plt.plot(np.real(v[:,1]))

# plt.figure(2)
# plt.plot(np.real(v[:,2]))
Ejemplo n.º 12
0
def compute_diffusion_map(L, alpha=0.5, n_components=None, diffusion_time=0, skip_checks=False, overwrite=False):
    """Compute the diffusion maps of a symmetric similarity matrix

        L : matrix N x N
           L is symmetric and L(x, y) >= 0

        alpha: float [0, 1]
            Setting alpha=1 and the diffusion operator approximates the
            Laplace-Beltrami operator. We then recover the Riemannian geometry
            of the data set regardless of the distribution of the points. To
            describe the long-term behavior of the point distribution of a
            system of stochastic differential equations, we can use alpha=0.5
            and the resulting Markov chain approximates the Fokker-Planck
            diffusion. With alpha=0, it reduces to the classical graph Laplacian
            normalization.

        n_components: int
            The number of diffusion map components to return. Due to the
            spectrum decay of the eigenvalues, only a few terms are necessary to
            achieve a given relative accuracy in the sum M^t.

        diffusion_time: float >= 0
            use the diffusion_time (t) step transition matrix M^t

            t not only serves as a time parameter, but also has the dual role of
            scale parameter. One of the main ideas of diffusion framework is
            that running the chain forward in time (taking larger and larger
            powers of M) reveals the geometric structure of X at larger and
            larger scales (the diffusion process).

            t = 0 empirically provides a reasonable balance from a clustering
            perspective. Specifically, the notion of a cluster in the data set
            is quantified as a region in which the probability of escaping this
            region is low (within a certain time t).

        skip_checks: bool
            Avoid expensive pre-checks on input data. The caller has to make
            sure that input data is valid or results will be undefined.

        overwrite: bool
            Optimize memory usage by re-using input matrix L as scratch space.

        References
        ----------

        [1] https://en.wikipedia.org/wiki/Diffusion_map
        [2] Coifman, R.R.; S. Lafon. (2006). "Diffusion maps". Applied and
        Computational Harmonic Analysis 21: 5-30. doi:10.1016/j.acha.2006.04.006
    """

    import numpy as np
    import scipy.sparse as sps

    use_sparse = False
    if sps.issparse(L):
        use_sparse = True

    if not skip_checks:
        from sklearn.manifold.spectral_embedding_ import _graph_is_connected

        if not _graph_is_connected(L):
            raise ValueError("Graph is disconnected")

    ndim = L.shape[0]
    if overwrite:
        L_alpha = L
    else:
        L_alpha = L.copy()

    if alpha > 0:
        # Step 2
        d = np.array(L_alpha.sum(axis=1)).flatten()
        d_alpha = np.power(d, -alpha)
        if use_sparse:
            L_alpha.data *= d_alpha[L_alpha.indices]
            L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
            L_alpha.data *= d_alpha[L_alpha.indices]
            L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
        else:
            L_alpha = d_alpha[:, np.newaxis] * L_alpha
            L_alpha = L_alpha * d_alpha[np.newaxis, :]

    # Step 3
    d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1)
    if use_sparse:
        L_alpha.data *= d_alpha[L_alpha.indices]
    else:
        L_alpha = d_alpha[:, np.newaxis] * L_alpha

    M = L_alpha

    from scipy.sparse.linalg import eigsh, eigs

    # Step 4
    func = eigs
    if n_components is not None:
        lambdas, vectors = func(M, k=n_components + 1)
    else:
        lambdas, vectors = func(M, k=max(2, int(np.sqrt(ndim))))
    del M

    if func == eigsh:
        lambdas = lambdas[::-1]
        vectors = vectors[:, ::-1]
    else:
        lambdas = np.real(lambdas)
        vectors = np.real(vectors)
        lambda_idx = np.argsort(lambdas)[::-1]
        lambdas = lambdas[lambda_idx]
        vectors = vectors[:, lambda_idx]

    # Step 5
    psi = vectors / vectors[:, [0]]
    if diffusion_time == 0:
        lambdas = lambdas[1:] / (1 - lambdas[1:])
    else:
        lambdas = lambdas[1:] ** float(diffusion_time)
    lambda_ratio = lambdas / lambdas[0]
    threshold = max(0.05, lambda_ratio[-1])

    n_components_auto = np.amax(np.nonzero(lambda_ratio > threshold)[0])
    n_components_auto = min(n_components_auto, ndim)
    if n_components is None:
        n_components = n_components_auto
    embedding = psi[:, 1 : (n_components + 1)] * lambdas[:n_components][None, :]

    result = dict(
        lambdas=lambdas,
        vectors=vectors,
        n_components=n_components,
        diffusion_time=diffusion_time,
        n_components_auto=n_components_auto,
    )
    return embedding, result
 def compute_diffusion_map(L, alpha=0.5, n_components=None, diffusion_time=0, verbose=False):
     # from https://github.com/satra/mapalign/blob/master/mapalign/embed.py
     import numpy as np
     import scipy.sparse as sps
 
     from sklearn.manifold.spectral_embedding_ import _graph_is_connected
     
     use_sparse = False
     if sps.issparse(L):
         use_sparse = True
 
     if not _graph_is_connected(L):
         raise ValueError('Graph is disconnected')
         
     if verbose:
         print 'checked conditions'
 
     ndim = L.shape[0]
     L_alpha = L.copy()
     if alpha > 0:
         if verbose:
             print 'step2'
         # Step 2
         d = np.array(L_alpha.sum(axis=1)).flatten()
         d_alpha = np.power(d, -alpha)
         if use_sparse:
             L_alpha.data *= d_alpha[L_alpha.indices]
             L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
             L_alpha.data *= d_alpha[L_alpha.indices]
             L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
         else:
             L_alpha = d_alpha[:, None] * L_alpha * d_alpha[None, :]
 
     # Step 3
     if verbose:
         print 'step 3'
     d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1)
     if use_sparse:
         L_alpha.data *= d_alpha[L_alpha.indices]
     else:
         L_alpha = d_alpha[:, None] * L_alpha
 
     M = L_alpha
 
     from sklearn.utils.arpack import eigsh, eigs
 
     # Step 4
     if verbose:
         print 'step 4'
     func = eigs
     if n_components is not None:
         lambdas, vectors = func(M, k=n_components + 1)
     else:
         lambdas, vectors = func(M, k=max(2, int(np.sqrt(ndim))))
     del M
 
     if func == eigsh:
         lambdas = lambdas[::-1]
         vectors = vectors[:, ::-1]
     else:
         lambdas = np.real(lambdas)
         vectors = np.real(vectors)
         lambda_idx = np.argsort(lambdas)[::-1]
         lambdas = lambdas[lambda_idx]
         vectors = vectors[:, lambda_idx]
 
     # Step 5
     if verbose:
         print 'step 5'
     psi = vectors/vectors[:, [0]]
     if diffusion_time == 0:
         lambdas = lambdas[1:] / (1 - lambdas[1:])
     else:
         lambdas = lambdas[1:] ** float(diffusion_time)
     lambda_ratio = lambdas/lambdas[0]
     threshold = max(0.05, lambda_ratio[-1])
 
     n_components_auto = np.amax(np.nonzero(lambda_ratio > threshold)[0])
     n_components_auto = min(n_components_auto, ndim)
     if n_components is None:
         n_components = n_components_auto
     embedding = psi[:, 1:(n_components + 1)] * lambdas[:n_components][None, :]
 
     result = dict(lambdas=lambdas, vectors=vectors,
                   n_components=n_components, diffusion_time=diffusion_time,
                   n_components_auto=n_components_auto)
     return embedding, result
Ejemplo n.º 14
0
    def spectral_embedding(self,
                           adjacency,
                           n_components=8,
                           eigen_solver=None,
                           random_state=None,
                           eigen_tol=0.0,
                           drop_first=True):
        """
        see original at https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/manifold/spectral_embedding_.py#L133
        custermize1: return lambdas with the embedded matrix.
        custermize2: norm_laplacian is always True
        """
        norm_laplacian = True
        adjacency = check_symmetric(adjacency)

        try:
            from pyamg import smoothed_aggregation_solver
        except ImportError:
            if eigen_solver == "amg":
                raise ValueError(
                    "The eigen_solver was set to 'amg', but pyamg is "
                    "not available.")

        if eigen_solver is None:
            eigen_solver = 'arpack'
        elif eigen_solver not in ('arpack', 'lobpcg', 'amg'):
            raise ValueError("Unknown value for eigen_solver: '%s'."
                             "Should be 'amg', 'arpack', or 'lobpcg'" %
                             eigen_solver)

        random_state = check_random_state(random_state)

        n_nodes = adjacency.shape[0]
        # Whether to drop the first eigenvector
        if drop_first:
            n_components = n_components + 1

        if not _graph_is_connected(adjacency):
            warnings.warn("Graph is not fully connected, spectral embedding"
                          " may not work as expected.")

        laplacian, dd = graph_laplacian(adjacency,
                                        normed=norm_laplacian,
                                        return_diag=True)
        if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and
            (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)):
            # lobpcg used with eigen_solver='amg' has bugs for low number of nodes
            # for details see the source code in scipy:
            # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
            # /lobpcg/lobpcg.py#L237
            # or matlab:
            # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
            laplacian = _set_diag(laplacian, 1, norm_laplacian)

            # Here we'll use shift-invert mode for fast eigenvalues
            # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
            #  for a short explanation of what this means)
            # Because the normalized Laplacian has eigenvalues between 0 and 2,
            # I - L has eigenvalues between -1 and 1.  ARPACK is most efficient
            # when finding eigenvalues of largest magnitude (keyword which='LM')
            # and when these eigenvalues are very large compared to the rest.
            # For very large, very sparse graphs, I - L can have many, many
            # eigenvalues very near 1.0.  This leads to slow convergence.  So
            # instead, we'll use ARPACK's shift-invert mode, asking for the
            # eigenvalues near 1.0.  This effectively spreads-out the spectrum
            # near 1.0 and leads to much faster convergence: potentially an
            # orders-of-magnitude speedup over simply using keyword which='LA'
            # in standard mode.
            try:
                # We are computing the opposite of the laplacian inplace so as
                # to spare a memory allocation of a possibly very large array
                laplacian *= -1
                lambdas, diffusion_map = eigsh(laplacian,
                                               k=n_components,
                                               sigma=1.0,
                                               which='LM',
                                               tol=eigen_tol)
                embedding = diffusion_map.T[n_components::-1] * dd

            except RuntimeError:
                # When submatrices are exactly singular, an LU decomposition
                # in arpack fails. We fallback to lobpcg
                eigen_solver = "lobpcg"
                # Revert the laplacian to its opposite to have lobpcg work
                laplacian *= -1

        if eigen_solver == 'amg':
            # Use AMG to get a preconditioner and speed up the eigenvalue
            # problem.
            if not sparse.issparse(laplacian):
                warnings.warn("AMG works better for sparse matrices")
            # lobpcg needs double precision floats
            laplacian = check_array(laplacian,
                                    dtype=np.float64,
                                    accept_sparse=True)
            laplacian = _set_diag(laplacian, 1, norm_laplacian)
            ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
            M = ml.aspreconditioner()
            X = random_state.rand(laplacian.shape[0], n_components + 1)
            X[:, 0] = dd.ravel()
            lambdas, diffusion_map = lobpcg(laplacian,
                                            X,
                                            M=M,
                                            tol=1.e-12,
                                            largest=False)
            embedding = diffusion_map.T * dd
            if embedding.shape[0] == 1:
                raise ValueError

        elif eigen_solver == "lobpcg":
            # lobpcg needs double precision floats
            laplacian = check_array(laplacian,
                                    dtype=np.float64,
                                    accept_sparse=True)
            if n_nodes < 5 * n_components + 1:
                # see note above under arpack why lobpcg has problems with small
                # number of nodes
                # lobpcg will fallback to eigh, so we short circuit it
                if sparse.isspmatrix(laplacian):
                    laplacian = laplacian.toarray()
                lambdas, diffusion_map = eigh(laplacian)
                embedding = diffusion_map.T[:n_components] * dd
            else:
                laplacian = _set_diag(laplacian, 1, norm_laplacian)
                # We increase the number of eigenvectors requested, as lobpcg
                # doesn't behave well in low dimension
                X = random_state.rand(laplacian.shape[0], n_components + 1)
                X[:, 0] = dd.ravel()
                lambdas, diffusion_map = lobpcg(laplacian,
                                                X,
                                                tol=1e-15,
                                                largest=False,
                                                maxiter=2000)
                embedding = diffusion_map.T[:n_components] * dd
                if embedding.shape[0] == 1:
                    raise ValueError

        embedding = _deterministic_vector_sign_flip(embedding)
        if drop_first:
            return embedding[1:n_components].T, lambdas
        else:
            return embedding[:n_components].T, lambdas