Example #1
0
def define_clusts(similarity_matrix,
                  threshold=0.05,
                  max_iter=200,
                  method='ap'):
    """Define clusters given the similarity matrix and the threshold."""
    n, labels = connected_components(similarity_matrix, directed=False)
    prev_max_clust = 0
    print("connected components: %d" % n)
    clusters = labels.copy()

    if method == 'dbscan':
        ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1)
    if method == 'ap':
        ap = AffinityPropagation(affinity='precomputed',
                                 max_iter=max_iter,
                                 preference='median')

    for i in range(n):
        idxs = np.where(labels == i)[0]
        if idxs.shape[0] > 1:
            sm = similarity_matrix[idxs][:, idxs]
            sm += sm.T + scipy.sparse.eye(sm.shape[0])

            # Hierarchical clustering
            if method == 'hc':
                dists = squareform(1 - sm.toarray())
                links = fastcluster.linkage(dists, method='ward')
                try:
                    clusters_ = fcluster(links, threshold, 'distance')
                except ValueError as err:
                    logging.critical(err)
                    clusters_ = np.zeros(1, dtype=int)

            # DBSCAN
            elif method == 'dbscan':
                db = ap.fit(1. - sm.toarray())
                # Number of clusters in labels, ignoring noise if present.
                clusters_ = db.labels_
                # n_clusters_ = len(set(clusters_)) - int(0 in clusters_)

            # AffinityPropagation
            # ap = AffinityPropagation(affinity='precomputed')
            elif method == 'ap':
                db = ap.fit(sm)
                clusters_ = db.labels_
            else:
                raise ValueError("clustering method %s unknown" % method)

            if np.min(clusters_) == 0:
                clusters_ += 1
            clusters_ += prev_max_clust
            clusters[idxs] = clusters_
            prev_max_clust = max(clusters_)
        else:  # connected component contains just 1 element
            prev_max_clust += 1
            clusters[idxs] = prev_max_clust
    return np.array(extra.flatten(clusters))
Example #2
0
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
                  method='ap'):
    """Define clusters given the similarity matrix and the threshold."""
    n, labels = connected_components(similarity_matrix, directed=False)
    prev_max_clust = 0
    print("connected components: %d" % n)
    clusters = labels.copy()

    if method == 'dbscan':
        ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1)
    if method == 'ap':
        ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter,
                                 preference='median')

    for i in range(n):
        idxs = np.where(labels == i)[0]
        if idxs.shape[0] > 1:
            sm = similarity_matrix[idxs][:, idxs]
            sm += sm.T + scipy.sparse.eye(sm.shape[0])

            # Hierarchical clustering
            if method == 'hc':
                dists = squareform(1 - sm.toarray())
                links = fastcluster.linkage(dists, method='ward')
                try:
                    clusters_ = fcluster(links, threshold, 'distance')
                except ValueError as err:
                    logging.critical(err)
                    clusters_ = np.zeros(1, dtype=int)

            # DBSCAN
            elif method == 'dbscan':
                db = ap.fit(1. - sm.toarray())
                # Number of clusters in labels, ignoring noise if present.
                clusters_ = db.labels_
                # n_clusters_ = len(set(clusters_)) - int(0 in clusters_)

            # AffinityPropagation
            # ap = AffinityPropagation(affinity='precomputed')
            elif method == 'ap':
                db = ap.fit(sm)
                clusters_ = db.labels_
            else:
                raise ValueError("clustering method %s unknown" % method)

            if np.min(clusters_) == 0:
                clusters_ += 1
            clusters_ += prev_max_clust
            clusters[idxs] = clusters_
            prev_max_clust = max(clusters_)
        else:  # connected component contains just 1 element
            prev_max_clust += 1
            clusters[idxs] = prev_max_clust
    return np.array(extra.flatten(clusters))
Example #3
0
def _fix_connectivity(X,
                      connectivity,
                      n_components=None,
                      affinity="euclidean"):
    """
    Fixes the connectivity matrix

        - copies it
        - makes it symmetric
        - converts it to LIL if necessary
        - completes it if necessary
    """
    n_samples = X.shape[0]
    if (connectivity.shape[0] != n_samples
            or connectivity.shape[1] != n_samples):
        raise ValueError('Wrong shape for connectivity matrix: %s '
                         'when X is %s' % (connectivity.shape, X.shape))

    # Make the connectivity matrix symmetric:
    connectivity = connectivity + connectivity.T

    # Convert connectivity matrix to LIL
    if not sparse.isspmatrix_lil(connectivity):
        if not sparse.isspmatrix(connectivity):
            connectivity = sparse.lil_matrix(connectivity)
        else:
            connectivity = connectivity.tolil()

    # Compute the number of nodes
    n_components, labels = connected_components(connectivity)

    if n_components > 1:
        warnings.warn("the number of connected components of the "
                      "connectivity matrix is %d > 1. Completing it to avoid "
                      "stopping the tree early." % n_components,
                      stacklevel=2)
        # XXX: Can we do without completing the matrix?
        for i in xrange(n_components):
            idx_i = np.where(labels == i)[0]
            Xi = X[idx_i]
            for j in xrange(i):
                idx_j = np.where(labels == j)[0]
                Xj = X[idx_j]
                D = pairwise_distances(Xi, Xj, metric=affinity)
                ii, jj = np.where(D == np.min(D))
                ii = ii[0]
                jj = jj[0]
                connectivity[idx_i[ii], idx_j[jj]] = True
                connectivity[idx_j[jj], idx_i[ii]] = True

    return connectivity, n_components
Example #4
0
def _fix_connectivity(X, connectivity, n_components=None,
                      affinity="euclidean"):
    """
    Fixes the connectivity matrix

        - copies it
        - makes it symmetric
        - converts it to LIL if necessary
        - completes it if necessary
    """
    n_samples = X.shape[0]
    if (connectivity.shape[0] != n_samples or
            connectivity.shape[1] != n_samples):
        raise ValueError('Wrong shape for connectivity matrix: %s '
                         'when X is %s' % (connectivity.shape, X.shape))

    # Make the connectivity matrix symmetric:
    connectivity = connectivity + connectivity.T

    # Convert connectivity matrix to LIL
    if not sparse.isspmatrix_lil(connectivity):
        if not sparse.isspmatrix(connectivity):
            connectivity = sparse.lil_matrix(connectivity)
        else:
            connectivity = connectivity.tolil()

    # Compute the number of nodes
    n_components, labels = connected_components(connectivity)

    if n_components > 1:
        warnings.warn("the number of connected components of the "
                      "connectivity matrix is %d > 1. Completing it to avoid "
                      "stopping the tree early." % n_components,
                      stacklevel=2)
        # XXX: Can we do without completing the matrix?
        for i in xrange(n_components):
            idx_i = np.where(labels == i)[0]
            Xi = X[idx_i]
            for j in xrange(i):
                idx_j = np.where(labels == j)[0]
                Xj = X[idx_j]
                D = pairwise_distances(Xi, Xj, metric=affinity)
                ii, jj = np.where(D == np.min(D))
                ii = ii[0]
                jj = jj[0]
                connectivity[idx_i[ii], idx_j[jj]] = True
                connectivity[idx_j[jj], idx_i[ii]] = True

    return connectivity, n_components
Example #5
0
def _graph_is_connected(graph):
    """ Return whether the graph is connected (True) or Not (False)

    Parameters
    ----------
    graph : array-like or sparse matrix, shape: (n_samples, n_samples)
        adjacency matrix of the graph, non-zero weight means an edge
        between the nodes

    Returns
    -------
    is_connected : bool
        True means the graph is fully connected and False means not
    """
    if sparse.isspmatrix(graph):
        # sparse graph, find all the connected components
        n_connected_components, _ = connected_components(graph)
        return n_connected_components == 1
    else:
        # dense graph, find all connected components start from node 0
        return _graph_connected_component(graph, 0).sum() == graph.shape[0]
Example #6
0
def _graph_is_connected(graph):
    """ Return whether the graph is connected (True) or Not (False)

    Parameters
    ----------
    graph : array-like or sparse matrix, shape: (n_samples, n_samples)
        adjacency matrix of the graph, non-zero weight means an edge
        between the nodes

    Returns
    -------
    is_connected : bool
        True means the graph is fully connected and False means not
    """
    if sparse.isspmatrix(graph):
        # sparse graph, find all the connected components
        n_connected_components, _ = connected_components(graph)
        return n_connected_components == 1
    else:
        # dense graph, find all connected components start from node 0
        return _graph_connected_component(graph, 0).sum() == graph.shape[0]
Example #7
0
    dist_matrix[i] = 0
    dist_matrix.flat[::N + 1] = 0
    return dist_matrix


dist_matrix = sparse.csr_matrix(generate_graph(20))

# auto
graph_sk = graph_shortest_path(dist_matrix, directed = False)
graph_sp = shortest_path(dist_matrix, directed = False)
assert_array_almost_equal(graph_sk, graph_sp)

# Floyd-Warshall
graph_sk = graph_shortest_path(dist_matrix, directed = False, method = 'FW')
graph_sp = shortest_path(dist_matrix, directed = False, method = 'FW')
assert_array_almost_equal(graph_sk, graph_sp)


# Dijkstra's
graph_sk = graph_shortest_path(dist_matrix, directed = False, method = 'D')
graph_sp = shortest_path(dist_matrix, directed = False, method = 'D')
assert_array_almost_equal(graph_sk, graph_sp)


from sklearn.utils.sparsetools import connected_components
from scipy.sparse.csgraph import connected_components as c_c

dist_matrix = sparse.csr_matrix(generate_graph(100))
(n_sk, labs_sk) = connected_components(dist_matrix)
(n_sp, labs_sp) = c_c(dist_matrix)
import numpy as np
from sklearn.utils.arpack import eigsh

app = service.prodbox.CinemaService()

X = app.getWeightedSearchFeatures(15)

graph = kneighbors_graph(X, 10)
lap = graph_laplacian(graph, True)

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components = 30, algorithm="arpack")
lap = spectral_embedding_._set_diag(lap, 1)
svd.fit(-lap)

eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T)

eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1)
print(eigenvalues)

print(eigenvalues2)

se = SpectralEmbedding(n_components = 30, eigen_solver='arpack', affinity="nearest_neighbors")
se.fit(X)

app.quit()

# TODO : check budget distribution, draw budget conditionnaly
out = connected_components(graph)

Example #9
0
from sklearn.utils.arpack import eigsh

app = service.prodbox.CinemaService()

X = app.getWeightedSearchFeatures(15)

graph = kneighbors_graph(X, 10)
lap = graph_laplacian(graph, True)

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=30, algorithm="arpack")
lap = spectral_embedding_._set_diag(lap, 1)
svd.fit(-lap)

eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T)

eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1)
print(eigenvalues)

print(eigenvalues2)

se = SpectralEmbedding(n_components=30,
                       eigen_solver='arpack',
                       affinity="nearest_neighbors")
se.fit(X)

app.quit()

# TODO : check budget distribution, draw budget conditionnaly
out = connected_components(graph)