Beispiel #1
0
def test_isomap_reconstruction_error():
    # Same setup as in test_isomap_simple_grid, with an added dimension
    N_per_side = 5
    Npts = N_per_side ** 2
    n_neighbors = Npts - 1

    # grid of equidistant points in 2D, out_dim = n_dim
    X = np.array(list(product(range(N_per_side), repeat=2)))

    # add noise in a third dimension
    rng = np.random.RandomState(0)
    noise = 0.1 * rng.randn(Npts, 1)
    X = np.concatenate((X, noise), 1)

    # compute input kernel
    G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()

    centerer = preprocessing.KernelCenterer()
    K = centerer.fit_transform(-0.5 * G ** 2)

    for eigen_solver in eigen_solvers:
        for path_method in path_methods:
            clf = manifold.Isomap(
                n_neighbors=n_neighbors, out_dim=2, eigen_solver=eigen_solver, path_method=path_method
            )
            clf.fit(X)

            # compute output kernel
            G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance").toarray()

            K_iso = centerer.fit_transform(-0.5 * G_iso ** 2)

            # make sure error agrees
            reconstruction_error = np.linalg.norm(K - K_iso) / Npts
            assert_almost_equal(reconstruction_error, clf.reconstruction_error())
Beispiel #2
0
def test_kneighbors_graph():
    """Test kneighbors_graph to build the k-Nearest Neighbor graph."""
    X = np.array([[0, 1], [1.01, 1.], [2, 0]])

    # n_neighbors = 1
    A = neighbors.kneighbors_graph(X, 1, mode='connectivity')
    assert_array_equal(A.toarray(), np.eye(A.shape[0]))

    A = neighbors.kneighbors_graph(X, 1, mode='distance')
    assert_array_almost_equal(
        A.toarray(),
        [[0.00, 1.01, 0.],
         [1.01, 0., 0.],
         [0.00, 1.40716026, 0.]])

    # n_neighbors = 2
    A = neighbors.kneighbors_graph(X, 2, mode='connectivity')
    assert_array_equal(
        A.toarray(),
        [[1., 1., 0.],
         [1., 1., 0.],
         [0., 1., 1.]])

    A = neighbors.kneighbors_graph(X, 2, mode='distance')
    assert_array_almost_equal(
        A.toarray(),
        [[0., 1.01, 2.23606798],
         [1.01, 0., 1.40716026],
         [2.23606798, 1.40716026, 0.]])

    # n_neighbors = 3
    A = neighbors.kneighbors_graph(X, 3, mode='connectivity')
    assert_array_almost_equal(
        A.toarray(),
        [[1, 1, 1], [1, 1, 1], [1, 1, 1]])
Beispiel #3
0
def test_include_self_neighbors_graph():
    """Test include_self parameter in neighbors_graph"""
    X = [[2, 3], [4, 5]]
    kng = neighbors.kneighbors_graph(X, 1, include_self=True).A
    kng_not_self = neighbors.kneighbors_graph(X, 1, include_self=False).A
    assert_array_equal(kng, [[1.0, 0.0], [0.0, 1.0]])
    assert_array_equal(kng_not_self, [[0.0, 1.0], [1.0, 0.0]])

    rng = neighbors.radius_neighbors_graph(X, 5.0, include_self=True).A
    rng_not_self = neighbors.radius_neighbors_graph(X, 5.0, include_self=False).A
    assert_array_equal(rng, [[1.0, 1.0], [1.0, 1.0]])
    assert_array_equal(rng_not_self, [[0.0, 1.0], [1.0, 0.0]])
Beispiel #4
0
def test_kneighbors_graph_sparse(seed=36):
    """Test kneighbors_graph to build the k-Nearest Neighbor graph
    for sparse input."""
    rng = np.random.RandomState(seed)
    X = rng.randn(10, 10)
    Xcsr = csr_matrix(X)

    for n_neighbors in [1, 2, 3]:
        for mode in ["connectivity", "distance"]:
            assert_array_almost_equal(
                neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(),
                neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
            )
Beispiel #5
0
  def _fit_transform(self, X):
    self.nbrs_.fit(X)
    self.training_data_ = self.nbrs_._fit_X 
    self.kernel_pca_ = KernelPCA(n_components=self.n_components,
                                  kernel="precomputed",
                                  eigen_solver=self.eigen_solver,
                                  tol=self.tol, max_iter=self.max_iter)
    
    kng = kneighbors_graph(self.nbrs_, self.n_neighbors, mode="distance")
    n_points = X.shape[0]
    n_workers = blob_ctx.get().num_workers

    if n_points < n_workers:
      tile_hint = (1, )
    else:
      tile_hint = (n_points / n_workers, )

    """
    task_array is used for deciding the idx of starting points and idx of endding points 
    that each tile needs to find the shortest path among.
    """
    task_array = expr.ndarray((n_points,), tile_hint=tile_hint)
    task_array = task_array.force()
    
    #dist matrix is used to hold the result
    dist_matrix = expr.ndarray((n_points, n_points), reduce_fn=lambda a,b:a+b).force()
    results = task_array.foreach_tile(mapper_fn = _shortest_path_mapper,
                                      kw = {'kng' : kng,
                                            'directed' : False,
                                            'dist_matrix' : dist_matrix})
    self.dist_matrix_ = dist_matrix.glom()
    G = self.dist_matrix_ ** 2
    G *= -0.5
    self.embedding_ = self.kernel_pca_.fit_transform(G)
Beispiel #6
0
    def _get_affinity_matrix(self, X, Y=None):
        """Calculate the affinity matrix from data
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples in the number of samples
            and n_features is the number of features.

            If affinity is "precomputed"
            X : array-like, shape (n_samples, n_samples),
            Interpret X as precomputed adjacency graph computed from
            samples.

        Returns
        -------
        affinity_matrix, shape (n_samples, n_samples)
        """
        if self.affinity == 'precomputed':
            self.affinity_matrix_ = X
            print( type(             self.affinity_matrix_))
            return self.affinity_matrix_
            
        # nearest_neigh kept for backward compatibility 
        if self.affinity == 'nearest_neighbors':
            if sparse.issparse(X):
                warnings.warn("Nearest neighbors affinity currently does "
                              "not support sparse input, falling back to "
                              "rbf affinity")
                self.affinity = "rbf"
            else:
                self.n_neighbors_ = (self.n_neighbors
                                     if self.n_neighbors is not None
                                     else max(int(X.shape[0] / 10), 1))
                self.affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_)
                # currently only symmetric affinity_matrix supported
                self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ +
                                               self.affinity_matrix_.T)
                return self.affinity_matrix_
        if self.affinity == 'radius_neighbors':
            if self.neighbors_radius is None:
                self.neighbors_radius_ =  np.sqrt(X.shape[1])
                # to put another defaault value, like diam(X)/sqrt(dimensions)/10
            else:
                self.neighbors_radius_ = self.neighbors_radius
                
            self.gamma_ = (self.gamma
                           if self.gamma is not None else 1.0 / X.shape[1])
            self.affinity_matrix_ = radius_neighbors_graph(X, self.neighbors_radius_, mode='distance')
            
            self.affinity_matrix_.data **= 2              
            self.affinity_matrix_.data /= -self.neighbors_radius_**2
            self.affinity_matrix_.data = np.exp( self.affinity_matrix_.data, self.affinity_matrix_.data )
            return self.affinity_matrix_
        if self.affinity == 'rbf':
            self.gamma_ = (self.gamma
                           if self.gamma is not None else 1.0 / X.shape[1])
            self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
            return self.affinity_matrix_
        self.affinity_matrix_ = self.affinity(X)
        return self.affinity_matrix_
 def knn_connectivity(self, X):
     knn_graph = kneighbors_graph(X, 30, include_self=False)
 
     for connectivity in (None, knn_graph):
             n_clusters = 4
             plt.figure(figsize=(10, 4))
             for index, linkage in enumerate(('average', 'complete', 'ward')):
                 plt.subplot(1, 3, index + 1)
                 model = AgglomerativeClustering(linkage=linkage,
                                             connectivity=connectivity,
                                             n_clusters=n_clusters)
                 t0 = time.time()
                 model.fit(X)
                 elapsed_time = time.time() - t0
                 plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
                         cmap=plt.cm.spectral)
                 plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
                       fontdict=dict(verticalalignment='top'))
                 plt.axis('equal')
                 plt.axis('off')
 
                 plt.subplots_adjust(bottom=0, top=.89, wspace=0,
                                 left=0, right=1)
                 plt.suptitle('n_cluster=%i, connectivity=%r' %
                          (n_clusters, connectivity is not None), size=17)
 
 
     plt.show()
Beispiel #8
0
def call_spectral(num_cluster ,mode_, data, update_flag):
    X = StandardScaler().fit_transform(data)
    spectral = SpectralClustering(n_clusters=num_cluster, eigen_solver='arpack', 
                                                        affinity='precomputed')
    connectivity = kneighbors_graph(X, n_neighbors=10)
    connectivity = 0.5 * (connectivity + connectivity.T)
    spectral.fit(connectivity)
    labels = spectral.labels_

    if update_flag:
        return labels


    label_dict = {}
    label_dict_count = 0
    for label in labels:
       label_dict[str(label_dict_count)] = float(label)
       label_dict_count = label_dict_count + 1
    print label_dict

    unique_dict = {}
    unique_dict_count = 0
    for uniq in np.unique(labels):
       print uniq
       unique_dict[str(unique_dict_count)] = float(uniq)
       unique_dict_count = unique_dict_count + 1
    print unique_dict

    return label_dict, unique_dict
def cluster_data(data,clustering_method,num_clusters):
    cluster_centers = labels_unique = labels = extra = None
    if clustering_method == 'KMeans':
        # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
        k_means = KMeans(n_clusters=num_clusters,init='k-means++',n_init=10,max_iter=100,tol=0.0001,
                                precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1)
        k_means.fit(data)
        labels = k_means.labels_
        cluster_centers = k_means.cluster_centers_
    elif clustering_method == 'MeanShift':
        ms =  MeanShift( bin_seeding=True,cluster_all=False)
        ms.fit(data)
        labels = ms.labels_
        cluster_centers = ms.cluster_centers_
    elif clustering_method == 'AffinityPropagation':
        af = AffinityPropagation().fit(data)
        cluster_centers = [data[i] for i in  af.cluster_centers_indices_]
        labels = af.labels_
    elif clustering_method == "AgglomerativeClustering":
        n_neighbors=min(10,len(data)/2)
        connectivity = kneighbors_graph(data, n_neighbors=n_neighbors)
        ward = AgglomerativeClustering(n_clusters=num_clusters, connectivity=connectivity,
                               linkage='ward').fit(data)
        labels = ward.labels_
    elif clustering_method == "DBSCAN":
        db = DBSCAN().fit(data)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        extra = core_samples_mask
        labels = db.labels_

    if labels is not None:
        labels_unique = np.unique(labels)
    return labels,cluster_centers,labels_unique,extra
Beispiel #10
0
def agglom(data, n_clusters):
    knn_graph = kneighbors_graph(data, 30, include_self=False)
    
    cluster = AgglomerativeClustering(n_clusters=n_clusters, connectivity=knn_graph, linkage='ward') # use ward / average / complete for different results
    model = cluster.fit(data)
    
    return cluster.fit_predict(data)
Beispiel #11
0
def test_non_euclidean_kneighbors():
    rng = np.random.RandomState(0)
    X = rng.rand(5, 5)

    # Find a reasonable radius.
    dist_array = pairwise_distances(X).flatten()
    np.sort(dist_array)
    radius = dist_array[15]

    # Test kneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.kneighbors_graph(
            X, 3, metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X)
        assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())

    # Test radiusneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.radius_neighbors_graph(
            X, radius, metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X)
        assert_array_equal(nbrs_graph,
                           nbrs1.radius_neighbors_graph(X).toarray())

    # Raise error when wrong parameters are supplied,
    X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3,
                  metric='euclidean')
    X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs,
                  radius, metric='euclidean')
Beispiel #12
0
	def agglomerative_clusters(self, word_vectors):
	
		#Pre-calculate BallTree object
		starting = time.time()
		Ball_Tree = BallTree(word_vectors, leaf_size = 200, metric = "minkowski")
		print("BallTree object in " + str(time.time() - starting))
		
		#Pre-calculate k_neighbors graph
		starting = time.time()
		connectivity_graph = kneighbors_graph(Ball_Tree, 
						n_neighbors = 1, 
						mode = "connectivity", 
						metric = "minkowski", 
						p = 2, 
						include_self = False, 
						n_jobs = workers
						)
		print("Pre-compute connectivity graph in " + str(time.time() - starting))

		#Agglomerative clustering
		starting = time.time()
		Agl = AgglomerativeClustering(n_clusters = 100, 
										affinity = "minkowski", 
										connectivity = connectivity_graph, 
										compute_full_tree = True, 
										linkage = "average"
										)
		
		Agl.fit(word_vectors)
		print("Agglomerative clustering in " + str(time.time() - starting))
		
		clusters = Agl.labels_
		
		return clusters
Beispiel #13
0
    def agglomerative(self, connect=True, linkage='ward'):
        # connectivity constrain
        if connect:
            knn_graph = kneighbors_graph(self.X, 10)
        else:
            knn_graph = None

        if linkage in ('ward', 'average', 'complete'):
            model = AgglomerativeClustering(linkage=linkage,
                                            n_clusters=self.n_clusters,
                                            connectivity=knn_graph)
            model.fit(self.X)
            self.agglo = (model.labels_,)
        ### END - if linkage

        elif linkage == 'all':
            label_list = []
            for link in ('ward', 'average', 'complete'):
                model = AgglomerativeClustering(linkage=link,
                                                n_clusters=self.n_clusters,
                                                connectivity=knn_graph)
                print link
                print self.X.shape
                model.fit(self.X)
                label_list.append(model.labels_)
            ### END - for linkage
            self.agglo = tuple(label_list)
        ### END - elif linkage

        else:
            print("Error: Wrong linkage argument")
            return
        ### END - else

        return self.evaluate(self.agglo)
Beispiel #14
0
    def _fit_process(self, X):
        """
        Computes the Laplacian score for the attributes


        :param X:
        :return:
        """

        self.scores_ = np.zeros(X.shape[1])

        # Similarity matrix
        S = kneighbors_graph(X, n_neighbors=self._n_neighbors, mode='distance')
        S = S.toarray()
        S *= S
        S /= self._bandwidth
        S = -S

        ones = np.ones(X.shape[0])

        D = np.diag(np.dot(S, ones))

        L = D - S

        qt = D.sum()
        for at in range(X.shape[1]):
            Fr = X[:, at]
            Fr_hat = Fr - np.dot(np.dot(Fr, D) / qt, ones)

            score1 = np.dot(np.dot(Fr_hat, L), Fr_hat)
            score2 = np.dot(np.dot(Fr_hat, D), Fr_hat)
            self.scores_[at] = score1 / score2
Beispiel #15
0
def _ward(X, k=2):
	connectivity = kneighbors_graph(X, n_neighbors=10)
	connectivity = 0.5 * (connectivity + connectivity.T)
	ward_five = cluster.Ward(n_clusters=k, connectivity=connectivity)
	ward_five.fit(X)
	y_pred = ward_five.labels_.astype(numpy.int)
	return y_pred
def clustering_tweets_hc(labeled_tweets, num_cluster):
    vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param)
    tweet_vec = vectorizer.fit_transform(labeled_tweets).toarray()
    # print(tweet_vec)
    n_clusters = num_cluster

    from sklearn.neighbors import kneighbors_graph

    knn_graph = kneighbors_graph(tweet_vec, 1, include_self=False)
    # print(knn_graph)

    connectivity = knn_graph
    from sklearn.cluster import AgglomerativeClustering

    model = AgglomerativeClustering(linkage='ward', connectivity=connectivity, n_clusters=n_clusters)
    model.fit(tweet_vec)
    c = model.labels_
    # print(c,len(c))

    clustered_tweets = []
    for i in range(0, num_cluster):
        similar_indices = (c == i).nonzero()[0]
        sent = ''
        for sid in similar_indices:
            sent = labeled_tweets[sid] + ' ' + sent
        clustered_tweets.append(sent)
    return clustered_tweets
Beispiel #17
0
def latent_cluster(SAMObject, n_clusters=10, X=None, plot=True,which_indices=(0,1)):
    """
    Use Anglomerative clustering to cluster the latent space by having a given number of clusters.
    ARG SAMObject: The SAMObject to operate on.
    ARG n_clusters: The number of clusters to find.
    ARG X: If None, we'll use the SAMObject's latent space, otherwise the provided one.
    ARG plot: Whether to plot the result or not.
    ARG which_indices: If plotting, which indices to plot.
    RETURN Y_: The cluster assignments for each component in the latent space. 
    """
    from sklearn.cluster import AgglomerativeClustering

    if X is None:
        X = SAMObject._get_latent()

    # Define the structure A of the data. Here a 10 nearest neighbors
    from sklearn.neighbors import kneighbors_graph
    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)

    # Compute clustering
    print("Compute structured hierarchical clustering...")
    ward = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity,linkage='ward',compute_full_tree=True).fit(X)
    #ward = AgglomerativeClustering(n_clusters=8,linkage='ward',compute_full_tree=True).fit(X)
    Y_ = ward.labels_

    if plot:
        color_iter = colors = cm.rainbow(np.linspace(0, 1, 20))

        #---- a silly way to get maximal separation in colors for the n_cluster first elements... move to separate function
        index_all = np.linspace(0,19,20).astype(int)
        space = np.floor(color_iter.shape[0]/float(n_clusters)).astype(int)
        index_first = index_all[::space][:n_clusters]
        index_rest = np.array(list(set(index_all)-set(index_first)))
        myperm = np.random.permutation(index_rest.shape[0])
        index_rest = index_rest[myperm]
        inds = np.hstack((index_first, index_rest))

        color_iter = color_iter[inds,:]

        marker_iter = itertools.cycle((',', '+', '.', 'o', '*','v','x','>')) 
        splot = pb.subplot(1, 1, 1)

        for i, (color,marker) in enumerate(zip(color_iter,marker_iter)):
            # as the method will not use every component it has access to unless it needs it, we shouldn't plot the redundant components.
            #if not np.any(Y_ == i):
            #    continue
            ###### tmp
            #cc = ['b','g','r']
            #mm = ['<','^','>']
            #pb.scatter(X[Y_ == i, which_indices[0]], X[Y_ == i, which_indices[1]], s=40, color=cc[i],marker=mm[i]) 
            #######
            pb.scatter(X[Y_ == i, which_indices[0]], X[Y_ == i, which_indices[1]], s=40, color=color,marker=marker) #UNCOMMENT
            if i >= n_clusters:
                break

        pb.legend(np.unique(Y_))
        pb.show()
        pb.draw()
        pb.show()
    return Y_
Beispiel #18
0
def makeWard(X, k=2):
    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    return cluster.AgglomerativeClustering(n_clusters=k,
                        linkage='ward', connectivity=connectivity)
Beispiel #19
0
def makeMaxLinkage(X=None, k=2):
    connectivity = kneighbors_graph(X, n_neighbors=10)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    return cluster.AgglomerativeClustering(linkage="complete",
                                affinity="cityblock", n_clusters=k,
                                connectivity=connectivity)
def median_min_distance(data, metric):
    """This function computes a graph of nearest-neighbors for each sample point in
        'data' and returns the median of the distribution of distances between those
        nearest-neighbors, the distance metric being specified by 'metric'.
    
    Parameters
    ----------
    data : array of shape (n_samples, n_features)
        The data-set, a fraction of whose sample points will be extracted
        by density sampling.
    
    metric : string
        The distance metric used to determine the nearest-neighbor to each data-point.
        The DistanceMetric class defined in scikit-learn's library lists all available
        metrics.
    
    Returns
    -------
    median_min_dist : float
        The median of the distribution of distances between nearest-neighbors.
    """

    data = np.atleast_2d(data)
    
    nearest_distances = kneighbors_graph(data, 1, mode = 'distance', metric = metric, include_self = False).data
 
    median_min_dist = np.median(nearest_distances, overwrite_input = True)

    return round(median_min_dist, 4)
def cluster_documents(documents, num_clusters=10, num_terms=30, clust_alg='kmeans', verbose_docs=True):
    '''A document is an object with a tokens attribute where tokens is
        a list of tokens. Documents is a list of these document objects'''

    labels = range(num_clusters)
    true_k = len(labels)

    texts = [' '.join(doc['tokens']) for doc in documents]

    vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
        min_df=2, stop_words='english', use_idf=True)

    vector_space = vectorizer.fit_transform(texts)

    if clust_alg == 'minibatch':
        clusterer = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000)
    elif clust_alg == 'kmeans':
        clusterer = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    elif clust_alg == 'agglomerative':
        #Note this doesn't work atm. Or rather its output (and input) is shaped
        #differently from kmeans and minibatchkmeans
        connectivity = kneighbors_graph(vector_space.toarray(), n_neighbors=true_k)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        clusterer = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward', connectivity=connectivity)

    clusterer.fit(vector_space)

    #Re-attach the cluster results to the original documents

    if verbose_docs:
        clustered_docs = [dict({'cluster': doc[0].item()}.items() + doc[1].items()) for doc in zip(clusterer.labels_, documents)]
    else:
        clustered_docs = [dict({'cluster': doc[0].item()}.items() + doc[1].items()) for doc in zip(clusterer.labels_, documents)]

    by_cluster = defaultdict(list)
    for doc in clustered_docs:
        by_cluster[doc['cluster']].append(doc)
    by_cluster = dict(by_cluster)

    # Top terms in each cluster

    clusters = []
    terms = vectorizer.get_feature_names()
    order_centroids = clusterer.cluster_centers_.argsort()[:, ::-1]
    for i in range(true_k):
        cluster_info = {'cluster': i}
        cluster_terms = []
        for ind in order_centroids[i, :num_terms]:
            cluster_terms.append(terms[ind])
        cluster_info['terms'] = cluster_terms
        clusters.append(cluster_info)


    # pp(by_cluster)
    # pp(clusters)

    # by cluster is a list of
    return (by_cluster, clusters)
def example1():
    """画出k-近邻关系图
    距离最近的k个样本将被看做近邻
    """
    train = np.array([[1,2,4,7,9,10]]).transpose()
    graph = kneighbors_graph(train, 2) # k = 2
    print(graph)
    print(graph.toarray())
Beispiel #23
0
def build_kneighbors_table(features, k_neighbors):
    sparse_connections = kneighbors_graph(features, k_neighbors + 1)
    # Iterate to unpack neighbors from sparse connections
    connections = list()
    for ridx in xrange(len(features)):
        connections.append(sparse_connections[ridx].nonzero()[1].tolist()[1:])

    return connections
Beispiel #24
0
    def test_kneighbors_graph(self):
        x = [[0], [3], [1]]
        df = pdml.ModelFrame(x)

        result = df.neighbors.kneighbors_graph(2)
        expected = neighbors.kneighbors_graph(x, 2)

        self.assert_numpy_array_almost_equal(result.toarray(), expected.toarray())
def hierarchical_clustering(corpus_fn, n_clusters=2, linkage='complete'):
    corpus = corpora.MmCorpus(corpus_fn)
    corpus = matutils.corpus2csc(corpus, num_terms=corpus.num_terms).transpose()
    svd = TruncatedSVD(n_components=100)
    new_corpus = svd.fit_transform(corpus)
    knn_graph = kneighbors_graph(new_corpus, 10, metric='euclidean')
    agg = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=linkage, connectivity=knn_graph)
    agg.fit(new_corpus)
    return corpus, agg.labels_
def agglomerative_clustering(crime_rows, column_names, num_clusters):
    crime_xy = [crime[0:2] for crime in crime_rows]
    crime_info = [crime[2:] for crime in crime_rows]
    print("Running Agglomerative Clustering")
    agglo_clustering = AgglomerativeClustering(n_clusters=num_clusters, 
            connectivity=neighbors.kneighbors_graph(crime_xy, n_neighbors=2))
    agglomerative_clustering_labels = agglo_clustering.fit_predict(crime_xy)
    print("formatting....")
    return _format_clustering(agglomerative_clustering_labels, 
            crime_xy, crime_info, column_names)
def cluster_spatial_data(X, n_parcels, xyz=None, shape=None, mask=None,
                         method='ward', verbose=False):
    """Cluster the data using Ward's algorithm

    Parameters
    ==========
    X: array of shape(n_voxels, n_subjects)
       the functional data, across subjects
    n_parcels: int, the desired number of parcels
    xyz: array of shape (n_voxels, 3), optional
         positions of the voxels in grid coordinates
    shape: tuple: the domain shape (assuming a grid structure), optional
          alternative specification of positions
    mask: arbitrary array of arbitrary dimension,optional
          alternative specification of positions
    method: string, one of ['ward', 'spectral', 'kmeans'], optional
            clustering method

    Returns
    =======
    label: array of shape(n_voxels): the resulting cluster assignment

    Note
    ====
    One of xyz, shape or mask needs to be provided
    """
    from sklearn.cluster import spectral_clustering, k_means
    if mask is not None:
        connectivity = grid_to_graph(*shape, mask=mask)
    elif shape is not None:
        connectivity = grid_to_graph(*shape)
    elif xyz is not None:
        from sklearn.neighbors import kneighbors_graph
        n_neighbors = 2 * xyz.shape[1]
        connectivity = kneighbors_graph(xyz, n_neighbors=n_neighbors)
    else:
        raise ValueError('One of mask, shape or xyz has to be provided')

    if n_parcels == 1:
        return np.zeros(X.shape[0])
    if method == 'ward':
        connectivity = connectivity.tocsr()
        ward = Ward(n_clusters=n_parcels, connectivity=connectivity).fit(X)
        label = ward.labels_
    elif method == 'spectral':
        i, j = connectivity.nonzero()
        sigma = np.sum((X[i] - X[j]) ** 2, 1).mean()
        connectivity.data = np.exp(- np.sum((X[i] - X[j]) ** 2, 1) /
                                      (2 * sigma))
        label = spectral_clustering(connectivity, n_clusters=n_parcels)
    elif method == 'kmeans':
        _, label, _ = k_means(X, n_parcels)
    else:
        raise ValueError('Unknown method for parcellation')
    return label
Beispiel #28
0
def clustering(X, algorithm, n_clusters):
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)

    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # Generate the new colors:
    if algorithm=='MiniBatchKMeans':
        model = cluster.MiniBatchKMeans(n_clusters=n_clusters)

    elif algorithm=='Birch':
        model = cluster.Birch(n_clusters=n_clusters)

    elif algorithm=='DBSCAN':
        model = cluster.DBSCAN(eps=.2)

    elif algorithm=='AffinityPropagation':
        model = cluster.AffinityPropagation(damping=.9,
                                            preference=-200)

    elif algorithm=='MeanShift':
        model = cluster.MeanShift(bandwidth=bandwidth,
                                  bin_seeding=True)

    elif algorithm=='SpectralClustering':
        model = cluster.SpectralClustering(n_clusters=n_clusters,
                                           eigen_solver='arpack',
                                           affinity="nearest_neighbors")

    elif algorithm=='Ward':
        model = cluster.AgglomerativeClustering(n_clusters=n_clusters,
                                                linkage='ward',
                                                connectivity=connectivity)

    elif algorithm=='AgglomerativeClustering':
        model = cluster.AgglomerativeClustering(linkage="average",
                                                affinity="cityblock",
                                                n_clusters=n_clusters,
                                                connectivity=connectivity)

    model.fit(X)

    if hasattr(model, 'labels_'):
            y_pred = model.labels_.astype(np.int)
    else:
            y_pred = model.predict(X)

    return X, y_pred
Beispiel #29
0
def agglomerative(num_clusters, similarity, dataset, header, text_sim=False):
    if text_sim:
        connectivity = kneighbors_graph(similarity, 5)
    else:
        values = dataset[header]
        connectivity = kneighbors_graph(values, 5)

    """
    #Based on images of each users?
    from scipy.sparse import csr_matrix
    users = set(dataset_now["uid"])
    connectivity = np.zeros([ dataset_now.shape[0], dataset_now.shape[0] ])
    for i, user1 in enumerate(dataset_now["uid"]):
        for j, user2 in enumerate(dataset_now["uid"]):
            if user1 == user2:
                connectivity[i][j] = 1
    connectivity = csr_matrix(connectivity)
    """
    return AgglomerativeClustering(
        n_clusters=num_clusters, connectivity=connectivity, compute_full_tree=True
    ).fit_predict(similarity)
Beispiel #30
0
def test_isomap_simple_grid():
    # Isomap should preserve distances when all neighbors are used
    N_per_side = 5
    Npts = N_per_side ** 2
    n_neighbors = Npts - 1

    # grid of equidistant points in 2D, out_dim = n_dim
    X = np.array(list(product(range(N_per_side), repeat=2)))

    # distances from each point to all others
    G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()

    for eigen_solver in eigen_solvers:
        for path_method in path_methods:
            clf = manifold.Isomap(
                n_neighbors=n_neighbors, out_dim=2, eigen_solver=eigen_solver, path_method=path_method
            )
            clf.fit(X)

            G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance").toarray()
            assert_array_almost_equal(G, G_iso)
Beispiel #31
0
def Laplacian_matrix(data, knn, gamma):
    print(data.shape)
    A = np.full((data.shape[0], data.shape[0]), 0.0, dtype=np.float64)
    #L = np.full((data.shape[0],data.shape[0]),0.0,dtype=np.float64)
    if knn != -1:
        A = kneighbors_graph(data, n_neighbors=knn, mode='distance').toarray()
        D = np.diagflat((np.full((1, data.shape[0]), knn, dtype=np.float64)))
        print("Weight Matrix created using KNN : ", knn)
    elif gamma != -1:
        A = sklearn.metrics.pairwise.rbf_kernel(
            data, gamma=gamma)  #RBF Kernel for constructing similarity matrix
        #D = np.diagflat(np.full((1,data.shape[0]),(data.shape[0])-1,dtype=np.float64))
        D = np.diagflat(np.count_nonzero(A, axis=1))
        print("Weight Matrix created using gamma : ", gamma)
    print("Dimensions of Similarity Matrix:	", A.shape)
    print(A)
    print("Dimensions of Degree Matrix:	", D.shape)
    print(D)
    L = np.subtract(D, A)
    print("Dimensions of Laplacian Matrix:	", L.shape)
    print(L)
    print("Laplacian Matrix created . . .")
    return A, D, L
Beispiel #32
0
    def fit(self, X):
        """Fit the clustering model

        Parameters
        ----------
        X : array_like
            the data to be clustered: shape = [n_samples, n_features]
        """
        X = np.asarray(X, dtype=float)

        self.X_train_ = X

        # generate a sparse graph using the k nearest neighbors of each point
        G = kneighbors_graph(X, n_neighbors=self.n_neighbors, mode='distance')

        # Compute the minimum spanning tree of this graph
        self.full_tree_ = minimum_spanning_tree(G, overwrite=True)

        # Find the cluster labels
        self.n_components_, self.labels_, self.cluster_graph_ =\
            self.compute_clusters()

        return self
def hierarchical_clustering(nb_clust, nb_feat, centroid, cluster_init,
                            dataCentroid):
    # Preparation of the contiguity matrix
    X = np.zeros(shape=(len(centroid), 2))
    for key, value in centroid.iteritems():
        X[key] = value
    knn_graph = kneighbors_graph(X, 8, include_self=False)
    linkage = 'ward'
    dataModel = np.zeros(shape=(len(centroid), nb_feat))
    for key, value in dataCentroid.iteritems():
        dataModel[key] = value[1:(nb_feat + 1)]
        #dataModel[key] = value

    model = AgglomerativeClustering(linkage=linkage,
                                    connectivity=knn_graph,
                                    n_clusters=nb_clust)
    model.fit(dataModel)
    new_id_clust = []
    for row in cluster_init:
        clust = model.labels_[row]
        new_id_clust.append(clust)
        print clust
    return new_id_clust
def link_clustering(x, inverselengthscale, n_clusters, n_neighbors):

    global log

    linkage = 'complete'

    log.append('Linkage : {}'.format(linkage))
    log.append('n_clusters : {} , n_neighbors : {}'.format(
        n_clusters, n_neighbors))

    # print n_neighbors, x.shape
    n_neighbors = int(len(x) * n_neighbors)

    knn_graph = kneighbors_graph(x, n_neighbors, include_self=False)

    clustering = AgglomerativeClustering(linkage=linkage,
                                         n_clusters=n_clusters,
                                         connectivity=knn_graph)
    clustering.fit(x)

    labels = clustering.labels_

    return labels
Beispiel #35
0
def generate_edges(X, mode='kneighbors_graph', n_neighbors=3, radius=0.1):
    """
    returns array with pairs of indices [vertex_from, vertex_to] and weight vector
    """
    n_neighbors = min(n_neighbors, len(X) - 1)
    if n_neighbors == 0:
        return X[:, 3].reshape(-1, 1), np.zeros((1, 5)), np.zeros((2, 1))
    if mode == 'kneighbors_graph':
        adjacency_matrix = np.array((kneighbors_graph(X=X[:, :3], 
                                                      n_neighbors=n_neighbors, mode='distance')).todense())
    elif mode == 'radius_neighbors_graph':
        adjacency_matrix = np.array((radius_neighbors_graph(X=X[:, :3], 
                                                            radius=radius, mode='distance')).todense())
    else:
        raise 'Unknown mode {}'.format(mode)
    rows, cols = np.where(adjacency_matrix > 0)
    edges = np.vstack([rows, cols])
    weights = adjacency_matrix[rows, cols]
    
    nodes_features = X[:, 3].reshape(-1, 1)    
    edges_features = X[edges.T[:, 0]] - X[edges.T[:, 1]]
                
    return nodes_features, np.c_[edges_features, weights], edges.astype(int)
Beispiel #36
0
    def GeoDesicMatrix(self, X):

        self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors,
                                      algorithm=self.neighbors_algorithm,
                                      metric=self.metric,
                                      p=self.p,
                                      metric_params=self.metric_params,
                                      n_jobs=self.n_jobs)
        self.nbrs_.fit(X)

        kng = kneighbors_graph(self.nbrs_,
                               self.n_neighbors,
                               metric=self.metric,
                               p=self.p,
                               metric_params=self.metric_params,
                               mode='distance',
                               n_jobs=self.n_jobs)

        self.dist_matrix_ = graph_shortest_path(kng,
                                                method=self.path_method,
                                                directed=False)
        G = self.dist_matrix_**2
        return G
Beispiel #37
0
def diffusion_mapping(X,
                      n_components=2,
                      n_neighbors=5,
                      alpha=1.0,
                      t=1,
                      gamma=0.5,
                      metric='minkowski',
                      p=2,
                      metric_params=None,
                      n_jobs=1):
    knn = kneighbors_graph(X,
                           n_neighbors,
                           mode='distance',
                           metric=metric,
                           metric_params=metric_params,
                           p=p,
                           n_jobs=n_jobs)

    K = sparse.csr_matrix(
        (np.exp(-gamma * knn.data**2), knn.indices, knn.indptr))

    mask = (K != 0).multiply(K.T != 0)
    L = K + K.T - K.multiply(mask)

    D = sparse.diags(np.asarray(L.sum(axis=0)).reshape(-1))

    L_a = D.power(-alpha) @ L @ D.power(-alpha)

    D_a = sparse.diags(np.asarray(L_a.sum(axis=1)).reshape(-1))

    m = D_a.power(-1) @ L_a

    w, v = eigs(m, n_components + 1)

    # eigs returns complex numbers, but for Markov matrices, all eigenvalues are
    # real and in [0, 1].
    return (m.dot(v[:, 1:]) * (w[1:]**t)).real
Beispiel #38
0
    def _fit_transform(self, X):
        self.nbrs_.fit(X)
        self.training_data_ = self.nbrs_._fit_X
        self.kernel_pca_ = KernelPCA(n_components=self.n_components,
                                     kernel="precomputed",
                                     eigen_solver=self.eigen_solver,
                                     tol=self.tol,
                                     max_iter=self.max_iter)

        kng = kneighbors_graph(self.nbrs_, self.n_neighbors, mode="distance")
        n_points = X.shape[0]
        n_workers = blob_ctx.get().num_workers

        if n_points < n_workers:
            tile_hint = (1, )
        else:
            tile_hint = (n_points / n_workers, )
        """
    task_array is used for deciding the idx of starting points and idx of endding points
    that each tile needs to find the shortest path among.
    """
        task_array = expr.ndarray((n_points, ), tile_hint=tile_hint)
        task_array = task_array.evaluate()

        #dist matrix is used to hold the result
        dist_matrix = expr.ndarray((n_points, n_points),
                                   reduce_fn=lambda a, b: a + b).evaluate()
        results = task_array.foreach_tile(mapper_fn=_shortest_path_mapper,
                                          kw={
                                              'kng': kng,
                                              'directed': False,
                                              'dist_matrix': dist_matrix
                                          })
        self.dist_matrix_ = dist_matrix.glom()
        G = self.dist_matrix_**2
        G *= -0.5
        self.embedding_ = self.kernel_pca_.fit_transform(G)
Beispiel #39
0
    def get_RF_avgRList_byAggloCluster(self, cluster_ratio):
        
        from sklearn.cluster import AgglomerativeClustering
        from sklearn.neighbors import kneighbors_graph

        trees = self.trees
        m,n = self.X_train.shape
        # get_RF_RList
        RF_RList=[]
        for tree in trees:

            tree_RList = tree.tree.get_RList()
            tree_RMat = np.array(tree_RList)
            # tree_new_RMat = np.zeros((tree_RMat.shape[0],n,2))
            # tree_new_RMat[:,tree.feat_ind] = tree_RMat
            RF_RList.extend(tree_RMat)   # len = m

        RF_R_Mat = np.array(RF_RList)  #(m,n,2), col0=center, col1=radius
        RF_R_centers = RF_R_Mat[:,:,0]  # (m,n)
        RF_R_radius = RF_R_Mat[:,:,1]   # (m,n)

        # get the number of cluster
        avg_num_R = int( RF_R_Mat.shape[0])  # total R divided by number trees
        # get the connectivity graph of R_list
        connect_graph = kneighbors_graph(RF_R_centers, n_neighbors=int(0.7*len(trees)), include_self=False)
        # connect_graph shape = (m,m) , if neibor then value=1, else=0
        
        if isinstance(cluster_ratio, float):
            try:
                R_cluster = AgglomerativeClustering(n_clusters=int(cluster_ratio*avg_num_R),
                                                    connectivity=connect_graph,
                                                    linkage='ward').fit(RF_R_centers)
            except ValueError,e:
                print 'ValueError ',e
                R_cluster = AgglomerativeClustering(n_clusters=int(cluster_ratio*avg_num_R)+1,
                                                    connectivity=connect_graph,
                                                    linkage='ward').fit(RF_R_centers)
Beispiel #40
0
def test_non_euclidean_kneighbors():
    rng = np.random.RandomState(0)
    X = rng.rand(5, 5)

    # Find a reasonable radius.
    dist_array = pairwise_distances(X).flatten()
    np.sort(dist_array)
    radius = dist_array[15]

    # Test kneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.kneighbors_graph(X, 3, metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X)
        assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())

    # Test radiusneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.radius_neighbors_graph(X, radius,
                                                      metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X)
        assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A)

    # Raise error when wrong parameters are supplied,
    X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError,
                  neighbors.kneighbors_graph,
                  X_nbrs,
                  3,
                  metric='euclidean')
    X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError,
                  neighbors.radius_neighbors_graph,
                  X_nbrs,
                  radius,
                  metric='euclidean')
Beispiel #41
0
def compute_propagation(order,idx_train,labels,emb,exp):
    ###here we need to get optimum k####
    k_range = range(1,12)
    param_grid = dict(n_neighbors =k_range)
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn,param_grid, cv = 10, scoring = "accuracy")
    grid.fit(gene_feature,labels)
    #print grid.best_params_,grid.best_params_['n_neighbors']
    GF = kneighbors_graph(gene_feature,grid.best_params_['n_neighbors'], mode='connectivity',include_self=False)
    G = nx.from_numpy_matrix(GF.A)
    nds = range(G.number_of_nodes())
    #print nds
    #print "nx.info embeddings:", nx.info(G)
    Laplacian_matrtix = nx.laplacian_matrix(G, nodelist= nds, weight='weight')
    L_exp = nx.laplacian_matrix(get_network, nodelist = nds, weight='weight')

    ####harmonic part####
    y = labels.copy
    I = identity(G.number_of_nodes())
    lamb = 1.0
    Laplacian_matrtix = np.add(Laplacian_matrtix*emb, L_exp*exp)
    fu = spsolve((I + Laplacian_matrtix*lamb), labels)
    
    return fu
Beispiel #42
0
def _affinity_propagation(feature, ground_truth, config):
    ref_sc = -1
    optimal_preference = 0
    optimal_damping_factor = -1
    optimal_affinity = 'euclidean'
    optimal_n_neighbors = config['n_neighbors'][0]
    if(config['affinity'].count('euclidean')>0):
        for p in config['preference']:
            for d in config['damping_factor']:    
                af = cluster.AffinityPropagation(preference=p, damping=d).fit(feature)
                y_pred_af = af.labels_
                ars_af = metrics.adjusted_rand_score(ground_truth, y_pred_af)
                if(ars_af > ref_sc):
                    ref_sc = ars_af
                    optimal_preference = p
                    optimal_damping_factor = d
    if(config['affinity'].count('precomputed')>0):      
        for p in config['preference']:
            for d in config['damping_factor']:     
                for n_neighbors in config['n_neighbors']:
                    connectivity = kneighbors_graph(feature, n_neighbors=n_neighbors,include_self=True)
                    affinity_matrix = 0.5 * (connectivity + connectivity.T)
                    affinity_matrix = np.asarray(affinity_matrix.todense(),dtype=float)
                    af = cluster.AffinityPropagation(damping=d, affinity='precomputed').fit(affinity_matrix)
                    y_pred_af = af.labels_
                    ars_af = metrics.adjusted_rand_score(ground_truth, y_pred_af)
                    if(ars_af > ref_sc):
                        ref_sc = ars_af
                        optimal_preference = p
                        optimal_damping_factor = d        
                        optimal_affinity = 'precomputed'
                        optimal_n_neighbors = n_neighbors
    logging.info('ari %.3f'% ref_sc)                            
    return {'preference': optimal_preference, 'damping_factor': optimal_damping_factor, 'ari': ref_sc, 
        'affinity': optimal_affinity, 'n_neighbors': optimal_n_neighbors
        }
Beispiel #43
0
def buildAdjacencyGraph3(matrix, top_k):
    nn = NearestNeighbors(n_neighbors=top_k,
                          metric='cosine',
                          n_jobs=multiprocessing.cpu_count())
    nn.fit(matrix)
    adjMatrix = kneighbors_graph(nn,
                                 top_k,
                                 mode='distance',
                                 metric='cosine',
                                 n_jobs=multiprocessing.cpu_count()).toarray()
    [rows, cols] = adjMatrix.shape
    # Set the diagonal to be zero, there is no edge from a node to itself
    # if (rows == cols):
    #     for r in range(rows):
    #         adjMatrix[r][r] = 0
    # for row in range(rows):
    #     for ind in range(cols):
    #         if(adjMatrix[row][ind]!=0):
    #             adjMatrix[row][ind] = 1-adjMatrix[row][ind]
    numpy.where(adjMatrix > 0, 1 - adjMatrix, 0)
    graph = nx.convert_matrix.from_numpy_matrix(
        adjMatrix, parallel_edges=False,
        create_using=nx.DiGraph()).to_undirected()
    return graph
Beispiel #44
0
    def customNcuts(self):
        """ Return segmentation label using classic Ncuts """
        # computing neighboors graph
        A = kneighbors_graph(self.values,
                             self.k,
                             mode='distance',
                             include_self=False).toarray()

        for i in range(self.values.shape[0]):
            for j in range(self.values.shape[0]):
                if A[i][j] > 0:

                    v1 = (self.values[i][3], self.values[i][4],
                          self.values[i][5])
                    v2 = (self.values[j][3], self.values[j][4],
                          self.values[j][5])

                    magnitude1 = np.sqrt(v1[0] * v1[0] + v1[1] * v1[1] +
                                         v1[2] * v1[2])
                    magnitude2 = np.sqrt(v2[0] * v2[0] + v2[1] * v2[1] +
                                         v2[2] * v2[2])
                    ang = np.arccos(np.dot(v1, v2) / (magnitude1 * magnitude2))

                    A[i][j] = max(self.values[i][7],
                                  self.values[j][7]) * A[i][j]

        # init SpectralClustering
        sc = SpectralClustering(4,
                                affinity='precomputed',
                                n_init=10,
                                assign_labels='discretize')

        # cluster
        labels = sc.fit_predict(A)

        return labels
Beispiel #45
0
    def iterative_nearest_neighbor(self, cluster_labels):

        labels = cluster_labels.copy()
        connectivity = kneighbors_graph(self.model_data,
                                        n_neighbors=4,
                                        include_self=False).toarray()
        conn_df = pd.DataFrame(connectivity)
        df = conn_df * (labels + 1)

        a = df.apply(lambda row: row.nunique() > 2, axis=1)
        k_neighs = a[a].index.tolist()

        while len(k_neighs) > 0:
            x = k_neighs[0]
            k_neighs.pop(0)

            cls_neighs = df.loc[x][df.loc[x] > 0]
            cross_region = [
                df.loc[r][df.loc[r] > 0] - 1 for r in cls_neighs.index
                if r in k_neighs
            ]
            cls_region = [item for elem in cross_region for item in elem]
            cls_region.extend(cls_neighs.values - 1)
            densed = pd.Series(cls_region).value_counts().index[0]

            #print(labels[cls_neighs.index] , int(densed))
            labels[cls_neighs.index] = int(densed)
            #print(int(densed), labels[region])
            #labels[region] = int(densed)
            df = conn_df * (labels + 1)

            k_neighs = list(set(k_neighs) - set(cls_neighs.index))

        sizes = pd.Series(labels).value_counts()

        return (labels, sizes)
Beispiel #46
0
def build_edges(Points, K=16):
    '''
    from point coordinates to edgelist and edge information
        input:  Data: 3D tensor (batch_size, num_points, dim=3)
        output: Edgelist: 3D tensor (batch_size, num_edges, 2)
                Edge_info: 3D tensor (batch_size, num_edges, dim=3)
    '''
    Edgelist = []
    Edge_info = []
    [batch_size, num_points, dim] = Points.shape

    from sklearn.neighbors import kneighbors_graph
    for i_sample in range(batch_size):
        data = Points[i_sample, :, :]
        A = kneighbors_graph(data, K, mode='connectivity', include_self=True)
        edgelist = np.transpose(np.stack(np.nonzero(A)))
        edge_info = np.concatenate(
            [data[edgelist[:, 1], :], data[edgelist[:, 0], :]], axis=1)
        Edgelist.append(edgelist)
        Edge_info.append(edge_info)
    Edgelist = np.stack(Edgelist)  # (i, j)
    Edge_info = np.stack(Edge_info)  # x_j - x_i \in R^3

    return Edgelist, Edge_info
def doiteration(new_data, dataframe):
    #kmeans1
    kmeans = KMeans(n_clusters=6, random_state=0).fit(new_data)
    labels_kmeans = kmeans.labels_
    set_lk = set(labels_kmeans)
    #spectral
    spectral = SpectralClustering()
    spectral.fit(new_data)
    spectral_labels =  spectral.labels_
    set_ls = set(spectral_labels)
    #Hierarchial
    connectivity = kneighbors_graph(new_data, n_neighbors=10, include_self=False)
    ward = AgglomerativeClustering(n_clusters=8, connectivity=connectivity,
                               linkage='ward').fit(new_data)
    h_labels = ward.labels_
    set_lh = set(h_labels)

    colNames =  list(dataframe.columns.values)

    labels_dict_kc = tsc.getLabelsDict(set_lk, labels_kmeans)
    #pass into cosine similarity computations
    print "\nkmeans\n"
    userdata = tsc.cosine_computations(labels_dict_kc, set_lk, labels_kmeans, colNames, dataframe)
    print tsc.average_sim_cluster(userdata)
    print "\nSpectral\n"
    labels_dict_sc = tsc.getLabelsDict(set_ls, spectral_labels)
    #pass into cosine similarity computations
    userdata = tsc.cosine_computations(labels_dict_sc, set_ls,spectral_labels, colNames, dataframe )
    # print userdata
    print tsc.average_sim_cluster(userdata)
    print "\nHeirarchial\n"
    labels_dict_hc = tsc.getLabelsDict(set_lh, h_labels)
    #pass into cosine similarity computations
    userdata = tsc.cosine_computations(labels_dict_hc, set_lh, h_labels, colNames, dataframe)
    # print userdata
    print tsc.average_sim_cluster(userdata)
Beispiel #48
0
    def create_agglomerative_models(self,
                                    n_cluster_list,
                                    linkage_methods=None):
        """
            Create multiple agglomerative models based on a list of
            'n_clusters' values and defined linkage methods.
        """

        if isinstance(n_cluster_list, int):
            n_cluster_list = [n_cluster_list]

        if not linkage_methods:
            linkage_methods = ["ward", "complete", "average", "single"]

        knn_graph = kneighbors_graph(
            self.__scaled, len(
                self.__scaled) - 1, include_self=False)

        for n_clusters in n_cluster_list:
            for connectivity in (None, knn_graph):

                for _, linkage in enumerate(linkage_methods):
                    model = AgglomerativeClustering(linkage=linkage,
                                                    connectivity=connectivity,
                                                    n_clusters=n_clusters)
                    model.fit(self.__scaled)
                    self.__all_cluster_models[
                        "AgglomerativeClustering_{0}_"
                        "cluster{1}_Connectivity{2}".format(
                            linkage,
                            n_clusters, connectivity is not None)] = model

                    print(
                        "Successfully generate Agglomerative model with "
                        "linkage {0} on n_clusters={1}".format(
                            linkage, n_clusters))
Beispiel #49
0
def configuraciones_agglomerative(subset):
    normalized_set = preprocessing.normalize(subset, norm='l2')

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(normalized_set,
                                    n_neighbors=10,
                                    include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    ward_10 = cl.AgglomerativeClustering(n_clusters=10, linkage='ward')
    ward_10_connectivity = cl.AgglomerativeClustering(
        n_clusters=10, linkage='ward', connectivity=connectivity)
    ward_20 = cl.AgglomerativeClustering(n_clusters=20, linkage='ward')
    ward_20_connectivity = cl.AgglomerativeClustering(
        n_clusters=20, linkage='ward', connectivity=connectivity)

    #Los añadimos a una lista
    clustering_algorithms = (('Ward-10', ward_10), ('Ward-10-con',
                                                    ward_10_connectivity),
                             ('Ward-20', ward_20), ('Ward-20-con',
                                                    ward_20_connectivity))

    return clustering_algorithms
Beispiel #50
0
def snn(X, neighbor_num, min_shared_neighbor_num):
    """Perform Shared Nearest Neighbor (SNN) clustering algorithm clustering.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or array of shape (n_samples, n_samples)
    A feature array
    neighbor_num : int
    K number of neighbors to consider for shared nearest neighbor similarity
    min_shared_neighbor_num : int
    Number of nearest neighbors that need to share two data points to be considered part of the same cluster
    """

    # for each data point, find their set of K nearest neighbors
    knn_graph = kneighbors_graph(X, n_neighbors=neighbor_num, include_self=False)
    neighbors = np.array([set(knn_graph[i].nonzero()[1]) for i in range(len(X))])

    # the distance matrix is computed as the complementary of the proportion of shared neighbors between each pair of data points
    snn_distance_matrix = np.asarray([[get_snn_distance(neighbors[i], neighbors[j]) for j in range(len(neighbors))] for i in range(len(neighbors))])

    # perform DBSCAN with the shared-neighbor distance criteria for density estimation
    dbscan = DBSCAN(min_samples=min_shared_neighbor_num, metric="precomputed")
    dbscan = dbscan.fit(snn_distance_matrix)
    return dbscan.core_sample_indices_, dbscan.labels_
Beispiel #51
0
def calc_graph(X, k, sigma):
    """ Given data X construct graphs with k nearest neighbours and weighted
    by Gaussian kernel with std sigma

    Parameters
    ----------
    X     - array - TxQ array of T timepoints with Q features each
    k     - int   - number of nearest neighbours
    sigma - float - standard deviation of Gaussian kernel

    Returns
    -------
    TxT adjacency matrix of weighted graph

    Notes
    -----
    k=0 means complete graph
    sigma=0 means unweighted
    Can't do both"""
    assert isinstance(k, int), 'k must be an integer'
    T = X.shape[0]
    X = X.reshape(T, np.prod(X.shape[1:]))
    if k == 0 and sigma == 0:
        assert False, "Can't have k and sigma both equal to 0 - thats a complete unweighted graph"
    if k == 0:
        G = 1.
    else:
        G = kneighbors_graph(X, k, include_self=False)
        G = 0.5 * (G + G.T).toarray()
    if sigma == 0:
        W = 1.
    else:
        dist_G = squareform(pdist(X))
        W = np.exp(-(dist_G**2) / (2 * sigma * sigma)) - np.identity(T)
    WG = G * W
    return WG
    def __init__(self, data, regex, embedding):

        dataToDict = data.fillna('').to_dict(orient="records")
        regexToDict = regex.to_dict()

        xy = UMAP().fit_transform(embedding)

        A = kneighbors_graph(xy, n_neighbors=1)
        G = nx.from_scipy_sparse_matrix(A)
        E = G.edges()

        coords = [{'x': x, 'y': y} for (x, y) in xy.tolist()]
        nearestNeighbors = [{
            'source': int(s),
            'target': int(t)
        } for (s, t) in E]

        super().__init__(
            **{
                'data': dataToDict,
                'regex': regexToDict,
                'coords': coords,
                'nearestNeighbors': nearestNeighbors
            })
Beispiel #53
0
def clusterer_sklearn_ward(X, n_clusters):
    # "_args": [{"type": "numpy.ndarray","dtype": "float32"} ],
    #   "_return": [{ "type": "numpy.ndarray","dtype": "int32"}

    # in this case we want to try different numbers of clusters, so it is a parameter

    import sklearn
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.neighbors import kneighbors_graph
    import numpy as np
    print('clusterer_sklearn_ward')

    connectivity = kneighbors_graph(X,
                                    n_neighbors=params['n_neighbors'],
                                    include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    ward = AgglomerativeClustering(n_clusters=params['n_clusters'],
                                   linkage='ward',
                                   connectivity=connectivity).fit(X)
    clusterAlgLabelAssignmentsSW = ward.labels_.astype(np.int)

    XY = (X, clusterAlgLabelAssignmentsSW)
    return (XY)
def compute_propagation(order,idx_train,labels,emb,exp):
    ### here we need to get optimum k from the range 1 to 12 ####
    k_range = range(1,12)
    param_grid = dict(n_neighbors = k_range)
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn,param_grid, cv = 10, scoring = "accuracy")
    grid.fit(gene_feature,labels)
    GF = kneighbors_graph(gene_feature,grid.best_params_['n_neighbors'], mode='connectivity',include_self=False)
    G = nx.from_numpy_matrix(GF.A)
    
    #print "nx.info embeddings:", nx.info(G)
    Laplacian_matrtix = nx.laplacian_matrix(G, nodelist=order, weight='weight')
    L_exp = nx.laplacian_matrix(get_network,nodelist = order, weight='weight')
    Laplacian_matrtix = np.add(Laplacian_matrtix * emb, L_exp * exp)
    l = len(idx_train)
    u = len(idx_test)
    r,c = Laplacian_matrtix.shape
    Lll = Laplacian_matrtix[0:l,0:l]
    Llu = Laplacian_matrtix[0:l,l:r]
    Lul = Laplacian_matrtix[l:r,0:l]
    Luu = Laplacian_matrtix[l:r,l:r]
    yl =  labels[idx_train]
    fu = -linalg.pinv(Luu.A).dot(Lul.A).dot(yl)
    return fu
Beispiel #55
0
    def generate_adjacency_matrix(feature_vectors, mode='knn'):
        covariances = torch.zeros([
            feature_vectors.shape[0], feature_vectors.shape[1],
            feature_vectors.shape[1]
        ])

        if mode == 'cov':
            for batch in range(feature_vectors.shape[0]):
                cov = np.cov(feature_vectors[batch])
                covariances[batch] = torch.tensor(cov)

            covariances[covariances >= 0.5] = 1.
            covariances[covariances < 0.5] = 0.
        else:
            for batch in range(feature_vectors.shape[0]):
                matrix = kneighbors_graph(feature_vectors[batch],
                                          n_neighbors=1).toarray()
                covariances[batch] = torch.tensor(
                    np.clip(matrix + matrix.T, a_min=0, a_max=1))

        # np.save('sample_graphs.npy', covariances)
        # exit()

        return covariances
    def _build_graph(self):
        """Compute the graph Laplacian."""

        # Graph sparsification
        if self.sparsify == 'epsilonNN':
            self.A_           = radius_neighbors_graph(self.X_, self.radius, include_self=False)
        else:
            Q                 = kneighbors_graph(
                self.X_,
                self.n_neighbors,
                include_self  = False
            ).astype(np.bool)

            if self.sparsify   == 'kNN':
                self.A_       = (Q + Q.T).astype(np.float64)
            elif self.sparsify == 'MkNN':
                self.A_       = (Q.multiply(Q.T)).astype(np.float64)

        # Edge re-weighting
        if self.reweight == 'rbf':
            W                 = rbf_kernel(self.X_, gamma=self.t)
            self.A_           = self.A_.multiply(W)

        return sp.csgraph.laplacian(self.A_, normed=self.normed)
Beispiel #57
0
def clusterer_sklearn_agglomerative(X, n_clusters):
    # "_args": [{"type": "numpy.ndarray","dtype": "float32"} ],
    #   "_return": [{ "type": "numpy.ndarray","dtype": "int32"}

    # in this case we want to try different numbers of clusters, so it is a parameter

    import sklearn
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.neighbors import kneighbors_graph
    import numpy as np
    print('clusterer_sklearn_agglomerative')

    connectivity = kneighbors_graph(X,
                                    n_neighbors=params['n_neighbors'],
                                    include_self=False)

    average_linkage = AgglomerativeClustering(linkage="average",
                                              affinity="cosine",
                                              n_clusters=params['n_clusters'],
                                              connectivity=connectivity).fit(X)
    clusterAlgLabelAssignmentsSAG = average_linkage.labels_.astype(np.int)

    XY = (X, clusterAlgLabelAssignmentsSAG)
    return (XY)
Beispiel #58
0
    def fit(self, X, y=None):
        """Fit the clustering model

        Parameters
        ----------
        X : array_like
            the data to be clustered: shape = [n_samples, n_features]
        """
        if self.cutoff is None and self.cutoff_scale is None:
            raise ValueError("Must specify either cutoff or cutoff_frac")

        # Compute the distance-based graph G from the points in X
        if self.metric == 'precomputed':
            # Input is already a graph. Copy if sparse
            # so we can overwrite for efficiency below.
            self.X_fit_ = None
            G = validate_graph(X,
                               directed=True,
                               csr_output=True,
                               dense_output=False,
                               copy_if_sparse=True,
                               null_value_in=np.inf)
        elif not self.approximate:
            X = check_array(X)
            self.X_fit_ = X
            kwds = self.metric_params or {}
            G = pairwise_distances(X, metric=self.metric, **kwds)
            G = validate_graph(G,
                               directed=True,
                               csr_output=True,
                               dense_output=False,
                               copy_if_sparse=True,
                               null_value_in=np.inf)
        else:
            # generate a sparse graph using n_neighbors of each point
            X = check_array(X)
            self.X_fit_ = X
            n_neighbors = min(self.n_neighbors, X.shape[0] - 1)
            G = kneighbors_graph(X,
                                 n_neighbors=n_neighbors,
                                 mode='distance',
                                 metric=self.metric,
                                 metric_params=self.metric_params)

        # HACK to keep explicit zeros (minimum spanning tree removes them)
        zero_fillin = G.data[G.data > 0].min() * 1E-8
        G.data[G.data == 0] = zero_fillin

        # Compute the minimum spanning tree of this graph
        self.full_tree_ = minimum_spanning_tree(G, overwrite=True)

        # undo the hack to bring back explicit zeros
        self.full_tree_[self.full_tree_ == zero_fillin] = 0

        # Partition the data by the cutoff
        N = G.shape[0] - 1
        if self.cutoff is None:
            i_cut = N
        elif 0 <= self.cutoff < 1:
            i_cut = int((1 - self.cutoff) * N)
        elif self.cutoff >= 1:
            i_cut = int(N - self.cutoff)
        else:
            raise ValueError('self.cutoff must be positive, not {0}'
                             ''.format(self.cutoff))

        # create the mask; we zero-out values where the mask is True
        N = len(self.full_tree_.data)
        if i_cut < 0:
            mask = np.ones(N, dtype=bool)
        elif i_cut >= N:
            mask = np.zeros(N, dtype=bool)
        else:
            mask = np.ones(N, dtype=bool)
            part = np.argpartition(self.full_tree_.data, i_cut)
            mask[part[:i_cut]] = False

        # additionally cut values above the ``cutoff_scale``
        if self.cutoff_scale is not None:
            mask |= (self.full_tree_.data > self.cutoff_scale)

        # Trim the tree
        cluster_graph = self.full_tree_.copy()

        # Eliminate zeros from cluster_graph for efficiency.
        # We want to do this:
        #    cluster_graph.data[mask] = 0
        #    cluster_graph.eliminate_zeros()
        # but there could be explicit zeros in our data!
        # So we call eliminate_zeros() with a stand-in data array,
        # then replace the data when we're finished.
        original_data = cluster_graph.data
        cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1)
        cluster_graph.data[mask] = 0
        cluster_graph.eliminate_zeros()
        cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1]

        # find connected components
        n_components, labels = connected_components(cluster_graph,
                                                    directed=False)

        # remove clusters with fewer than min_cluster_size
        counts = np.bincount(labels)
        to_remove = np.where(counts < self.min_cluster_size)[0]

        if len(to_remove) > 0:
            for i in to_remove:
                labels[labels == i] = -1
            _, labels = np.unique(labels, return_inverse=True)
            labels -= 1  # keep -1 labels the same

        # update cluster_graph by eliminating non-clusters
        # operationally, this means zeroing-out rows & columns where
        # the label is negative.
        I = sparse.eye(len(labels))
        I.data[0, labels < 0] = 0

        # we could just do this:
        #   cluster_graph = I * cluster_graph * I
        # but we want to be able to eliminate the zeros, so we use
        # the same indexing trick as above
        original_data = cluster_graph.data
        cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1)
        cluster_graph = I * cluster_graph * I
        cluster_graph.eliminate_zeros()
        cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1]

        self.labels_ = labels
        self.cluster_graph_ = cluster_graph
        return self
            ]


fig, axes = plt.subplots(figsize=(12, 12), ncols=3,
                         nrows=len(datasets), sharey=True, sharex=True)
plt.setp(axes, xticks=[], yticks=[], xlim=(-2.5, 2.5), ylim=(-2.5, 2.5))

for d, (dataset_label, dataset, algo_params) in enumerate(datasets):
    params = default_params.copy()
    params.update(algo_params)

    X, y = dataset
    X = StandardScaler().fit_transform(X)

    # 层次聚类距离度量方式,离差平方和Ward
    connectivity = kneighbors_graph(
        X, n_neighbors=params['n_neighbors'], include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)

    # 三种聚类模型
    kmeans = KMeans(n_clusters=params['n_clusters'])
    dbscan = DBSCAN(eps=params['eps'])
    average_linkage = AgglomerativeClustering(
        linkage="average", affinity="cityblock",
        n_clusters=params['n_clusters'], connectivity=connectivity)
    
    clustering_algorithms = (
        ('KMeans', kmeans),
        ('AgglomerativeClustering', average_linkage),
        ('DBSCAN', dbscan)
    )
    # 绘图
np.random.seed(0)
t = 1.5 * np.pi * (1 + 3 * np.random.rand(1, n_samples))
x = t * np.cos(t)
y = t * np.sin(t)


X = np.concatenate((x, y))
X += .7 * np.random.randn(2, n_samples)
X = X.T

# Create a graph capturing local connectivity. Larger number of neighbors
# will give more homogeneous clusters to the cost of computation
# time. A very large number of neighbors gives more evenly distributed
# cluster sizes, but may not impose the local manifold structure of
# the data
knn_graph = kneighbors_graph(X, 30, include_self=False)

for connectivity in (None, knn_graph):
    for n_clusters in (30, 3):
        plt.figure(figsize=(10, 4))
        for index, linkage in enumerate(('average', 'complete', 'ward')):
            plt.subplot(1, 3, index + 1)
            model = AgglomerativeClustering(linkage=linkage,
                                            connectivity=connectivity,
                                            n_clusters=n_clusters)
            t0 = time.time()
            model.fit(X)
            elapsed_time = time.time() - t0
            plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
                        cmap=plt.cm.spectral)
            plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),