def test_isomap_reconstruction_error(): # Same setup as in test_isomap_simple_grid, with an added dimension N_per_side = 5 Npts = N_per_side ** 2 n_neighbors = Npts - 1 # grid of equidistant points in 2D, out_dim = n_dim X = np.array(list(product(range(N_per_side), repeat=2))) # add noise in a third dimension rng = np.random.RandomState(0) noise = 0.1 * rng.randn(Npts, 1) X = np.concatenate((X, noise), 1) # compute input kernel G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray() centerer = preprocessing.KernelCenterer() K = centerer.fit_transform(-0.5 * G ** 2) for eigen_solver in eigen_solvers: for path_method in path_methods: clf = manifold.Isomap( n_neighbors=n_neighbors, out_dim=2, eigen_solver=eigen_solver, path_method=path_method ) clf.fit(X) # compute output kernel G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance").toarray() K_iso = centerer.fit_transform(-0.5 * G_iso ** 2) # make sure error agrees reconstruction_error = np.linalg.norm(K - K_iso) / Npts assert_almost_equal(reconstruction_error, clf.reconstruction_error())
def test_kneighbors_graph(): """Test kneighbors_graph to build the k-Nearest Neighbor graph.""" X = np.array([[0, 1], [1.01, 1.], [2, 0]]) # n_neighbors = 1 A = neighbors.kneighbors_graph(X, 1, mode='connectivity') assert_array_equal(A.toarray(), np.eye(A.shape[0])) A = neighbors.kneighbors_graph(X, 1, mode='distance') assert_array_almost_equal( A.toarray(), [[0.00, 1.01, 0.], [1.01, 0., 0.], [0.00, 1.40716026, 0.]]) # n_neighbors = 2 A = neighbors.kneighbors_graph(X, 2, mode='connectivity') assert_array_equal( A.toarray(), [[1., 1., 0.], [1., 1., 0.], [0., 1., 1.]]) A = neighbors.kneighbors_graph(X, 2, mode='distance') assert_array_almost_equal( A.toarray(), [[0., 1.01, 2.23606798], [1.01, 0., 1.40716026], [2.23606798, 1.40716026, 0.]]) # n_neighbors = 3 A = neighbors.kneighbors_graph(X, 3, mode='connectivity') assert_array_almost_equal( A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]])
def test_include_self_neighbors_graph(): """Test include_self parameter in neighbors_graph""" X = [[2, 3], [4, 5]] kng = neighbors.kneighbors_graph(X, 1, include_self=True).A kng_not_self = neighbors.kneighbors_graph(X, 1, include_self=False).A assert_array_equal(kng, [[1.0, 0.0], [0.0, 1.0]]) assert_array_equal(kng_not_self, [[0.0, 1.0], [1.0, 0.0]]) rng = neighbors.radius_neighbors_graph(X, 5.0, include_self=True).A rng_not_self = neighbors.radius_neighbors_graph(X, 5.0, include_self=False).A assert_array_equal(rng, [[1.0, 1.0], [1.0, 1.0]]) assert_array_equal(rng_not_self, [[0.0, 1.0], [1.0, 0.0]])
def test_kneighbors_graph_sparse(seed=36): """Test kneighbors_graph to build the k-Nearest Neighbor graph for sparse input.""" rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) for n_neighbors in [1, 2, 3]: for mode in ["connectivity", "distance"]: assert_array_almost_equal( neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), )
def _fit_transform(self, X): self.nbrs_.fit(X) self.training_data_ = self.nbrs_._fit_X self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter) kng = kneighbors_graph(self.nbrs_, self.n_neighbors, mode="distance") n_points = X.shape[0] n_workers = blob_ctx.get().num_workers if n_points < n_workers: tile_hint = (1, ) else: tile_hint = (n_points / n_workers, ) """ task_array is used for deciding the idx of starting points and idx of endding points that each tile needs to find the shortest path among. """ task_array = expr.ndarray((n_points,), tile_hint=tile_hint) task_array = task_array.force() #dist matrix is used to hold the result dist_matrix = expr.ndarray((n_points, n_points), reduce_fn=lambda a,b:a+b).force() results = task_array.foreach_tile(mapper_fn = _shortest_path_mapper, kw = {'kng' : kng, 'directed' : False, 'dist_matrix' : dist_matrix}) self.dist_matrix_ = dist_matrix.glom() G = self.dist_matrix_ ** 2 G *= -0.5 self.embedding_ = self.kernel_pca_.fit_transform(G)
def _get_affinity_matrix(self, X, Y=None): """Calculate the affinity matrix from data Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. If affinity is "precomputed" X : array-like, shape (n_samples, n_samples), Interpret X as precomputed adjacency graph computed from samples. Returns ------- affinity_matrix, shape (n_samples, n_samples) """ if self.affinity == 'precomputed': self.affinity_matrix_ = X print( type( self.affinity_matrix_)) return self.affinity_matrix_ # nearest_neigh kept for backward compatibility if self.affinity == 'nearest_neighbors': if sparse.issparse(X): warnings.warn("Nearest neighbors affinity currently does " "not support sparse input, falling back to " "rbf affinity") self.affinity = "rbf" else: self.n_neighbors_ = (self.n_neighbors if self.n_neighbors is not None else max(int(X.shape[0] / 10), 1)) self.affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_) # currently only symmetric affinity_matrix supported self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ + self.affinity_matrix_.T) return self.affinity_matrix_ if self.affinity == 'radius_neighbors': if self.neighbors_radius is None: self.neighbors_radius_ = np.sqrt(X.shape[1]) # to put another defaault value, like diam(X)/sqrt(dimensions)/10 else: self.neighbors_radius_ = self.neighbors_radius self.gamma_ = (self.gamma if self.gamma is not None else 1.0 / X.shape[1]) self.affinity_matrix_ = radius_neighbors_graph(X, self.neighbors_radius_, mode='distance') self.affinity_matrix_.data **= 2 self.affinity_matrix_.data /= -self.neighbors_radius_**2 self.affinity_matrix_.data = np.exp( self.affinity_matrix_.data, self.affinity_matrix_.data ) return self.affinity_matrix_ if self.affinity == 'rbf': self.gamma_ = (self.gamma if self.gamma is not None else 1.0 / X.shape[1]) self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_) return self.affinity_matrix_ self.affinity_matrix_ = self.affinity(X) return self.affinity_matrix_
def knn_connectivity(self, X): knn_graph = kneighbors_graph(X, 30, include_self=False) for connectivity in (None, knn_graph): n_clusters = 4 plt.figure(figsize=(10, 4)) for index, linkage in enumerate(('average', 'complete', 'ward')): plt.subplot(1, 3, index + 1) model = AgglomerativeClustering(linkage=linkage, connectivity=connectivity, n_clusters=n_clusters) t0 = time.time() model.fit(X) elapsed_time = time.time() - t0 plt.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap=plt.cm.spectral) plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time), fontdict=dict(verticalalignment='top')) plt.axis('equal') plt.axis('off') plt.subplots_adjust(bottom=0, top=.89, wspace=0, left=0, right=1) plt.suptitle('n_cluster=%i, connectivity=%r' % (n_clusters, connectivity is not None), size=17) plt.show()
def call_spectral(num_cluster ,mode_, data, update_flag): X = StandardScaler().fit_transform(data) spectral = SpectralClustering(n_clusters=num_cluster, eigen_solver='arpack', affinity='precomputed') connectivity = kneighbors_graph(X, n_neighbors=10) connectivity = 0.5 * (connectivity + connectivity.T) spectral.fit(connectivity) labels = spectral.labels_ if update_flag: return labels label_dict = {} label_dict_count = 0 for label in labels: label_dict[str(label_dict_count)] = float(label) label_dict_count = label_dict_count + 1 print label_dict unique_dict = {} unique_dict_count = 0 for uniq in np.unique(labels): print uniq unique_dict[str(unique_dict_count)] = float(uniq) unique_dict_count = unique_dict_count + 1 print unique_dict return label_dict, unique_dict
def cluster_data(data,clustering_method,num_clusters): cluster_centers = labels_unique = labels = extra = None if clustering_method == 'KMeans': # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans k_means = KMeans(n_clusters=num_clusters,init='k-means++',n_init=10,max_iter=100,tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) k_means.fit(data) labels = k_means.labels_ cluster_centers = k_means.cluster_centers_ elif clustering_method == 'MeanShift': ms = MeanShift( bin_seeding=True,cluster_all=False) ms.fit(data) labels = ms.labels_ cluster_centers = ms.cluster_centers_ elif clustering_method == 'AffinityPropagation': af = AffinityPropagation().fit(data) cluster_centers = [data[i] for i in af.cluster_centers_indices_] labels = af.labels_ elif clustering_method == "AgglomerativeClustering": n_neighbors=min(10,len(data)/2) connectivity = kneighbors_graph(data, n_neighbors=n_neighbors) ward = AgglomerativeClustering(n_clusters=num_clusters, connectivity=connectivity, linkage='ward').fit(data) labels = ward.labels_ elif clustering_method == "DBSCAN": db = DBSCAN().fit(data) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True extra = core_samples_mask labels = db.labels_ if labels is not None: labels_unique = np.unique(labels) return labels,cluster_centers,labels_unique,extra
def agglom(data, n_clusters): knn_graph = kneighbors_graph(data, 30, include_self=False) cluster = AgglomerativeClustering(n_clusters=n_clusters, connectivity=knn_graph, linkage='ward') # use ward / average / complete for different results model = cluster.fit(data) return cluster.fit_predict(data)
def test_non_euclidean_kneighbors(): rng = np.random.RandomState(0) X = rng.rand(5, 5) # Find a reasonable radius. dist_array = pairwise_distances(X).flatten() np.sort(dist_array) radius = dist_array[15] # Test kneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.kneighbors_graph( X, 3, metric=metric).toarray() nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X) assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) # Test radiusneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.radius_neighbors_graph( X, radius, metric=metric).toarray() nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X) assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).toarray()) # Raise error when wrong parameters are supplied, X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3, metric='euclidean') X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs, radius, metric='euclidean')
def agglomerative_clusters(self, word_vectors): #Pre-calculate BallTree object starting = time.time() Ball_Tree = BallTree(word_vectors, leaf_size = 200, metric = "minkowski") print("BallTree object in " + str(time.time() - starting)) #Pre-calculate k_neighbors graph starting = time.time() connectivity_graph = kneighbors_graph(Ball_Tree, n_neighbors = 1, mode = "connectivity", metric = "minkowski", p = 2, include_self = False, n_jobs = workers ) print("Pre-compute connectivity graph in " + str(time.time() - starting)) #Agglomerative clustering starting = time.time() Agl = AgglomerativeClustering(n_clusters = 100, affinity = "minkowski", connectivity = connectivity_graph, compute_full_tree = True, linkage = "average" ) Agl.fit(word_vectors) print("Agglomerative clustering in " + str(time.time() - starting)) clusters = Agl.labels_ return clusters
def agglomerative(self, connect=True, linkage='ward'): # connectivity constrain if connect: knn_graph = kneighbors_graph(self.X, 10) else: knn_graph = None if linkage in ('ward', 'average', 'complete'): model = AgglomerativeClustering(linkage=linkage, n_clusters=self.n_clusters, connectivity=knn_graph) model.fit(self.X) self.agglo = (model.labels_,) ### END - if linkage elif linkage == 'all': label_list = [] for link in ('ward', 'average', 'complete'): model = AgglomerativeClustering(linkage=link, n_clusters=self.n_clusters, connectivity=knn_graph) print link print self.X.shape model.fit(self.X) label_list.append(model.labels_) ### END - for linkage self.agglo = tuple(label_list) ### END - elif linkage else: print("Error: Wrong linkage argument") return ### END - else return self.evaluate(self.agglo)
def _fit_process(self, X): """ Computes the Laplacian score for the attributes :param X: :return: """ self.scores_ = np.zeros(X.shape[1]) # Similarity matrix S = kneighbors_graph(X, n_neighbors=self._n_neighbors, mode='distance') S = S.toarray() S *= S S /= self._bandwidth S = -S ones = np.ones(X.shape[0]) D = np.diag(np.dot(S, ones)) L = D - S qt = D.sum() for at in range(X.shape[1]): Fr = X[:, at] Fr_hat = Fr - np.dot(np.dot(Fr, D) / qt, ones) score1 = np.dot(np.dot(Fr_hat, L), Fr_hat) score2 = np.dot(np.dot(Fr_hat, D), Fr_hat) self.scores_[at] = score1 / score2
def _ward(X, k=2): connectivity = kneighbors_graph(X, n_neighbors=10) connectivity = 0.5 * (connectivity + connectivity.T) ward_five = cluster.Ward(n_clusters=k, connectivity=connectivity) ward_five.fit(X) y_pred = ward_five.labels_.astype(numpy.int) return y_pred
def clustering_tweets_hc(labeled_tweets, num_cluster): vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param) tweet_vec = vectorizer.fit_transform(labeled_tweets).toarray() # print(tweet_vec) n_clusters = num_cluster from sklearn.neighbors import kneighbors_graph knn_graph = kneighbors_graph(tweet_vec, 1, include_self=False) # print(knn_graph) connectivity = knn_graph from sklearn.cluster import AgglomerativeClustering model = AgglomerativeClustering(linkage='ward', connectivity=connectivity, n_clusters=n_clusters) model.fit(tweet_vec) c = model.labels_ # print(c,len(c)) clustered_tweets = [] for i in range(0, num_cluster): similar_indices = (c == i).nonzero()[0] sent = '' for sid in similar_indices: sent = labeled_tweets[sid] + ' ' + sent clustered_tweets.append(sent) return clustered_tweets
def latent_cluster(SAMObject, n_clusters=10, X=None, plot=True,which_indices=(0,1)): """ Use Anglomerative clustering to cluster the latent space by having a given number of clusters. ARG SAMObject: The SAMObject to operate on. ARG n_clusters: The number of clusters to find. ARG X: If None, we'll use the SAMObject's latent space, otherwise the provided one. ARG plot: Whether to plot the result or not. ARG which_indices: If plotting, which indices to plot. RETURN Y_: The cluster assignments for each component in the latent space. """ from sklearn.cluster import AgglomerativeClustering if X is None: X = SAMObject._get_latent() # Define the structure A of the data. Here a 10 nearest neighbors from sklearn.neighbors import kneighbors_graph connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) # Compute clustering print("Compute structured hierarchical clustering...") ward = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity,linkage='ward',compute_full_tree=True).fit(X) #ward = AgglomerativeClustering(n_clusters=8,linkage='ward',compute_full_tree=True).fit(X) Y_ = ward.labels_ if plot: color_iter = colors = cm.rainbow(np.linspace(0, 1, 20)) #---- a silly way to get maximal separation in colors for the n_cluster first elements... move to separate function index_all = np.linspace(0,19,20).astype(int) space = np.floor(color_iter.shape[0]/float(n_clusters)).astype(int) index_first = index_all[::space][:n_clusters] index_rest = np.array(list(set(index_all)-set(index_first))) myperm = np.random.permutation(index_rest.shape[0]) index_rest = index_rest[myperm] inds = np.hstack((index_first, index_rest)) color_iter = color_iter[inds,:] marker_iter = itertools.cycle((',', '+', '.', 'o', '*','v','x','>')) splot = pb.subplot(1, 1, 1) for i, (color,marker) in enumerate(zip(color_iter,marker_iter)): # as the method will not use every component it has access to unless it needs it, we shouldn't plot the redundant components. #if not np.any(Y_ == i): # continue ###### tmp #cc = ['b','g','r'] #mm = ['<','^','>'] #pb.scatter(X[Y_ == i, which_indices[0]], X[Y_ == i, which_indices[1]], s=40, color=cc[i],marker=mm[i]) ####### pb.scatter(X[Y_ == i, which_indices[0]], X[Y_ == i, which_indices[1]], s=40, color=color,marker=marker) #UNCOMMENT if i >= n_clusters: break pb.legend(np.unique(Y_)) pb.show() pb.draw() pb.show() return Y_
def makeWard(X, k=2): # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) return cluster.AgglomerativeClustering(n_clusters=k, linkage='ward', connectivity=connectivity)
def makeMaxLinkage(X=None, k=2): connectivity = kneighbors_graph(X, n_neighbors=10) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) return cluster.AgglomerativeClustering(linkage="complete", affinity="cityblock", n_clusters=k, connectivity=connectivity)
def median_min_distance(data, metric): """This function computes a graph of nearest-neighbors for each sample point in 'data' and returns the median of the distribution of distances between those nearest-neighbors, the distance metric being specified by 'metric'. Parameters ---------- data : array of shape (n_samples, n_features) The data-set, a fraction of whose sample points will be extracted by density sampling. metric : string The distance metric used to determine the nearest-neighbor to each data-point. The DistanceMetric class defined in scikit-learn's library lists all available metrics. Returns ------- median_min_dist : float The median of the distribution of distances between nearest-neighbors. """ data = np.atleast_2d(data) nearest_distances = kneighbors_graph(data, 1, mode = 'distance', metric = metric, include_self = False).data median_min_dist = np.median(nearest_distances, overwrite_input = True) return round(median_min_dist, 4)
def cluster_documents(documents, num_clusters=10, num_terms=30, clust_alg='kmeans', verbose_docs=True): '''A document is an object with a tokens attribute where tokens is a list of tokens. Documents is a list of these document objects''' labels = range(num_clusters) true_k = len(labels) texts = [' '.join(doc['tokens']) for doc in documents] vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, stop_words='english', use_idf=True) vector_space = vectorizer.fit_transform(texts) if clust_alg == 'minibatch': clusterer = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000) elif clust_alg == 'kmeans': clusterer = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) elif clust_alg == 'agglomerative': #Note this doesn't work atm. Or rather its output (and input) is shaped #differently from kmeans and minibatchkmeans connectivity = kneighbors_graph(vector_space.toarray(), n_neighbors=true_k) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) clusterer = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward', connectivity=connectivity) clusterer.fit(vector_space) #Re-attach the cluster results to the original documents if verbose_docs: clustered_docs = [dict({'cluster': doc[0].item()}.items() + doc[1].items()) for doc in zip(clusterer.labels_, documents)] else: clustered_docs = [dict({'cluster': doc[0].item()}.items() + doc[1].items()) for doc in zip(clusterer.labels_, documents)] by_cluster = defaultdict(list) for doc in clustered_docs: by_cluster[doc['cluster']].append(doc) by_cluster = dict(by_cluster) # Top terms in each cluster clusters = [] terms = vectorizer.get_feature_names() order_centroids = clusterer.cluster_centers_.argsort()[:, ::-1] for i in range(true_k): cluster_info = {'cluster': i} cluster_terms = [] for ind in order_centroids[i, :num_terms]: cluster_terms.append(terms[ind]) cluster_info['terms'] = cluster_terms clusters.append(cluster_info) # pp(by_cluster) # pp(clusters) # by cluster is a list of return (by_cluster, clusters)
def example1(): """画出k-近邻关系图 距离最近的k个样本将被看做近邻 """ train = np.array([[1,2,4,7,9,10]]).transpose() graph = kneighbors_graph(train, 2) # k = 2 print(graph) print(graph.toarray())
def build_kneighbors_table(features, k_neighbors): sparse_connections = kneighbors_graph(features, k_neighbors + 1) # Iterate to unpack neighbors from sparse connections connections = list() for ridx in xrange(len(features)): connections.append(sparse_connections[ridx].nonzero()[1].tolist()[1:]) return connections
def test_kneighbors_graph(self): x = [[0], [3], [1]] df = pdml.ModelFrame(x) result = df.neighbors.kneighbors_graph(2) expected = neighbors.kneighbors_graph(x, 2) self.assert_numpy_array_almost_equal(result.toarray(), expected.toarray())
def hierarchical_clustering(corpus_fn, n_clusters=2, linkage='complete'): corpus = corpora.MmCorpus(corpus_fn) corpus = matutils.corpus2csc(corpus, num_terms=corpus.num_terms).transpose() svd = TruncatedSVD(n_components=100) new_corpus = svd.fit_transform(corpus) knn_graph = kneighbors_graph(new_corpus, 10, metric='euclidean') agg = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=linkage, connectivity=knn_graph) agg.fit(new_corpus) return corpus, agg.labels_
def agglomerative_clustering(crime_rows, column_names, num_clusters): crime_xy = [crime[0:2] for crime in crime_rows] crime_info = [crime[2:] for crime in crime_rows] print("Running Agglomerative Clustering") agglo_clustering = AgglomerativeClustering(n_clusters=num_clusters, connectivity=neighbors.kneighbors_graph(crime_xy, n_neighbors=2)) agglomerative_clustering_labels = agglo_clustering.fit_predict(crime_xy) print("formatting....") return _format_clustering(agglomerative_clustering_labels, crime_xy, crime_info, column_names)
def cluster_spatial_data(X, n_parcels, xyz=None, shape=None, mask=None, method='ward', verbose=False): """Cluster the data using Ward's algorithm Parameters ========== X: array of shape(n_voxels, n_subjects) the functional data, across subjects n_parcels: int, the desired number of parcels xyz: array of shape (n_voxels, 3), optional positions of the voxels in grid coordinates shape: tuple: the domain shape (assuming a grid structure), optional alternative specification of positions mask: arbitrary array of arbitrary dimension,optional alternative specification of positions method: string, one of ['ward', 'spectral', 'kmeans'], optional clustering method Returns ======= label: array of shape(n_voxels): the resulting cluster assignment Note ==== One of xyz, shape or mask needs to be provided """ from sklearn.cluster import spectral_clustering, k_means if mask is not None: connectivity = grid_to_graph(*shape, mask=mask) elif shape is not None: connectivity = grid_to_graph(*shape) elif xyz is not None: from sklearn.neighbors import kneighbors_graph n_neighbors = 2 * xyz.shape[1] connectivity = kneighbors_graph(xyz, n_neighbors=n_neighbors) else: raise ValueError('One of mask, shape or xyz has to be provided') if n_parcels == 1: return np.zeros(X.shape[0]) if method == 'ward': connectivity = connectivity.tocsr() ward = Ward(n_clusters=n_parcels, connectivity=connectivity).fit(X) label = ward.labels_ elif method == 'spectral': i, j = connectivity.nonzero() sigma = np.sum((X[i] - X[j]) ** 2, 1).mean() connectivity.data = np.exp(- np.sum((X[i] - X[j]) ** 2, 1) / (2 * sigma)) label = spectral_clustering(connectivity, n_clusters=n_parcels) elif method == 'kmeans': _, label, _ = k_means(X, n_parcels) else: raise ValueError('Unknown method for parcellation') return label
def clustering(X, algorithm, n_clusters): # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # Generate the new colors: if algorithm=='MiniBatchKMeans': model = cluster.MiniBatchKMeans(n_clusters=n_clusters) elif algorithm=='Birch': model = cluster.Birch(n_clusters=n_clusters) elif algorithm=='DBSCAN': model = cluster.DBSCAN(eps=.2) elif algorithm=='AffinityPropagation': model = cluster.AffinityPropagation(damping=.9, preference=-200) elif algorithm=='MeanShift': model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif algorithm=='SpectralClustering': model = cluster.SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") elif algorithm=='Ward': model = cluster.AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity) elif algorithm=='AgglomerativeClustering': model = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=n_clusters, connectivity=connectivity) model.fit(X) if hasattr(model, 'labels_'): y_pred = model.labels_.astype(np.int) else: y_pred = model.predict(X) return X, y_pred
def agglomerative(num_clusters, similarity, dataset, header, text_sim=False): if text_sim: connectivity = kneighbors_graph(similarity, 5) else: values = dataset[header] connectivity = kneighbors_graph(values, 5) """ #Based on images of each users? from scipy.sparse import csr_matrix users = set(dataset_now["uid"]) connectivity = np.zeros([ dataset_now.shape[0], dataset_now.shape[0] ]) for i, user1 in enumerate(dataset_now["uid"]): for j, user2 in enumerate(dataset_now["uid"]): if user1 == user2: connectivity[i][j] = 1 connectivity = csr_matrix(connectivity) """ return AgglomerativeClustering( n_clusters=num_clusters, connectivity=connectivity, compute_full_tree=True ).fit_predict(similarity)
def test_isomap_simple_grid(): # Isomap should preserve distances when all neighbors are used N_per_side = 5 Npts = N_per_side ** 2 n_neighbors = Npts - 1 # grid of equidistant points in 2D, out_dim = n_dim X = np.array(list(product(range(N_per_side), repeat=2))) # distances from each point to all others G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray() for eigen_solver in eigen_solvers: for path_method in path_methods: clf = manifold.Isomap( n_neighbors=n_neighbors, out_dim=2, eigen_solver=eigen_solver, path_method=path_method ) clf.fit(X) G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance").toarray() assert_array_almost_equal(G, G_iso)
def Laplacian_matrix(data, knn, gamma): print(data.shape) A = np.full((data.shape[0], data.shape[0]), 0.0, dtype=np.float64) #L = np.full((data.shape[0],data.shape[0]),0.0,dtype=np.float64) if knn != -1: A = kneighbors_graph(data, n_neighbors=knn, mode='distance').toarray() D = np.diagflat((np.full((1, data.shape[0]), knn, dtype=np.float64))) print("Weight Matrix created using KNN : ", knn) elif gamma != -1: A = sklearn.metrics.pairwise.rbf_kernel( data, gamma=gamma) #RBF Kernel for constructing similarity matrix #D = np.diagflat(np.full((1,data.shape[0]),(data.shape[0])-1,dtype=np.float64)) D = np.diagflat(np.count_nonzero(A, axis=1)) print("Weight Matrix created using gamma : ", gamma) print("Dimensions of Similarity Matrix: ", A.shape) print(A) print("Dimensions of Degree Matrix: ", D.shape) print(D) L = np.subtract(D, A) print("Dimensions of Laplacian Matrix: ", L.shape) print(L) print("Laplacian Matrix created . . .") return A, D, L
def fit(self, X): """Fit the clustering model Parameters ---------- X : array_like the data to be clustered: shape = [n_samples, n_features] """ X = np.asarray(X, dtype=float) self.X_train_ = X # generate a sparse graph using the k nearest neighbors of each point G = kneighbors_graph(X, n_neighbors=self.n_neighbors, mode='distance') # Compute the minimum spanning tree of this graph self.full_tree_ = minimum_spanning_tree(G, overwrite=True) # Find the cluster labels self.n_components_, self.labels_, self.cluster_graph_ =\ self.compute_clusters() return self
def hierarchical_clustering(nb_clust, nb_feat, centroid, cluster_init, dataCentroid): # Preparation of the contiguity matrix X = np.zeros(shape=(len(centroid), 2)) for key, value in centroid.iteritems(): X[key] = value knn_graph = kneighbors_graph(X, 8, include_self=False) linkage = 'ward' dataModel = np.zeros(shape=(len(centroid), nb_feat)) for key, value in dataCentroid.iteritems(): dataModel[key] = value[1:(nb_feat + 1)] #dataModel[key] = value model = AgglomerativeClustering(linkage=linkage, connectivity=knn_graph, n_clusters=nb_clust) model.fit(dataModel) new_id_clust = [] for row in cluster_init: clust = model.labels_[row] new_id_clust.append(clust) print clust return new_id_clust
def link_clustering(x, inverselengthscale, n_clusters, n_neighbors): global log linkage = 'complete' log.append('Linkage : {}'.format(linkage)) log.append('n_clusters : {} , n_neighbors : {}'.format( n_clusters, n_neighbors)) # print n_neighbors, x.shape n_neighbors = int(len(x) * n_neighbors) knn_graph = kneighbors_graph(x, n_neighbors, include_self=False) clustering = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, connectivity=knn_graph) clustering.fit(x) labels = clustering.labels_ return labels
def generate_edges(X, mode='kneighbors_graph', n_neighbors=3, radius=0.1): """ returns array with pairs of indices [vertex_from, vertex_to] and weight vector """ n_neighbors = min(n_neighbors, len(X) - 1) if n_neighbors == 0: return X[:, 3].reshape(-1, 1), np.zeros((1, 5)), np.zeros((2, 1)) if mode == 'kneighbors_graph': adjacency_matrix = np.array((kneighbors_graph(X=X[:, :3], n_neighbors=n_neighbors, mode='distance')).todense()) elif mode == 'radius_neighbors_graph': adjacency_matrix = np.array((radius_neighbors_graph(X=X[:, :3], radius=radius, mode='distance')).todense()) else: raise 'Unknown mode {}'.format(mode) rows, cols = np.where(adjacency_matrix > 0) edges = np.vstack([rows, cols]) weights = adjacency_matrix[rows, cols] nodes_features = X[:, 3].reshape(-1, 1) edges_features = X[edges.T[:, 0]] - X[edges.T[:, 1]] return nodes_features, np.c_[edges_features, weights], edges.astype(int)
def GeoDesicMatrix(self, X): self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, metric=self.metric, p=self.p, metric_params=self.metric_params, n_jobs=self.n_jobs) self.nbrs_.fit(X) kng = kneighbors_graph(self.nbrs_, self.n_neighbors, metric=self.metric, p=self.p, metric_params=self.metric_params, mode='distance', n_jobs=self.n_jobs) self.dist_matrix_ = graph_shortest_path(kng, method=self.path_method, directed=False) G = self.dist_matrix_**2 return G
def diffusion_mapping(X, n_components=2, n_neighbors=5, alpha=1.0, t=1, gamma=0.5, metric='minkowski', p=2, metric_params=None, n_jobs=1): knn = kneighbors_graph(X, n_neighbors, mode='distance', metric=metric, metric_params=metric_params, p=p, n_jobs=n_jobs) K = sparse.csr_matrix( (np.exp(-gamma * knn.data**2), knn.indices, knn.indptr)) mask = (K != 0).multiply(K.T != 0) L = K + K.T - K.multiply(mask) D = sparse.diags(np.asarray(L.sum(axis=0)).reshape(-1)) L_a = D.power(-alpha) @ L @ D.power(-alpha) D_a = sparse.diags(np.asarray(L_a.sum(axis=1)).reshape(-1)) m = D_a.power(-1) @ L_a w, v = eigs(m, n_components + 1) # eigs returns complex numbers, but for Markov matrices, all eigenvalues are # real and in [0, 1]. return (m.dot(v[:, 1:]) * (w[1:]**t)).real
def _fit_transform(self, X): self.nbrs_.fit(X) self.training_data_ = self.nbrs_._fit_X self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter) kng = kneighbors_graph(self.nbrs_, self.n_neighbors, mode="distance") n_points = X.shape[0] n_workers = blob_ctx.get().num_workers if n_points < n_workers: tile_hint = (1, ) else: tile_hint = (n_points / n_workers, ) """ task_array is used for deciding the idx of starting points and idx of endding points that each tile needs to find the shortest path among. """ task_array = expr.ndarray((n_points, ), tile_hint=tile_hint) task_array = task_array.evaluate() #dist matrix is used to hold the result dist_matrix = expr.ndarray((n_points, n_points), reduce_fn=lambda a, b: a + b).evaluate() results = task_array.foreach_tile(mapper_fn=_shortest_path_mapper, kw={ 'kng': kng, 'directed': False, 'dist_matrix': dist_matrix }) self.dist_matrix_ = dist_matrix.glom() G = self.dist_matrix_**2 G *= -0.5 self.embedding_ = self.kernel_pca_.fit_transform(G)
def get_RF_avgRList_byAggloCluster(self, cluster_ratio): from sklearn.cluster import AgglomerativeClustering from sklearn.neighbors import kneighbors_graph trees = self.trees m,n = self.X_train.shape # get_RF_RList RF_RList=[] for tree in trees: tree_RList = tree.tree.get_RList() tree_RMat = np.array(tree_RList) # tree_new_RMat = np.zeros((tree_RMat.shape[0],n,2)) # tree_new_RMat[:,tree.feat_ind] = tree_RMat RF_RList.extend(tree_RMat) # len = m RF_R_Mat = np.array(RF_RList) #(m,n,2), col0=center, col1=radius RF_R_centers = RF_R_Mat[:,:,0] # (m,n) RF_R_radius = RF_R_Mat[:,:,1] # (m,n) # get the number of cluster avg_num_R = int( RF_R_Mat.shape[0]) # total R divided by number trees # get the connectivity graph of R_list connect_graph = kneighbors_graph(RF_R_centers, n_neighbors=int(0.7*len(trees)), include_self=False) # connect_graph shape = (m,m) , if neibor then value=1, else=0 if isinstance(cluster_ratio, float): try: R_cluster = AgglomerativeClustering(n_clusters=int(cluster_ratio*avg_num_R), connectivity=connect_graph, linkage='ward').fit(RF_R_centers) except ValueError,e: print 'ValueError ',e R_cluster = AgglomerativeClustering(n_clusters=int(cluster_ratio*avg_num_R)+1, connectivity=connect_graph, linkage='ward').fit(RF_R_centers)
def test_non_euclidean_kneighbors(): rng = np.random.RandomState(0) X = rng.rand(5, 5) # Find a reasonable radius. dist_array = pairwise_distances(X).flatten() np.sort(dist_array) radius = dist_array[15] # Test kneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.kneighbors_graph(X, 3, metric=metric).toarray() nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X) assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) # Test radiusneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.radius_neighbors_graph(X, radius, metric=metric).toarray() nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X) assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A) # Raise error when wrong parameters are supplied, X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3, metric='euclidean') X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs, radius, metric='euclidean')
def compute_propagation(order,idx_train,labels,emb,exp): ###here we need to get optimum k#### k_range = range(1,12) param_grid = dict(n_neighbors =k_range) knn = KNeighborsClassifier() grid = GridSearchCV(knn,param_grid, cv = 10, scoring = "accuracy") grid.fit(gene_feature,labels) #print grid.best_params_,grid.best_params_['n_neighbors'] GF = kneighbors_graph(gene_feature,grid.best_params_['n_neighbors'], mode='connectivity',include_self=False) G = nx.from_numpy_matrix(GF.A) nds = range(G.number_of_nodes()) #print nds #print "nx.info embeddings:", nx.info(G) Laplacian_matrtix = nx.laplacian_matrix(G, nodelist= nds, weight='weight') L_exp = nx.laplacian_matrix(get_network, nodelist = nds, weight='weight') ####harmonic part#### y = labels.copy I = identity(G.number_of_nodes()) lamb = 1.0 Laplacian_matrtix = np.add(Laplacian_matrtix*emb, L_exp*exp) fu = spsolve((I + Laplacian_matrtix*lamb), labels) return fu
def _affinity_propagation(feature, ground_truth, config): ref_sc = -1 optimal_preference = 0 optimal_damping_factor = -1 optimal_affinity = 'euclidean' optimal_n_neighbors = config['n_neighbors'][0] if(config['affinity'].count('euclidean')>0): for p in config['preference']: for d in config['damping_factor']: af = cluster.AffinityPropagation(preference=p, damping=d).fit(feature) y_pred_af = af.labels_ ars_af = metrics.adjusted_rand_score(ground_truth, y_pred_af) if(ars_af > ref_sc): ref_sc = ars_af optimal_preference = p optimal_damping_factor = d if(config['affinity'].count('precomputed')>0): for p in config['preference']: for d in config['damping_factor']: for n_neighbors in config['n_neighbors']: connectivity = kneighbors_graph(feature, n_neighbors=n_neighbors,include_self=True) affinity_matrix = 0.5 * (connectivity + connectivity.T) affinity_matrix = np.asarray(affinity_matrix.todense(),dtype=float) af = cluster.AffinityPropagation(damping=d, affinity='precomputed').fit(affinity_matrix) y_pred_af = af.labels_ ars_af = metrics.adjusted_rand_score(ground_truth, y_pred_af) if(ars_af > ref_sc): ref_sc = ars_af optimal_preference = p optimal_damping_factor = d optimal_affinity = 'precomputed' optimal_n_neighbors = n_neighbors logging.info('ari %.3f'% ref_sc) return {'preference': optimal_preference, 'damping_factor': optimal_damping_factor, 'ari': ref_sc, 'affinity': optimal_affinity, 'n_neighbors': optimal_n_neighbors }
def buildAdjacencyGraph3(matrix, top_k): nn = NearestNeighbors(n_neighbors=top_k, metric='cosine', n_jobs=multiprocessing.cpu_count()) nn.fit(matrix) adjMatrix = kneighbors_graph(nn, top_k, mode='distance', metric='cosine', n_jobs=multiprocessing.cpu_count()).toarray() [rows, cols] = adjMatrix.shape # Set the diagonal to be zero, there is no edge from a node to itself # if (rows == cols): # for r in range(rows): # adjMatrix[r][r] = 0 # for row in range(rows): # for ind in range(cols): # if(adjMatrix[row][ind]!=0): # adjMatrix[row][ind] = 1-adjMatrix[row][ind] numpy.where(adjMatrix > 0, 1 - adjMatrix, 0) graph = nx.convert_matrix.from_numpy_matrix( adjMatrix, parallel_edges=False, create_using=nx.DiGraph()).to_undirected() return graph
def customNcuts(self): """ Return segmentation label using classic Ncuts """ # computing neighboors graph A = kneighbors_graph(self.values, self.k, mode='distance', include_self=False).toarray() for i in range(self.values.shape[0]): for j in range(self.values.shape[0]): if A[i][j] > 0: v1 = (self.values[i][3], self.values[i][4], self.values[i][5]) v2 = (self.values[j][3], self.values[j][4], self.values[j][5]) magnitude1 = np.sqrt(v1[0] * v1[0] + v1[1] * v1[1] + v1[2] * v1[2]) magnitude2 = np.sqrt(v2[0] * v2[0] + v2[1] * v2[1] + v2[2] * v2[2]) ang = np.arccos(np.dot(v1, v2) / (magnitude1 * magnitude2)) A[i][j] = max(self.values[i][7], self.values[j][7]) * A[i][j] # init SpectralClustering sc = SpectralClustering(4, affinity='precomputed', n_init=10, assign_labels='discretize') # cluster labels = sc.fit_predict(A) return labels
def iterative_nearest_neighbor(self, cluster_labels): labels = cluster_labels.copy() connectivity = kneighbors_graph(self.model_data, n_neighbors=4, include_self=False).toarray() conn_df = pd.DataFrame(connectivity) df = conn_df * (labels + 1) a = df.apply(lambda row: row.nunique() > 2, axis=1) k_neighs = a[a].index.tolist() while len(k_neighs) > 0: x = k_neighs[0] k_neighs.pop(0) cls_neighs = df.loc[x][df.loc[x] > 0] cross_region = [ df.loc[r][df.loc[r] > 0] - 1 for r in cls_neighs.index if r in k_neighs ] cls_region = [item for elem in cross_region for item in elem] cls_region.extend(cls_neighs.values - 1) densed = pd.Series(cls_region).value_counts().index[0] #print(labels[cls_neighs.index] , int(densed)) labels[cls_neighs.index] = int(densed) #print(int(densed), labels[region]) #labels[region] = int(densed) df = conn_df * (labels + 1) k_neighs = list(set(k_neighs) - set(cls_neighs.index)) sizes = pd.Series(labels).value_counts() return (labels, sizes)
def build_edges(Points, K=16): ''' from point coordinates to edgelist and edge information input: Data: 3D tensor (batch_size, num_points, dim=3) output: Edgelist: 3D tensor (batch_size, num_edges, 2) Edge_info: 3D tensor (batch_size, num_edges, dim=3) ''' Edgelist = [] Edge_info = [] [batch_size, num_points, dim] = Points.shape from sklearn.neighbors import kneighbors_graph for i_sample in range(batch_size): data = Points[i_sample, :, :] A = kneighbors_graph(data, K, mode='connectivity', include_self=True) edgelist = np.transpose(np.stack(np.nonzero(A))) edge_info = np.concatenate( [data[edgelist[:, 1], :], data[edgelist[:, 0], :]], axis=1) Edgelist.append(edgelist) Edge_info.append(edge_info) Edgelist = np.stack(Edgelist) # (i, j) Edge_info = np.stack(Edge_info) # x_j - x_i \in R^3 return Edgelist, Edge_info
def doiteration(new_data, dataframe): #kmeans1 kmeans = KMeans(n_clusters=6, random_state=0).fit(new_data) labels_kmeans = kmeans.labels_ set_lk = set(labels_kmeans) #spectral spectral = SpectralClustering() spectral.fit(new_data) spectral_labels = spectral.labels_ set_ls = set(spectral_labels) #Hierarchial connectivity = kneighbors_graph(new_data, n_neighbors=10, include_self=False) ward = AgglomerativeClustering(n_clusters=8, connectivity=connectivity, linkage='ward').fit(new_data) h_labels = ward.labels_ set_lh = set(h_labels) colNames = list(dataframe.columns.values) labels_dict_kc = tsc.getLabelsDict(set_lk, labels_kmeans) #pass into cosine similarity computations print "\nkmeans\n" userdata = tsc.cosine_computations(labels_dict_kc, set_lk, labels_kmeans, colNames, dataframe) print tsc.average_sim_cluster(userdata) print "\nSpectral\n" labels_dict_sc = tsc.getLabelsDict(set_ls, spectral_labels) #pass into cosine similarity computations userdata = tsc.cosine_computations(labels_dict_sc, set_ls,spectral_labels, colNames, dataframe ) # print userdata print tsc.average_sim_cluster(userdata) print "\nHeirarchial\n" labels_dict_hc = tsc.getLabelsDict(set_lh, h_labels) #pass into cosine similarity computations userdata = tsc.cosine_computations(labels_dict_hc, set_lh, h_labels, colNames, dataframe) # print userdata print tsc.average_sim_cluster(userdata)
def create_agglomerative_models(self, n_cluster_list, linkage_methods=None): """ Create multiple agglomerative models based on a list of 'n_clusters' values and defined linkage methods. """ if isinstance(n_cluster_list, int): n_cluster_list = [n_cluster_list] if not linkage_methods: linkage_methods = ["ward", "complete", "average", "single"] knn_graph = kneighbors_graph( self.__scaled, len( self.__scaled) - 1, include_self=False) for n_clusters in n_cluster_list: for connectivity in (None, knn_graph): for _, linkage in enumerate(linkage_methods): model = AgglomerativeClustering(linkage=linkage, connectivity=connectivity, n_clusters=n_clusters) model.fit(self.__scaled) self.__all_cluster_models[ "AgglomerativeClustering_{0}_" "cluster{1}_Connectivity{2}".format( linkage, n_clusters, connectivity is not None)] = model print( "Successfully generate Agglomerative model with " "linkage {0} on n_clusters={1}".format( linkage, n_clusters))
def configuraciones_agglomerative(subset): normalized_set = preprocessing.normalize(subset, norm='l2') # connectivity matrix for structured Ward connectivity = kneighbors_graph(normalized_set, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) ward_10 = cl.AgglomerativeClustering(n_clusters=10, linkage='ward') ward_10_connectivity = cl.AgglomerativeClustering( n_clusters=10, linkage='ward', connectivity=connectivity) ward_20 = cl.AgglomerativeClustering(n_clusters=20, linkage='ward') ward_20_connectivity = cl.AgglomerativeClustering( n_clusters=20, linkage='ward', connectivity=connectivity) #Los añadimos a una lista clustering_algorithms = (('Ward-10', ward_10), ('Ward-10-con', ward_10_connectivity), ('Ward-20', ward_20), ('Ward-20-con', ward_20_connectivity)) return clustering_algorithms
def snn(X, neighbor_num, min_shared_neighbor_num): """Perform Shared Nearest Neighbor (SNN) clustering algorithm clustering. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or array of shape (n_samples, n_samples) A feature array neighbor_num : int K number of neighbors to consider for shared nearest neighbor similarity min_shared_neighbor_num : int Number of nearest neighbors that need to share two data points to be considered part of the same cluster """ # for each data point, find their set of K nearest neighbors knn_graph = kneighbors_graph(X, n_neighbors=neighbor_num, include_self=False) neighbors = np.array([set(knn_graph[i].nonzero()[1]) for i in range(len(X))]) # the distance matrix is computed as the complementary of the proportion of shared neighbors between each pair of data points snn_distance_matrix = np.asarray([[get_snn_distance(neighbors[i], neighbors[j]) for j in range(len(neighbors))] for i in range(len(neighbors))]) # perform DBSCAN with the shared-neighbor distance criteria for density estimation dbscan = DBSCAN(min_samples=min_shared_neighbor_num, metric="precomputed") dbscan = dbscan.fit(snn_distance_matrix) return dbscan.core_sample_indices_, dbscan.labels_
def calc_graph(X, k, sigma): """ Given data X construct graphs with k nearest neighbours and weighted by Gaussian kernel with std sigma Parameters ---------- X - array - TxQ array of T timepoints with Q features each k - int - number of nearest neighbours sigma - float - standard deviation of Gaussian kernel Returns ------- TxT adjacency matrix of weighted graph Notes ----- k=0 means complete graph sigma=0 means unweighted Can't do both""" assert isinstance(k, int), 'k must be an integer' T = X.shape[0] X = X.reshape(T, np.prod(X.shape[1:])) if k == 0 and sigma == 0: assert False, "Can't have k and sigma both equal to 0 - thats a complete unweighted graph" if k == 0: G = 1. else: G = kneighbors_graph(X, k, include_self=False) G = 0.5 * (G + G.T).toarray() if sigma == 0: W = 1. else: dist_G = squareform(pdist(X)) W = np.exp(-(dist_G**2) / (2 * sigma * sigma)) - np.identity(T) WG = G * W return WG
def __init__(self, data, regex, embedding): dataToDict = data.fillna('').to_dict(orient="records") regexToDict = regex.to_dict() xy = UMAP().fit_transform(embedding) A = kneighbors_graph(xy, n_neighbors=1) G = nx.from_scipy_sparse_matrix(A) E = G.edges() coords = [{'x': x, 'y': y} for (x, y) in xy.tolist()] nearestNeighbors = [{ 'source': int(s), 'target': int(t) } for (s, t) in E] super().__init__( **{ 'data': dataToDict, 'regex': regexToDict, 'coords': coords, 'nearestNeighbors': nearestNeighbors })
def clusterer_sklearn_ward(X, n_clusters): # "_args": [{"type": "numpy.ndarray","dtype": "float32"} ], # "_return": [{ "type": "numpy.ndarray","dtype": "int32"} # in this case we want to try different numbers of clusters, so it is a parameter import sklearn from sklearn.cluster import AgglomerativeClustering from sklearn.neighbors import kneighbors_graph import numpy as np print('clusterer_sklearn_ward') connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) ward = AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity).fit(X) clusterAlgLabelAssignmentsSW = ward.labels_.astype(np.int) XY = (X, clusterAlgLabelAssignmentsSW) return (XY)
def compute_propagation(order,idx_train,labels,emb,exp): ### here we need to get optimum k from the range 1 to 12 #### k_range = range(1,12) param_grid = dict(n_neighbors = k_range) knn = KNeighborsClassifier() grid = GridSearchCV(knn,param_grid, cv = 10, scoring = "accuracy") grid.fit(gene_feature,labels) GF = kneighbors_graph(gene_feature,grid.best_params_['n_neighbors'], mode='connectivity',include_self=False) G = nx.from_numpy_matrix(GF.A) #print "nx.info embeddings:", nx.info(G) Laplacian_matrtix = nx.laplacian_matrix(G, nodelist=order, weight='weight') L_exp = nx.laplacian_matrix(get_network,nodelist = order, weight='weight') Laplacian_matrtix = np.add(Laplacian_matrtix * emb, L_exp * exp) l = len(idx_train) u = len(idx_test) r,c = Laplacian_matrtix.shape Lll = Laplacian_matrtix[0:l,0:l] Llu = Laplacian_matrtix[0:l,l:r] Lul = Laplacian_matrtix[l:r,0:l] Luu = Laplacian_matrtix[l:r,l:r] yl = labels[idx_train] fu = -linalg.pinv(Luu.A).dot(Lul.A).dot(yl) return fu
def generate_adjacency_matrix(feature_vectors, mode='knn'): covariances = torch.zeros([ feature_vectors.shape[0], feature_vectors.shape[1], feature_vectors.shape[1] ]) if mode == 'cov': for batch in range(feature_vectors.shape[0]): cov = np.cov(feature_vectors[batch]) covariances[batch] = torch.tensor(cov) covariances[covariances >= 0.5] = 1. covariances[covariances < 0.5] = 0. else: for batch in range(feature_vectors.shape[0]): matrix = kneighbors_graph(feature_vectors[batch], n_neighbors=1).toarray() covariances[batch] = torch.tensor( np.clip(matrix + matrix.T, a_min=0, a_max=1)) # np.save('sample_graphs.npy', covariances) # exit() return covariances
def _build_graph(self): """Compute the graph Laplacian.""" # Graph sparsification if self.sparsify == 'epsilonNN': self.A_ = radius_neighbors_graph(self.X_, self.radius, include_self=False) else: Q = kneighbors_graph( self.X_, self.n_neighbors, include_self = False ).astype(np.bool) if self.sparsify == 'kNN': self.A_ = (Q + Q.T).astype(np.float64) elif self.sparsify == 'MkNN': self.A_ = (Q.multiply(Q.T)).astype(np.float64) # Edge re-weighting if self.reweight == 'rbf': W = rbf_kernel(self.X_, gamma=self.t) self.A_ = self.A_.multiply(W) return sp.csgraph.laplacian(self.A_, normed=self.normed)
def clusterer_sklearn_agglomerative(X, n_clusters): # "_args": [{"type": "numpy.ndarray","dtype": "float32"} ], # "_return": [{ "type": "numpy.ndarray","dtype": "int32"} # in this case we want to try different numbers of clusters, so it is a parameter import sklearn from sklearn.cluster import AgglomerativeClustering from sklearn.neighbors import kneighbors_graph import numpy as np print('clusterer_sklearn_agglomerative') connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) average_linkage = AgglomerativeClustering(linkage="average", affinity="cosine", n_clusters=params['n_clusters'], connectivity=connectivity).fit(X) clusterAlgLabelAssignmentsSAG = average_linkage.labels_.astype(np.int) XY = (X, clusterAlgLabelAssignmentsSAG) return (XY)
def fit(self, X, y=None): """Fit the clustering model Parameters ---------- X : array_like the data to be clustered: shape = [n_samples, n_features] """ if self.cutoff is None and self.cutoff_scale is None: raise ValueError("Must specify either cutoff or cutoff_frac") # Compute the distance-based graph G from the points in X if self.metric == 'precomputed': # Input is already a graph. Copy if sparse # so we can overwrite for efficiency below. self.X_fit_ = None G = validate_graph(X, directed=True, csr_output=True, dense_output=False, copy_if_sparse=True, null_value_in=np.inf) elif not self.approximate: X = check_array(X) self.X_fit_ = X kwds = self.metric_params or {} G = pairwise_distances(X, metric=self.metric, **kwds) G = validate_graph(G, directed=True, csr_output=True, dense_output=False, copy_if_sparse=True, null_value_in=np.inf) else: # generate a sparse graph using n_neighbors of each point X = check_array(X) self.X_fit_ = X n_neighbors = min(self.n_neighbors, X.shape[0] - 1) G = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance', metric=self.metric, metric_params=self.metric_params) # HACK to keep explicit zeros (minimum spanning tree removes them) zero_fillin = G.data[G.data > 0].min() * 1E-8 G.data[G.data == 0] = zero_fillin # Compute the minimum spanning tree of this graph self.full_tree_ = minimum_spanning_tree(G, overwrite=True) # undo the hack to bring back explicit zeros self.full_tree_[self.full_tree_ == zero_fillin] = 0 # Partition the data by the cutoff N = G.shape[0] - 1 if self.cutoff is None: i_cut = N elif 0 <= self.cutoff < 1: i_cut = int((1 - self.cutoff) * N) elif self.cutoff >= 1: i_cut = int(N - self.cutoff) else: raise ValueError('self.cutoff must be positive, not {0}' ''.format(self.cutoff)) # create the mask; we zero-out values where the mask is True N = len(self.full_tree_.data) if i_cut < 0: mask = np.ones(N, dtype=bool) elif i_cut >= N: mask = np.zeros(N, dtype=bool) else: mask = np.ones(N, dtype=bool) part = np.argpartition(self.full_tree_.data, i_cut) mask[part[:i_cut]] = False # additionally cut values above the ``cutoff_scale`` if self.cutoff_scale is not None: mask |= (self.full_tree_.data > self.cutoff_scale) # Trim the tree cluster_graph = self.full_tree_.copy() # Eliminate zeros from cluster_graph for efficiency. # We want to do this: # cluster_graph.data[mask] = 0 # cluster_graph.eliminate_zeros() # but there could be explicit zeros in our data! # So we call eliminate_zeros() with a stand-in data array, # then replace the data when we're finished. original_data = cluster_graph.data cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1) cluster_graph.data[mask] = 0 cluster_graph.eliminate_zeros() cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1] # find connected components n_components, labels = connected_components(cluster_graph, directed=False) # remove clusters with fewer than min_cluster_size counts = np.bincount(labels) to_remove = np.where(counts < self.min_cluster_size)[0] if len(to_remove) > 0: for i in to_remove: labels[labels == i] = -1 _, labels = np.unique(labels, return_inverse=True) labels -= 1 # keep -1 labels the same # update cluster_graph by eliminating non-clusters # operationally, this means zeroing-out rows & columns where # the label is negative. I = sparse.eye(len(labels)) I.data[0, labels < 0] = 0 # we could just do this: # cluster_graph = I * cluster_graph * I # but we want to be able to eliminate the zeros, so we use # the same indexing trick as above original_data = cluster_graph.data cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1) cluster_graph = I * cluster_graph * I cluster_graph.eliminate_zeros() cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1] self.labels_ = labels self.cluster_graph_ = cluster_graph return self
] fig, axes = plt.subplots(figsize=(12, 12), ncols=3, nrows=len(datasets), sharey=True, sharex=True) plt.setp(axes, xticks=[], yticks=[], xlim=(-2.5, 2.5), ylim=(-2.5, 2.5)) for d, (dataset_label, dataset, algo_params) in enumerate(datasets): params = default_params.copy() params.update(algo_params) X, y = dataset X = StandardScaler().fit_transform(X) # 层次聚类距离度量方式,离差平方和Ward connectivity = kneighbors_graph( X, n_neighbors=params['n_neighbors'], include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) # 三种聚类模型 kmeans = KMeans(n_clusters=params['n_clusters']) dbscan = DBSCAN(eps=params['eps']) average_linkage = AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) clustering_algorithms = ( ('KMeans', kmeans), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan) ) # 绘图
np.random.seed(0) t = 1.5 * np.pi * (1 + 3 * np.random.rand(1, n_samples)) x = t * np.cos(t) y = t * np.sin(t) X = np.concatenate((x, y)) X += .7 * np.random.randn(2, n_samples) X = X.T # Create a graph capturing local connectivity. Larger number of neighbors # will give more homogeneous clusters to the cost of computation # time. A very large number of neighbors gives more evenly distributed # cluster sizes, but may not impose the local manifold structure of # the data knn_graph = kneighbors_graph(X, 30, include_self=False) for connectivity in (None, knn_graph): for n_clusters in (30, 3): plt.figure(figsize=(10, 4)) for index, linkage in enumerate(('average', 'complete', 'ward')): plt.subplot(1, 3, index + 1) model = AgglomerativeClustering(linkage=linkage, connectivity=connectivity, n_clusters=n_clusters) t0 = time.time() model.fit(X) elapsed_time = time.time() - t0 plt.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap=plt.cm.spectral) plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),