def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True): print "ClusterBalancing..." indexesPicked = [] obs1 = self.observations[indexesToPick] obs = normalize(obs1, axis=0) if len(indexesToPick) != 0: if kmeansFlag: if(len(indexesToPick) < self.numClusters): cluster = KMeans(init='k-means++', n_clusters=len(obs), n_init=10) else: cluster = KMeans(init='k-means++', n_clusters=self.numClusters, n_init=10) else: if(len(indexesToPick) < self.numClusters): cluster = spectral_clustering(n_clusters=len(obs), n_init=10) else: cluster = spectral_clustering(n_clusters=self.numClusters, n_init=10) cluster.fit(obs) labels = cluster.labels_ whenToStop = max(2, stopCount) count = 0 while count != whenToStop: cluster_list = range(self.numClusters) index = 0 for j in labels: if j in cluster_list: indexesPicked.append(indexesToPick[index]) cluster_list.remove(j) count += 1 if count == whenToStop: break labels[index] = -1 if len(cluster_list) == 0: break index += 1 return indexesPicked
def test_spectral_clustering_with_arpack_amg_solvers(): # Test that spectral_clustering is the same for arpack and amg solver # Based on toy example from plot_segmentation_toy.py # a small two coin image x, y = np.indices((40, 40)) center1, center2 = (14, 12), (20, 25) radius1, radius2 = 8, 7 circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2 circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2 circles = circle1 | circle2 mask = circles.copy() img = circles.astype(float) graph = img_to_graph(img, mask=mask) graph.data = np.exp(-graph.data / graph.data.std()) labels_arpack = spectral_clustering( graph, n_clusters=2, eigen_solver='arpack', random_state=0) assert len(np.unique(labels_arpack)) == 2 if amg_loaded: labels_amg = spectral_clustering( graph, n_clusters=2, eigen_solver='amg', random_state=0) assert adjusted_rand_score(labels_arpack, labels_amg) == 1 else: assert_raises( ValueError, spectral_clustering, graph, n_clusters=2, eigen_solver='amg', random_state=0)
def image_features_labels(img,n_clusters,maxPixel): # X is the feature vector with one row of features per image # imageSize=maxPixel*maxPixel img = resize(img, (maxPixel, maxPixel)) mask = img.astype(bool) # Convert the image into a graph with the value of the gradient on the # edges. graph = s_im.img_to_graph(img, mask=mask) # Take a decreasing function of the gradient: we take it weakly # dependent from the gradient the segmentation is close to a voronoi graph.data = np.exp(-graph.data / graph.data.std()) # Force the solver to be arpack, since amg is numerically # unstable on this example labels = spectral_clustering(graph, n_clusters, eigen_solver='arpack') label_im = -np.ones(mask.shape) label_im[mask] = labels X=np.zeros(imageSize, dtype=float) # Store the rescaled image pixels X[0:imageSize] = np.reshape(label_im,(1, imageSize)) return X
def test_spectral_amg_mode(): # Test the amg mode of SpectralClustering centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) try: from pyamg import smoothed_aggregation_solver amg_loaded = True except ImportError: amg_loaded = False if amg_loaded: labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, mode="amg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), .3) else: assert_raises(ValueError, spectral_embedding, S, n_components=len(centers), random_state=0, mode="amg")
def speclu(data_matrix, k): #use spectral clustering print 'using spectral clustering......' E_matrix = getEMatrix(data_matrix) result_total = spectral_clustering(E_matrix, n_clusters = k) result = result_total[ : len(data_matrix)] return result
def compute(n): G , nodes , ego = build_graph(n) A = nx.to_numpy_matrix(G) C = connectedness(A) row , col = A.shape if row >= 350: clus = 10 else: clus = 6 L = spectral_clustering(C , n_clusters = clus) circles = [] for x in range(0,clus): circles += [[]] tmp = 0 for node in nodes: circles[L[tmp]] += [node] tmp += 1 final_circle = [] for circle in circles: if len(circles) == 1: final_circle += [circle] continue den = compute_density(circle , nodes , A) if den + 1e-9 < .250: continue final_circle += [circle] # print(final_circle) return ego , final_circle
def classifySpeCluLsa(self, class_num): from draw_data import draw_data draw_title = draw_data() lsa = models.LsiModel.load('model.lsa', mmap='r') logging.info("load lsa model!!") index = similarities.MatrixSimilarity.load('model_lsa.index') self.get_data(num=3000) (tfidf, dictionary) = self.get_tfidf(True, num=3000) hash_id2list = dict() # 保存id -> 下标 similar_matrix中对应使用 for i in range(len(self.title_id)): hash_id2list[self.title_id[i]] = i logging.info('开始创建相似矩阵...') similar_matrix = np.zeros((len(tfidf),len(tfidf))) #存放相似度 for i in range(len(tfidf)): sims = index[lsa[tfidf[i]]] for j,v in enumerate(sims): similar_matrix[i][j] = v similar_matrix[j][i] = v logging.info('done,相似矩阵建立完成,使用普聚类进行分类...') labels = spectral_clustering(similar_matrix, n_clusters=class_num, eigen_solver='arpack') self.vector_table = [[] for i in range(class_num)] for i in range(len(labels)): self.vector_table[labels[i]].append(self.title_id[i]) logging.info("print set... "+str(len(self.vector_table))) self.printTitleTOfile(hash_id2list) draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-12-25')
def community_clustering(): path = settings.COMMUNITY_PATH index = 0 communities = [] merged_communities = {} for root, dirs, files in os.walk(path): for year in files: merged_communities[int(year)] = [[] for i in range(200)] comm_dict = {} input = open(os.path.join(path,year)) for line in input: x = line.strip().split(' ') author = int(x[0]) id = int(x[1]) if not comm_dict.has_key(id): comm_dict[id] = Community(int(year),id,index) index+=1 comm_dict[id].append_member(author) for id in comm_dict.keys(): communities.append(comm_dict[id]) verbose.debug("num of communities: "+str(len(communities))) adjacency = np.ndarray(shape=(len(communities),len(communities)), dtype=int) for i in range(len(communities)): for j in range(i+1,len(communities)): affinity = communities[i].intersect(communities[j]) adjacency[i,j]=affinity adjacency[j,i]=affinity labels = spectral_clustering(adjacency, n_clusters = 200) verbose.debug("clustering finished") for i in range(len(labels)): merged_communities[communities[i].year][labels[i]].extend(communities[i].members) for year in merged_communities.keys(): cluster_file = open(settings.DATA_PATH+"\\clusters\\"+str(year), 'w') for i in range(len(merged_communities[year])): [cluster_file.write(str(member)+',') for member in merged_communities[year][i]]
def spectral(tweetfile,npmifile,dictfile,k,noc): Ptmp=textscan(npmifile,'([^ ]*) ([^ ]*) ([^ ]*)'); PP=textscan(dictfile,'(.*) (.*)',(int,str)); PP[0] -= 1 PMI=ssp.coo_matrix( (Ptmp[2],(Ptmp[0]-1,Ptmp[1]-1)), (PP[0].shape[0],PP[0].shape[0]) ).tocsr(); W=knnmatrix(PMI,k); # This is hidious and wrong and it must be fixed W=ssp.csr_matrix(minimum(W.todense(),W.T.todense())) s,comp = ssp.csgraph.connected_components(W,directed=False) comp_mode = mstats.mode(comp)[0] inds = comp==comp_mode inds = [x for x in range(W.shape[0]) if inds[x]] WW = W[inds,:][:,inds] P=PP[1][inds]; ids = P; X = WW; c = spectral_clustering(X,n_clusters=noc, eigen_solver='arpack') fid=file("".join(['cl.',tweetfile,'-',str(noc)]),'w'); for i in range(max(c)+1): cl=[x for x in range(len(c)) if c[x] == i] b,wordsix = centralityn(cl,X,ids); for j in range(len(b)): word=wordsix[j]; fid.write('%s %d %.5f\n'%(word,i,b[j]));
def __init__(self, laplacian,ncluster,classesnames): self.laplacian = laplacian self.ncluster = ncluster m,n=laplacian.shape print 'size Laplacian_matrix: ',m, n labels = spectral_clustering(laplacian, n_clusters=ncluster) x=range(n+1) wordsall=zip(x, classesnames) lc= zip(labels,x) print "labels", lc allwordsclustered=[] for m in range(ncluster): sort=[item[1] for item in lc if item[0] == m] wordsclustered=[] for y in sort: for item in wordsall: if item[0] == y: wordsclustered.append(item[1]) if len(wordsclustered) >1: allwordsclustered.append(wordsclustered) print'clusteredwords' print allwordsclustered self.cluster= len(allwordsclustered),allwordsclustered
def getPairwiseDistanceMatrix(self): """ It is sloghtly slower but memory efficient, fast implementation is not tractable in terms of memory for such a scale """ self.clusters = [] dataSize = self.data_points.shape self.PDistMat = sp.sparse.csr_matrix((dataSize[0],dataSize[0])) for k in range(dataSize[0]): CurrentPoint = self.data_points[k,:] Dist = sp.spatial.distance.cdist(np.reshape(CurrentPoint,(1,dataSize[1])),self.data_points,'euclidean') kMins = [] kDists = [] maxD = np.max(Dist)+1 while len(kMins)<5: cMins = np.argmin(Dist) kMins.append(cMins) kDists.append(Dist[0,cMins]) Dist[0,cMins]=maxD for pt in range(len(kMins)): #print kMins[pt],k,self.PDistMat.shape,kDists[pt],pt,kDists self.PDistMat[k,kMins[pt]]=kDists[pt] self.PDistMat[kMins[pt],k]=kDists[pt] SM=self.PDistMat.data.mean() self.PDistMat.data[:] = np.exp(((-1)*self.PDistMat.data)/SM) #Here we go a bit low-level and apply the e^(-1.x) to the data array #self.PDistMat.data = np.exp((-1)*self.PDistMat.data) pickle.dump(self.PDistMat,open('pdist.bnbb','wb')) labs = spectral_clustering(self.PDistMat,n_clusters=20) pickle.dump(labs,open('labs.bnbb','wb'))
def cluster_nodes(dist_laplacian, clusters=3, show=False): norm_laplacian = Lapl_normalize(dist_laplacian) norm_laplacian.setdiag(0) norm_laplacian = -norm_laplacian if show: plt.imshow(norm_laplacian.toarray(), cmap='jet', interpolation="nearest") plt.colorbar() plt.show() labels = spectral_clustering(norm_laplacian, n_clusters=clusters, eigen_solver='arpack') return np.reshape(labels, (dist_laplacian.shape[0], 1))
def spectralClusteringTest01(): import numpy as np import matplotlib.pyplot as plt from sklearn.feature_extraction import image from sklearn.cluster import spectral_clustering l = 100 x,y = np.indices((l, l)) #x,y 都是二维矩阵, 表示了某点的x 和 y的坐标 center1 = (28, 24) center2 = (40, 50) center3 = (67, 58) center4 = (24, 70) radius1, radius2, radius3, radius4 = 16, 14, 15, 14 circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2 circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2 circle3 = (x - center3[0]) ** 2 + (y - center3[1]) ** 2 < radius3 ** 2 circle4 = (x - center4[0]) ** 2 + (y - center4[1]) ** 2 < radius4 ** 2 img = circle1 + circle2 + circle3 + circle4 mask = img.astype(bool) img = img.astype(float) img += 1 + 0.2 * np.random.randn(*img.shape) #Convert the image into a graph with the value of the gradient on the edges #img就是一个100 * 100的图片 #mask是一个bool型的100 * 100模板 #graph是一个稀疏矩阵 -- 不过为什么是2678 * 2678 ? #估计这一步里面计算了梯度 graph = image.img_to_graph(img, mask = mask) print graph.shape graph.data = np.exp(-graph.data / graph.data.std()) #这里还是指定了聚类的中心数目 #这里是只对mask内的点进行聚类 labels = spectral_clustering(graph, n_clusters = 4, eigen_solver = "arpack") print labels label_im = -np.ones(mask.shape) label_im[mask] = labels plt.matshow(img) plt.matshow(label_im) plt.show()
def cluster_spatial_data(X, n_parcels, xyz=None, shape=None, mask=None, method='ward', verbose=False): """Cluster the data using Ward's algorithm Parameters ========== X: array of shape(n_voxels, n_subjects) the functional data, across subjects n_parcels: int, the desired number of parcels xyz: array of shape (n_voxels, 3), optional positions of the voxels in grid coordinates shape: tuple: the domain shape (assuming a grid structure), optional alternative specification of positions mask: arbitrary array of arbitrary dimension,optional alternative specification of positions method: string, one of ['ward', 'spectral', 'kmeans'], optional clustering method Returns ======= label: array of shape(n_voxels): the resulting cluster assignment Note ==== One of xyz, shape or mask needs to be provided """ from sklearn.cluster import spectral_clustering, k_means if mask is not None: connectivity = grid_to_graph(*shape, mask=mask) elif shape is not None: connectivity = grid_to_graph(*shape) elif xyz is not None: from sklearn.neighbors import kneighbors_graph n_neighbors = 2 * xyz.shape[1] connectivity = kneighbors_graph(xyz, n_neighbors=n_neighbors) else: raise ValueError('One of mask, shape or xyz has to be provided') if n_parcels == 1: return np.zeros(X.shape[0]) if method == 'ward': connectivity = connectivity.tocsr() ward = Ward(n_clusters=n_parcels, connectivity=connectivity).fit(X) label = ward.labels_ elif method == 'spectral': i, j = connectivity.nonzero() sigma = np.sum((X[i] - X[j]) ** 2, 1).mean() connectivity.data = np.exp(- np.sum((X[i] - X[j]) ** 2, 1) / (2 * sigma)) label = spectral_clustering(connectivity, n_clusters=n_parcels) elif method == 'kmeans': _, label, _ = k_means(X, n_parcels) else: raise ValueError('Unknown method for parcellation') return label
def cluster_and_rank_demos(sm, n_clusters, eigen_solver='arpack', assign_labels='discretize'): """ Clusters demos based on similarity matrix. """ labels = spectral_clustering(sm, n_clusters = n_clusters, eigen_solver=eigen_solver,assign_labels=assign_labels) clusters = {i:[] for i in xrange(n_clusters)} for i,l in enumerate(labels): clusters[l].append(i) # Maybe re-cluster large demos return rank_demos_in_cluster(clusters, sm)
def test_spectral_clustering(self): N = 50 m = np.random.random_integers(1, 200, size=(N, N)) m = (m + m.T) / 2 df = pdml.ModelFrame(m) result = df.cluster.spectral_clustering(random_state=self.random_state) expected = cluster.spectral_clustering(m, random_state=self.random_state) self.assertIsInstance(result, pdml.ModelSeries) tm.assert_index_equal(result.index, df.index) tm.assert_numpy_array_equal(result.values, expected)
def test_spectral_lobpcg_mode(): # Test the lobpcg mode of SpectralClustering # We need a fairly big data matrix, as lobpcg does not work with # small data matrices centers = np.array([[0.0, 0.0], [10.0, 10.0]]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=0.1, random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, eigen_solver="lobpcg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), 0.3)
def __speclu(self): #use spectral clustering print 'using spectral clustering......' data_matrix = self.data_matrix if len(data_matrix) == len(data_matrix[0]): print "Donot need to use E_matrix" E_matrix = data_matrix else: E_matrix = self.__getEMatrix() result_total = spectral_clustering(E_matrix, n_clusters=self.k) result = result_total[:len(data_matrix)] return result
def spectralcluster(correlations,n_clusters,names): labels=cluster.spectral_clustering(correlations,n_clusters=n_clusters, eigen_solver=None, random_state=0, n_init=10, k=None, eigen_tol=0.0, assign_labels='kmeans', mode=None) #print labels clusdict=[] print "" print "Spectral Clustering - shape: " + str(correlations.shape) for i in range(labels.max()+1): print 'Cluster %i: %s' % ((i+1),', '.join(names[labels==i])) clusdict.append(names[labels==i]) #print clusdict return clusdict
def __speclu(self): #use spectral clustering print 'using spectral clustering......' data_matrix = self.data_matrix if len(data_matrix) == len(data_matrix[0]): print "Donot need to use E_matrix" E_matrix = data_matrix else: E_matrix = self.__getEMatrix() result_total = spectral_clustering(E_matrix, n_clusters = self.k) result = result_total[ : len(data_matrix)] return result
def run_snf2(w1, w2, wall_label): Dist1 = dist2(w1.values, w1.values) Dist2 = dist2(w2.values, w2.values) S1 = snf.compute.affinity_matrix(Dist1, K=args.neighbor_size, mu=args.mu) S2 = snf.compute.affinity_matrix(Dist2, K=args.neighbor_size, mu=args.mu) # Do SNF2 diffusion ( dicts_common, dicts_commonIndex, dict_sampleToIndexs, dicts_unique, original_order, ) = data_indexing([w1, w2]) S1_df = pd.DataFrame(data=S1, index=original_order[0], columns=original_order[0]) S2_df = pd.DataFrame(data=S2, index=original_order[1], columns=original_order[1]) fused_networks = snf2( args, [S1_df, S2_df], dicts_common=dicts_common, dicts_unique=dicts_unique, original_order=original_order, ) S1_fused = fused_networks[0] S2_fused = fused_networks[1] # S2_fused = S2_fused.reindex(wall_label.index.tolist()) # labels_final = spectral_clustering(S2_fused.values, n_clusters=10) # score = v_measure_score(wall_label["label"].tolist(), labels_final) # print("SNF2 for clustering union 832 samples NMI score:", score) S_final = tsne_p_deep( args, dicts_commonIndex, dict_sampleToIndexs, [S1_fused.values, S2_fused.values], ) S_final_df = pd.DataFrame(data=S_final, index=dict_sampleToIndexs.keys()) S_final_df = S_final_df.reindex(wall_label.index.tolist()) Dist_final = dist2(S_final_df.values, S_final_df.values) Wall_final = snf.compute.affinity_matrix( Dist_final, K=args.neighbor_size, mu=args.mu ) labels_final = spectral_clustering(Wall_final, n_clusters=10) score = v_measure_score(wall_label["label"].tolist(), labels_final) print("SNF2 for clustering union 832 samples NMI score:", score) return score
def test_spectral_clustering(self): N = 50 m = np.random.random_integers(1, 200, size=(N, N)) m = (m + m.T) / 2 df = pdml.ModelFrame(m) result = df.cluster.spectral_clustering(random_state=self.random_state) expected = cluster.spectral_clustering(m, random_state=self.random_state) self.assertTrue(isinstance(result, pdml.ModelSeries)) self.assert_index_equal(result.index, df.index) self.assert_numpy_array_equal(result.values, expected)
def cluster_financial_indexs(self, k): #fecth_indexs = FIR.fetch_selected_financial_indexs(indexs, self.dates) #date = '2017-12-31' date = dates[0] print('cluster is', date, 'alg is', self.alg) # k = 10 X = self.fetch_factors.values if self.alg == 'kmean': km = KMeans(n_clusters=k, random_state=42) km.fit(X) labels = km.labels_ elif self.alg == 'agglomerative': ward = AgglomerativeClustering(n_clusters=k, linkage='ward') ward.fit(X) labels = ward.labels_ elif self.alg == 'DBSCAN': # Compute DBSCAN db = DBSCAN(eps=10, min_samples=10).fit(X) labels = db.labels_ elif self.alg == 'spectral': # Compute DBSCAN labels = spectral_clustering(X, n_clusters=k, eigen_solver='arpack') #labels = db.labels_ elif self.alg == 'birch': # Compute DBSCAN brc = Birch(threshold=50, branching_factor=50, n_clusters=300, compute_labels=True) labels = brc.fit(X) labels = labels.labels_ elif self.alg == 'affinity': #af = AffinityPropagation(affinity='precomputed').fit(X) af = AffinityPropagation(max_iter=500, affinity='euclidean').fit(X) labels = af.labels_ else: print('not support this cluster') exit(-1) #labels = spectral_clustering(self.fetch_factors[date].values, n_clusters=k, # assign_labels='discretize', random_state=1) self.fetch_factors["Cluster"] = labels #self.fetch_factors[date]["Cluster"].sort(key='Cluster', reverse=False) self.fetch_factors = self.fetch_factors.sort_values("Cluster", axis=0, ascending=True) self.fetch_factors.to_csv(self.path_cluster.format(date)) print('save folder is', self.path_cluster.format(date))
def affin_sclustering(X,n_clust, distance='euclid', gamma=0.1, std=1): print 'Basic spectral clustering using affinity matrix' if distance=='cosine': similarity=cos(X)#pairwise_distances(X, metric='cosine') elif distance=='euclid': dist=euclidean_distances(X) if std: similarity = np.exp(-gamma * dist/dist.std()) else: similarity = np.exp(-gamma * dist) labels = cluster.spectral_clustering(similarity,n_clusters=n_clust, eigen_solver='arpack') return labels
def test_spectral_clustering_with_arpack_amg_solvers(): # Test that spectral_clustering is the same for arpack and amg solver # Based on toy example from plot_segmentation_toy.py # a small two coin image x, y = np.indices((40, 40)) center1, center2 = (14, 12), (20, 25) radius1, radius2 = 8, 7 circle1 = (x - center1[0])**2 + (y - center1[1])**2 < radius1**2 circle2 = (x - center2[0])**2 + (y - center2[1])**2 < radius2**2 circles = circle1 | circle2 mask = circles.copy() img = circles.astype(float) graph = img_to_graph(img, mask=mask) graph.data = np.exp(-graph.data / graph.data.std()) labels_arpack = spectral_clustering(graph, n_clusters=2, eigen_solver='arpack', random_state=0) assert len(np.unique(labels_arpack)) == 2 if amg_loaded: labels_amg = spectral_clustering(graph, n_clusters=2, eigen_solver='amg', random_state=0) assert adjusted_rand_score(labels_arpack, labels_amg) == 1 else: assert_raises(ValueError, spectral_clustering, graph, n_clusters=2, eigen_solver='amg', random_state=0)
def cluster_spectral(X): similarity_matrix = compute_similarity_matrix(X) labels = spectral_clustering(similarity_matrix) classes = {idx: str(v) for idx, v in enumerate(labels)} graph = create_knn_graph(similarity_matrix, 8) # export clustered graph as json nx.set_node_attributes(graph, classes, 'group') graph_json = json_graph.node_link_data(graph) return list(labels), graph_json
def gen_codebook(graphs, W_matrix, group_num=16): m = len(W_matrix) res = spectral_clustering(W_matrix, n_clusters=group_num) group_res = [] for i in range(group_num): group_res.append([]) for i in range(m): group_res[res[i]].append(i) centers = processing_grouping(group_res, W_matrix) codebook = [] for i in centers: codebook.append(graphs[i]) return codebook
def clustering_preGraph(self): hardLabelDict, softLabelDict = self.getLabel() for key in self.edgeDict: groundTrues = hardLabelDict[key] clusterNum = 12 A = self.edgeDict[key] nt = NetworkTool() nt.initNetwork(A, nodeIndexDict[key]) X = self.initX(A) labels_ajen = spectral_clustering(X, n_clusters=clusterNum, eigen_solver='arpack') nmi_sc = self.NMI(labels_ajen.tolist(), groundTrues, clusterNum) print nmi_sc # counter=self.counter(labels,clusterNum) Y = self.refexFeature[key] pca = PCA(n_components=50, svd_solver='full') Y_50 = pca.fit_transform(Y) S = cosine_similarity(Y_50) S = (S + 1.0) / 2.0 labels = spectral_clustering(S, n_clusters=clusterNum, eigen_solver='arpack') counter = self.counter(labels, clusterNum) nmi_sc = self.NMI(labels.tolist(), groundTrues, clusterNum) print nmi_sc # self.draw(nodeIndexDict[key],nt,labels,str(key)+'_spectral_'+str(clusterNum)+'.png') # self.output(nodeIndexDict[key],labels,str(key)+'_spectral') kmeans = KMeans(n_clusters=clusterNum, random_state=0).fit(Y) labels_km = kmeans.labels_.tolist() counter = self.counter(labels_km, clusterNum) nmi_km = self.NMI(labels_km, groundTrues, clusterNum) print nmi_km
def sp_clustering(img): graph = image.img_to_graph(img) # Take a decreasing function of the gradient: we take it weakly # dependent from the gradient the segmentation is close to a voronoi graph.data = np.exp(-graph.data / graph.data.std()) # Force the solver to be arpack, since amg is numerically # unstable on this example labels = spectral_clustering(graph, n_clusters=64, eigen_solver='arpack') plt.matshow(img) plt.matshow(labels)
def graph_cuts(fg_embed, edge_index, num_cg, bandwidth=1.0, kernel='rbf', device=torch.device(0)): affinity = compute_affinity(fg_embed, edge_index, bandwidth, kernel, device) pred_cg_idx = spectral_clustering(affinity.cpu().numpy(), n_clusters=num_cg, assign_labels='discretize') return pred_cg_idx, affinity
def _cluster_model(self, model_name, c): if model_name == 'KMeans': model = KMeans(n_clusters=c, init='k-means++') elif model_name == 'HAC': model = AgglomerativeClustering(n_clusters=c, affinity='euclidean', linkage='ward') elif model_name == 'Spectral': model = spectral_clustering(n_clusters=c) else: print("Options for models are KMeans, HAC or Spectral.") exit(-1) return model
def sp(data, class_num, data_nm, label): n_clusters = class_num matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] matplotlib.rcParams['axes.unicode_minus'] = False m = euclidean_distances(data, squared=True) # print(m) sigma = np.median(m) plt.figure(figsize=(12, 8), facecolor='w') plt.suptitle(u'谱聚类', fontsize=20) clrs = [ '#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B', '#2E8B57', '#FAEBD7', '#8B5A00', '#EEEE00', '#0000FF', '#ABABAB', '#8B8B00' ] # print(len(clrs)) assess = [] for i, s in enumerate(np.logspace(-2, 0, 6)): af = np.exp(-m**2 / (s**2)) + 1e-6 y_hat = spectral_clustering(af, n_clusters=n_clusters, assign_labels='kmeans', random_state=1) # assess.append(y_hat) plt.subplot(2, 3, i + 1) for k, clr in enumerate(clrs): cur = (y_hat == k) plt.scatter(data[cur, 0], data[cur, 1], s=40, color=clr, edgecolors='k') x1_min, x2_min = np.min(data, axis=0) x1_max, x2_max = np.max(data, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.title(u'sigma = %.2f' % s, fontsize=16) # print(y_hat) print("标准化互信息 精度 纯度 轮廓系数 兰德系数") nmi, acc, purity, Sc, ARI = evaluate.eva(y_hat, label, data) print(nmi, acc, purity, Sc, ARI) plt.tight_layout() plt.title("SC1+" + data_nm) plt.subplots_adjust(top=0.9) plt.savefig( '.\picture\improved_spectral_clustering\sc1_{0}.png'.format(data_nm)) plt.close()
def affin_sclustering(X, n_clust, distance='euclid', gamma=0.1, std=1): print 'Basic spectral clustering using affinity matrix' if distance == 'cosine': similarity = cos(X) #pairwise_distances(X, metric='cosine') elif distance == 'euclid': dist = euclidean_distances(X) if std: similarity = np.exp(-gamma * dist / dist.std()) else: similarity = np.exp(-gamma * dist) labels = cluster.spectral_clustering(similarity, n_clusters=n_clust, eigen_solver='arpack') return labels
def global_clustering_by_spectral(self): num_clusters = self.num_global_clusters X = self.build_global_feature_vectors_by_jaccard_with_weight() logging.info("Global spectral clustering...") spectral = spectral_clustering(X, n_clusters=num_clusters, eigen_solver='arpack') logging.info("Global spectral finished") self.global_clusters = [[[] for i in range(num_clusters)] for j in range(self.num_time_slides)] self.global_cluster_labels = [[None for i in range(self.num_local_clusters)] for j in range(self.num_time_slides)] labels = spectral for time in range(self.num_time_slides): for i, cluster in enumerate(self.local_clusters[time]): l = labels[self.gloabl_feature_vectors_index[time][i]] self.global_clusters[time][l].append(i) self.global_cluster_labels[time][i] = l
def get_spectralClustering(similarity, cluster_num): """ :param similarity: similarity matrix :param cluster_num: number of clusters(if it is 0, calculate by spectral clustering) :return: labels... """ similarity = pd.DataFrame(similarity) similarity = similarity.values similarity[np.isnan(similarity)] = 0 labels = cl.spectral_clustering(affinity=similarity, n_clusters=cluster_num) return labels
def unify_communities_spectral_mean(params, GT): #This is the technique compared by Han Xu and Airoldi 2015 ICML (who propose variationam profile MLE algo) adj_matrix_summed = sp.sparse.csr_matrix(np.zeros( (len(GT[0].nodes), len(GT[0].nodes))), dtype=int) for G in GT: adj_matrix_summed += nx.adjacency_matrix(G) spout = spectral_clustering(adj_matrix_summed, n_clusters=params['k']) + 1 gfinal = {} for i in GT[0].nodes(): gfinal[i] = spout[i - 1] return gfinal, {}
def clustering(mat, k, names, size=2): labels = spectral_clustering(mat, n_clusters=k) clusters = dict() for a, clu_id in enumerate(labels): clusters.setdefault(clu_id, set()) clusters[clu_id].add(a) name_clusters = list() for c_id in clusters: cluster = clusters[c_id] name_cluster = [names[c] for c in cluster] if len(name_cluster) < size: continue name_clusters.append(name_cluster) return name_clusters
def clust(vectorfile, matrixfile, clusted): fid2fname = {} for line in open(vectorfile): line = line.strip().split('\t') fid2fname.setdefault(int(line[0]), line[1:]) N = len(fid2fname) rowlist = [] collist = [] datalist = [] for line in open(matrixfile): line = line.strip().split('\t') if len(line) < 3: continue f1, f2, sim = line[:3] rowlist.append(int(f1)) collist.append(int(f2)) datalist.append(float(sim)) for id in fid2fname: rowlist.append(int(id)) collist.append(int(id)) datalist.append(1.0) row = np.array(rowlist) col = np.array(collist) data = np.array(datalist) graph = coo_matrix((data, (row, col)), shape=(N, N)) ############################################################################### # Force the solver to be arpack, since amg is numerically # unstable on this example labels = spectral_clustering(graph, n_clusters=550, eigen_solver='arpack') cluster2fid = {} for index, lab in enumerate(labels): cluster2fid.setdefault(lab, []) cluster2fid[lab].append(index) normal_data = open("normal-data.txt", 'w') easy_data = open("spectal_easy-data-550.txt", 'w') for index, lab in enumerate(cluster2fid): for fid in cluster2fid[lab]: strx = "" for i in range(0, len(fid2fname[fid])): strx += str(fid2fname[fid][i]) + "\t" print >> normal_data, strx + '\t' + str(index) print >> easy_data, strx + '\t' + str(fid) + '\t' + str(index)
def clust(vectorfile,matrixfile,clusted): fid2fname = {} for line in open(vectorfile) : line = line.strip().split('\t') fid2fname.setdefault(int(line[0]), line[1:]) N = len(fid2fname) rowlist = [] collist = [] datalist = [] for line in open(matrixfile) : line = line.strip().split('\t') if len(line) < 3 : continue f1, f2, sim = line[:3] rowlist.append(int(f1)) collist.append(int(f2)) datalist.append(float(sim)) for id in fid2fname : rowlist.append(int(id)) collist.append(int(id)) datalist.append(1.0) row = np.array(rowlist) col = np.array(collist) data = np.array(datalist) graph = coo_matrix((data, (row, col)), shape=(N, N)) ############################################################################### # Force the solver to be arpack, since amg is numerically # unstable on this example labels = spectral_clustering(graph, n_clusters=550, eigen_solver='arpack') cluster2fid = {} for index, lab in enumerate(labels) : cluster2fid.setdefault(lab, []) cluster2fid[lab].append(index) normal_data = open("normal-data.txt", 'w') easy_data=open("spectal_easy-data-550.txt", 'w') for index, lab in enumerate(cluster2fid) : for fid in cluster2fid[lab] : strx="" for i in range(0, len(fid2fname[fid])): strx+=str(fid2fname[fid][i])+"\t" print >> normal_data,strx+'\t'+str(index) print >> easy_data,strx+'\t'+str(fid)+'\t'+str(index)
def LSTClustering(self): # 参考“Segmenting the picture of greek coins in regions”方法,Author: Gael Varoquaux <*****@*****.**>, Brian Cheung # License: BSD 3 clause orig_coins = self.LST # these were introduced in skimage-0.14 if LooseVersion(skimage.__version__) >= '0.14': rescale_params = {'anti_aliasing': False, 'multichannel': False} else: rescale_params = {} smoothened_coins = gaussian_filter(orig_coins, sigma=2) rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect", **rescale_params) # Convert the image into a graph with the value of the gradient on the # edges. graph = image.img_to_graph(rescaled_coins) # Take a decreasing function of the gradient: an exponential # The smaller beta is, the more independent the segmentation is of the # actual image. For beta=1, the segmentation is close to a voronoi beta = 10 eps = 1e-6 graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps # Apply spectral clustering (this step goes much faster if you have pyamg # installed) N_REGIONS = 200 for assign_labels in ('discretize', ): # for assign_labels in ('kmeans', 'discretize'): t0 = time.time() labels = spectral_clustering(graph, n_clusters=N_REGIONS, assign_labels=assign_labels, random_state=42) t1 = time.time() labels = labels.reshape(rescaled_coins.shape) plt.figure(figsize=(5 * 3, 5 * 3)) plt.imshow(rescaled_coins, cmap=plt.cm.gray) for l in range(N_REGIONS): plt.contour( labels == l, colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))]) plt.xticks(()) plt.yticks(()) title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0)) print(title) plt.title(title) plt.show()
def consensus(clusterings, nclus, weights=None, method='hier', refclus=None): """ Consensus by clustering of the pairings matrix, using hierarchical or spectral clustering Parameters ---------- clusterings : ndarray ndata x nreal array of cluster realizations nclus : int the number of clusters to generate weights : ndarray nreal-long array of weights for each clustering method : str clustering method for the pairings matrix, either `hier` or `spec` refclus : ndarray A reference clustering for this dataset that the target will be recoded too Returns ------- final_clusterings : ndarray 1D array of final cluster labels given the passed parameters clusterprobs : ndarray ndata x nclus array of likelihood to be in each cluster """ from sklearn.cluster import spectral_clustering try: clusterings = clusterings.clusterings except AttributeError: pass pairings = pairings_matrix(clusterings, weights) # use the selected nd x nd matrix clustering method if method == 'hier': final_clusters = hierarchical_clustering(pairings, nclus, method='ward') else: final_clusters = spectral_clustering(pairings, n_clusters=nclus) if refclus is not None: final_clusters, _ = reclass_clusters(refclus, final_clusters) final_ensemble, _ = reclass_clusters(refclus, clusterings) # if a reference clustering is passed also recode the passed ensemble for i in range(final_ensemble.shape[1]): clusterings[:, i] = final_ensemble[:, i] clusterprobs = cluster_probability_bycount(final_clusters, clusterings) return final_clusters, clusterprobs, pairings
def _spectral_clustering(self,samples): if sp_version < (0, 12): raise SkipTest("Skipping because SciPy version earlier than 0.12.0 and " "thus does not include the scipy.misc.face() image.") # Convert the image into a graph with the value of the gradient on the # edges. graph = image.img_to_graph(samples) # Take a decreasing function of the gradient: an exponential # The smaller beta is, the more independent the segmentation is of the # actual image. For beta=1, the segmentation is close to a voronoi beta = 5 eps = 1e-6 graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps # Apply spectral clustering (this step goes much faster if you have pyamg # installed) N_REGIONS = 4 ############################################################################# # Visualize the resulting regions for assign_labels in ('kmeans', 'discretize'): t0 = time.time() labels = spectral_clustering(graph, n_clusters=N_REGIONS, assign_labels=assign_labels, random_state=1) sample=pd.DataFrame(labels) sample.to_csv(os.path.join(OUTPUT_DIR, "spectral_result.csv"),sep=",") t1 = time.time() #classif=labels.fit(samples) #print classif print labels print sample labels = labels.reshape(samples.shape) plt.figure(figsize=(5, 5)) plt.imshow(samples, cmap=plt.cm.gray) for l in range(N_REGIONS): plt.contour(labels == l, contours=1, colors=[plt.cm.spectral(l / float(N_REGIONS))]) plt.xticks(()) plt.yticks(()) title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0)) print(title) plt.title(title) plt.show()
def perform_clustering(alpha=0.0, num_clusters=100): """ clustering the tag/terms and return the cluster ids for each tag :param alpha: parameter to combine visual and textual similarity matrix :param num_clusters: number of clusters/concepts obtained :return: cluster ids for each tag """ vis_sim_mat = utilites.loadVariableFromFile( "Corel5k/tag_affinity_matrix_scaled.pkl") tex_sim_mat = utilites.loadVariableFromFile( "Corel5k/tag_textual_similarity_matrix.pkl") tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat) vis_sim_mat = expit(vis_sim_mat) # introduce a parameter alpha to merge the two matrics joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat # let's start spectrum clustering # obtain cluster IDs for each word # eigen_solver: None, arpack, lobpcg, or amg cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack') print("Done...") # Create a Word / Index dictionary, mapping each vocabulary word to # a cluster number words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") word_centroid_map = dict(zip(words, cluster_ids)) utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl") cluster_contents = [] # For the first 10 clusters for cluster in range(0, num_clusters): # print the cluster number print("\nCluster %d" % cluster) # Find all of the words for that cluster number, and print them out r_words = [] for i in range(0, len(word_centroid_map.values())): if (word_centroid_map.values()[i] == cluster): r_words.append(word_centroid_map.keys()[i]) print(r_words) cluster_contents.append(r_words) utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl") return cluster_ids
def unify_communities_CM(ghats, k): Qs = {} QQtotal = np.zeros((len(ghats[0]), len(ghats[0]))) for idx in range(len(ghats)): Qs[idx] = np.zeros((len(ghats[idx]), k)) for i, x in enumerate(ghats[idx]): Qs[idx][i, ghats[idx][x] - 1] = 1 QQtotal += np.dot(Qs[idx], Qs[idx].transpose()) spout = spectral_clustering(QQtotal, n_clusters=k) + 1 gfinal = {} for i in ghats[0]: gfinal[i] = spout[i - 1] return gfinal
def clustering(): cosMatrix_mat = sio.loadmat( '../data/result/cosMatrix.mat', struct_as_record=False, squeeze_me=True)['cosMatrix'] userMatrix_mat = getFriendsMatrix() combinedMatrix_mat = userMatrix_mat + cosMatrix_mat clusterNumber = range(50,60) sims = [] for c in clusterNumber: labels = spectral_clustering( combinedMatrix_mat, n_clusters=c, eigen_solver='arpack') sim = clusterSimilarity(combinedMatrix_mat, labels, c) sims.append(sim) print "{} cluster: average simi={}".format(c, sim) print sims
def main(): src_path = os.path.join(os.getcwd(), 'ratings.csv') res_path = os.path.join(os.getcwd(), 'preRatings.csv') predicted_data = pd.read_csv(res_path, header = 0, index_col = 0) int_col = [] for col in predicted_data.columns: icol = int(col) int_col.append(icol) predicted_data.columns = int_col movie_rated_num = pd.Series(index = predicted_data.columns) for i in predicted_data.columns.values: movie_rated_num[i] = predicted_data[i].dropna().count() movie_rated_num.sort() cuted_data = predicted_data.loc[ : , movie_rated_num[8500: ].index] print cuted_data.shape data_matrix = cuted_data.fillna(0).values for i in range(0, len(data_matrix)): for j in range(0, len(data_matrix[i])): if data_matrix[i][j]>3.5: data_matrix[i][j] = 2 elif data_matrix[i][j]<2.5: data_matrix[i][j] = 0 else: data_matrix[i][j] = 1 print data_matrix E_matrix = cs.getEMatrix(data_matrix) labels = spectral_clustering(E_matrix, n_clusters = 20) print labels ''' init_data = pd.read_csv(src_path, header = 0, index_col = 0) # Cause the type of columns that read in csv is str, we need to convert it into int int_col = [int(col) for col in init_data.columns] init_data.columns = int_col init_data = init_data.loc[predicted_data.index, predicted_data.columns] init_data_matrix = init_data.fillna(0).values ''' columns = ['userID', 'movieID', 'rating', 'timestamp'] ratings = pd.read_csv(src_path, header = 1, names = columns) data = ratings.pivot(index = 'userID', columns = 'movieID', values = 'rating') init_data = data.loc[cuted_data.index, cuted_data.columns] init_data_matrix = init_data.fillna(0).values dp.drawPicture(init_data_matrix, labels) '''
def cluster_and_compare(n_clusters, data, labels_true): print(75 * '-') print('cluster\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI') kmeans_cluster = KMeans(n_clusters=n_clusters) kmeans_labels = evaluate_clustering(kmeans_cluster, "kmeans", data, labels_true) start_time = time() graph = cosine_similarity(data) spectral_labels = spectral_clustering(graph, n_clusters=n_clusters) execution_time = time() - start_time evaluate_with_predited_labels(labels_true, spectral_labels, execution_time, "spectral") dbscan_cluster = DBSCAN(eps=0.0595, min_samples=10, metric='cosine') dbscan_labels = evaluate_clustering(dbscan_cluster, "DBSCAN", data, labels_true) agg_cluster = AgglomerativeClustering(n_clusters=n_clusters) agg_labels = evaluate_clustering(agg_cluster, "Agglomerative", data, labels_true) print(75 * '-') pca_converter = PCA(n_components=2) data = pca_converter.fit_transform(data) plt.figure() plt.title('True labels') plt.scatter(data[:, 0], data[:, 1], c=labels_true) plt.figure() plt.title('Kmeans labels') plt.scatter(data[:, 0], data[:, 1], c=kmeans_labels) plt.figure() plt.title('Spectral labels') plt.scatter(data[:, 0], data[:, 1], c=spectral_labels) plt.figure() plt.title('DBSCAN labels') plt.scatter(data[:, 0], data[:, 1], c=dbscan_labels) plt.figure() plt.title('Agglomerative labels') plt.scatter(data[:, 0], data[:, 1], c=agg_labels) plt.show()
def clustering(): cosMatrix_mat = sio.loadmat('../data/result/cosMatrix.mat', struct_as_record=False, squeeze_me=True)['cosMatrix'] userMatrix_mat = getFriendsMatrix() combinedMatrix_mat = userMatrix_mat + cosMatrix_mat clusterNumber = range(50, 60) sims = [] for c in clusterNumber: labels = spectral_clustering(combinedMatrix_mat, n_clusters=c, eigen_solver='arpack') sim = clusterSimilarity(combinedMatrix_mat, labels, c) sims.append(sim) print "{} cluster: average simi={}".format(c, sim) print sims
def spectral_clustering(G, n_clusters=8, node_map=[], no_conversion=False, simple_conversion=False): """ Cluster the given similarity matrix using spectral clustering. Assumes the given similarity network is connected. Args: G (ig.Graph) - the input network n_clusters (int) - number of clusters to look for Returns: clusters (list) - a list of lists of nodes, each sublist represents a cluster """ # generate a numpy distance matrix from the given graph mat = G.get_adjacency(attribute='weight') dist_matrix = np.array(mat.data) if no_conversion: sim_matrix = dist_matrix elif simple_conversion: # take simple inverse to get similarity from distance sim_fn = np.vectorize(lambda x: 0 if x == 0 else 1 / float(x), otypes=[np.float]) sim_matrix = sim_fn(dist_matrix) else: # apply RBF kernel to generate similarity matrix from distance # matrix (i.e. lower DSD => higher similarity) std_dev = dist_matrix.std() sim_fn = np.vectorize(lambda x: 0 if x == 0 else np.exp(-(x) / (2 * (std_dev)**2)), otypes=[np.float]) sim_matrix = sim_fn(dist_matrix) # now do the clustering, scikit-learn implements this # return a list of lists representing the clusters node_assignments = list(sc.spectral_clustering(sim_matrix, n_clusters)) clusters = [] for n in xrange(n_clusters): clusters.append([i for i, m in enumerate(node_assignments) if m == n]) if node_map: return [[node_map[n] for n in cl] for cl in clusters] else: return clusters
def get_Finit(self, seed): """ initialize factors A, B, C :param seed: :return: """ agg_network = self.aggregated_network_matrix() # A_init = sparse.dok_matrix((len(self.node_ids),len(self.node_ids)), dtype=np.float32) A_init = np.zeros((len(self.node_ids), self.num_of_coms)) clusters = spectral_clustering(agg_network, n_clusters=self.num_of_coms, n_init=10, eigen_solver='arpack', random_state=seed) for i, t in enumerate(clusters): A_init[i, t] = 1 B_init = deepcopy(A_init) C_init = np.random.rand(self.tensor.shape[2], self.num_of_coms) Finit = [A_init, B_init, C_init] return Finit
def clustering_useFeature(self, f_list): i = 0 hardLabelDict, softLabelDict = self.getLabel() for key in self.edgeDict: groundTrues = hardLabelDict[key] clusterNum = 12 A = self.edgeDict[key] nt = NetworkTool() nt.initNetwork(A, nodeIndexDict[key]) F = f_list[i] # X=self.initX(A) # labels = spectral_clustering(A, n_clusters=clusterNum, eigen_solver='arpack') # counter=self.counter(labels,clusterNum) S = cosine_similarity(F) S = (S + 1.0) / 2.0 labels = spectral_clustering(S, n_clusters=clusterNum, eigen_solver='arpack') counter = self.counter(labels, clusterNum) nmi_sc = self.NMI(labels.tolist(), groundTrues, clusterNum) print nmi_sc # self.draw(nodeIndexDict[key],nt,labels,str(key)+'_spectral_'+str(clusterNum)+'.png') # self.output(nodeIndexDict[key],labels,str(key)+'_spectral') kmeans = KMeans(n_clusters=clusterNum, random_state=0).fit(F) labels_km = kmeans.labels_.tolist() counter = self.counter(labels_km, clusterNum) nmi_km = self.NMI(labels_km, groundTrues, clusterNum) # nmi_km_sk=normalized_mutual_info_score(groundTrues,labels_km) print nmi_km pca = PCA(n_components=clusterNum, svd_solver='full') F_pca = pca.fit_transform(F) kmeans = KMeans(n_clusters=clusterNum, random_state=0).fit(F_pca) labels_km = kmeans.labels_.tolist() counter = self.counter(labels_km, clusterNum) nmi_km = self.NMI(labels_km, groundTrues, clusterNum) # nmi_km_sk=normalized_mutual_info_score(groundTrues,labels_km) print nmi_km # self.draw(nodeIndexDict[key],nt,labels_km,str(key)+'_kmean_'+str(clusterNum)+'.png') # self.output(nodeIndexDict[key],labels_km,str(key)+'_kmean') i += 1
def test_spectral_lobpcg_mode(): # Test the lobpcg mode of SpectralClustering # We need a fairly big data matrix, as lobpcg does not work with # small data matrices centers = np.array([ [0., 0.], [10., 10.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, eigen_solver="lobpcg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), .3)
def Discretize_Clustering(twoDimg, N_REGIONS): """Put clustering code""" graph = imp.img_to_graph(twoDimg) beta = 1 eps = 1e-1 graph.data = np.exp(-beta * graph.data / twoDimg.std()) + eps t0 = time.time() labels = spectral_clustering(graph, n_clusters=N_REGIONS, assign_labels='discretize', random_state=1) t1 = time.time() labels = labels.reshape(twoDimg.shape) print('time taken', t1 - t0) return labels, N_REGIONS
def dist_matrix(axis, clusters): # if axis: # re_shards = shards[:, non_nilled] # else: # re_shards = shards # print re_shards.shape # c_list = np.split(re_shards, re_shards.shape[axis], axis) # accumulator_matrix = np.zeros((re_shards.shape[axis], re_shards.shape[axis])) # est_len = re_shards.shape[axis]*(re_shards.shape[axis]-1)/2 # for i, (i_a, i_b) in enumerate(combinations(range(0, re_shards.shape[axis]), 2)): # if not i%100: # pl = "{0:0.2f}".format(i/float(est_len)*100.0) # print pl, '%' # a = c_list[i_a] # b = c_list[i_b] # dist = distance(a, b) # accumulator_matrix[i_a, i_b] = dist # accumulator_matrix[i_b, i_a] = dist # # dump(accumulator_matrix,open('loc_dump.dmp','w')) ########################################################################################################## pre_accumulator_matrix = load(open('loc_dump.dmp','r')) accumulator_matrix = np.exp( - pre_accumulator_matrix*pre_accumulator_matrix / pre_accumulator_matrix.std() ) plt.imshow(accumulator_matrix, interpolation='nearest') plt.show() vals, vects = eigh(accumulator_matrix) plt.hist(vals, 1000, log=True) vals[vals**2 < 0.3] = 0 print vals # accumulator_matrix = np.dot(vects, np.dot(np.diag(vals), vects.T)) plt.show() labels = spectral_clustering(accumulator_matrix, n_clusters=clusters, eigen_solver='arpack') print labels stable_mappings = crible(10, labels, non_nilled) print 'stable mappings redundancy:', len(stable_mappings), len(set(stable_mappings)) srt_idx = hierchical_clustering(accumulator_matrix, labels) dump((stable_mappings, accumulator_matrix, srt_idx, non_nilled), open('loc_dump2.dmp','w'))
def SpectralClusterImage(input_image, beta=5, eps=1e-6, n_regions=11, assign_labels='discretize',downsample_factor=np.NaN, order=3): """ Spectral Cluster an image Inputs: input_image: ndarray of image beta: Take a decreasing function of the gradient: an exponential The smaller beta is, the more independent the segmentation is of the acutal image. For beta=1, the segmentation is close to a voronoi. Default is 5. eps: error term. Default is 1E-6 n_regions: number of regions to decompose into. Default is 11. assign_labels: ways of decomposition. Selecting from 'discretize' and 'kmeans'. Default is 'discretize'. downsample_factor: downsampling before spectral decomposition. Default is to keep the original sampling. Enter a single number to apply the kernel for both dimensions of the image, or enter as a sequence to apply different kernel for each dimension order: downsampling method, order of B-spline interpolation """ # Downsample the image if not np.isnan(downsample_factor): zoom(input_image, zoom=downsample_factor, order=order) # Convert the image into a graph with the value of the gradient on the edges graph = image.img_to_graph(input_image) # Take a decreasing function of the gradient: an exponential # The smaller beta is, the more independent the segmentation is of the # acutal image. For beta=1, the segmentation is close to a voronoi graph.data = np.exp(-beta * graph.data / input_image.std()) + eps # Apply spectral clustering (this step goes much faster if yuo have pyamg # installed) labels = spectral_clustering(graph, n_clusters=n_regions, assign_labels='discretize') labels = labels.reshape(input_image.shape) # Visualizing the resulting regions pl.figure(figsize=(5,5)) pl.imshow(input_image, cmap=pl.cm.gray) for lb in range(n_regions): pl.contour(labels == lb, contour=1, color=[pl.cm.spectral(lb / float(n_regions)), ]) # Get rid of x, y tick marks pl.xticks(()) pl.yticks(())
def perform_clustering(alpha=0.0, num_clusters=100): """ clustering the tag/terms and return the cluster ids for each tag :param alpha: parameter to combine visual and textual similarity matrix :param num_clusters: number of clusters/concepts obtained :return: cluster ids for each tag """ vis_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_affinity_matrix_scaled.pkl") tex_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_textual_similarity_matrix.pkl") tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat) vis_sim_mat = expit(vis_sim_mat) # introduce a parameter alpha to merge the two matrics joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat # let's start spectrum clustering # obtain cluster IDs for each word # eigen_solver: None, arpack, lobpcg, or amg cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack') print("Done...") # Create a Word / Index dictionary, mapping each vocabulary word to # a cluster number words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") word_centroid_map = dict(zip(words, cluster_ids)) utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl") cluster_contents = [] # For the first 10 clusters for cluster in range(0, num_clusters): # print the cluster number print("\nCluster %d" % cluster) # Find all of the words for that cluster number, and print them out r_words = [] for i in range(0,len(word_centroid_map.values())): if( word_centroid_map.values()[i] == cluster ): r_words.append(word_centroid_map.keys()[i]) print (r_words) cluster_contents.append(r_words) utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl") return cluster_ids
def spectral_cluster(G, node_list): # G is a similarity matrix S = nx.to_scipy_sparse_matrix(G, nodelist=node_list) previous_sum_cut = 0 previous_cluster_node = {} previous_cluster_label = {} for i in range(2, 100): labels = spectral_clustering(S, n_clusters=i) labels = labels.tolist() # print(labels) result_cluster_node = dict(zip(node_list, labels)) result_cluster_label = {} for k in result_cluster_node: v = result_cluster_node[k] if v in result_cluster_label: result_cluster_label.get(v).add(k) else: result_cluster_label[v] = {k} # print(result_cluster_label) sum_cut = 0 for k in result_cluster_label: cut_k = 0 vol_k = 0 v = result_cluster_label[k] for nk in v: set_not_k = set(node_list).difference(v) vol_k += csr_matrix.sum(S.getcol(node_list.index(nk))) # print(nk, S.getcol(cited_list.index(nk)).toarray().tolist()) for notk in set_not_k: cut_k += G.get_edge_data(nk,notk,default={"weight":0})["weight"] # print(cut_k, vol_k) sum_cut += (cut_k/vol_k) if sum_cut > previous_sum_cut != 0 or i == 99: print(i, sum_cut, result_cluster_label) return {"result_by_node": previous_cluster_node, "result_by_cluster": previous_cluster_label} break else: previous_cluster_node = result_cluster_node previous_cluster_label = result_cluster_label previous_sum_cut = sum_cut
def defficient_spectral_clustring(name_dict, z_depth, shape_dict): # requires to re-implement the distance definition on sparse images. z_stack = next(name_dict.itervalues()) _3D_chan1 = np.zeros((shape_dict[1][0], shape_dict[1][1], z_depth)) _3D_chan2 = np.zeros((shape_dict[2][0], shape_dict[2][1], z_depth)) for depth, bi_image in z_stack.iteritems(): img1 = bi_image[1] img2 = bi_image[2] _3D_chan1[:, :, depth-1] = img1 _3D_chan2[:, :, depth-1] = img2 # mlab.pipeline.volume(mlab.pipeline.scalar_field(_3D_chan1), vmin=0.1) # mlab.show() _3D_chan2[_3D_chan2<0.04] = 0 mlab.pipeline.volume(mlab.pipeline.scalar_field(_3D_chan2)) mlab.show() mask = _3D_chan2.astype(bool) img = _3D_chan2.astype(float) graph = image.img_to_graph(img, mask=mask) graph.data = np.exp(-graph.data / graph.data.std()) print graph.shape print len(graph.nonzero()[0]) clusters = 4 labels = spectral_clustering(graph, n_clusters = clusters, eigen_solver='arpack') label_im = -np.ones(mask.shape) label_im[mask] = labels for i in range(0, clusters): re_img = copy(_3D_chan2) re_img[label_im!=i] = 0 mlab.pipeline.volume(mlab.pipeline.scalar_field(re_img)) mlab.show()