def test_connectivity_popagation(): """ Check that connectivity in the ward tree is propagated correctly during merging. """ from sklearn.neighbors import NearestNeighbors X = np.array( [ (0.014, 0.120), (0.014, 0.099), (0.014, 0.097), (0.017, 0.153), (0.017, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.152), (0.018, 0.149), (0.018, 0.144), ] ) nn = NearestNeighbors(n_neighbors=10).fit(X) connectivity = nn.kneighbors_graph(X) ward = Ward(n_clusters=4, connectivity=connectivity) # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def hieclu(data_matrix, k): #use Hierarchical clustering print 'using hierarchical clustering......' ac = Ward(n_clusters=k) ac.fit(data_matrix) result = ac.fit_predict(data_matrix) return result
def hierarchicalClustering(x,k): model = Ward(n_clusters=k) labels = model.fit_predict(np.asarray(x)) # Centroids is a list of lists centroids = [] for c in range(k): base = [] for d in range(len(x[0])): base.append(0) centroids.append(base) # Stores number of examples per cluster ctrs = np.zeros(k) # Sum up all vectors for each cluster for c in range(len(x)): centDex = labels[c] for d in range(len(centroids[centDex])): centroids[centDex][d] += x[c][d] ctrs[centDex] += 1 # Average the vectors in each cluster to get the centroids for c in range(len(centroids)): for d in range(len(centroids[c])): centroids[c][d] = centroids[c][d]/ctrs[c] return (centroids,labels)
def agglomerate(self, nodes, edges, clusters): if len(nodes) != len(clusters): print("#nodes(%d) != #clusters(%d)" % (len(nodes), len(clusters))) neighbors = {} for edge in edges: if edge[0] in neighbors: neighbors[edge[0]].append(edge[1]) else: neighbors[edge[0]] = [edge[1]] node_clusters = {} # node: its cluster id communities = {} # cluster id: all neighbors for its members for i in range(len(nodes)): if clusters[i] in communities: communities[clusters[i]].extend(neighbors[nodes[i]]) else: communities[clusters[i]] = neighbors[nodes[i]] node_clusters[nodes[i]] = clusters[i] N = len(communities) affinity_matrix = sp.zeros([N, N]) for comm in communities: members = [node_clusters[node] for node in communities[comm]] degree = dict(Counter(members)) for key in degree: affinity_matrix[comm, key] = degree[key] ward = Ward(n_clusters=6) predicts = ward.fit_predict(affinity_matrix) return [predicts[node_clusters[node]] for node in nodes]
def constraint(self, nodes, edges, lables): if len(nodes) != len(lables): print("#nodes(%d) != #clusters(%d)" % (len(nodes), len(lables))) N = len(nodes) circles = {} guidance_matrix = sp.zeros([N, N]) # guidance_matrix = {} for i in range(len(nodes)): if lables[i] in circles: circles[lables[i]].append(nodes[i]) else: circles[lables[i]] = [nodes[i]] for key in circles.iterkeys(): print(key, len(circles[key])) c = 36 for ni in circles[c]: i = nodes.index(ni) for nj in circles[c]: j = nodes.index(nj) guidance_matrix[i, j] = 1.0 guidance_matrix = sparse.lil_matrix(guidance_matrix) # pos = sum(x > 0 for x in guidance_matrix) print(guidance_matrix) ward = Ward(n_clusters=6, n_components=2, connectivity=guidance_matrix) predicts = ward.fit_predict(self.A) print(predicts)
def buildFromImageCollectionWard(self, pathTxtFile, pathDirImages, fileImageExtension, vocabularySize, maxNumImages=sys.maxint): # vocabularySize could be 4096 # Read the image IDs imageIds = self.readImageIdsFromTxtFile(pathTxtFile) # If there are more images than the considered ones... if (len(imageIds) > maxNumImages): imageIds = random.sample(imageIds, maxNumImages) # Extract the SURF descriptors from a collection of images and save in dictionary surfExtractor = SurfExtractor(True) surfExtractor.processCollectionFilesImage(imageIds, pathDirImages, fileImageExtension) # Create a numpy array from the descriptors descriptors = surfExtractor.getDescriptors() arr_descriptor = np.vstack(tuple(descriptors)) #self.mbk = MiniBatchKMeans(init='k-means++', # k=vocabularySize, # n_init=10, # max_no_improvement=10, # verbose=0) self.ward = Ward(n_clusters=vocabularySize) self.ward.fit(arr_descriptor)
def compute_clusters(dataset, features_vector): """ Apply clustering method """ labels = dataset.target true_k = np.unique(labels).shape[0] # Run clustering method print "Performing clustering with method ", cmd_options.clust_method.upper( ) print if (cmd_options.clust_method == "hclust"): result = features_vector.toarray() ward = Ward(n_clusters=true_k) ward.fit(result) return ward if (cmd_options.clust_method == "kmeans"): km = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, verbose=1) km.fit(features_vector) return km
def hierarchicalClustering(x, k): model = Ward(n_clusters=k) labels = model.fit_predict(np.asarray(x)) # Centroids is a list of lists centroids = [] for c in range(k): base = [] for d in range(len(x[0])): base.append(0) centroids.append(base) # Stores number of examples per cluster ctrs = np.zeros(k) # Sum up all vectors for each cluster for c in range(len(x)): centDex = labels[c] for d in range(len(centroids[centDex])): centroids[centDex][d] += x[c][d] ctrs[centDex] += 1 # Average the vectors in each cluster to get the centroids for c in range(len(centroids)): for d in range(len(centroids[c])): centroids[c][d] = centroids[c][d] / ctrs[c] return (centroids, labels)
def __hieclu(self): #use Hierarchical clustering print 'using hierarchical clustering......' ac = Ward(n_clusters=self.k) ac.fit(self.data_matrix) result = ac.fit_predict(self.data_matrix) return result
def test_connectivity_popagation(): """ Check that connectivity in the ward tree is propagated correctly during merging. """ from sklearn.neighbors import NearestNeighbors X = np.array([ (.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144), ]) nn = NearestNeighbors(n_neighbors=10, warn_on_equidistant=False).fit(X) connectivity = nn.kneighbors_graph(X) ward = Ward(n_clusters=4, connectivity=connectivity) # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def __hieclu(self): #use Hierarchical clustering print 'using hierarchical clustering......' ac = Ward(n_clusters = self.k) ac.fit(self.data_matrix) result = ac.fit_predict(self.data_matrix) return result
def test_connectivity_popagation(): """ Check that connectivity in the ward tree is propagated correctly during merging. """ from sklearn.neighbors import kneighbors_graph X = np.array([ (.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144), ]) connectivity = kneighbors_graph(X, 10) ward = Ward(n_clusters=4, connectivity=connectivity) # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10)
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ np.random.seed(0) mask = np.ones([10, 10], dtype=np.bool) X = np.random.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10)
def test_connectivity_fixing_non_lil(): """ Check non regression of a bug if a non item assignable connectivity is provided with more than one component. """ # create dummy data x = np.array([[0, 0], [1, 1]]) # create a mask with several components to force connectivity fixing m = np.array([[True, False], [False, True]]) c = grid_to_graph(n_x=2, n_y=2, mask=m) w = Ward(connectivity=c) w.fit(x)
def cluster_ward(classif_data, vect_data): ward = Ward(n_clusters=10) np_arr_train = np.array(vect_data["train_vect"]) np_arr_label = np.array(classif_data["topics"]) np_arr_test = np.array(vect_data["test_vect"]) labels = ward.fit_predict(np_arr_train) print "Ward" sil_score = metrics.silhouette_score(np_arr_train, labels, metric='euclidean') print sil_score return labels
def get_km_segments(x, image, sps, n_segments=25): if len(x) == 2: feats, edges = x else: feats, edges, _ = x colors_ = get_colors(image, sps) centers = get_centers(sps) n_spixel = len(feats) graph = sparse.coo_matrix((np.ones(edges.shape[0]), edges.T), shape=(n_spixel, n_spixel)) ward = Ward(n_clusters=n_segments, connectivity=graph + graph.T) # km = KMeans(n_clusters=n_segments) color_feats = np.hstack([colors_, centers * 0.5]) # return km.fit_predict(color_feats) return ward.fit_predict(color_feats)
def spectral_cluster(data, n_clusters, method='sl'): # 获取拉普拉斯矩阵 if method == 'NJW': lap_matrix = get_lap_matrix_njw(data, 0.1) eigenvalues, eigenvectors = np.linalg.eig(lap_matrix) idx = eigenvalues.argsort()[::-1] eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] elif method == 'self-tuning': lap_matrix = get_lap_matrix_self_tuning(data) eigenvalues, eigenvectors = np.linalg.eig(lap_matrix) idx = eigenvalues.argsort()[::-1] eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] else: lap_matrix = get_lap_matrix_sl(data, 0.1) eigenvalues, eigenvectors = np.linalg.eig(lap_matrix) idx = eigenvalues.argsort() eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] #print(eigenvalues) # 获取前n_clusters个特征向量 x_matrix = eigenvectors[:, 0:n_clusters] # 归一化特征向量矩阵 y_matrix = normal_eigen(x_matrix) # 调用自己写的k_means函数 """ k_dist_dic, k_centers_dic, cluster_group = kmeans.k_means(y_matrix, n_clusters) mat_plot_cluster_sample(data, cluster_group, method) """ # 调用自己写的bi_k_means函数 """center_list, cluster_assign = bikmeans.exe_bi_k_means(y_matrix, n_clusters) labels = cluster_assign[:, 0] mat_plot_cluster_sample(data, labels. method) # 调用sklearn中的KMeans函数,效果比自己写的强了好多 k_means = KMeans(n_clusters) k_means.fit(y_matrix) #k_centers = k_means.cluster_centers_ #mat_plot_cluster_sample(data, k_means.labels_, method) """ # 调用sklearn中的hierarchical 聚类方法进行聚类 hie_cluster = Ward(n_clusters) hie_cluster.fit(y_matrix) mat_plot_cluster_sample(data, hie_cluster.labels_, method)
def get_km_segments(x, image, sps, n_segments=25): if len(x) == 2: feats, edges = x else: feats, edges, _ = x colors_ = get_colors(image, sps) centers = get_centers(sps) n_spixel = len(feats) graph = sparse.coo_matrix((np.ones(edges.shape[0]), edges.T), shape=(n_spixel, n_spixel)) ward = Ward(n_clusters=n_segments, connectivity=graph + graph.T) #km = KMeans(n_clusters=n_segments) color_feats = np.hstack([colors_, centers * .5]) #return km.fit_predict(color_feats) return ward.fit_predict(color_feats)
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10): ############################################################################## # Extract Y true labels_true = y_true ############################################################################## # transform distance matrix into a similarity matrix S = 1 - D ############################################################################## # compute DBSCAN #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S) db = Ward(n_clusters=n_clusters).fit(S) #core_samples = db.core_sample_indices_ labels = db.labels_ # number of clusters in labels, ignoring noise if present n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print 'Number of clusters: %d' % n_clusters_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels) print 'Completeness: %0.3f' % metrics.completeness_score( labels_true, labels) print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels) print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score( labels_true, labels) print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score( labels_true, labels) print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score( D, labels, metric='precomputed')
def spect_clust_segmentation(lena, regions=20): X = np.reshape(lena, (-1, 1)) connectivity = grid_to_graph(*lena.shape) print("Compute structured hierarchical clustering...") st = time.time() n_clusters = regions ward = Ward(n_clusters=n_clusters, connectivity=connectivity).fit(X) label = np.reshape(ward.labels_, lena.shape) print("Elapsed time: ", time.time() - st) print("Number of pixels: ", label.size) print("Number of clusters: ", np.unique(label).size) plt.imshow(lena, cmap=plt.cm.gray) for l in range(n_clusters): plt.contour(label == l, contours=1, colors=[ plt.cm.spectral(l / float(n_clusters)), ]) plt.show()
def test_linkage_misc(): # Misc tests on linkage X = np.ones((5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foobar').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foobar') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning Ward(copy=True).fit(X) # We should be getting 2 warnings: one for using Ward that is # deprecated, one for using the copy argument assert_equal(len(warning_list), 2) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning ward_tree(X, copy=True) # We should be getting 1 warnings: for using the copy argument assert_equal(len(warning_list), 1) # Let's test a hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
def test_linkage_misc(): # Misc tests on linkage rnd = np.random.RandomState(42) X = rnd.normal(size=(5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foo') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # Deprecation of Ward class with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) Ward().fit(X) assert_equal(len(warning_list), 1) # test hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hiearchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def cluster_tiestrength_kmeans(self,vertices=None, nclusters=2, cluster_prop='tsk'): if vertices is None: vertices=self.gs ts=self.similarity_dice(vertices) #list of list of similarity(float) ward=Ward(nclusters).fit(ts) for i,v in enumerate(vertices): v[cluster_prop]=ward.labels_[i]
def _run_interface(self, runtime): #load data data = nb.load(self.inputs.in_File).get_data() corrmatrix = np.squeeze(data) if self.inputs.cluster_type == 'spectral': positivecorrs = np.where( corrmatrix > 0, corrmatrix, 0) #threshold at 0 (spectral uses non-negative values) newmatrix = np.asarray( positivecorrs, dtype=np.double) #spectral expects dtype=double values labels = spectral(newmatrix, n_clusters=self.inputs.n_clusters, eigen_solver='arpack', assign_labels='discretize') if self.inputs.cluster_type == 'hiercluster': labels = Ward( n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix) if self.inputs.cluster_type == 'kmeans': labels = km( n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix) if self.inputs.cluster_type == 'dbscan': labels = DBSCAN(eps=self.inputs.epsilon).fit_predict(corrmatrix) new_img = nb.Nifti1Image(labels + 1, None) #+1 because cluster labels start at 0 _, base, _ = split_filename(self.inputs.in_File) nb.save( new_img, os.path.abspath(base + '_' + str(self.inputs.n_clusters) + '_' + self.inputs.cluster_type + '_' + self.inputs.hemi + '.nii')) return runtime
def doCoClustering(self, leftClustCount, rightClustCount, clustPropName='coclust'): vsleft = self.left() simleft = np.matrix(self.similarity_dice(vsleft)) clustleft = Ward(n_clusters=leftClustCount).fit(simleft).labels_ vsright = self.right() full2bipart = [ (None, -1) ] * self.vcount() #tuple of (isOnRightSide,index in left/right list) for i, v in enumerate(vsleft): full2bipart[v.index] = (False, i) for i, v in enumerate(vsright): full2bipart[v.index] = (True, i) sizeright = len(vsright) m_rclust = np.zeros(shape=(sizeright, leftClustCount)) for e in self.es: (srcOnRight, src) = full2bipart[e.source] (_, dst) = full2bipart[e.target] if srcOnRight: vright = src clust = clustleft[dst] else: vright = dst clust = clustleft[src] m_rclust[vright, clust] += 1 clustSizes = [0] * leftClustCount for c in clustleft: clustSizes[c] += 1 for (row, col) in [(row, col) for (row, col), val in np.ndenumerate(m_rclust) if val]: #m_rclust[row,col]=float(val)/clustSizes[col] m_rclust[row, col] = float(val) / vsright[row].degree() simRight = cdist(m_rclust, m_rclust, 'cosine') clustright = Ward(n_clusters=rightClustCount).fit(simRight).labels_ for i, c in enumerate(clustright): vsright[i][clustPropName] = c
def max_diff_dist_idx(dist_mat, min_dist, max_dist): num_nodes = dist_mat.shape[0] dist_diff = [] max_diff = -1 max_diff_row = 0 max_diff_label = [] max_cluster_idx = [] for i, dist_vals in enumerate(dist_mat): # exclude its own distance idx_set = np.r_[np.r_[0:i:1], np.r_[i + 1:num_nodes:1]] #print i,'th row k-mean cluster' temp = dist_vals[idx_set] if np.min(temp) > max_dist: exemplar_idx = i max_cluster_idx = i #import pdb;pdb.set_trace() return exemplar_idx, max_cluster_idx ######################################## # K-mean #_,label,_=cluster.k_means(temp[:,None],2) # Herichical Binary Clutering ward = Ward(n_clusters=2).fit(temp[:, None]) label = ward.labels_ #kmean=KMeans(n_clusters=2).fit(temp[:,None]) #label=kmean.labels_ # max is default centroid = np.zeros(2) #import pdb;pdb.set_trace() centroid[0] = np.max(temp[label == 0]) centroid[1] = np.max(temp[label == 1]) #idx0=idx_set[np.nonzero(label==0)] #idx1=idx_set[np.nonzero(label==1)] #dist01=np.round([dist_mat[v0,v1] for v0 in idx0 for v1 in idx1],2) #num_min_dist_violation=len(np.nonzero(dist01<min_dist)[0]) ######################################## temp_1 = abs(centroid[0] - centroid[1]) cent_diff = centroid[0] - centroid[1] dist_diff.append(abs(cent_diff)) if max_diff < temp_1: #if (max_diff< temp_1) and (num_min_dist_violation==0): max_idx_set = idx_set max_diff_row = i max_diff = temp_1 max_diff_label = label max_cent_diff = cent_diff #import pdb;pdb.set_trace() cur_cent_idx = set([]) if max_cent_diff > 0: cur_cent_idx = cur_cent_idx | set(np.nonzero(max_diff_label == 1)[0]) else: cur_cent_idx = cur_cent_idx | set(np.nonzero(max_diff_label == 0)[0]) max_cluster_idx = list( set(max_idx_set[list(cur_cent_idx)]) | set([max_diff_row])) exemplar_idx = max_diff_row return exemplar_idx, max_cluster_idx
def ward(self, X, n_clusters, plot=True): k_means = Ward(n_clusters=n_clusters, copy=False, compute_full_tree=True, memory="cache") k_means.fit(X) labels = k_means.labels_ pl.close('all') pl.figure(1) pl.clf() if plot: colors = "rbgcmybgrcmybgrcmybgrcm" * 10 X2d = RandomizedPCA(n_components=2).fit_transform(X) for i in xrange(len(X2d)): x = X2d[i] pl.plot(x[0], x[1], "o", markerfacecolor=colors[labels[i]], markeredgecolor=colors[labels[i]], alpha=0.035) pl.show() return k_means.labels_
def cluster_ward(self, calpha=True): ''' cluster the positively predicted residues using the Ward method. Returns a list of cluster labels the same length as the number of positively predicted residues. ''' if calpha: data_atoms = self.positive_surface_residues.ca #else: # data_atoms = self.positive_surface_residues.select('ca or sidechain').copy() if data_atoms.getCoords().shape[0] < 4: print self.pdbid, data_atoms.getCoords().shape return {} connectivity = kneighbors_graph(data_atoms.getCoords(), 5) ward = Ward(n_clusters=self.WARD_N_CLUSTERS, connectivity=connectivity) ward.fit(data_atoms.getCoords()) resnums = data_atoms.getResnums() reslabels = ward.labels_ clusters = sorted([resnums[reslabels==i] for i in set(reslabels)], key=len, reverse=True) return dict(enumerate(clusters))
def hac_derived_ordering( bags_file, num_clusters_multiplier=0.4 ): #uses HAC analysis to output hierarchies and evaluate results with ground truth print '*HAC DERIVED ORDERING*', num_clusters_multiplier print 'Starting Hierarchical Agglomerative Clustering analysis...' data, words, transcripts = doc_term_mat_from_bags(bags_file) model = Ward(n_clusters=int(num_clusters_multiplier * len(transcripts))).fit(data) clust = model.fit_predict(data) hier_sets = [] for i in range(len(transcripts)): s = [i + 1] #print transcripts[i] for j in range(0, i): if (clust[i] == clust[j]): #print '>>', transcripts[j] s.append(j + 1) hier_sets.append(set(s)) return compare_hierarchies(hier_sets)
def cluster_w_else(network, similarity_matrix, number_of_communities=20): raw_communities = Ward( n_clusters=number_of_communities).fit(similarity_matrix).labels_ #raw_communities = KMeans(k=number_of_communities).fit(similarity_matrix).labels_ #raw_communities = DBSCAN().fit(similarity_matrix, eps=eps, min_samples=min_samples).labels_ communities = OrderedDict([(x, []) for x in range(number_of_communities)]) for i in range(len(network)): community_idx = raw_communities[i] if community_idx != -1: communities[community_idx].append(network.keys()[i]) return communities
def cluster_hierarchically(self, raw_data, num_clusters, cmtrx=None): """ """ if cmtrx is None: cmtrx = self.generate_connectivity_matrix(raw_data.shape[0]) try: ward_clusters = Ward(n_clusters=num_clusters, connectivity=cmtrx).fit(raw_data) except NameError: print 'WARNING: sklearn Ward clustering disabled.' return None return ward_clusters.labels_
def identify_communities(number_of_communities, similarity_matrix, node_ids): raw_communities = Ward( n_clusters=number_of_communities).fit(similarity_matrix).labels_ #raw_communities = KMeans(k=number_of_communities).fit(similarity_matrix).labels_ #raw_communities = DBSCAN().fit(similarity_matrix, eps=eps, min_samples=min_samples).labels_ num_communities = len( set(raw_communities)) - (1 if -1 in raw_communities else 0) communities = OrderedDict([(x, []) for x in range(num_communities)]) for i in range(len(node_ids)): community_idx = raw_communities[i] if community_idx != -1: communities[community_idx].append(node_ids[i]) return communities
def main(): print "## Welcome to the clustering tutorial ##" args = parse_args() x, tc = generate_data(args.n) ks = numpy.arange(1, args.k + 1) crs = numpy.zeros(args.k) col = 'k' print "Computing %s clustering quality criterion" % args.criterion for j in xrange(args.k): ward = Ward(n_clusters=ks[j]).fit(x) labels = ward.labels_ if args.criterion == 'squared': crs[j] = squared_criterion(x, labels) col = 'r' elif args.criterion == 'diameter': crs[j] = diameter_criterion(x, labels) col = 'g' elif args.criterion == 'silhouette': crs[j] = silhouette_criterion(x, labels) col = 'b' else: raise ValueError("Wrong criterion" + args.criterion) pylab.figure(figsize=(12, 6)) ward = Ward(n_clusters=args.n).fit(x) labels = ward.labels_ pylab.subplot(1, 2, 1) plot_data(x, labels) pylab.subplot(1, 2, 2) plot_criterion(ks, crs, col) pylab.show()
def compute_clusters(dataset,features_vector): """ Apply clustering method """ labels = dataset.target true_k = np.unique(labels).shape[0] # Run clustering method print "Performing clustering with method ", cmd_options.clust_method.upper() print if(cmd_options.clust_method == "hclust"): result = features_vector.toarray() ward = Ward(n_clusters=true_k) ward.fit(result) return ward if(cmd_options.clust_method == "kmeans"): km = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, verbose=1) km.fit(features_vector) return km
def do_experiments(dataset): X, y = dataset.data, dataset.target dataset_name = dataset.DESCR.split('\n')[0] if dataset_name.startswith("Iris"): # iris has duplicate data points. That messes up our # MeanNN implementation. from scipy.spatial.distance import pdist, squareform dist = squareform(pdist(X)) doubles = np.unique(np.where(np.tril(dist - 1, -1) == -1)[0]) mask = np.ones(X.shape[0], dtype=np.bool) mask[doubles] = False X = X[mask] y = y[mask] n_clusters = len(np.unique(y)) print("\n\nDataset %s samples: %d, features: %d, clusters: %d" % (dataset_name, X.shape[0], X.shape[1], n_clusters)) print("=" * 70) classes = [ ITM(n_clusters=n_clusters), ITM(n_clusters=n_clusters, infer_dimensionality=True), Ward(n_clusters=n_clusters), KMeans(n_clusters=n_clusters) ] names = ["ITM", "ITM ID", "Ward", "KMeans"] for clusterer, method in zip(classes, names): start = time() clusterer.fit(X) y_pred = clusterer.labels_ ari = adjusted_rand_score(y, y_pred) ami = adjusted_mutual_info_score(y, y_pred) nmi = normalized_mutual_info_score(y, y_pred) objective = tree_information(X, y_pred) runtime = time() - start print("%-15s ARI: %.3f, AMI: %.3f, NMI: %.3f objective: %.3f time:" "%.2f" % (method, ari, ami, nmi, objective, runtime)) i_gt = tree_information(X, y) print("GT objective: %.3f" % i_gt)
def test_linkage_misc(): # Misc tests on linkage X = np.ones((5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foobar').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foobar') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning Ward(copy=True).fit(X) # We should be getting 2 warnings: one for using Ward that is # deprecated, one for using the copy argument assert_equal(len(warning_list), 2)
def cluster(dump_path, file_name, n_clusters=200): # Obtain data from file. #feature_file = 'feature.list' data = np.loadtxt(file_name, unpack=True) m1 = data[1] X = np.transpose(data) X = scale(X) labels_true = np.zeros(len(m1)) ############################################################################### # Compute clustering print("Compute unstructured hierarchical clustering...") st = time.time() ward = Ward(n_clusters=n_clusters).fit(X) label = ward.labels_ print("Elapsed time: ", time.time() - st) print("Number of points: ", label.size) label_file = dump_path + "ward_labels.list" fp = open(label_file, 'w') for i in label: fp.write("%d\n" % i) fp.close() num_cluster_file = dump_path + "_num_clusters_ward.info" fp = open(num_cluster_file, 'w') fp.write("%d" % n_clusters) fp.close() cluster_centers = ward.cluster_centers_ score = 0.0 # print "evaluating performance..." # score = metrics.silhouette_score(X, label, metric='euclidean', sample_size=20000) # print "evaluation done." # score = metrics.silhouette_samples(X, k_means_labels, metric='euclidean', sample_size=1000) # score = np.sum(score)/len(score) return score
def clusterRT_ward(values) : if len(values) == 0 : return [] v = sorted([[val] for val in values]) #connectivity = kneighbors_graph(np.asarray(v), n_neighbors=3) ward = Ward(n_clusters=2).fit(np.asarray(v)) labels = ward.labels_ curr_l = -2 cl_output = [] curr_cluster = [] for i,l in enumerate(labels) : if l != curr_l : if len(curr_cluster) > 0 : cl_output.append(curr_cluster) curr_l = l curr_cluster = [] curr_cluster.append(values[i]) cl_output.append(curr_cluster) return cl_output
# Generate data lena = misc.imread('dyfoc.png') # Downsample the image by a factor of 4 lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2] X = np.reshape(lena, (-1, 1)) ############################################################################### # Define the structure A of the data. Pixels connected to their neighbors. connectivity = grid_to_graph(*lena.shape) ############################################################################### # Compute clustering print("Compute structured hierarchical clustering...") st = time.time() n_clusters = 15 # number of regions ward = Ward(n_clusters=n_clusters, connectivity=connectivity).fit(X) label = np.reshape(ward.labels_, lena.shape) print("Elapsed time: ", time.time() - st) print("Number of pixels: ", label.size) print("Number of clusters: ", np.unique(label).size) ############################################################################### # Plot the results on an image pl.figure(figsize=(5, 5)) pl.imshow(lena, cmap=pl.cm.gray) for l in range(n_clusters): pl.contour(label == l, contours=1, colors=[pl.cm.spectral(l / float(n_clusters)), ]) pl.xticks(()) pl.yticks(()) pl.show()
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) # test caching clustering = Ward(n_clusters=10, connectivity=connectivity, memory=mkdtemp()) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) # Turn caching off now clustering = Ward(n_clusters=10, connectivity=connectivity) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) np.testing.assert_array_equal(clustering.labels_, labels) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = Ward(n_clusters=10, connectivity=connectivity.todense()) assert_raises(TypeError, clustering.fit, X) clustering = Ward(n_clusters=10, connectivity=sparse.lil_matrix( connectivity.todense()[:10, :10])) assert_raises(ValueError, clustering.fit, X)
def encode(self, interm_rep, neighborhood_size = 26, clust_ratio=10, encoding='geometrical', similarity_measure='pearson', threshold=0.3, n_jobs=1, **kwds): """ Parameters ---------- interm_rep: IntermRep IntermRep object containing the arr_xyz and arr_voxel matrixes. neighborhood_size: int Number of neighbors each voxel will be connected to. clust_ratio: int The number of clusters will be equal to n/clust_ratio, where n is the number of voxels. encoding: string Type of encoding. 'geometrical' and 'functional' are allowed. similarity_measure: string Similarity measure used to compare the representative value of each parcel (cluster). 'pearson' or the measures available in scikit-learn are allowed. threshold: float Threshold applied to the similarity values in order to define the edges in the graph. Returns ------- g: Graph Networkx graph representing the graph encoding of the data. """ #computing the connectivity matrix, each voxel is connected to #"neighborhood_size" neighbors. # conn = kneighbors_graph(interm_rep.arr_xyz, n_neighbors=neighborhood_size) # conn_n = kneighbors_graph(interm_rep.arr_xyz, n_neighbors=neighborhood_size) # conn_r = radius_neighbors_graph(interm_rep.arr_xyz, radius=10) # conn = conn_n * conn_r #Hierarchical clustering algorithm. The number of clusters is defined #accoring to the parameter "clust_ratio". ward = Ward(n_clusters=len(interm_rep.arr_xyz)/clust_ratio, connectivity=conn) #ward = Ward(n_clusters=60, connectivity=conn) #Type of encoding: geometrical (only xyz data is used) or # functional (voxel time series is used). if encoding=='geometrical': ward.fit(interm_rep.arr_xyz) elif encoding=='functional': ward.fit(interm_rep.arr_voxels) labels = ward.labels_ #Plotting the voxels with the cluster labels. #pp.plot_clustering_intermediate_representation(interm_rep, labels*10) #Computing the unique cluster indentifiers l_unique = np.unique(labels) mean_voxels = np.zeros((len(l_unique), interm_rep.arr_voxels.shape[1])) mean_xyz = np.zeros((len(l_unique), interm_rep.arr_xyz.shape[1])) cont = 0 for i in l_unique: #Taking the possitions corresponding to the same cluster. pos = np.where(labels == i)[0] #Taking data from these possitions and computing the mean time serie m_voxel = interm_rep.arr_voxels[pos].mean(0) #Taking the xyz from these positions and computing the mean value m_xyz = interm_rep.arr_xyz[pos].mean(0) mean_voxels[cont] = m_voxel mean_xyz[cont] = m_xyz cont += 1 #plotting the voxels time series for each cluster #pp.plot_interm_representation_time_series(ir.IntermRep(mean_voxels, mean_xyz)) #The new intermediate representation is given by mean_voxels and # mean_xyz. #Computing similarity matrix and applying the threshold adj_mat = np.zeros((len(mean_voxels), len(mean_voxels)), dtype = np.byte) for j in range(len(mean_voxels) - 1): for k in range(j + 1, len(mean_voxels)): if similarity_measure == 'pearson': aux = st.pearsonr(mean_voxels[j], mean_voxels[k])[0] else: aux = skpw.pairwise_kernel(mean_voxels[j], mean_voxels[k], metric = similarity_measure, n_jobs = n_jobs) if aux >= threshold: adj_mat[j,k] = 1 adj_mat[k,j] = 1 # #Weighted encoding (for graph kernels that work with weighted graphs) # #------------------------------------ # adj_mat = np.zeros((len(mean_voxels), len(mean_voxels)), # dtype = np.float) # for j in range(len(mean_voxels) - 1): # for k in range(j + 1, len(mean_voxels)): # if similarity_measure == 'pearson': # aux = st.pearsonr(mean_voxels[j], mean_voxels[k])[0] # else: # aux = skpw.pairwise_kernel(mean_voxels[j], mean_voxels[k], # metric = similarity_measure, # n_jobs = n_jobs) ## if aux >= threshold: ## adj_mat[j,k] = aux ## adj_mat[k,j] = aux # adj_mat[j,k] = adj_mat[k,j] = aux # adj_mat = (adj_mat - np.mean(adj_mat))/np.std(adj_mat) # adj_mat = (adj_mat - np.min(adj_mat))/(np.max(adj_mat) - np.min(adj_mat)) # adj_mat = np.where(adj_mat>=threshold, 1, 0) # #------------------------------------ #Building the graph from the adjacency matrix g = nx.from_numpy_matrix(adj_mat) #Spliting the node degrees into some categories and using them as node labels. # num_lab = 5 deg = g.degree() # for k in deg: # deg[k]/= num_lab nx.set_node_attributes(g, 'node_label', deg) ############ #Storing the mean time-series of each parcell as a node attribute ts_att = {} mv = mean_voxels.tolist() for pos in range(len(mv)): ts_att[pos] = mv[pos] nx.set_node_attributes(g, 'time_series', ts_att) #Saving the graphs for CLFR subject (the one for which I have the structural data) # if interm_rep.subj_name == 'CLFR': # nx.write_gexf(g, 'graph_gephi_format.gexf') # np.savetxt('CLFR_clusters_xyz.txt', mean_xyz, fmt='%1d', delimiter=' ') # edges = np.array(np.where(adj_mat==1)).T # np.savetxt('CLFR_clusters_timeseries_cond%s.txt' %(interm_rep.cls), edges, fmt='%1d', delimiter=' ') #Plot Graphs #pp.plot_graph(mean_xyz, g) return g
print("Homogeneity k-means: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness k-means: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure k-means: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Silhouette Coefficient k-means: %0.3f" % metrics.silhouette_score(clustering, km.labels_, sample_size = 8000)) # DBSCAN # Structured hierarchical clustering db = DBSCAN() db.fit(clustering) print 'DBSCAN clusters created..' print("Homogeneity DBSCAN: %0.3f" % metrics.homogeneity_score(labels, db.labels_)) print("Completeness DBSCAN: %0.3f" % metrics.completeness_score(labels, db.labels_)) print("V-measure DBSCAN: %0.3f" % metrics.v_measure_score(labels, db.labels_)) print("Silhouette Coefficient DBSCAN: %0.3f" % metrics.silhouette_score(clustering, db.labels_, sample_size = 5000)) # Structured hierarchical clustering ward = Ward(n_clusters = 9) ward.fit(clustering) print 'Hierarchical clusters created..' print("Homogeneity hierarchical: %0.3f" % metrics.homogeneity_score(labels, ward.labels_)) print("Completeness hierarchical: %0.3f" % metrics.completeness_score(labels, ward.labels_)) print("V-measure hierarchical: %0.3f" % metrics.v_measure_score(labels, ward.labels_)) print("Silhouette Coefficient hierarchical: %0.3f" % metrics.silhouette_score(clustering, ward.labels_, sample_size = 5000))
def hierarchical(self, n_clusters): ward = Ward(n_clusters=n_clusters) return ward.fit_predict(sp.array(self.A))
def ward(X, n_clust): "H" ward = Ward(n_clusters=n_clust) ward.fit(X) return ward
""" Benchmark scikit-learn's Ward implement compared to SciPy's """ import time import numpy as np from scipy.cluster import hierarchy import pylab as pl from sklearn.cluster import Ward ward = Ward(n_clusters=3) n_samples = np.logspace(.5, 3, 9) n_features = np.logspace(1, 3.5, 7) N_samples, N_features = np.meshgrid(n_samples, n_features) scikits_time = np.zeros(N_samples.shape) scipy_time = np.zeros(N_samples.shape) for i, n in enumerate(n_samples): for j, p in enumerate(n_features): X = np.random.normal(size=(n, p)) t0 = time.time() ward.fit(X) scikits_time[j, i] = time.time() - t0 t0 = time.time() hierarchy.ward(X) scipy_time[j, i] = time.time() - t0
if i != 'Combined Queries' and i != 'Report ID' and i != 'Object Name' and i != 'Report Name' and i != 'Operands': print i train = pd.concat([train, pd.get_dummies(raw_train[i])], axis=1) freq = train.groupby('Report ID').sum() freq = freq.drop('Has Combined Queries', 1) # Train Model ############################# num_cluster = 12 kmean = KMeans(n_clusters=num_cluster, max_iter=400, verbose = 0, n_jobs = 2, n_init=20, tol=1e-6) model_kmean = kmean.fit(freq) ward = Ward(n_clusters=num_cluster) model_ward = ward.fit(freq) from sklearn.neighbors import kneighbors_graph connectivity = kneighbors_graph(freq, n_neighbors=4) #ward = Ward(n_clusters=num_cluster, connectivity = connectivity) #model_ward = ward.fit(freq) # Visualization ##################################################### import mpl_toolkits.mplot3d.axes3d as p3 import pylab as pl from sklearn.datasets.samples_generator import make_friedman3