def find_cluster_centroids(self, cluster_index): #print "Finding cluster centroids.." data_arr = array([self.restimes]) centroids, cmask = clustercentroids(data_arr, mask = self.masks, transpose = self.centroids_transpose, clusterid = cluster_index, method = self.method) return centroids
def cluster_sentences(cls, sentences, n): """Cluster the sentences into n clusters. Args: sentences: [IRSentence] n: int, number of clusters Returns: [int], group id of each sentence in sentences """ vol = set() for sentence in sentences: tfidf = sentence.get_tfidf() for term in tfidf: vol.add(term) vol = list(vol) vecs = [] for sentence in sentences: tfidf = sentence.get_tfidf() vec = [] for term in vol: if term in tfidf: vec.append(tfidf[term]) else: vec.append(0.0) vecs.append(vec) # call pycluster k-means from Pycluster import kcluster, clustercentroids, distancematrix labels, error, nfound = kcluster(vecs, nclusters=n, method='a', dist='u') centroids, cmask = clustercentroids(vecs, clusterid=labels, method='a') sentence_ids = [] for centroid_index, centroid in enumerate(centroids): # find vecs in the cluster subvecs = [centroid] subvecindexs = [-1] for label_index, label in enumerate(labels): if label == centroid_index: subvecs.append(vecs[label_index]) subvecindexs.append(label_index) # find the min dist vec matrix = distancematrix(subvecs, dist='u') minimum = 100000 minimum_index = 0 for i in xrange(1, subvecs.__len__()): dist = matrix[i][0] if dist < minimum: minimum = dist minimum_index = subvecindexs[i] sentence_ids.append(minimum_index) # method='a') return labels, sentence_ids
def create_clustered_samples(points, nclusters, transpose): print points[1:6] labels, error, nfound= kcluster(points[1:4], nclusters, None, None, transpose, npass=1, method='a', dist='e', initialid=None) cdata, cmask = clustercentroids(points[1:4], None, labels, 'a', transpose) print cdata clusteredpoints = list() for i in range(nclusters): clusteredpoints.append(list()) if transpose == 0: for index in range(len(points)): clusteredpoints[labels[index]].append(points[index]) return clusteredpoints, cdata else: for i in range(len(clusteredpoints)): for types in range(len(points)): clusteredpoints[i].append(list()) for index in range(len(labels)): for item in range(len(points)): clusteredpoints[labels[index]][item].append(points[item][index]) #print clusters and some element x = cdata[1] y = cdata[2] # fig = figure() # ax1 = fig.add_subplot(1,1,1) # ax1.scatter(x, y, c='r') # ax1.axis([0,max(x)+1,0,max(y)+1]) # ax1.set_xlabel('number of bodies') # ax1.set_ylabel('number of steps') # x0 = clusteredpoints[0][2] # y0 = clusteredpoints[0][3] # # x1 = clusteredpoints[1][1] # y1 = clusteredpoints[1][2] # # x2 = clusteredpoints[2][1] # y2 = clusteredpoints[2][2] # # x3 = clusteredpoints[3][1] # y3 = clusteredpoints[3][2] # # x4 = clusteredpoints[4][1] # y4 = clusteredpoints[4][2] # # x5 = clusteredpoints[5][1] # y5 = clusteredpoints[5][2] # # ax1.scatter(x0[1:20],y0[1:20], marker='s') # ax1.scatter(x1[1:20],y1[1:20], marker='^') # ax1.scatter(x2[1:15],y2[1:15], marker='<') # ax1.scatter(x3[1:15],y3[1:15], marker='>') # ax1.scatter(x4[1:15],y4[1:15], marker='p') # ax1.scatter(x5[1:15],y5[1:15], marker='8') # show() return clusteredpoints, cdata