예제 #1
0
	def find_cluster_centroids(self, cluster_index):
		#print "Finding cluster centroids.."
		data_arr = array([self.restimes])
		centroids, cmask = clustercentroids(data_arr, mask = self.masks,
				transpose = self.centroids_transpose, 
				clusterid = cluster_index, method = self.method)

		return centroids
예제 #2
0
    def cluster_sentences(cls, sentences, n):
        """Cluster the sentences into n clusters.

        Args:
            sentences: [IRSentence]
            n: int, number of clusters

        Returns:
            [int], group id of each sentence in sentences
        """

        vol = set()
        for sentence in sentences:
            tfidf = sentence.get_tfidf()
            for term in tfidf:
                vol.add(term)
        vol = list(vol)
        vecs = []
        for sentence in sentences:
            tfidf = sentence.get_tfidf()
            vec = []
            for term in vol:
                if term in tfidf:
                    vec.append(tfidf[term])
                else:
                    vec.append(0.0)
            vecs.append(vec)
        # call pycluster k-means
        from Pycluster import kcluster, clustercentroids, distancematrix
        labels, error, nfound = kcluster(vecs, nclusters=n, method='a',
                                         dist='u')
        centroids, cmask = clustercentroids(vecs, clusterid=labels, method='a')
        sentence_ids = []
        for centroid_index, centroid in enumerate(centroids):
            # find vecs in the cluster
            subvecs = [centroid]
            subvecindexs = [-1]
            for label_index, label in enumerate(labels):
                if label == centroid_index:
                    subvecs.append(vecs[label_index])
                    subvecindexs.append(label_index)
            # find the min dist vec
            matrix = distancematrix(subvecs, dist='u')
            minimum = 100000
            minimum_index = 0
            for i in xrange(1, subvecs.__len__()):
                dist = matrix[i][0]
                if dist < minimum:
                    minimum = dist
                    minimum_index = subvecindexs[i]
            sentence_ids.append(minimum_index)

        # method='a')
        return labels, sentence_ids
예제 #3
0
def create_clustered_samples(points, nclusters, transpose):
    
    print points[1:6]
    labels, error, nfound= kcluster(points[1:4], nclusters, None, None, transpose, npass=1, method='a', dist='e', initialid=None)
    
    cdata, cmask = clustercentroids(points[1:4], None, labels, 'a', transpose)
    
    print cdata
    
    
    
    clusteredpoints = list()
    
    for i in range(nclusters):
        clusteredpoints.append(list())
    if transpose == 0:    
        for index in range(len(points)):
            clusteredpoints[labels[index]].append(points[index])
        return clusteredpoints, cdata
    else:
        for i in range(len(clusteredpoints)):
            for types in range(len(points)):
                clusteredpoints[i].append(list())
        for index in range(len(labels)):
            for item in range(len(points)):
                clusteredpoints[labels[index]][item].append(points[item][index])
        #print clusters and some element
        x = cdata[1]
        y = cdata[2]
        
#        fig = figure()
#        ax1 = fig.add_subplot(1,1,1)
#        ax1.scatter(x, y, c='r')
#        ax1.axis([0,max(x)+1,0,max(y)+1])
#        ax1.set_xlabel('number of bodies')
#        ax1.set_ylabel('number of steps')
        
#        x0 = clusteredpoints[0][2]
#        y0 = clusteredpoints[0][3]
#        
#        x1 = clusteredpoints[1][1]
#        y1 = clusteredpoints[1][2]
#        
#        x2 = clusteredpoints[2][1]
#        y2 = clusteredpoints[2][2]
#        
#        x3 = clusteredpoints[3][1]
#        y3 = clusteredpoints[3][2]
#        
#        x4 = clusteredpoints[4][1]
#        y4 = clusteredpoints[4][2]
#        
#        x5 = clusteredpoints[5][1]
#        y5 = clusteredpoints[5][2]
#        
#        ax1.scatter(x0[1:20],y0[1:20], marker='s')
#        ax1.scatter(x1[1:20],y1[1:20], marker='^')
#        ax1.scatter(x2[1:15],y2[1:15], marker='<')
#        ax1.scatter(x3[1:15],y3[1:15], marker='>')
#        ax1.scatter(x4[1:15],y4[1:15], marker='p')
#        ax1.scatter(x5[1:15],y5[1:15], marker='8')
#        show()
        return clusteredpoints, cdata