def cluster_kmedoids(self, k=2, npass=50): # Utilise la distance pour produire une partition de k classes # n est le nombre d'itérations c, err, nfound = pc.kmedoids(self.zd, k, npass=npass) return partition(c, self.mat)
def clusterSessionsPre(catQueryDist, featMan, weightMatrix): tclusters = {} print len(catQueryDist) for termCount in range(4, 5): tclusters[termCount] = [] for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet) / termCount if k == 0: k = 1 #print cat, len(qSet), k qList = list(qSet) catDist = getWeightMatrixForKMed(qList, weightMatrix) clusArray, error, opt = clust.kmedoids(catDist, k, 5, None) #print 'Queries', qList clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = [] qc = featMan.returnQuery(qList[c]) if len(qc) > 1: clusters[clusId].append(qc) #print cat, len(clusters) for entry in clusters.values(): tclusters[termCount].append(entry) print len(tclusters[4]) return tclusters[4]
def kmedoids_cluster(self, similarities=None): # https://jpcomputing.wordpress.com/2014/05/18/pycluster-kmedoids-example/ from sklearn.metrics import silhouette_score distances = [] if similarities is None: for page_id, sim_vector in self._pages_similarities.iteritems(): distances.append([1 - x[1][USED_DISTANCE] for x in sim_vector]) else: for x in similarities: distances.append([1 - a for a in x]) np_distances = np.asarray(distances) import scipy.cluster from sklearn.metrics import silhouette_score from scipy.spatial.distance import squareform squareform_distances = squareform(np_distances) import Pycluster nb_clusters = 2 # this is the number of cluster the dataset is supposed to be partitioned into clusterid, error, nfound = Pycluster.kmedoids(squareform_distances, nclusters=nb_clusters, npass=50) print 'clusterid: ', len(set(clusterid)), clusterid res = silhouette_score(np_distances, clusterid, metric='precomputed') print 'Res: ', res return # grouping to clusters clusters_indexes = {} for i, medoid in enumerate(clusterid): if medoid not in clusters_indexes: clusters_indexes[medoid] = [i] else: clusters_indexes[medoid].append(i)
def clusterSessionsKmed(featMan, weightFile): data = featMan.returnKeys() weightList = getWeightMatrixForKMedFromFile(featMan.returnLastId(), weightFile, data) cnt = 0 kclusters = {} for k in range(4, 5, 2): i = (len(weightList) + 1) / k if i == 0: i = 1 clusArray, error, opt = clust.kmedoids(weightList, i, 10, None) print error, len(clusArray) clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] q = featMan.returnQuery(c) if len(q) > 1: if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(q) cnt += 1 kclusters[k] = clusters.values() print 'Cluster with kmed ', len(clusters), cnt, ' queries' return kclusters[4]
def cluster_kmedoids(sessions, clusters, distance_fn=string_similarity.jaccard_distance): """ kmedoids clustering, requires distance matrix, therefore slow """ distances = compute_distances(sessions, distance_fn) clusterids, error, nfound = Pycluster.kmedoids(distances, nclusters=clusters) return clusterids, error, nfound
def clusterCatWithMediods(lowerLimit, upperLimit,featMan, weightMatrix, \ samePairsSet, differentPairsSet, catQueryDist, \ outFile = 'cat-clusters-with-med.txt'): oFile = open(outFile,'w') metrics = {} for noTerms in range(lowerLimit, upperLimit): #fclusters = [] cluster_list = [] i = 0 oFile = open(outFile+str(noTerms)+'.txt','w') for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet)/noTerms if k == 0: k = 1 qList = sorted(list(qSet),reverse=True) catDist = getWeightMatrixForKMed(qList, weightMatrix,'cat_kmediods') clusArray, error, opt = clust.kmedoids(catDist,k, 5, None) clusters = {} for c in range(1, len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(qList[c-1]) for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) #fclusters.append(qStr) oFile.write(cat+'\t'+qStr+'\n'); print 'Clust category',cat, 'length', len(clusters),\ 'Queries' , len(qSet),'k', k, 'error', error, opt if i % 5 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) #metrics[noTerms] = getRecallPrecision(samePairsSet, \ # differentPairsSet,\ # predictedSamePairsSet,\ # predictedDifferentPairsSet) metrics[noTerms] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics
def clusterCatWithMediodsAndNetwork(threshold, \ lowerLimit, upperLimit, featMan, \ weightMatrix, samePairsSet, \ differentPairsSet, catQueryDist, \ catNetwork, \ outFile = 'cat-clusters-with-med.txt'): #cluster each cat find the outliers #move them to parents metrics = {} for noTerms in range(lowerLimit, upperLimit, 2): cluster_list = [] #fclusters = [] i = 0 oFile = open(outFile+str(noTerms)+'.txt','w') for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet)/noTerms if k == 0: k = 1 #print cat, len(qSet), k qList = list(qSet) catDist = getWeightMatrixForKMed(qList, weightMatrix) clusArray, error, opt = clust.kmedoids(catDist,k, 5, None) #print 'Queries', qList clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(qList[c]) #outliers = getOutliers(qList,catDist) for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) oFile.write(cat+'\t'+qStr+'\n'); #fclusters.append(qStr) print 'Clust ',cat, len(clusters), error, opt if i % 50 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) key = str(threshold)+'_'+str(noTerms) metrics[key] = getRecallPrecision(samePairsSet, differentPairsSet,\ predictedSamePairsSet,\ predictedDifferentPairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics
def cluster(D, k): import Pycluster as pcl labels, _, _ = pcl.kmedoids(D, nclusters=k, npass=10, initialid=None) errors = np.array([ D[labels[i], i] for i in range(len(labels)) ]) centroidids = np.unique(labels) cmap = np.zeros(labels.max()+1) for c in centroidids: cmap[c] = np.nonzero(centroidids == c)[0][0] labels = cmap[labels] logger.debug('k-medoids (k=%i): %.2f.' % (k, errors.sum())) return labels, { 'method': 'kmedoids', 'init': 'random', 'k': k, 'centroidids': centroidids, 'errors': errors, 'error': errors.sum(), 'error-label': 'sum of distances' }
def kmedoids(m): labels, error, nfound = Pycluster.kmedoids(m, 16, 5) # Find the clusters and rename to have same naming convention as affinity propagation clusters = [] for label in labels: if label not in clusters: clusters.append(label) currentCluster = 0 for cluster in clusters: currentLabel = 0 for label in labels: if label == cluster: labels[currentLabel] = currentCluster currentLabel += 1 # clusters[currentCluster] = currentCluster currentCluster += 1 return labels, clusters
def cluster(D, k): import Pycluster as pcl labels, _, _ = pcl.kmedoids(D, nclusters=k, npass=10, initialid=None) errors = np.array([D[labels[i], i] for i in range(len(labels))]) centroidids = np.unique(labels) cmap = np.zeros(labels.max() + 1) for c in centroidids: cmap[c] = np.nonzero(centroidids == c)[0][0] labels = cmap[labels] logger.debug('k-medoids (k=%i): %.2f.' % (k, errors.sum())) return labels, { 'method': 'kmedoids', 'init': 'random', 'k': k, 'centroidids': centroidids, 'errors': errors, 'error': errors.sum(), 'error-label': 'sum of distances' }
def matchs_ia(): # if not current_user.admin: # return jsonify({'message' : 'Cannot perform that function!'}) tests = TestR.query.all() dataSet = [] trans_tab = dict.fromkeys(map(ord, u"\u0301\u0308"), None) for test in tests: # test_data = {} resident = Residents.query.filter_by(public_id=test.public_id).first() test_data = ("-" + str(resident.id) + "-" + test.gender + str(test.age) + test.musicGender + test.sport + test.hobbie + test.movieSeries + test.filmGender + test.tabaco + test.alcohol + test.party + str(test.ordenConvivencia) + str(test.ordenPersonal) + test.personalidad) test_data = normalize( "NFKC", normalize("NFKD", test_data).translate(trans_tab)) dataSet.append(test_data) distans = [ distance.edit_distance(dataSet[i], dataSet[j]) for i in range(1, len(dataSet)) for j in range(0, i) ] labels, error, nfound = PC.kmedoids(distans, nclusters=5, npass=10) cluster = dict() output = [] for roommate, label in zip(dataSet, labels): cluster.setdefault(label, []).append(roommate) for label, grp in cluster.items(): cluster_data = {} cluster_data["Roommate"] = grp cluster_data["Count"] = len(grp) cluster_data["label"] = str(label) output.append(cluster_data) return jsonify({"testsALL": output}, {"error": error}, {"nfound": nfound})
def cluster(self, num_cluster): category_tfidf = self.category_tfidf categories = list(category_tfidf) random.shuffle(categories) tfidf_norms = {category: sum(value**2 for value in tfidf.values()) for category, tfidf in category_tfidf.items()} for category, norm in tfidf_norms.items(): if not norm: raise Exception((category, category_tfidf[category])) distances = [] for i, category1 in enumerate(categories): cat1_tfidf = category_tfidf[category1] row_array = array([0.0] * i) for j, category2 in enumerate(categories): if j >= i: break row_array[j] = self.compute_distance(cat1_tfidf, category_tfidf[category2], tfidf_norms[category1], tfidf_norms[category2]) distances.append(row_array) clusterids, error, nfound = Pycluster.kmedoids(distances, num_cluster) print error category_clusters = [[] for _ in range(num_cluster)] print len(clusterids) print len(categories) print clusterids clusterid_map = {} for i, category in enumerate(clusterids): category_id = clusterid_map.setdefault(category, len(clusterid_map)) category_clusters[category_id].append(categories[i]) return category_clusters
def Kmedoids(num_patches, samples, progress=None): """Estimate patches as centroids of samples using k-Medoids. This requires the `Pycluster` library to be installed. :param int num_patches: number of patches to create :type samples: 2D array :param samples: example patches :param progress: ignored :rtype: 2D array with `num_patches` rows and N columns, where N is the number of columns in `samples`. :return: created patches """ logging.info("Learning %d prototypes per size by k-Medoids clustering" % num_patches) import Pycluster dist = Pycluster.distancematrix(samples) cluster_ids, _, _ = Pycluster.kmedoids(dist, nclusters=num_patches) # `cluster_ids` contains `num_patches` unique values, each of which is # the index of the medoid for a different cluster. return samples[np.unique(cluster_ids)].astype(ACTIVATION_DTYPE)
def clusterAllWithKMediods(lowerLimit, upperLimit,\ featMan, weightMatrix, \ #weightFile, allTaskDict, samePairsSet, \ differentPairsSet, outDir): data = featMan.returnKeys() #weightList = getWeightMatrixForKMedFromFile(featMan.returnLastId(),\ # weightFile,data) weightList= getWeightMatrixForKMed(data, weightMatrix,'kmediods') print len(weightList) metrics = {} for k in range(lowerLimit,upperLimit,3): print 'Clustering with terms ', k cluster_list = [] i = k # (len(weightList)+1)/k if i == 0: i = 1 clusArray, error, opt = clust.kmedoids(weightList,i, 10, None) clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() try: clusters[clusId].add(c) except: print c #len(data) print 'Error and cluster length ' , error, len(clusters) '''for clid, ind in clusters.items(): print clid, ind for qind in sorted(ind): print 'query', featMan.returnQuery(qind), print for i1 in sorted(ind): for i2 in sorted(ind): print 'i1 and i2',i1, i2 if i1 in weightMatrix and i2 in weightMatrix[i1]: print i1, i2,'matrix', weightMatrix[i1][i2],(weightList[i2])[i1] ''' fname = outDir+'_'+str(i)+'.txt' oFile = open(fname,'w'); for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) oFile.write(qStr+'\n') oFile.close() predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) #metrics[k] = getRecallPrecision(samePairsSet, \ # differentPairsSet,\ # predictedSamePairsSet,\ # predictedDifferentPairsSet) # metrics[k] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ # predictedSamePairsSet) metrics[k] = getSetBasedLabelsAndMetric(cluster_list,\ allTaskDict, featMan) for tcount, met in metrics.items(): print tcount, met return metrics
def cluster_domains( GDA, missing_da, p, starting_time) : """Clusters the domain graph using DBSCAN algorithm and make a picture of the whole matrix map Parameters ---------- GDA : Graph undirected graph of domain similarities missing_da : list DA not in Graph, no edge (no similarity) no any other DA p : argument parser object parameter object starting_time : int program starting time Returns ------- clusters : list a list of list containing the clusterised DA """ # add missing da as a self cluster clusters = [ [da] for da in missing_da ] if p.daonly : #if True : # to gain some memory space DBSCAN is only used on connected components clusters_comp = nx.connected_components( GDA ) all_unclustered = [ ] for comp in clusters_comp : if len( comp ) > p.minpts : H = GDA.subgraph( comp ) # networkx return an numpy.matrixlib.defmatrix.matrix mat = 1.0 - np.array(nx.to_numpy_matrix( H, nodelist=comp ) ) mat.flat[ :: mat.shape[0] + 1 ] = 0 # diag to 0 # run OPTICS on distance matrix optics = Optics( p.minpts, epsilon=p.epsilon ) ordered, reachability, core_dist = optics.run(mat) labels = optics.cluster( p.epsilon_p ) # run dbscan on distance matrix slabels = set( labels ) for k in slabels : ind = np.where( labels == k )[0] if k == -1 : for i in ind : clusters.append( [comp[i]] ) else : clusters.append( [ comp[i] for i in ind ] ) else : # if the component is whith less memebers than the minpts cutoff # all the members of the same components are put in the same clusters clusters.append( comp ) else : nodes = GDA.nodes( ) # networkx return an numpy.matrixlib.defmatrix.matrix # instead of an numpy.ndarray matrix, not really convenient ... bigmat = 1.0 - np.array( nx.to_numpy_matrix( GDA, nodelist=nodes ) ) bigmat.flat[ :: bigmat.shape[0] +1 ] = 0 # diagonal to 0 clusterid, error, nfound = Pycluster.kmedoids (bigmat, nclusters=p.kcluster, npass=10 ) for l in np.unique( clusterid ) : tmp_clust = [nodes[i] for i in range(clusterid.shape[0]) if clusterid[i] == l ] clusters.append( tmp_clust ) return clusters
#!/usr/bin/python -tt import Pycluster as pc import numpy as np import sys import re filename, n = sys.argv[1], int(sys.argv[2]) dist = np.loadtxt(filename, usecols=range(0, 799)) clustermap, error, nfound = pc.kmedoids(dist, n, npass=1000) medoids = {} for i in clustermap: medoids[i] = medoids.get(i, 0) + 1 output_filename = re.sub('\/[^\/]+$', '/kmedoids_result', filename) np.savetxt(output_filename, clustermap, delimiter=" ", fmt="%d") print clustermap print "\n" print medoids print "nfound: " + str(nfound)
filename, n = sys.argv[1], int(sys.argv[2]) data = np.loadtxt(filename) k = len(data) # Calculate the distance matrix m = np.zeros(k * k) m.shape = (k, k) for i in range(0, k): for j in range(i, k): d = dist(data[i], data[j]) m[i][j] = d m[j][i] = d # Perform the actual clustering clustermap, _, _ = pc.kmedoids(m, n, npass=20) # Find the indices of the points used as medoids, and the cluster masses medoids = {} for i in clustermap: medoids[i] = medoids.get(i, 0) + 1 # Print points, grouped by cluster for i in medoids.keys(): print "Cluster=", i, " Mass=", medoids[i], " Centroid: ", data[i] for j in range(0, len(data)): if clustermap[j] == i: print "\t", data[j]
filename, n = sys.argv[1], int( sys.argv[2] ) data = np.loadtxt( filename ) k = len(data) # Calculate the distance matrix m = np.zeros( k*k ) m.shape = ( k, k ) for i in range( 0, k ): for j in range( i, k ): d = dist( data[i], data[j] ) m[i][j] = d m[j][i] = d # Perform the actual clustering clustermap, _, _ = pc.kmedoids( m, n, npass=20 ) # Find the indices of the points used as medoids, and the cluster masses medoids = {} for i in clustermap: medoids[i] = medoids.get(i,0) + 1 # Print points, grouped by cluster for i in medoids.keys(): print "Cluster=", i, " Mass=", medoids[i], " Centroid: ", data[i] for j in range( 0, len(data) ): if clustermap[j] == i: print "\t", data[j]
'19HombreHip hopVolleyballEstar con mi familia AmbasRom-comsIntrovertid@', '19MujerFlamenco Futbol DibujarAmbasThrillerExtrovertid@'] #td*(0-1) saber el tiempo que toma ejecuatar cada parte del codigo td0= time.process_time_ns() #Get distance dist2 = [distance.edit_distance(DataSetTFG[i], DataSetTFG[j]) for i in range(1, len(DataSetTFG)) for j in range(0, i)] #Tiempo Distancias td1 = time.process_time_ns() - td0 #TiempoClusterK1 tc10 = time.process_time_ns() #Cluster kmedoids K1 npass3 labels1, error1, nfound1 = PC.kmedoids(dist2, nclusters=1,npass=10) #Guardamos el resultado en un dicionario cluster1 cluster1 = dict() for roommate, label in zip(DataSetTFG, labels1): cluster1.setdefault(label, []).append(roommate) for label, grp in cluster1.items(): print(grp) tc11 = time.process_time_ns() - tc10 #TiempoClusterK2 tc20 = time.process_time_ns() #Cluster kmedoids K2 npass3 labels2, error2, nfound2 = PC.kmedoids(dist2, nclusters=2,npass=10) cluster2 = dict() for roommate, label in zip(DataSetTFG, labels2):
# > Rather than using the mean (which is usually not a real datapoint), K-medoids uses a particular datapoint as the centroid. This makes K-medoids less sensitive to outliers. And in case of text, can give an actual "representative" document of a topic. # # Valid values for metric are: # from scikit-learn: [‘euclidean’, ‘l2’, ‘l1’, ‘manhattan’, ‘cityblock’] # from scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘cosine’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the documentation for scipy.spatial.distance for details on these metrics. # <codecell> #K-Medoid #Can pass in other distances for K-Medoids import Pycluster from Pycluster import kmedoids distances = pairwise_distances(X, metric='euclidean', n_jobs=1) nb_clusters = 20 clusterid, error, nfound = Pycluster.kmedoids(distances, nclusters= nb_clusters, npass=100) # <codecell> #Look at the actual comments that are central to each cluster: medoids = list(set(clusterid)) for m in medoids: print "------------------------" print "medoid ", m, ":", grouped_questions[m] # <markdowncell> # ##Non-Negative Matrix Factorization # <codecell>
temp = [] for i in range(0,len(points)): p2 = points[i] temp.append ( distance_function(p1,p2) ) distances.append (temp) return distances # def timespan(list): nb_clusters = 15 # this is the number of cluster the dataset is supposed to be partitioned into distances = get_distance_matrix(vectors, euclidean) clusterid, error, nfound = Pycluster.kmedoids(distances, nclusters= nb_clusters, npass=100) uniq_ids = list(set(clusterid)) new_ids = [ uniq_ids.index(val) for val in clusterid] # print uniq_ids # print new_ids ############################################# # new_ids -> index:clusterid # # vectors -> index:location #
def matchsTestM(current_user): if not current_user.admin: return jsonify({'message': 'Cannot perform that function!'}) tiempoTI = time.process_time_ns() tests = TestR.query.filter_by(gender="Mujer").all() dataSet = [] #Quitar asentos, ñ , etc.. trans_tab = dict.fromkeys(map(ord, u"\u0301\u0308"), None) for test in tests: resident = Residents.query.filter_by(public_id=test.public_id).first() test_data = ( "-" + str(resident.id) + "-" #+ test.gender + " " + str(test.age) + " " + test.musicGender + " " + test.sport + " " + test.hobbie + " " + test.movieSeries + " " + test.filmGender + " " + test.tabaco + " " + test.alcohol + " " + test.party + " " + str(test.ordenConvivencia) + " " + str(test.ordenPersonal) + test.personalidad) test_data = normalize( "NFKC", normalize("NFKD", test_data).translate(trans_tab)) dataSet.append(test_data) tiempoPI = time.process_time_ns() tiempoII = time.process_time_ns() distans = [ distance.edit_distance(dataSet[i], dataSet[j]) for i in range(1, len(dataSet)) for j in range(0, i) ] tiempoPF = (time.process_time_ns() - tiempoPI) tiempoPF = tiempoPF / 60000000000 tiempoCI = time.process_time_ns() labels, error, nfound = PC.kmedoids(distans, nclusters=10, npass=10) cluster = dict() datosCluster = [] outputId = [] for roommate, label in zip(dataSet, labels): cluster.setdefault(label, []).append(roommate) for label, grp in cluster.items(): cluster_dataID = {} cluster_dataID["resultados"] = grp outputId.append(cluster_dataID) tiempoCF = (time.process_time_ns() - tiempoCI) tiempoIF = (time.process_time_ns() - tiempoII) tiempoCF = tiempoCF / 60000000000 tiempoIF = tiempoIF / 60000000000 returnObj = [] sizeRooms = 4 #Makes rooms for cluster in outputId: room = [] for row in cluster.values(): num = len(row) #number of tests in cluster residual = num % sizeRooms offset = 1 for test in row: #extract and format data x = test.split("-") index = row.index(test) realI = index + 1 room.append(x[1]) if realI % sizeRooms == 0: #add room and create room returnObj.append(room[:]) #empty room room.clear() if (residual > 0 and realI > sizeRooms * (num // sizeRooms)): #si residuo es 1, ultima, 2, penultima, 3 antepenutima. #si % de num es =! 0, (1,2,3) meter en ult, pen, o ante #number of rooms counter = (len(returnObj)) - offset #print(counter, x[1], "ok" returnObj[counter].append(x[1]) offset += 1 #Write in DB for rooms in returnObj: new_room = Rooms(state="Completo") db.session.add(new_room) db.session.commit() for x in range(0, len(rooms)): new_match = Matches(user_id=rooms[x], room_id=new_room.id) db.session.add(new_match) db.session.commit() tiempoTF = (time.process_time_ns() - tiempoTI) tiempoTF = tiempoTF / 60000000000 datos_data = {} datos_data["tiempoT"] = tiempoTF datos_data["TimepoIA"] = tiempoIF datos_data["tiempoP"] = tiempoPF datos_data["tiempoC"] = tiempoCF datos_data["error"] = error datos_data["nS"] = nfound datosCluster.append(datos_data) return jsonify({"datosIA": datos_data})
def generate_kmedoid(self,locationid): trend_cross_trend_matrix,trends_list = self.get_matrix(locationid) clusterid , error , nfound = Pycluster.kmedoids(trend_cross_trend_matrix,nclusters=4,npass=100) return clusterid , trends_list
words=readwordlist.read("rt_words.csv") vectWords=[] for i in range(50): sysnet.setdefault(i,{'word':words[i],'sysnet':wn.synsets(words[i])}) vectWords.append(i) totalElement=len(vectWords) totalClusters=10 distMatrix=numpy.ones((totalElement,totalElement),dtype=float) for i in range(totalElement): for j in range(totalElement): distMatrix[i,j]=getDistance(i,j) clusters=Pycluster.kmedoids(distMatrix,nclusters=totalClusters,npass=100) print distMatrix print clusters groups={} for i in range(len(clusters[0])): if clusters[0][i]<totalClusters : groups.setdefault(clusters[0][i],[]).append(sysnet[i]['word']) for key,value in groups.items(): print "\n***********************************\n" for v in value: print v #print findclusters(vectWords,100) #plt.show()