def findcenters(x, n=1000, k=6): # get dimensions m = x.shape[1] # create centers as empty centers = DataFrame(np.zeros(shape=(k, m))) for i in range(n): labels, _, _ = Pycluster.kcluster(x, nclusters=k, transpose=0, method="a", dist="e", npass=1) center, _ = Pycluster.clustercentroids(x, clusterid=labels) # sort centers by the distance to the origin center = sorted(center, key=lambda t: np.linalg.norm(np.array(t) - np.zeros(m)), reverse=True) # print np.linalg.norm(np.array(center[0])-np.zeros(m)) # print np.linalg.norm(np.array(center[1])-np.zeros(m)) # print np.linalg.norm(np.array(center[2])-np.zeros(m)) # print np.linalg.norm(np.array(center[3])-np.zeros(m)) # print np.linalg.norm(np.array(center[4])-np.zeros(m)) # print np.linalg.norm(np.array(center[5])-np.zeros(m)) # print np.array(center[0]) # print np.array(center[1]) # print np.array(center[2]) # print np.array(center[3]) # print np.array(center[4]) # print np.array(center[5]) # take the average for j in range(k): centers.ix[j, :] = centers.ix[j, :] + center[j] centers = centers / n return centers
def findcenters(x,n=1000,k=6): #get dimensions m = x.shape[1] #create centers as empty centers = DataFrame(np.zeros(shape=(k,m))) for i in range(n): labels, _, _ = Pycluster.kcluster(x, nclusters = k, transpose=0, method='a', dist='e', npass = 1) center, _ = Pycluster.clustercentroids(x,clusterid = labels) #sort centers by the distance to the origin center = sorted(center,key = lambda t: np.linalg.norm(np.array(t)-np.zeros(m)), reverse = True) #print np.linalg.norm(np.array(center[0])-np.zeros(m)) #print np.linalg.norm(np.array(center[1])-np.zeros(m)) #print np.linalg.norm(np.array(center[2])-np.zeros(m)) #print np.linalg.norm(np.array(center[3])-np.zeros(m)) #print np.linalg.norm(np.array(center[4])-np.zeros(m)) #print np.linalg.norm(np.array(center[5])-np.zeros(m)) #print np.array(center[0]) #print np.array(center[1]) #print np.array(center[2]) #print np.array(center[3]) #print np.array(center[4]) #print np.array(center[5]) #take the average for j in range(k): centers.ix[j,:] = centers.ix[j,:] + center[j] centers = centers/n return(centers)
def pyclustertest(): data=sp.rand(100,4) cid,e,n=pcl.kcluster(data) centroids,cmask=pcl.clustercentroids(D,clusterid=cid) print data print centroids
def myCKDemo(filename,n): #以下两个语句是获取数据,用于聚类分析的数据位于第3和第4列(从0开始计算) data = np.loadtxt(filename, delimiter = "," ,usecols=(2,4,14,8)) #第8和第9列,保存了城市的经纬度坐标,用于最后画散点图 xy = np.loadtxt(filename, delimiter = "," ,usecols=(2,4)) #clustermap是聚类之后的集合,记录每一组数据的类别id clustermap = pc.kcluster(data, n)[0] #centroids 是分组聚类之后的聚类中心坐标 centroids = pc.clustercentroids(data, clusterid=clustermap)[0] #m是距离矩阵 m = pc.distancematrix(data) #mass 用来记录各类的点的数目 mass = np.zeros(n) for c in clustermap: mass[c] += 1 #sil是轮廓系统矩阵,用于记录每个簇的大小 sil = np.zeros(n*len(data)) sil.shape = ( len(data), n ) for i in range( 0, len(data) ): for j in range( i+1, len(data) ): d = m[j][i] sil[i, clustermap[j] ] += d sil[j, clustermap[i] ] += d for i in range(0,len(data)): sil[i,:] /= mass #s轮廓系数是一个用来评估聚类效果的参数 #值在-1 —— 1之间,值越大,表示效果越好。 #小于0,说明与其簇内元素的平均距离小于最近的其他簇,表示聚类效果不好。 #趋近与1,说明聚类效果比较好。 s=0 for i in range( 0, len(data) ): c = clustermap[i] a = sil[i,c] b = min(sil[i,range(0,c)+range(c+1,n)]) si = (b-a)/max(b,a) s+=si print n, s/len(data) #使用matplotlib画出散点图。 fig, ax = pl.subplots() #cmap是用于区分不同类别的颜色 cmap = pl.get_cmap('jet', n) cmap.set_under('gray') #xy是经纬度,主要为了通过经纬度来画出不同城市在地理上的位置 x = [list(d)[0] for d in xy] y = [list(d)[1] for d in xy] cax = ax.scatter(x, y, c=clustermap, s=30, cmap=cmap, vmin=0, vmax=n) pl.show()
def _G(self, data, K): labels, _, _ = Pycluster.kcluster(data.T, K) centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels) centers = centers.T G = zeros((K, data.shape[1])) for k in range(K): D = data - expand_dims(centers[:, k], axis=1) G[k, :] = -sqrt(sum(multiply(D, D), axis=0)) return G
def clustering(file_path, k, dist_measure, PLOT): """ Do the K-means clustering for input data. @param file_path: Input data file. @param k: Number of centers in K-means algorithm. @param dist_measure: Distance measure (in this case, we use Manhattan distance). @param PLOT: Bool variable, check if plot the result (set it as True only in testing). @return: Clusters id for all data points in the input data file. """ data = numpy.genfromtxt(file_path, delimiter=',') if len(data.shape) == 1: return [-1] print "-- Processing file: " + file_path + " -- Data points: " + str(len(data)) print "-- Start clustering" k = set_k(len(data), k) ite_num = method_name(len(data)) # Do the K-means clustering cluster_id, _, _ = Pycluster.kcluster(data, nclusters=k, mask=None, weight=None, transpose=0, npass=ite_num, method='a', dist=dist_measure, initialid=None) if PLOT is False: return cluster_id # Draw the clustering result plot. centroids, _ = Pycluster.clustercentroids(data, clusterid=cluster_id) if PLOT: data_pca = mlab.PCA(data) cutoff = data_pca.fracs[1] data_2d = data_pca.project(data, minfrac=cutoff) centroids_2d = data_pca.project(centroids, minfrac=cutoff) else: data_2d = data centroids_2d = centroids color = ['#2200CC', '#D9007E', '#FF6600', '#FFCC00', '#ACE600', '#0099CC', '#8900CC', '#FF0000', '#FF9900', '#FFFF00', '#00CC01', '#0055CC'] for i in range(k): scatter(data_2d[cluster_id == i, 0], data_2d[cluster_id == i, 1], color=color[i % 12]) plot(centroids_2d[:, 0], centroids_2d[:, 1], 'sg', markersize=8) show() return cluster_id
def clusterAndPlot(df, k, height=10, engine='PyCluster', cmap='spectral'): '''calculate and plot kmean clustering''' fig, axes = plt.subplots(k + 1, figsize=(18, height), sharex='all', sharey='all') if engine == 'scipy': centroids, label = kmeans2(df, k, iter=100, thresh=1e-05) else: labels, error, nfound = Pycluster.kcluster(df, k) df['label'] = labels colors = nColors(k=k, cmap=cmap) # one by one for l, g in df.groupby('label'): g.T.plot(ax=axes[l], legend=0, c=colors[l], alpha=.2) axes[l].set_title('cluster %d, %d zipcodes' % (l, len(g))) pd.Series(g.mean(0)).plot(ax=axes[-1], label='cluster %d' % (l), c=colors[l]) # plt.legend() return df
def cluster_kmedoids(self, k=2, npass=50): # Utilise la distance pour produire une partition de k classes # n est le nombre d'itérations c, err, nfound = pc.kmedoids(self.zd, k, npass=npass) return partition(c, self.mat)
def suggest(self, word): v = self.analyze(word) # pick first x res = [] for nword, nv in self.ndx.items(): wsim = self.compute_similarity([v, nv]) res.append((wsim, nword, self.as_vector(nv))) res.sort() res = res[::-1] # from first y pick the most distant ones res2 = [v for (sim, word, v) in res] resw = [word for (sim, word, v) in res] lab, err, nfound = Pycluster.kcluster(res2, 40) resg = defaultdict(lambda: []) for i, l in enumerate(lab): resg[l] += [res[i]] res_sug = [] used_groups = set() for l, w in zip(lab, resw): if not l in used_groups: res_sug += [w] used_groups.add(l) return res_sug
def kmedoids_cluster(self, similarities=None): # https://jpcomputing.wordpress.com/2014/05/18/pycluster-kmedoids-example/ from sklearn.metrics import silhouette_score distances = [] if similarities is None: for page_id, sim_vector in self._pages_similarities.iteritems(): distances.append([1 - x[1][USED_DISTANCE] for x in sim_vector]) else: for x in similarities: distances.append([1 - a for a in x]) np_distances = np.asarray(distances) import scipy.cluster from sklearn.metrics import silhouette_score from scipy.spatial.distance import squareform squareform_distances = squareform(np_distances) import Pycluster nb_clusters = 2 # this is the number of cluster the dataset is supposed to be partitioned into clusterid, error, nfound = Pycluster.kmedoids(squareform_distances, nclusters=nb_clusters, npass=50) print 'clusterid: ', len(set(clusterid)), clusterid res = silhouette_score(np_distances, clusterid, metric='precomputed') print 'Res: ', res return # grouping to clusters clusters_indexes = {} for i, medoid in enumerate(clusterid): if medoid not in clusters_indexes: clusters_indexes[medoid] = [i] else: clusters_indexes[medoid].append(i)
def grapeCluster(vectors, iterationCountPerBurst, maximumPixelDiameter, minimumPixelDiameter): # If we have no vectors, return empty array if not vectors: return [] # Assign all vectors to a single cluster globalClusters = [numpy.array(vectors)] globalCount = len(vectors) globalClusterMeans = [] # While there are globalClusters, while globalClusters: # Pop the last cluster globalCluster = globalClusters.pop() # Measure size sizeCategory = measureClusterSize(globalCluster, maximumPixelDiameter, minimumPixelDiameter) # If it is too big, if sizeCategory > 0: # Burst it # assignments = scipy.cluster.vq.kmeans2(globalCluster, k=2, iter=iterationCountPerBurst)[1] assignments = Pycluster.kcluster(globalCluster, npass=iterationCountPerBurst)[0] # Extract localClusters booleanAssignments = numpy.array(assignments) > 0 localClusters = globalCluster[booleanAssignments], globalCluster[~booleanAssignments] # Push localClusters to the end of the stack globalClusters.extend(localClusters) # If it is the right size, append the weighted mean elif sizeCategory == 0: globalClusterMeans.append(computeWeightedMean(globalCluster)) # Show feedback view.printPercentUpdate(globalCount - len(globalClusters), globalCount) # Return view.printPercentFinal(globalCount) return globalClusterMeans
def _guide_tree(self, dist_matrix): """ @summary: Build a guide tree from the distance matrix @param dist_matrix: The distance matrix @type dist_matrix: numpy.ndarray @return: Pycluster similarity tree @rtype: Pycluster.cluster.Tree @author: Woon Wai Keen @author: Vladimir Likic """ n = len(dist_matrix) print " -> Clustering %d pairwise alignments." % (n * (n - 1)), tree = Pycluster.treecluster(dist_matrix, method='a') print '\n' print tree # x = 1 # for i in list(tree): # print (i, x) # x += 1 #return different for of tree perhaps , list within list ** print "Done" return tree
def clusterSessionsKmed(featMan, weightFile): data = featMan.returnKeys() weightList = getWeightMatrixForKMedFromFile(featMan.returnLastId(), weightFile, data) cnt = 0 kclusters = {} for k in range(4, 5, 2): i = (len(weightList) + 1) / k if i == 0: i = 1 clusArray, error, opt = clust.kmedoids(weightList, i, 10, None) print error, len(clusArray) clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] q = featMan.returnQuery(c) if len(q) > 1: if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(q) cnt += 1 kclusters[k] = clusters.values() print 'Cluster with kmed ', len(clusters), cnt, ' queries' return kclusters[4]
def testPricesDiffsVecsKmeansClustering(self): """Testing whether kmeans clustering with prices differences vectors works.""" prices_diffs_vecs = utils.make_prices_diffs_vecs(self.data1) labels, wcss, n = Pycluster.kcluster(prices_diffs_vecs, 3, npass=100) clusters = utils.make_groups_from_labels(labels, self.data1) # The result should be sth like this modulo group numbers. Probability # that this isn't like this with npass=100 is (I think) very low! But # it can happen that this grouping will be different. suggested_clusters = {0: ['E'], 1: ['A', 'D'], 2: ['B', 'C']} # Let's check this. num_matches = 0 for cluster in clusters.values(): cluster.sort() for suggested_cluster in suggested_clusters.values(): suggested_cluster.sort() if cluster == suggested_cluster: num_matches = num_matches + 1 # Ok, so we've found out that each suggested cluster exists # in output of our kcluster algorithm and because length of # clusters dict is 3 we can be sure these dictionaries are equal. self.assertEqual(num_matches, 3) self.assertEqual(len(clusters), 3)
def cluster(parser, k): """ general method for clustering data """ #get index number for every page code_book = parser.get_data_encoding(page_min_occurance=5) #use only sequence of pages visited simple_session = [session for session in parser.get_simple_sessions() if config.session_filter_fn(session)] #use vector representation (v1,v2,v2) where v1 means page v1 was visited #models = session_modeling.convert_sessions_to_vector(simple_session, code_book, binary=True) #construct markov chains, estimate transition probabilities models = session_modeling.convert_sessions_to_markov(simple_session, code_book, bayes=False) idx, sse, _ = Pycluster.kcluster(models, k, method='a', dist='e') #idx, sse, _ = cluster_kmedoids(models, k, string_similarity.jaccard_distance) clusters = {} for name, clusterid in zip(simple_session, idx): clusters.setdefault(clusterid, []).append(name) return clusters, sse
def cluster_kmedoids(sessions, clusters, distance_fn=string_similarity.jaccard_distance): """ kmedoids clustering, requires distance matrix, therefore slow """ distances = compute_distances(sessions, distance_fn) clusterids, error, nfound = Pycluster.kmedoids(distances, nclusters=clusters) return clusterids, error, nfound
def clusterSessionsPre(catQueryDist, featMan, weightMatrix): tclusters = {} print len(catQueryDist) for termCount in range(4, 5): tclusters[termCount] = [] for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet) / termCount if k == 0: k = 1 #print cat, len(qSet), k qList = list(qSet) catDist = getWeightMatrixForKMed(qList, weightMatrix) clusArray, error, opt = clust.kmedoids(catDist, k, 5, None) #print 'Queries', qList clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = [] qc = featMan.returnQuery(qList[c]) if len(qc) > 1: clusters[clusId].append(qc) #print cat, len(clusters) for entry in clusters.values(): tclusters[termCount].append(entry) print len(tclusters[4]) return tclusters[4]
def multikmeans(self, krange=None): # La recette magique if krange==None: kr=np.arange(2, len(self.mat)-1) else: kr=krange lmat=len(self.mat) accords=np.zeros((lmat,lmat), dtype=int) # Où on comptera combien de fois chq paire de documents est classé ensemble t=deque() # pour sauver temps & mémoire, on emploie deque à la place de list t0=time() k2s = lambda x: x*0.85 tunits=k2s(np.array(kr)).sum() # La boucle elle-même for k in kr: t1=time() # K-means c,err,nfound=pc.kcluster(self.mat,k) # Mise à jour des valeurs for i in np.unique(c): accords[c==i] += c==i # Prédiction du temps restant t2=time() tunits-=k2s(k) t.append((t2-t1)/k2s(k)) prediction = tunits*np.mean(tuple(t)[-20:]) print "k={0}: \t{1} ({2} depuis le début) \t{3} à faire".format(k,human_time(t2-t1),human_time(t2-t0),human_time(prediction)) return accords/float(k)
def DoClustering(self, nclusters=30): '''Main clustering function''' gx = self._gx func = self._scale_function nid, jm, am, fg = zip(*[(x, gx.node[x]['JuvenileMass'], gx.node[x]['AdultMass'], gx.node[x]['FunctionalGroup']) for x in gx.node.keys()]) data = np.c_[func(jm), func(am)] if (self._normalize_data == True): data = whiten(data) data = np.c_[data, 1000 * np.array(fg)] if self._algorithm == Aggregation._HIERARCHICAL_CLUSTERING: if not self._tree_done: if self._distance_matrix: self._tree = pc.treecluster( distancematrix=self._distance_matrix) else: self._tree = pc.treecluster(data) self._tree_done = True self._data = data self._nodes_ids = nid clusters_ids = self._tree.cut(nclusters) self._clusters_ids = clusters_ids self._nclusters = len(np.unique(self._clusters_ids)) cluster_attrib = dict(zip(nid, clusters_ids)) nx.set_node_attributes(gx, 'cluster', cluster_attrib) self._gx = gx for cid in clusters_ids: fg = [ gx.node[x]['FunctionalGroup'] for x in gx.node.keys() if gx.node[x]['cluster'] == cid ] if len(np.unique(fg)) is not 1: raise Exception( 'Many functional groups inside the same cluster!!!!!! A CRASH JUST HAPPENED, just joking!!!!' )
def clusters(labels, data, k): kclus = Pycluster.kcluster(data, k, npass=1)[0] nx = numpy.zeros((len(labels), len(labels)), dtype=numpy.float32) for ind1 in range(len(labels)): for ind2 in range(len(labels)): if kclus[ind1] == kclus[ind2]: nx[ind1][ind2] = 1 print k, " of ", len(labels) return nx
def getlabels(x, y, n = 1000 , k = 8): if y == "none": y = x #fit k-means clusters labels, _, _ = Pycluster.kcluster(y, nclusters = k, transpose=0, method='a', dist='e', npass = n) #write labels back x.loc[:,"group"] = labels return(x)
def findk(x, n=1000, minK=2, maxK=20): errors = [] # fit k-means clusters for n times for i in range(minK, maxK + 1, 1): _, error, nfound = Pycluster.kcluster(x, nclusters=i, transpose=0, method="a", dist="e", npass=n) # get errors errors.append(error) print i print errors
def findk(x, n = 1000, minK = 2, maxK = 20): errors = [] #fit k-means clusters for n times for i in range(minK,maxK+1,1): _, error, nfound = Pycluster.kcluster(x, nclusters = i, transpose=0, method='a', dist='e', npass = n) #get errors errors.append(error) print i print errors
def cluster_spw_rpw(list_of_recs): number_of_clusters = 8 only_serve_return = [] if list_of_recs==[]: print "ERRROR" for rec in list_of_recs: only_serve_return.append([float(rec[0]),float(rec[1])]) k = get_k_value(only_serve_return) labels, error, nfound = Pycluster.kcluster(scipy.array(only_serve_return), k) return labels
def cluster_spw_rpw(list_of_recs): number_of_clusters = 8 only_serve_return = [] if list_of_recs == []: print "ERRROR" for rec in list_of_recs: only_serve_return.append([float(rec[0]), float(rec[1])]) k = get_k_value(only_serve_return) labels, error, nfound = Pycluster.kcluster(scipy.array(only_serve_return), k) return labels
def getlabels(x, y, n=1000, k=8): if y == "none": y = x # fit k-means clusters labels, _, _ = Pycluster.kcluster(y, nclusters=k, transpose=0, method="a", dist="e", npass=n) # write labels back x.loc[:, "group"] = labels # count how many items in each group labels = list(labels) for i in range(k): print labels.count(i) return x
def cluster(): x = [[76.0,32.0],[63.0,40.0],[70.0,30.0],[64.0,45.0]] k = 2 labels, error, nfound = Pycluster.kcluster(scipy.array(x),k) print "Input data:" print " spw " + " rpw" j = 1 for i in x: print str(j)+") "+str(i[0]) + " " + str(i[1]) j +=1 print " " print "clusters: " + str(labels)
def cluster(): x = [[76.0, 32.0], [63.0, 40.0], [70.0, 30.0], [64.0, 45.0]] k = 2 labels, error, nfound = Pycluster.kcluster(scipy.array(x), k) print "Input data:" print " spw " + " rpw" j = 1 for i in x: print str(j) + ") " + str(i[0]) + " " + str(i[1]) j += 1 print " " print "clusters: " + str(labels)
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.GetoptError, msg: raise Usage(msg) try: nodesFile = argv[1] #nodesFile="C:\Users\Selin\Desktop\k-means\TibyOutput\TibyNodes.txt" except IndexError: raise Error("Not enough arguments provided to script.") nodes=readNodesFromTxtForKmeans(nodesFile) nodes = numpy.array(nodes) #results,assign=kmeans2(whiten(features),2,iter=20,thresh=0.0000000000000000001) #results,assignment=kmeans2(features,2,iter=100,thresh=0.0000000000000000000000000000000000000001) results = Pycluster.kcluster(array(nodes),nclusters=30,npass=50,method='m') assignments=results[0] #print results # Roy's verison of making cluster dict #clusterIDs = set(assignments) #clusterByClusterID = dict((clusterID, nodes[assignments == clusterID]) for clusterID in clusterIDs) #print clusterByClusterID[0] xs = [node[0] for node in array(nodes)] ys = [node[1] for node in array(nodes)] clusterByClusterID=collections.defaultdict(list) for x, y, clusterID in itertools.izip(xs,ys,assignments): #if clusterID not in clusterByClusterID: #clusterByClusterID[clusterID] = [] clusterByClusterID[clusterID].append((x,y)) #print clusterDictByID[3] # print data #pylab.axis([-10000, 500000, -10000, 500000]) pylab.figure() pylab.hold(True) colors=['r','b','g','c','m','y','k','w','#ff6c01','#00cd00'] #colors=['r','b','g','c','m','y','k','w'] #colors=['burlywood'] #colors = 'rbgcmykw' for clusterID, color in itertools.izip(clusterByClusterID.keys(), itertools.cycle(colors)): #print clusterByClusterID[clusterID] plotCluster(clusterByClusterID[clusterID], color) print results[1], results[2]
def clusterCatWithMediods(lowerLimit, upperLimit,featMan, weightMatrix, \ samePairsSet, differentPairsSet, catQueryDist, \ outFile = 'cat-clusters-with-med.txt'): oFile = open(outFile,'w') metrics = {} for noTerms in range(lowerLimit, upperLimit): #fclusters = [] cluster_list = [] i = 0 oFile = open(outFile+str(noTerms)+'.txt','w') for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet)/noTerms if k == 0: k = 1 qList = sorted(list(qSet),reverse=True) catDist = getWeightMatrixForKMed(qList, weightMatrix,'cat_kmediods') clusArray, error, opt = clust.kmedoids(catDist,k, 5, None) clusters = {} for c in range(1, len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(qList[c-1]) for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) #fclusters.append(qStr) oFile.write(cat+'\t'+qStr+'\n'); print 'Clust category',cat, 'length', len(clusters),\ 'Queries' , len(qSet),'k', k, 'error', error, opt if i % 5 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) #metrics[noTerms] = getRecallPrecision(samePairsSet, \ # differentPairsSet,\ # predictedSamePairsSet,\ # predictedDifferentPairsSet) metrics[noTerms] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics
def kmeans(data, **kwargs): """ Perform k-means clustering on unstructured N-dimensional data. @type data: array @param data: The data to be clustered @type kwargs: dict @param kwargs: The following args are accepted: - numClusters: The number of clusters to form (returned number of clusters may be less than k). - npasses: The number of times the k-means clustering algorithm is performed, each time with a different (random) initial condition. - method: describes how the center of a cluster is found: - method=='a': arithmetic mean. - method=='m': median. - initialCenters: a set of points that should be used as the initial cluster centers @rtype: tuple @return: A list where each element indicates the cluster membership of the corresponding index in the original data and a message string """ k = 1 npasses = 1 method = 'a' initialCenters = None smartCenters = False msg = '' if 'numClusters' in kwargs: k = int(kwargs['numClusters']) if 'npasses' in kwargs: npasses = int(kwargs['npasses']) if 'method' in kwargs: method = kwargs['method'] if 'initialCenters' in kwargs: initialCenters = kwargs['initialCenters'] if 'smartCenters' in kwargs: smartCenters = kwargs['smartCenters'] logData = tm.getMethod('log')(data) if initialCenters is not None: (clusterIDs, err, nOpt) = pc.kcluster(logData, k, npass=npasses, method=method) msg = "Number of rounds optimal solution was found: %i" % nOpt else: logCenters = tm.getMethod('log')(np.array(initialCenters[:k])) (centroids, clusterIDs) = kmeans2(logData, logCenters, minit='matrix') if len(np.unique(clusterIDs)) < k: wx.MessageBox('Warning: One or more of the returned clusters are empty. Please choose different initial cluster centers and re-run k-means for better results.', 'Insufficiently varied cluster centers', wx.OK | wx.ICON_WARNING) return clusterIDs, msg
def Kmedoids(num_patches, samples, progress=None): """Estimate patches as centroids of samples using k-Medoids. This requires the `Pycluster` library to be installed. :param int num_patches: number of patches to create :type samples: 2D array :param samples: example patches :param progress: ignored :rtype: 2D array with `num_patches` rows and N columns, where N is the number of columns in `samples`. :return: created patches """ logging.info("Learning %d prototypes per size by k-Medoids clustering" % num_patches) import Pycluster dist = Pycluster.distancematrix(samples) cluster_ids, _, _ = Pycluster.kmedoids(dist, nclusters=num_patches) # `cluster_ids` contains `num_patches` unique values, each of which is # the index of the medoid for a different cluster. return samples[np.unique(cluster_ids)].astype(ACTIVATION_DTYPE)
def getlabels(x, y, n = 1000 , k = 8): if y == "none": y = x #fit k-means clusters labels, _, _ = Pycluster.kcluster(y, nclusters = k, transpose=0, method='a', dist='e', npass = n) #write labels back x.loc[:,"group"] = labels #count how many items in each group labels = list(labels) for i in range(k): print labels.count(i) return(x)
def reassignClusterIDs(src, dst): """ Given the cluster centers for two clusterings, determine the centers most similar to each other and reassign the cluster ids to match. """ srcFCS = DataStore.getData()[src[0]] dstFCS = DataStore.getData()[dst[0]] srcdata = srcFCS.data if srcFCS.selDims: srcdata = dh.filterData(srcFCS.data, srcFCS.selDims) srcids = srcFCS.clustering[src[1]] srccenters = pc.clustercentroids(srcdata, clusterid=srcids)[0] dstdata = dstFCS.data if dstFCS.selDims: dstdata = dh.filterData(dstFCS.data, dstFCS.selDims) dstids = dstFCS.clustering[dst[1]] dstcenters = pc.clustercentroids(dstdata, clusterid=dstids)[0] srcsep = separate(srcdata, srcids) dstsep = separate(dstdata, dstids) centerEQ = {} taken = [] # Fill the map with the closest source center for each destination center for i,dc in enumerate(dstcenters): bestDist = -1 for j,sc in enumerate(srccenters): if (j not in taken): dist = nonSymmetricClusterDistance(dstsep[i], srcsep[j]) if (bestDist < 0) or (dist < bestDist): bestDist = dist centerEQ[i] = j taken.append(centerEQ[i]) # Renumber the cluster IDs in the destination to match the IDs of the closest src center tmp = [centerEQ[id] for id in dstids] DataStore.getData()[dst[0]].clustering[dst[1]] = tmp
def main(): args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) logging.debug('Reading %s', args.input.name) src_nodes = np.loadtxt(args.input, dtype=int) lattice_width, lattice_height = src_nodes.shape lattice = nx.grid_2d_graph(lattice_width, lattice_height) # find and remove wall wall_nodes = map(lambda e: tuple(e), np.transpose(np.nonzero(src_nodes))) lattice.remove_nodes_from(wall_nodes) assert len(lattice.nodes()) == (lattice_width * lattice_height - len(wall_nodes)) nodelist = list(lattice.nodes()) node_ids = {n: i for i, n in enumerate(nodelist)} assert len(nodelist) == len(node_ids) # compute normalized laplacian norm_lapl = normalized_laplacian(lattice, nodelist, node_ids) # compute eigenvalues and eigenvectors eigen_val, eigen_vec = np.linalg.eig(norm_lapl) # kmeans labels, _, _ = Pycluster.kcluster(eigen_vec[:, :args.kappa + 1], args.kappa, dist='e', npass=100, initialid=None) # assign colors colors = [COLORS[i] for i in labels] assert len(colors) == len(labels) # compute grid lattice_height x lattice_width containing colors grid = [] colored, non_colored = 0, 0 its = 0 for i in xrange(lattice_height): grid.append([]) for j in xrange(lattice_width): node_id = node_ids.get((i, j)) color = colors[node_id] if node_id is not None else BLACK grid[i].append(color) if color == BLACK: non_colored += 1 else: colored += 1 assert non_colored == len(wall_nodes) display(grid)
def DoClustering(self,nclusters=30,distance_matrix=None): #Avoid working two times if not self._tree_done: df_nc = self._df_nodes[self._df_nodes['ID']>=0].copy() data = df_nc[['JuvenileMass', 'AdultMass']] data = data.as_matrix() data = self._scale_function(data) if(self._normalize_data==True): data = whiten(data) data = np.c_[data,100.*df_nc.FunctionalGroup.values] if distance_matrix: self._tree = pc.treecluster(distancematrix=distance_matrix) else: self._tree = pc.treecluster(data) self._data = data self._tree_done = True self.FillClusterIndividualData(self._tree.cut(nclusters))
def clusterCatWithMediodsAndNetwork(threshold, \ lowerLimit, upperLimit, featMan, \ weightMatrix, samePairsSet, \ differentPairsSet, catQueryDist, \ catNetwork, \ outFile = 'cat-clusters-with-med.txt'): #cluster each cat find the outliers #move them to parents metrics = {} for noTerms in range(lowerLimit, upperLimit, 2): cluster_list = [] #fclusters = [] i = 0 oFile = open(outFile+str(noTerms)+'.txt','w') for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet)/noTerms if k == 0: k = 1 #print cat, len(qSet), k qList = list(qSet) catDist = getWeightMatrixForKMed(qList, weightMatrix) clusArray, error, opt = clust.kmedoids(catDist,k, 5, None) #print 'Queries', qList clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(qList[c]) #outliers = getOutliers(qList,catDist) for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) oFile.write(cat+'\t'+qStr+'\n'); #fclusters.append(qStr) print 'Clust ',cat, len(clusters), error, opt if i % 50 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) key = str(threshold)+'_'+str(noTerms) metrics[key] = getRecallPrecision(samePairsSet, differentPairsSet,\ predictedSamePairsSet,\ predictedDifferentPairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics
def DoClustering(self,nclusters=30): '''Main clustering function''' gx = self._gx; func = self._scale_function nid,jm,am,fg=zip(*[(x,gx.node[x]['JuvenileMass'],gx.node[x]['AdultMass'],gx.node[x]['FunctionalGroup']) for x in gx.node.keys()]) data = np.c_[func(jm),func(am)] if(self._normalize_data==True): data = whiten(data) data = np.c_[data,1000*np.array(fg)] if self._algorithm == Aggregation._HIERARCHICAL_CLUSTERING: if not self._tree_done: if self._distance_matrix: self._tree = pc.treecluster(distancematrix=self._distance_matrix) else: self._tree = pc.treecluster(data) self._tree_done = True self._data = data self._nodes_ids = nid clusters_ids = self._tree.cut(nclusters) self._clusters_ids = clusters_ids self._nclusters = len(np.unique(self._clusters_ids)) cluster_attrib = dict(zip(nid,clusters_ids)) nx.set_node_attributes(gx,'cluster',cluster_attrib) self._gx = gx for cid in clusters_ids: fg = [gx.node[x]['FunctionalGroup'] for x in gx.node.keys() if gx.node[x]['cluster']==cid] if len(np.unique(fg)) is not 1: raise Exception('Many functional groups inside the same cluster!!!!!! A CRASH JUST HAPPENED, just joking!!!!')
def main(): args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) logging.debug('Reading %s', args.input.name) src_nodes = np.loadtxt(args.input, dtype=int) lattice_width, lattice_height = src_nodes.shape lattice = nx.grid_2d_graph(lattice_width, lattice_height) # find and remove wall wall_nodes = map(lambda e: tuple(e), np.transpose(np.nonzero(src_nodes))) lattice.remove_nodes_from(wall_nodes) assert len(lattice.nodes()) == (lattice_width * lattice_height - len(wall_nodes)) nodelist = list(lattice.nodes()) node_ids = {n: i for i, n in enumerate(nodelist)} assert len(nodelist) == len(node_ids) # compute normalized laplacian norm_lapl = normalized_laplacian(lattice, nodelist, node_ids) # compute eigenvalues and eigenvectors eigen_val, eigen_vec = np.linalg.eig(norm_lapl) # kmeans labels, _, _ = Pycluster.kcluster(eigen_vec[:, :args.kappa+1], args.kappa, dist='e', npass=100, initialid=None) # assign colors colors = [COLORS[i] for i in labels] assert len(colors) == len(labels) # compute grid lattice_height x lattice_width containing colors grid = [] colored, non_colored = 0, 0 its = 0 for i in xrange(lattice_height): grid.append([]) for j in xrange(lattice_width): node_id = node_ids.get((i, j)) color = colors[node_id] if node_id is not None else BLACK grid[i].append(color) if color == BLACK: non_colored += 1 else: colored += 1 assert non_colored == len(wall_nodes) display(grid)
def cluster(fname, nclust): fh = open(fname, 'r') lines = fh.readlines() fh.close() clusters = int(nclust) points = [] points_r = [] dates = [] volumes = [] close_prices = [] for i in range(len(lines)): if i <= 1: continue line_c = lines[i - 1].strip().split(',') close_price = float(line_c[0]) volume = float(line_c[1]) points_r.append((close_price, volume)) volumes.append(volume) close_prices.append(close_price) #dates.append(line_c[0]) volume_z = np.array(volumes) #volume_z = stats.zscore(a) close_price_z = np.array(close_prices) #close_price_z = stats.zscore(a) points = zip(close_price_z, volume_z) init_data = [] k = len(points) / (nclust) for i in range(nclust - 1): for j in range(k): init_data.append(i) while (len(points) != len(init_data)): init_data.append(nclust - 1) #print(clusters) labels, error, nfound = Pycluster.kcluster(points, clusters, None, None, 0, 1, 'a', 'e', init_data) labels_sorted = sort_labels(labels) #print('Labels: ') print labels_sorted return labels_sorted
def generate_network_clusters(G): # Function creates the cluster partitions using heierarchical clustering # on geodesic distances # First check to make sure the given network is a single fully # connected component. if len(NX.component.connected_component_subgraphs(G)) >1: raise NX.NetworkXError, 'G must be single component! Extract main component...' # Now generte clusters dist_matrix=get_dist_matrix(G) # Default Heierarchical Clustering algo used hclus=PC.treecluster(data=None,distancematrix=dist_matrix,method='m') partitions={} # create dictionary of partitioning at each cut in heierarchy for c in range(1,len(hclus)+1): # treecluster cuts start at 1 partitions[c]=hclus.cut(c).tolist() return partitions
def __init__(self, numComps=None, dim=5, data=None, epsilon=math.pow(10, -10), wishartScalar=1, wishartScale=np.identity(dim), dirichlet=np.ones(numComps), normalMu=0, normalSigma=np.identity(dim)): # INITIALIZE ALL POSTERIOR PARAMETERS self.d = dim self.k = numComps self.n = len(data) # INITIALIZE ALL PRIOR PARAMETERS self.e = normalSigma self.m = normalMu self.w = wishartScale self.v = wishartScalar self.di = dirichlet self.epsilon = epsilon # INITIALIZE ALL PRIORS USING k-means CLUSTERING # INITIALIZE THE MUS labels, error, nfound = pc.kcluster(data, self.k)#, iter=300, thresh=1e-05) centroids, _ = pc.clustercentroids(data, clusterid=labels) self.mu = centroids self.pointsInComp = [[] for comp in xrange(self.k)] for n in xrange(self.n): self.pointsInComp[labels[n]].append(data[n]) # INITIALIZE THE COVARIANCE MATRIX self.sigma = [np.cov(np.array(kpoints).T) for kpoints in self.pointsInComp] # INITIALIZE THE WEIGHTS self.pi = [len(l)/data.shape[0] for l in self.pointsInComp]
def tree_cluster_test(data, real_labels, outputfile=None): start = time.time() tree = Pycluster.treecluster(data, method='m') ks = range(25, 50, 1) if outputfile != None: f = open(outputfile, 'w') f.write(out_result_header()) for k in ks: print 'hierachical clustering whn k=%d' % k predicted = tree.cut(k).tolist() if outputfile != None: f.write(out_result(predicted, k, real_labels)) elasped = time.time() - start print 'hierarchical clustering time: %.3f' % (elasped / float(len(ks)))
def clustering(x,y,cost,ngroup=2): if CLUSTER == "scipy": z = whiten(cost) # let scipy do its magic (k==3 groups) res, labels = kmeans2(array(list(zip(x,y,z))),ngroup) if CLUSTER == "Pycluster": points = np.zeros((x.shape[0], 2)) points[:,0] = x points[:,1] = y # labels, error, nfound = Pycluster.kcluster(points, ngroup, weights=cost) labels, error, nfound = Pycluster.kcluster(points, ngroup) return labels
def kmeans_cluster_test(data, real_labels, outputfile=None): start = time.time() ks = range(8, 15) if outputfile != None: f = open(outputfile, 'w') f.write(out_result_header()) for k in ks: print 'running kmeans when k=%d' % k predicted = Pycluster.kcluster(data, k)[0].tolist() if outputfile != None: f.write(out_result(predicted, k, real_labels)) f.close() elasped = time.time() - start print 'Average time: %.3f' % (elasped / float(len(ks)))
def tree_cluster_test(data,real_labels, outputfile = None): start = time.time() tree = Pycluster.treecluster(data, method='m') ks = range(25,50,1) if outputfile != None: f = open(outputfile,'w') f.write(out_result_header()) for k in ks: print 'hierachical clustering whn k=%d' % k predicted = tree.cut(k).tolist() if outputfile != None: f.write(out_result(predicted,k, real_labels)) elasped = time.time() - start print 'hierarchical clustering time: %.3f' % (elasped/float(len(ks)))
def cluster(D, k): import Pycluster as pcl labels, _, _ = pcl.kmedoids(D, nclusters=k, npass=10, initialid=None) errors = np.array([D[labels[i], i] for i in range(len(labels))]) centroidids = np.unique(labels) cmap = np.zeros(labels.max() + 1) for c in centroidids: cmap[c] = np.nonzero(centroidids == c)[0][0] labels = cmap[labels] logger.debug('k-medoids (k=%i): %.2f.' % (k, errors.sum())) return labels, { 'method': 'kmedoids', 'init': 'random', 'k': k, 'centroidids': centroidids, 'errors': errors, 'error': errors.sum(), 'error-label': 'sum of distances' }
def resolution_clustering(clusters, cluster_ids, sampled, kx=2): X = np.array([np.append(np.append(c[0], c[1]), c[2]) for c in clusters]) n = X.shape[1] / 3 Xn = X / ([np.average(X[:, :n])] * n + [np.average(X[:, n:2 * n])] * n + [np.average(X[:, 2 * n:])] * n) C, e, nf = Pycluster.kcluster(Xn, len(clusters) / len(sampled) * kx) del Xn Cidx = defaultdict(list) for i, c in enumerate(C): Cidx[c].append(i) CStable = [] for k, v in Cidx.items(): members = set() for c in v: members.update(clusters[c][3]) members = sorted(members) s = stability(members, cluster_ids) CStable.append((s, np.average(X[v], axis=0).reshape( (3, X.shape[1] / 3)), members)) return CStable
def _guide_tree(self, dist_matrix): """ @summary: Build a guide tree from the distance matrix @param dist_matrix: The distance matrix @type dist_matrix: numpy.ndarray @return: Pycluster similarity tree @rtype: Pycluster.cluster.Tree @author: Woon Wai Keen @author: Vladimir Likic """ n = len(dist_matrix) print " -> Clustering %d pairwise alignments." % (n * (n - 1)), tree = Pycluster.treecluster(distancematrix=dist_matrix, method='a') print "Done" return tree
def matchs_ia(): # if not current_user.admin: # return jsonify({'message' : 'Cannot perform that function!'}) tests = TestR.query.all() dataSet = [] trans_tab = dict.fromkeys(map(ord, u"\u0301\u0308"), None) for test in tests: # test_data = {} resident = Residents.query.filter_by(public_id=test.public_id).first() test_data = ("-" + str(resident.id) + "-" + test.gender + str(test.age) + test.musicGender + test.sport + test.hobbie + test.movieSeries + test.filmGender + test.tabaco + test.alcohol + test.party + str(test.ordenConvivencia) + str(test.ordenPersonal) + test.personalidad) test_data = normalize( "NFKC", normalize("NFKD", test_data).translate(trans_tab)) dataSet.append(test_data) distans = [ distance.edit_distance(dataSet[i], dataSet[j]) for i in range(1, len(dataSet)) for j in range(0, i) ] labels, error, nfound = PC.kmedoids(distans, nclusters=5, npass=10) cluster = dict() output = [] for roommate, label in zip(dataSet, labels): cluster.setdefault(label, []).append(roommate) for label, grp in cluster.items(): cluster_data = {} cluster_data["Roommate"] = grp cluster_data["Count"] = len(grp) cluster_data["label"] = str(label) output.append(cluster_data) return jsonify({"testsALL": output}, {"error": error}, {"nfound": nfound})
def cluster(self, num_cluster): category_tfidf = self.category_tfidf categories = list(category_tfidf) random.shuffle(categories) tfidf_norms = {category: sum(value**2 for value in tfidf.values()) for category, tfidf in category_tfidf.items()} for category, norm in tfidf_norms.items(): if not norm: raise Exception((category, category_tfidf[category])) distances = [] for i, category1 in enumerate(categories): cat1_tfidf = category_tfidf[category1] row_array = array([0.0] * i) for j, category2 in enumerate(categories): if j >= i: break row_array[j] = self.compute_distance(cat1_tfidf, category_tfidf[category2], tfidf_norms[category1], tfidf_norms[category2]) distances.append(row_array) clusterids, error, nfound = Pycluster.kmedoids(distances, num_cluster) print error category_clusters = [[] for _ in range(num_cluster)] print len(clusterids) print len(categories) print clusterids clusterid_map = {} for i, category in enumerate(clusterids): category_id = clusterid_map.setdefault(category, len(clusterid_map)) category_clusters[category_id].append(categories[i]) return category_clusters
def __kmeans_initialization(self): """ given the data points, cluster them by applying kmeans clustering algorithm. """ # apply kmeans clustering to get the centroids and labels for each vector in data labels, error, nfound = Pycluster.kcluster(self._data, self._nClusters) # get the dimension of the input data rows, cols = self._data.shape clusterData = [[] for i in xrange(self._nClusters)] # assign vectors to clusters for data, label in zip(self._data, labels): clusterData[label].append(data) models = [GaussianCluster( *muAndSigma(clusterData[i], cols)) for i in xrange(self._nClusters)] apriori = np.ones(self._nClusters, dtype = np.float32) / np.array([len(elem) for elem in clusterData]) return models, apriori
def kmeans(k, table): # k = 50 (labels, error, nfound) = pc.kcluster(table, k, None, None, 0, 20, 'a', 'b') # plot.plot_scatter(table, labels, k) # centers = get_centers(table, labels) # np.random.shuffle(table) # tab = [map(float, x) for x in table[:1000]] # mycluster = mc.MyClustering(tab, k) # mycluster.init_heap() # mycluster.hierarchy_cluster() # mycluster.clear_sample_points() # for i, row in enumerate(table): # if i % 1000 == 0: # print 'progress: %d' % i # mycluster.add_point(i, map(float, row)) # mycluster.get_cluster() return labels
def som_cluster_test(data, real_labels, outputfile=None): if outputfile != None: f = open(outputfile, 'w') f.write(out_result_header()) start = time.time() ks = range(6, 40) for k in ks: print 'som clustering when k=%d' % k predicted = Pycluster.somcluster(data, nxgrid=k, nygrid=1, niter=5, dist='u')[0] predicted = [xy[0] for xy in predicted.tolist()] cata = tuple(set(predicted)) for i in range(0, len(predicted)): predicted[i] = cata.index(predicted[i]) if outputfile != None: f.write(out_result(predicted, k, real_labels)) elasped = time.time() - start print 'som clustering time: %.3f' % (elasped / float(len(ks)))