def findcenters(x,n=1000,k=6): #get dimensions m = x.shape[1] #create centers as empty centers = DataFrame(np.zeros(shape=(k,m))) for i in range(n): labels, _, _ = Pycluster.kcluster(x, nclusters = k, transpose=0, method='a', dist='e', npass = 1) center, _ = Pycluster.clustercentroids(x,clusterid = labels) #sort centers by the distance to the origin center = sorted(center,key = lambda t: np.linalg.norm(np.array(t)-np.zeros(m)), reverse = True) #print np.linalg.norm(np.array(center[0])-np.zeros(m)) #print np.linalg.norm(np.array(center[1])-np.zeros(m)) #print np.linalg.norm(np.array(center[2])-np.zeros(m)) #print np.linalg.norm(np.array(center[3])-np.zeros(m)) #print np.linalg.norm(np.array(center[4])-np.zeros(m)) #print np.linalg.norm(np.array(center[5])-np.zeros(m)) #print np.array(center[0]) #print np.array(center[1]) #print np.array(center[2]) #print np.array(center[3]) #print np.array(center[4]) #print np.array(center[5]) #take the average for j in range(k): centers.ix[j,:] = centers.ix[j,:] + center[j] centers = centers/n return(centers)
def suggest(self, word): v = self.analyze(word) # pick first x res = [] for nword, nv in self.ndx.items(): wsim = self.compute_similarity([v, nv]) res.append((wsim, nword, self.as_vector(nv))) res.sort() res = res[::-1] # from first y pick the most distant ones res2 = [v for (sim, word, v) in res] resw = [word for (sim, word, v) in res] lab, err, nfound = Pycluster.kcluster(res2, 40) resg = defaultdict(lambda: []) for i, l in enumerate(lab): resg[l] += [res[i]] res_sug = [] used_groups = set() for l, w in zip(lab, resw): if not l in used_groups: res_sug += [w] used_groups.add(l) return res_sug
def multikmeans(self, krange=None): # La recette magique if krange==None: kr=np.arange(2, len(self.mat)-1) else: kr=krange lmat=len(self.mat) accords=np.zeros((lmat,lmat), dtype=int) # Où on comptera combien de fois chq paire de documents est classé ensemble t=deque() # pour sauver temps & mémoire, on emploie deque à la place de list t0=time() k2s = lambda x: x*0.85 tunits=k2s(np.array(kr)).sum() # La boucle elle-même for k in kr: t1=time() # K-means c,err,nfound=pc.kcluster(self.mat,k) # Mise à jour des valeurs for i in np.unique(c): accords[c==i] += c==i # Prédiction du temps restant t2=time() tunits-=k2s(k) t.append((t2-t1)/k2s(k)) prediction = tunits*np.mean(tuple(t)[-20:]) print "k={0}: \t{1} ({2} depuis le début) \t{3} à faire".format(k,human_time(t2-t1),human_time(t2-t0),human_time(prediction)) return accords/float(k)
def findcenters(x, n=1000, k=6): # get dimensions m = x.shape[1] # create centers as empty centers = DataFrame(np.zeros(shape=(k, m))) for i in range(n): labels, _, _ = Pycluster.kcluster(x, nclusters=k, transpose=0, method="a", dist="e", npass=1) center, _ = Pycluster.clustercentroids(x, clusterid=labels) # sort centers by the distance to the origin center = sorted(center, key=lambda t: np.linalg.norm(np.array(t) - np.zeros(m)), reverse=True) # print np.linalg.norm(np.array(center[0])-np.zeros(m)) # print np.linalg.norm(np.array(center[1])-np.zeros(m)) # print np.linalg.norm(np.array(center[2])-np.zeros(m)) # print np.linalg.norm(np.array(center[3])-np.zeros(m)) # print np.linalg.norm(np.array(center[4])-np.zeros(m)) # print np.linalg.norm(np.array(center[5])-np.zeros(m)) # print np.array(center[0]) # print np.array(center[1]) # print np.array(center[2]) # print np.array(center[3]) # print np.array(center[4]) # print np.array(center[5]) # take the average for j in range(k): centers.ix[j, :] = centers.ix[j, :] + center[j] centers = centers / n return centers
def grapeCluster(vectors, iterationCountPerBurst, maximumPixelDiameter, minimumPixelDiameter): # If we have no vectors, return empty array if not vectors: return [] # Assign all vectors to a single cluster globalClusters = [numpy.array(vectors)] globalCount = len(vectors) globalClusterMeans = [] # While there are globalClusters, while globalClusters: # Pop the last cluster globalCluster = globalClusters.pop() # Measure size sizeCategory = measureClusterSize(globalCluster, maximumPixelDiameter, minimumPixelDiameter) # If it is too big, if sizeCategory > 0: # Burst it # assignments = scipy.cluster.vq.kmeans2(globalCluster, k=2, iter=iterationCountPerBurst)[1] assignments = Pycluster.kcluster(globalCluster, npass=iterationCountPerBurst)[0] # Extract localClusters booleanAssignments = numpy.array(assignments) > 0 localClusters = globalCluster[booleanAssignments], globalCluster[~booleanAssignments] # Push localClusters to the end of the stack globalClusters.extend(localClusters) # If it is the right size, append the weighted mean elif sizeCategory == 0: globalClusterMeans.append(computeWeightedMean(globalCluster)) # Show feedback view.printPercentUpdate(globalCount - len(globalClusters), globalCount) # Return view.printPercentFinal(globalCount) return globalClusterMeans
def cluster(parser, k): """ general method for clustering data """ #get index number for every page code_book = parser.get_data_encoding(page_min_occurance=5) #use only sequence of pages visited simple_session = [session for session in parser.get_simple_sessions() if config.session_filter_fn(session)] #use vector representation (v1,v2,v2) where v1 means page v1 was visited #models = session_modeling.convert_sessions_to_vector(simple_session, code_book, binary=True) #construct markov chains, estimate transition probabilities models = session_modeling.convert_sessions_to_markov(simple_session, code_book, bayes=False) idx, sse, _ = Pycluster.kcluster(models, k, method='a', dist='e') #idx, sse, _ = cluster_kmedoids(models, k, string_similarity.jaccard_distance) clusters = {} for name, clusterid in zip(simple_session, idx): clusters.setdefault(clusterid, []).append(name) return clusters, sse
def clusterAndPlot(df, k, height=10, engine='PyCluster', cmap='spectral'): '''calculate and plot kmean clustering''' fig, axes = plt.subplots(k + 1, figsize=(18, height), sharex='all', sharey='all') if engine == 'scipy': centroids, label = kmeans2(df, k, iter=100, thresh=1e-05) else: labels, error, nfound = Pycluster.kcluster(df, k) df['label'] = labels colors = nColors(k=k, cmap=cmap) # one by one for l, g in df.groupby('label'): g.T.plot(ax=axes[l], legend=0, c=colors[l], alpha=.2) axes[l].set_title('cluster %d, %d zipcodes' % (l, len(g))) pd.Series(g.mean(0)).plot(ax=axes[-1], label='cluster %d' % (l), c=colors[l]) # plt.legend() return df
def testPricesDiffsVecsKmeansClustering(self): """Testing whether kmeans clustering with prices differences vectors works.""" prices_diffs_vecs = utils.make_prices_diffs_vecs(self.data1) labels, wcss, n = Pycluster.kcluster(prices_diffs_vecs, 3, npass=100) clusters = utils.make_groups_from_labels(labels, self.data1) # The result should be sth like this modulo group numbers. Probability # that this isn't like this with npass=100 is (I think) very low! But # it can happen that this grouping will be different. suggested_clusters = {0: ['E'], 1: ['A', 'D'], 2: ['B', 'C']} # Let's check this. num_matches = 0 for cluster in clusters.values(): cluster.sort() for suggested_cluster in suggested_clusters.values(): suggested_cluster.sort() if cluster == suggested_cluster: num_matches = num_matches + 1 # Ok, so we've found out that each suggested cluster exists # in output of our kcluster algorithm and because length of # clusters dict is 3 we can be sure these dictionaries are equal. self.assertEqual(num_matches, 3) self.assertEqual(len(clusters), 3)
def pyclustertest(): data=sp.rand(100,4) cid,e,n=pcl.kcluster(data) centroids,cmask=pcl.clustercentroids(D,clusterid=cid) print data print centroids
def getlabels(x, y, n = 1000 , k = 8): if y == "none": y = x #fit k-means clusters labels, _, _ = Pycluster.kcluster(y, nclusters = k, transpose=0, method='a', dist='e', npass = n) #write labels back x.loc[:,"group"] = labels return(x)
def findk(x, n=1000, minK=2, maxK=20): errors = [] # fit k-means clusters for n times for i in range(minK, maxK + 1, 1): _, error, nfound = Pycluster.kcluster(x, nclusters=i, transpose=0, method="a", dist="e", npass=n) # get errors errors.append(error) print i print errors
def clusters(labels, data, k): kclus = Pycluster.kcluster(data, k, npass=1)[0] nx = numpy.zeros((len(labels), len(labels)), dtype=numpy.float32) for ind1 in range(len(labels)): for ind2 in range(len(labels)): if kclus[ind1] == kclus[ind2]: nx[ind1][ind2] = 1 print k, " of ", len(labels) return nx
def findk(x, n = 1000, minK = 2, maxK = 20): errors = [] #fit k-means clusters for n times for i in range(minK,maxK+1,1): _, error, nfound = Pycluster.kcluster(x, nclusters = i, transpose=0, method='a', dist='e', npass = n) #get errors errors.append(error) print i print errors
def cluster_spw_rpw(list_of_recs): number_of_clusters = 8 only_serve_return = [] if list_of_recs==[]: print "ERRROR" for rec in list_of_recs: only_serve_return.append([float(rec[0]),float(rec[1])]) k = get_k_value(only_serve_return) labels, error, nfound = Pycluster.kcluster(scipy.array(only_serve_return), k) return labels
def myCKDemo(filename,n): #以下两个语句是获取数据,用于聚类分析的数据位于第3和第4列(从0开始计算) data = np.loadtxt(filename, delimiter = "," ,usecols=(2,4,14,8)) #第8和第9列,保存了城市的经纬度坐标,用于最后画散点图 xy = np.loadtxt(filename, delimiter = "," ,usecols=(2,4)) #clustermap是聚类之后的集合,记录每一组数据的类别id clustermap = pc.kcluster(data, n)[0] #centroids 是分组聚类之后的聚类中心坐标 centroids = pc.clustercentroids(data, clusterid=clustermap)[0] #m是距离矩阵 m = pc.distancematrix(data) #mass 用来记录各类的点的数目 mass = np.zeros(n) for c in clustermap: mass[c] += 1 #sil是轮廓系统矩阵,用于记录每个簇的大小 sil = np.zeros(n*len(data)) sil.shape = ( len(data), n ) for i in range( 0, len(data) ): for j in range( i+1, len(data) ): d = m[j][i] sil[i, clustermap[j] ] += d sil[j, clustermap[i] ] += d for i in range(0,len(data)): sil[i,:] /= mass #s轮廓系数是一个用来评估聚类效果的参数 #值在-1 —— 1之间,值越大,表示效果越好。 #小于0,说明与其簇内元素的平均距离小于最近的其他簇,表示聚类效果不好。 #趋近与1,说明聚类效果比较好。 s=0 for i in range( 0, len(data) ): c = clustermap[i] a = sil[i,c] b = min(sil[i,range(0,c)+range(c+1,n)]) si = (b-a)/max(b,a) s+=si print n, s/len(data) #使用matplotlib画出散点图。 fig, ax = pl.subplots() #cmap是用于区分不同类别的颜色 cmap = pl.get_cmap('jet', n) cmap.set_under('gray') #xy是经纬度,主要为了通过经纬度来画出不同城市在地理上的位置 x = [list(d)[0] for d in xy] y = [list(d)[1] for d in xy] cax = ax.scatter(x, y, c=clustermap, s=30, cmap=cmap, vmin=0, vmax=n) pl.show()
def cluster_spw_rpw(list_of_recs): number_of_clusters = 8 only_serve_return = [] if list_of_recs == []: print "ERRROR" for rec in list_of_recs: only_serve_return.append([float(rec[0]), float(rec[1])]) k = get_k_value(only_serve_return) labels, error, nfound = Pycluster.kcluster(scipy.array(only_serve_return), k) return labels
def _G(self, data, K): labels, _, _ = Pycluster.kcluster(data.T, K) centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels) centers = centers.T G = zeros((K, data.shape[1])) for k in range(K): D = data - expand_dims(centers[:, k], axis=1) G[k, :] = -sqrt(sum(multiply(D, D), axis=0)) return G
def getlabels(x, y, n=1000, k=8): if y == "none": y = x # fit k-means clusters labels, _, _ = Pycluster.kcluster(y, nclusters=k, transpose=0, method="a", dist="e", npass=n) # write labels back x.loc[:, "group"] = labels # count how many items in each group labels = list(labels) for i in range(k): print labels.count(i) return x
def cluster(): x = [[76.0,32.0],[63.0,40.0],[70.0,30.0],[64.0,45.0]] k = 2 labels, error, nfound = Pycluster.kcluster(scipy.array(x),k) print "Input data:" print " spw " + " rpw" j = 1 for i in x: print str(j)+") "+str(i[0]) + " " + str(i[1]) j +=1 print " " print "clusters: " + str(labels)
def cluster(): x = [[76.0, 32.0], [63.0, 40.0], [70.0, 30.0], [64.0, 45.0]] k = 2 labels, error, nfound = Pycluster.kcluster(scipy.array(x), k) print "Input data:" print " spw " + " rpw" j = 1 for i in x: print str(j) + ") " + str(i[0]) + " " + str(i[1]) j += 1 print " " print "clusters: " + str(labels)
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.GetoptError, msg: raise Usage(msg) try: nodesFile = argv[1] #nodesFile="C:\Users\Selin\Desktop\k-means\TibyOutput\TibyNodes.txt" except IndexError: raise Error("Not enough arguments provided to script.") nodes=readNodesFromTxtForKmeans(nodesFile) nodes = numpy.array(nodes) #results,assign=kmeans2(whiten(features),2,iter=20,thresh=0.0000000000000000001) #results,assignment=kmeans2(features,2,iter=100,thresh=0.0000000000000000000000000000000000000001) results = Pycluster.kcluster(array(nodes),nclusters=30,npass=50,method='m') assignments=results[0] #print results # Roy's verison of making cluster dict #clusterIDs = set(assignments) #clusterByClusterID = dict((clusterID, nodes[assignments == clusterID]) for clusterID in clusterIDs) #print clusterByClusterID[0] xs = [node[0] for node in array(nodes)] ys = [node[1] for node in array(nodes)] clusterByClusterID=collections.defaultdict(list) for x, y, clusterID in itertools.izip(xs,ys,assignments): #if clusterID not in clusterByClusterID: #clusterByClusterID[clusterID] = [] clusterByClusterID[clusterID].append((x,y)) #print clusterDictByID[3] # print data #pylab.axis([-10000, 500000, -10000, 500000]) pylab.figure() pylab.hold(True) colors=['r','b','g','c','m','y','k','w','#ff6c01','#00cd00'] #colors=['r','b','g','c','m','y','k','w'] #colors=['burlywood'] #colors = 'rbgcmykw' for clusterID, color in itertools.izip(clusterByClusterID.keys(), itertools.cycle(colors)): #print clusterByClusterID[clusterID] plotCluster(clusterByClusterID[clusterID], color) print results[1], results[2]
def kmeans(data, **kwargs): """ Perform k-means clustering on unstructured N-dimensional data. @type data: array @param data: The data to be clustered @type kwargs: dict @param kwargs: The following args are accepted: - numClusters: The number of clusters to form (returned number of clusters may be less than k). - npasses: The number of times the k-means clustering algorithm is performed, each time with a different (random) initial condition. - method: describes how the center of a cluster is found: - method=='a': arithmetic mean. - method=='m': median. - initialCenters: a set of points that should be used as the initial cluster centers @rtype: tuple @return: A list where each element indicates the cluster membership of the corresponding index in the original data and a message string """ k = 1 npasses = 1 method = 'a' initialCenters = None smartCenters = False msg = '' if 'numClusters' in kwargs: k = int(kwargs['numClusters']) if 'npasses' in kwargs: npasses = int(kwargs['npasses']) if 'method' in kwargs: method = kwargs['method'] if 'initialCenters' in kwargs: initialCenters = kwargs['initialCenters'] if 'smartCenters' in kwargs: smartCenters = kwargs['smartCenters'] logData = tm.getMethod('log')(data) if initialCenters is not None: (clusterIDs, err, nOpt) = pc.kcluster(logData, k, npass=npasses, method=method) msg = "Number of rounds optimal solution was found: %i" % nOpt else: logCenters = tm.getMethod('log')(np.array(initialCenters[:k])) (centroids, clusterIDs) = kmeans2(logData, logCenters, minit='matrix') if len(np.unique(clusterIDs)) < k: wx.MessageBox('Warning: One or more of the returned clusters are empty. Please choose different initial cluster centers and re-run k-means for better results.', 'Insufficiently varied cluster centers', wx.OK | wx.ICON_WARNING) return clusterIDs, msg
def getlabels(x, y, n = 1000 , k = 8): if y == "none": y = x #fit k-means clusters labels, _, _ = Pycluster.kcluster(y, nclusters = k, transpose=0, method='a', dist='e', npass = n) #write labels back x.loc[:,"group"] = labels #count how many items in each group labels = list(labels) for i in range(k): print labels.count(i) return(x)
def clustering(file_path, k, dist_measure, PLOT): """ Do the K-means clustering for input data. @param file_path: Input data file. @param k: Number of centers in K-means algorithm. @param dist_measure: Distance measure (in this case, we use Manhattan distance). @param PLOT: Bool variable, check if plot the result (set it as True only in testing). @return: Clusters id for all data points in the input data file. """ data = numpy.genfromtxt(file_path, delimiter=',') if len(data.shape) == 1: return [-1] print "-- Processing file: " + file_path + " -- Data points: " + str(len(data)) print "-- Start clustering" k = set_k(len(data), k) ite_num = method_name(len(data)) # Do the K-means clustering cluster_id, _, _ = Pycluster.kcluster(data, nclusters=k, mask=None, weight=None, transpose=0, npass=ite_num, method='a', dist=dist_measure, initialid=None) if PLOT is False: return cluster_id # Draw the clustering result plot. centroids, _ = Pycluster.clustercentroids(data, clusterid=cluster_id) if PLOT: data_pca = mlab.PCA(data) cutoff = data_pca.fracs[1] data_2d = data_pca.project(data, minfrac=cutoff) centroids_2d = data_pca.project(centroids, minfrac=cutoff) else: data_2d = data centroids_2d = centroids color = ['#2200CC', '#D9007E', '#FF6600', '#FFCC00', '#ACE600', '#0099CC', '#8900CC', '#FF0000', '#FF9900', '#FFFF00', '#00CC01', '#0055CC'] for i in range(k): scatter(data_2d[cluster_id == i, 0], data_2d[cluster_id == i, 1], color=color[i % 12]) plot(centroids_2d[:, 0], centroids_2d[:, 1], 'sg', markersize=8) show() return cluster_id
def main(): args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) logging.debug('Reading %s', args.input.name) src_nodes = np.loadtxt(args.input, dtype=int) lattice_width, lattice_height = src_nodes.shape lattice = nx.grid_2d_graph(lattice_width, lattice_height) # find and remove wall wall_nodes = map(lambda e: tuple(e), np.transpose(np.nonzero(src_nodes))) lattice.remove_nodes_from(wall_nodes) assert len(lattice.nodes()) == (lattice_width * lattice_height - len(wall_nodes)) nodelist = list(lattice.nodes()) node_ids = {n: i for i, n in enumerate(nodelist)} assert len(nodelist) == len(node_ids) # compute normalized laplacian norm_lapl = normalized_laplacian(lattice, nodelist, node_ids) # compute eigenvalues and eigenvectors eigen_val, eigen_vec = np.linalg.eig(norm_lapl) # kmeans labels, _, _ = Pycluster.kcluster(eigen_vec[:, :args.kappa + 1], args.kappa, dist='e', npass=100, initialid=None) # assign colors colors = [COLORS[i] for i in labels] assert len(colors) == len(labels) # compute grid lattice_height x lattice_width containing colors grid = [] colored, non_colored = 0, 0 its = 0 for i in xrange(lattice_height): grid.append([]) for j in xrange(lattice_width): node_id = node_ids.get((i, j)) color = colors[node_id] if node_id is not None else BLACK grid[i].append(color) if color == BLACK: non_colored += 1 else: colored += 1 assert non_colored == len(wall_nodes) display(grid)
def main(): args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) logging.debug('Reading %s', args.input.name) src_nodes = np.loadtxt(args.input, dtype=int) lattice_width, lattice_height = src_nodes.shape lattice = nx.grid_2d_graph(lattice_width, lattice_height) # find and remove wall wall_nodes = map(lambda e: tuple(e), np.transpose(np.nonzero(src_nodes))) lattice.remove_nodes_from(wall_nodes) assert len(lattice.nodes()) == (lattice_width * lattice_height - len(wall_nodes)) nodelist = list(lattice.nodes()) node_ids = {n: i for i, n in enumerate(nodelist)} assert len(nodelist) == len(node_ids) # compute normalized laplacian norm_lapl = normalized_laplacian(lattice, nodelist, node_ids) # compute eigenvalues and eigenvectors eigen_val, eigen_vec = np.linalg.eig(norm_lapl) # kmeans labels, _, _ = Pycluster.kcluster(eigen_vec[:, :args.kappa+1], args.kappa, dist='e', npass=100, initialid=None) # assign colors colors = [COLORS[i] for i in labels] assert len(colors) == len(labels) # compute grid lattice_height x lattice_width containing colors grid = [] colored, non_colored = 0, 0 its = 0 for i in xrange(lattice_height): grid.append([]) for j in xrange(lattice_width): node_id = node_ids.get((i, j)) color = colors[node_id] if node_id is not None else BLACK grid[i].append(color) if color == BLACK: non_colored += 1 else: colored += 1 assert non_colored == len(wall_nodes) display(grid)
def cluster(fname, nclust): fh = open(fname, 'r') lines = fh.readlines() fh.close() clusters = int(nclust) points = [] points_r = [] dates = [] volumes = [] close_prices = [] for i in range(len(lines)): if i <= 1: continue line_c = lines[i - 1].strip().split(',') close_price = float(line_c[0]) volume = float(line_c[1]) points_r.append((close_price, volume)) volumes.append(volume) close_prices.append(close_price) #dates.append(line_c[0]) volume_z = np.array(volumes) #volume_z = stats.zscore(a) close_price_z = np.array(close_prices) #close_price_z = stats.zscore(a) points = zip(close_price_z, volume_z) init_data = [] k = len(points) / (nclust) for i in range(nclust - 1): for j in range(k): init_data.append(i) while (len(points) != len(init_data)): init_data.append(nclust - 1) #print(clusters) labels, error, nfound = Pycluster.kcluster(points, clusters, None, None, 0, 1, 'a', 'e', init_data) labels_sorted = sort_labels(labels) #print('Labels: ') print labels_sorted return labels_sorted
def silhouette(data, k=5, shuffle = True, shufflecount = 100): #assume that data is a matrix with variables in rows and dimensions in columns coefficients = {} data = data.transpose() for nclus in range(2,k): clustermap = pc.kcluster(data,nclusters=nclus,npass=50)[0] centroids = pc.clustercentroids(data,clusterid=clustermap)[0] m = pc.distancematrix(data) res = [silhouette_coefficient(m,clustermap,nclus,data.shape)] for _ in range(shufflecount): dat = data map(np.random.shuffle,dat) clustermap = pc.kcluster(dat,nclusters=nclus,npass=50)[0] centroids = pc.clustercentroids(dat,clusterid=clustermap)[0] #distance matrix-- well it's a list actually m = pc.distancematrix(dat) res.append([silhouette_coefficient(m,clustermap,nclus,dat.shape)]) coefficients[nclus]={'data':res[0],'distribution':res[1:]} return coefficients
def clustering(x,y,cost,ngroup=2): if CLUSTER == "scipy": z = whiten(cost) # let scipy do its magic (k==3 groups) res, labels = kmeans2(array(list(zip(x,y,z))),ngroup) if CLUSTER == "Pycluster": points = np.zeros((x.shape[0], 2)) points[:,0] = x points[:,1] = y # labels, error, nfound = Pycluster.kcluster(points, ngroup, weights=cost) labels, error, nfound = Pycluster.kcluster(points, ngroup) return labels
def kmeans_cluster_test(data, real_labels, outputfile=None): start = time.time() ks = range(8, 15) if outputfile != None: f = open(outputfile, 'w') f.write(out_result_header()) for k in ks: print 'running kmeans when k=%d' % k predicted = Pycluster.kcluster(data, k)[0].tolist() if outputfile != None: f.write(out_result(predicted, k, real_labels)) f.close() elasped = time.time() - start print 'Average time: %.3f' % (elasped / float(len(ks)))
def kmeans_cluster_test(data, real_labels, outputfile=None): start = time.time() ks = range(8,15) if outputfile != None: f = open(outputfile,'w') f.write(out_result_header()) for k in ks: print 'running kmeans when k=%d' % k predicted = Pycluster.kcluster(data,k)[0].tolist() if outputfile != None: f.write(out_result(predicted,k, real_labels)) f.close() elasped = time.time() - start print 'Average time: %.3f' % (elasped/float(len(ks)))
def main(): usage= 'usage: %prog [options] infname1 [infname2 ...]' parser= OptionParser(usage=usage) parser.add_option('-o', '--output', dest='output_fname', default='cluster.out', help='output fname') parser.add_option('-c', '--no-cache', dest='cache', action='store_false', default=True, help='discard cache') parser.add_option('-k', '--resulting-dimensionality', dest='k', default=150, help='el numero de dimensiones resultantes') options, args= parser.parse_args(argv[1:]) if len(args) < 1: parser.error('not enaught args') parser= parserclass('models_cache') infnames= args outfname= options.output_fname print 'parsing scores' scores= [parser.parse(infname, cache=options.cache) for infname in infnames] nclusterss= [2] # range(2,5) npasss= [5,10, 15, 20, 25] methods= ['a', 'm'] dists= ['e', 'b', 'c', 'a', 'u', 'x', 's', 'k'] configs= list(combine(nclusterss, npasss, methods, dists)) results= {} for k in range(2,10,2): print 'k=', k concept_vectors= apply_lsa(scores, k) step= len(configs)/10 for i, (nclusters, npass, method, dist) in enumerate(configs): if (i+1)%step == 0: print '\t', ((i+1)*100)/len(configs) r= Pycluster.kcluster(concept_vectors, nclusters= nclusters, method= method, dist= dist) results[(k, nclusters, npass, method, dist)]= r f= open('clusters_results.pickle', 'w') pickle.dump(results, f, 2) f.close()
def get_clusterid(vanadium_dbName, Cobalt_dbName): ''' Given the database of sensor observation it calculates clusterid for each sensor''' con = lite.connect(vanadium_dbName) con.row_factory = lite.Row with con: cur = con.cursor() cur.execute("SELECT * FROM sensor") columnNames = cur.fetchone() columnNames = columnNames.keys() con = lite.connect(vanadium_dbName) with con: cur = con.cursor() cur.execute("SELECT * FROM sensor") rows = cur.fetchall() ROW = [row[3:] for row in rows] Data = numpy.matrix(ROW).astype(float) Data = numpy.transpose(Data) numRows, numCols = Data.shape mask = numpy.ones((numRows, numCols)).astype(numpy.uint8) counter = 0 for i in range(numRows): for j in range(numCols): if Data[i,j] < 0: counter += 1 # Counting the missing observation mask[i,j] = 0 clusterid, error, nfound = cluster.kcluster(data=Data, nclusters=7, mask=mask, weight=None, transpose=0, npass=1, method='a', dist='c', initialid=None) '''Create clusterid to SPND dictonary''' clusterIdtoSPND = {} lst = [] for cid in clusterid: clusterIdtoSPND[cid] = lst for cid, col in zip(clusterid, columnNames[3:]): clusterIdtoSPND[cid].append(col)
def cluster_colors(points): num_clusters = min(len(points), 3) labels, error, nfound = Pycluster.kcluster(points, num_clusters) totals = [] for i in range(num_clusters): totals.append( [[0, 0, 0], 0] ) for i in range(len(labels)): tmp = totals[labels[i]] tmp[0][0] += points[i][0] tmp[0][1] += points[i][1] tmp[0][2] += points[i][2] tmp[1] += 1 averages = [ [ 1.0 * a[0]/n, 1.0 * a[1]/n, 1.0 * a[2]/n] for a,n in totals] return averages
def get_clusters(job_id, t, debug=False): dbc = MySQLdb.connect(host = 'localhost', user = '******', passwd = '1qaz2wsx', db = 'affivir') cur = dbc.cursor() assets = get_assets(cur, job_id, t) # number of clusters nc = len(assets) / 2 if nc == 0: return None if debug is True: print t, nc, 'clusters' (b, a) = get_asset_range(assets) sz = b - a + 1 W = np.zeros((sz, sz)) for i in range(a, b + 1): for j in range(i + 1, b + 1): if i != j: sim = get_sim(dbc.cursor(), i, j) W[i - a, j - a] = sim W[j - a, i - a] = sim D = np.diag(np.sum(W, 0)) L = D - W lam, u = eigs(L, k=nc, which='SR') u = real(u) labels, error, nfound = Pycluster.kcluster(u, nc) for k in range(nc): out = '' for i in range(len(labels)): if labels[i] == k: out += str(a + i) + ' ' if debug is False: q = 'INSERT INTO clusters(job_id, type, data) VALUES ("' + str(job_id) + '", "' + t + '", "' + out + '")' cur.execute(q) else: print out
def k_means(flat_data, data, nclusters, method, distance): """ K-Means Clustering """ clusterid, error, nfound = pc.kcluster( flat_data.values(), nclusters=nclusters, mask=None, weight=None, transpose=0, npass=100, method=method, dist=distance, initialid=None) # load clusters into dictionary clusters = defaultdict(list) for i, j in zip(clusterid, data): clusters[i].append(j) return clusters
def resolution_clustering(clusters, cluster_ids, sampled, kx=2): X = np.array([np.append(np.append(c[0], c[1]), c[2]) for c in clusters]) n = X.shape[1] / 3 Xn = X / ([np.average(X[:, :n])] * n + [np.average(X[:, n:2 * n])] * n + [np.average(X[:, 2 * n:])] * n) C, e, nf = Pycluster.kcluster(Xn, len(clusters) / len(sampled) * kx) del Xn Cidx = defaultdict(list) for i, c in enumerate(C): Cidx[c].append(i) CStable = [] for k, v in Cidx.items(): members = set() for c in v: members.update(clusters[c][3]) members = sorted(members) s = stability(members, cluster_ids) CStable.append((s, np.average(X[v], axis=0).reshape( (3, X.shape[1] / 3)), members)) return CStable
def kmeans(k, table): # k = 50 (labels, error, nfound) = pc.kcluster(table, k, None, None, 0, 20, 'a', 'b') # plot.plot_scatter(table, labels, k) # centers = get_centers(table, labels) # np.random.shuffle(table) # tab = [map(float, x) for x in table[:1000]] # mycluster = mc.MyClustering(tab, k) # mycluster.init_heap() # mycluster.hierarchy_cluster() # mycluster.clear_sample_points() # for i, row in enumerate(table): # if i % 1000 == 0: # print 'progress: %d' % i # mycluster.add_point(i, map(float, row)) # mycluster.get_cluster() return labels
def k_means(): dataList = list(database.find_all()) vectors = [] uuids = [] for data in dataList: counter = Counter() uuids.append(data['uuid']) for event in data['events']: counter[event['name']] += 1 vector = [] for typ in counter: vector.append(counter[typ]) vectors.append(vector) result = vectors labels, error, nfound = Pycluster.kcluster(vectors, 3) classes = [] for label in labels: classes.append(numpy.asscalar(label)) result = dict(zip(uuids,classes)) return result
def __kmeans_initialization(self): """ given the data points, cluster them by applying kmeans clustering algorithm. """ # apply kmeans clustering to get the centroids and labels for each vector in data labels, error, nfound = Pycluster.kcluster(self._data, self._nClusters) # get the dimension of the input data rows, cols = self._data.shape clusterData = [[] for i in xrange(self._nClusters)] # assign vectors to clusters for data, label in zip(self._data, labels): clusterData[label].append(data) models = [GaussianCluster(*muAndSigma(clusterData[i], cols)) for i in xrange(self._nClusters)] apriori = np.ones(self._nClusters, dtype=np.float32) / np.array([len(elem) for elem in clusterData]) return models, apriori
def __kmeans_initialization(self): """ given the data points, cluster them by applying kmeans clustering algorithm. """ # apply kmeans clustering to get the centroids and labels for each vector in data labels, error, nfound = Pycluster.kcluster(self._data, self._nClusters) # get the dimension of the input data rows, cols = self._data.shape clusterData = [[] for i in xrange(self._nClusters)] # assign vectors to clusters for data, label in zip(self._data, labels): clusterData[label].append(data) models = [GaussianCluster( *muAndSigma(clusterData[i], cols)) for i in xrange(self._nClusters)] apriori = np.ones(self._nClusters, dtype = np.float32) / np.array([len(elem) for elem in clusterData]) return models, apriori
def cl(Data_v_co): starttime = time.time() clusterid, error, nfound = cluster.kcluster(data=Data_v_co, nclusters=10, mask=mask_v_co, weight=None, transpose=0, npass=98, method='a', dist='c', initialid=None) #centroids, _ = cluster.clustercentroids(Data_v_co, clusterid=clusterid) #cen = map(tuple, centroids) #set(cen) count=Counter(clusterid) #print count #print clusterid #-------------------------------------------------------------------------------------------------------------- clusterIdtoSPND = {} lst = [] for cid in clusterid: clusterIdtoSPND[cid] = lst for cid, col in zip(clusterid, columnNames_v_co[3:]): clusterIdtoSPND[cid] = clusterIdtoSPND[cid] + [col] print clusterIdtoSPND return clusterIdtoSPND
def clusterAndPlot(df, k, height=10, engine='PyCluster', cmap='spectral'): '''calculate and plot kmean clustering''' fig, axes = plt.subplots(k + 1, figsize=(18, height), sharex='all', sharey='all') if engine == 'scipy': centroids, label = kmeans2(df, k, iter=100, thresh=1e-05) else: labels, error, nfound = Pycluster.kcluster(df, k) df['label'] = labels colors = nColors(k=k, cmap=cmap) # one by one for l, g in df.groupby('label'): g.T.plot(ax=axes[l], legend=0, c=colors[l], alpha=.2) axes[l].set_title('cluster %d, %d zipcodes' % (l, len(g))) pd.Series(g.mean(0)).plot( ax=axes[-1], label='cluster %d' % (l), c=colors[l]) # plt.legend() return df
def pt(x,y): """Take the mean of a sample neighborhood, discarding any invalid (depth == 2047) pixels """ t = sample_side global d, ds, samples, mask d = depth[y-t:y+t,x-t:x+t] # This is where I choose which point in the sample to use. I take # the minimum, which is the nearest pixel. Other possibilities # are median, mean, etc. if method=='median': meand = np.median(d[d<2047]) if method=='mean': meand = np.mean(d[d<2047]) if method=='min': meand = d[d<2047].min() if method=='kmeans': import Pycluster labels, error, nfound = Pycluster.kcluster(d.reshape(-1,1),4) labels = labels.reshape(d.shape) means = np.array([d[labels==i].mean() for i in range(labels.max()+1)]) nearest = np.argmin(means) mask = labels==nearest samples = d[mask] def radius(target): x,y = np.nonzero(d == target) return np.sqrt((x[0]-sample_side/2)**2+(y[0]-sample_side/2)**2) cands = (samples.min(), samples.max()) rads = [radius(i) for i in cands] meand = means.min() #meand = cands[np.argmax(rads)] #meand = np.median(samples) #meand = samples.min() if np.median(samples) > np.mean(samples) else samples.max() return x,y,meand,1
# Distance metrics dDict = dict([("corr", "c"), ("abscorr", "a"), ("uncentcorr", "u"), ("absunccorr", "x"), ("spearman", "s"), ("kendall", "k"), ("euc", "e"), ("cityblock", "b")]) # Unsupervised validation metrics list silhouetteList = [] # K-means if (algorithm == "k"): # Method mDict = dict([("mean", "a"), ("median", "m")]) # All clusterListAll, error, nfound = pc.kcluster(np.array(rawData), nclusters=noClust, transpose=0, method=mDict[method], dist=dDict[distance]) silScore = metrics.silhouette_score(rawData, clusterListAll, metric='euclidean') silhouetteList.append(silScore) # Single clusterListSingle = [] for i in range(0, len(labelList)): cluterListTemp, error, nfound = pc.kcluster( np.array(rawData)[:, (rawVph * i):(rawVph * i) + rawVph], nclusters=noClust, transpose=0, method=mDict[method], dist=dDict[distance])
import Pycluster as pc import numpy as np import sys # Read data filename and desired number of clusters from command line filename, n = sys.argv[1], int(sys.argv[2]) data = np.loadtxt(filename) # Perform clustering and find centroids clustermap, _, _ = pc.kcluster(data, nclusters=n, npass=50) centroids, _ = pc.clustercentroids(data, clusterid=clustermap) # Obtain distance matrix m = pc.distancematrix(data) # Find the masses of all clusters mass = np.zeros(n) for c in clustermap: mass[c] += 1 # Create a matrix for individual silhouette coefficients sil = np.zeros(n * len(data)) sil.shape = (len(data), n) # Evaluate the distance for all pairs of points for i in range(0, len(data)): for j in range(i + 1, len(data)): d = m[j][i] sil[i, clustermap[j]] += d
def heatmap(args): datafiles = args.datafiles for x in args.datafiles: if not os.path.isfile(x): print "ERROR: Data file '{0}' does not exist".format(x) sys.exit(1) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format( x) pysam.index(x) # Options Parser featurefile = args.featurefile datafiles = [x.strip() for x in args.datafiles] tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] colors = parse_colors(args.colors) bgcolors = parse_colors(args.bgcolors) outfile = args.outfile extend_up = args.extend extend_down = args.extend fragmentsize = args.fragmentsize cluster_type = args.clustering[0].lower() merge_mirrored = args.merge_mirrored bins = (extend_up + extend_down) / args.binsize rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats ncpus = args.cpus distancefunction = args.distancefunction[0].lower() dynam = args.graphdynamics fontsize = args.textfontsize # Check for mutually exclusive parameters if dynam: if merge_mirrored: print "ERROR: -m and -g option CANNOT be used together" sys.exit(1) if distancefunction == 'e': print 'Dynamics can only be identified using Pearson correlation as metric.' print 'Assigning metric to Pearson correlation' distancefunction = 'p' # Warning about too much files if (len(tracks) > 4): print "Warning: Running fluff with too many files might make you system use enormous amount of memory!" # Method of clustering if (args.pick != None): pick = [i - 1 for i in split_ranges(args.pick)] if not all(i <= len(tracks) - 1 for i in pick): sys.stderr.write( "You picked a non-existent file for clustering.\n") sys.exit(1) else: pick = range(len(datafiles)) if not cluster_type in ["k", "h", "n"]: sys.stderr.write("Unknown clustering type!\n") sys.exit(1) # Number of clusters if cluster_type == "k" and not args.numclusters >= 2: sys.stderr.write("Please provide number of clusters!\n") sys.exit(1) # Distance function if not distancefunction in ["e", "p"]: sys.stderr.write("Unknown distance function!\n") sys.exit(1) else: if distancefunction == "e": METRIC = cfg.DEFAULT_METRIC print "Euclidean distance method" else: METRIC = "c" print "Pearson distance method" ## Get scale for each track tscale = [1.0 for track in datafiles] # Function to load heatmap data def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=None): if guard is None: guard = [] # Calculate the profile data data = {} regions = [] print "Loading data" try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( pool.apply_async(load_heatmap_data, args=(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in datafiles: track, regions, profile, guard = load_heatmap_data( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard # -g : Option to try and get dynamics # Extend features 1kb up/down stream # Cluster them in one bin # Cluster them in one bin guard = [] amount_bins = bins extend_dyn_up = extend_up extend_dyn_down = extend_down if dynam: # load the data once to get the features which extend below 0 guard = check_data(featurefile, extend_dyn_up, extend_dyn_down) extend_dyn_up = 1000 extend_dyn_down = 1000 amount_bins = 1 # Load data for clustering data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) # Normalize norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE) clus = hstack([ norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick) ]) # Clustering if cluster_type == "k": print "K-means clustering" ## K-means clustering # PyCluster labels, _, nfound = Pycluster.kcluster(clus, args.numclusters, dist=METRIC) if not dynam and merge_mirrored: (i, j) = mirror_clusters(data, labels) while j: for track in data.keys(): data[track][labels == j] = [ row[::-1] for row in data[track][labels == j] ] for k in range(len(regions)): if labels[k] == j: (chrom, start, end, gene, strand) = regions[k] if strand == "+": strand = "-" else: strand = "+" regions[k] = (chrom, start, end, gene, strand) n = len(set(labels)) labels[labels == j] = i for k in range(j + 1, n): labels[labels == k] = k - 1 (i, j) = mirror_clusters(data, labels) ind = labels.argsort() # Hierarchical clustering elif cluster_type == "h": print "Hierarchical clustering" tree = Pycluster.treecluster(clus, method="m", dist=METRIC) labels = tree.cut(args.numclusters) ind = sort_tree(tree, arange(len(regions))) else: ind = arange(len(regions)) labels = zeros(len(regions)) # Load data for visualization if -g option was used if dynam: data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) f = open("{0}_clusters.bed".format(outfile), "w") for (chrom, start, end, gene, strand), cluster in zip( array(regions, dtype="object")[ind], array(labels)[ind]): if not gene: f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format( chrom, start, end, cluster + 1, strand)) else: f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format( chrom, start, end, gene, cluster + 1, strand)) f.close() # Save read counts readcounts = {} for i, track in enumerate(tracks): readcounts[track] = {} readcounts[track]['bins'] = [] for idx, row in enumerate(data[track]): bins = '' for b in row: if not bins: bins = '{0}'.format(b) else: bins = '{0};{1}'.format(bins, b) readcounts[track]['bins'].append(bins) input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w') input_fileBins.write('Regions\t') for i, track in enumerate(titles): input_fileBins.write('{0}\t'.format(track)) input_fileBins.write('\n') for i, track in enumerate(tracks): for idx in ind: input_fileBins.write('{0}:{1}-{2}\t'.format( regions[idx][0], regions[idx][1], regions[idx][2])) for i, track in enumerate(tracks): input_fileBins.write('{0}\t'.format( readcounts[track]['bins'][idx])) input_fileBins.write('\n') break input_fileBins.close() if not cluster_type == "k": labels = None scale = get_absolute_scale(args.scale, [data[track] for track in tracks]) heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize)
import numpy as np import Pycluster def cargaDatos(nombreArchivo): ## carga la matriz de caracteristicas a clasificar return np.load(nombreArchivo) M = cargaDatos("DataSet.npy") ## proceso de clustering ## hacemos varias corridas c = [] for i in range(5): labels, error, nfound = Pycluster.kcluster(M, 5) c.append(labels) z = np.array(c) np.save("resultados.npy", z) print labels
errorOld = 5000 index = 175 '''for i in range(max(nodes)+1): clusterid,error,nfound = pc.kcluster(adjMat, nclusters=i+1, transpose=0,npass=1,method='a',dist='e') if (i==0): errorOld = error elif (error == 0): index = i break elif(error < errorOld): index = i errorOld = error''' clusterid, error, nfound = pc.kcluster(adjMat, nclusters=index + 1, transpose=0, npass=1, method='a', dist='e') print(clusterid) print nfound print error neglect = set([]) for i in range(max(nodes) + 1): if (sum(adjMat[i]) == 0.0): neglect.add(clusterid[i]) print neglect
with open('d2_out', 'r') as file1: d2 = pickle.load(file1) with open('np_out', 'r') as file1: np = pickle.load(file1) goodBbands = [b for b in np.keys() if np[b] > 500 and b != "Ram"] d3 = {b: d2[b] for b in goodBbands} data = numpy.array([d2[b] for b in goodBbands]).squeeze() ave = numpy.mean(d3.values(), axis=0) stds = numpy.std(d3.values(), axis=0) sims = scipy.zeros((len(goodBbands), len(goodBbands))) dists = scipy.zeros((len(goodBbands), len(goodBbands))) idx = Pycluster.kcluster(data, 3, npass=1)[0] keys = [goodBbands[i] for i in numpy.argsort(idx)] for (ia, a) in enumerate(keys): for (ib, b) in enumerate(keys): v1 = (d3[a] - ave) / stds v2 = (d3[b] - ave) / stds sims[ia, ib] = (v1.T.dot(v2)) / (scipy.linalg.norm(v1) * scipy.linalg.norm(v2)) dists[ia, ib] = scipy.linalg.norm(v1 - v2) labels = zip([str(t) for t in idx[numpy.argsort(idx)]], keys) fig = plt.figure() ax = fig.add_subplot(111) imgplot = ax.imshow(sims, interpolation='none') ax.set_yticks(xrange(len(keys)))
for t in tokenList: if t in tokens: tokenV[j] = 1.0 j += 1 vectorList.append(tokenV) ''' Logging intermediate results ''' print vectorList print profileVector print len(tokenList) print len(vectorList) features = array(vectorList) labels, error, nfound = Pycluster.kcluster(features, kc) centroids = vstack( [features[labels == i].mean(0) for i in range(labels.max() + 1)]) s1Vector = defaultdict(list) s2Vector = defaultdict(list) Result1 = [] Result2 = [] for l in range(0, len(labels)): s2Vector[labels[l]].append(profileVector[l][1]) company = profileVector[l][0] for z in range(0, len(labels)): if profileVector[z][0] in profileVector[l][0]: if len(profileVector[z][0]) < len(company): company = profileVector[z][0] s1Vector[labels[l]].append(company)
import networkx as net import networkx.algorithms as algo import matplotlib.pyplot as plt import numpy as np import numpy.linalg as la import Pycluster g = net.Graph() g.add_edges_from([(1, 2), (1, 3), (1, 4), (2, 3), (3, 4), (4, 5), (4, 6), (5, 6), (5, 7), (5, 8), (6, 7), (6, 8), (7, 8), (7, 9)]) adj_m = net.adjacency_matrix(g) w, v = la.eig(adj_m) S = [[0.0 for i in range(1, 3)] for k in range(1, 10)] S = np.mat(S) S[:, 0] += v[:, 0] S[:, 1] += v[:, 1] B = np.diag( (w[0], w[1])) # diagonal matrix built with the top 2 eigenvalues of adj_m labels = Pycluster.kcluster(S, 2)