def cluster_by_edges(edges): k_edges = 2 clustered_images_by_edges, error, _ = kcluster(edges, k_edges, npass=5) previous_error = error * 10 while 1/log(k_edges+0.0000001)*error > 1/log(k_edges-0.999999)*previous_error and k_edges < len(edges): #while error < 0.85 * previous_error: k_edges += 1 previous_error = error clustered_images_by_edges, error, _ = kcluster(edges, k_edges, npass=5) k_edges -= 1 clustered_images_by_edges, error, nfound = kcluster(edges, k_edges, npass=10) return clustered_images_by_edges, k_edges
def get_mvn_pars(x, nclusters): # n, p = x.shape # pis = ones(nclusters)/nclusters # mus = normal(0, 1, (nclusters, p)) # omegas = [identity(p) for i in range(nclusters)] z, error, found = kcluster(x, nclusters=nclusters) z = array(z) n = len(z) pis = [] mus = [] omegas = [] for i in range(nclusters): cluster = x[z==i] pi = len(cluster)/float(n) mu = mean(cluster, 0) sigma = (1.0/len(cluster))*dot(transpose(cluster - mu), (cluster - mu)) pis.append(pi) mus.append(mu) try: omegas.append(inv(sigma)) except LinAlgError: omegas.append(identity(x.shape[1])) return pis, mus, omegas
def extend(self, k): '''try to improve likelihood: likelihood of obs in our models times likelihood of our models in parent, vs. likelihood of obs in parent''' clusterid, err, nfound = kcluster([(p, ) for p in self.data], k, npass=5) l = [[] for i in range(k)] j = 0 for i in clusterid: # CONSTRUCT MODEL LISTS OF DATA l[i].append(self.data[j]) j += 1 newmodel = [] #means=[] for j in range(k): model = Model(self, l[j], float(len(l[j])) / len(self.data), self.delta) newmodel.append(model) #means.append(model.args[0]) logP = self.computeP(self.data, newmodel)[0] # COMPUTE NEW LOG-P #logP+=self.parent.computeP(means) # COMPUTE LOG-P FOR newmodels print 'logP:', logP if logP > self.lastP + self.logConfidence: # ACCEPT THE NEW MODEL self.models = newmodel self.lastP = logP self.k = k return True return False
def extend(self,k): '''try to improve likelihood: likelihood of obs in our models times likelihood of our models in parent, vs. likelihood of obs in parent''' clusterid,err,nfound=kcluster([(p,) for p in self.data],k,npass=5) l=[[] for i in range(k)] j=0 for i in clusterid: # CONSTRUCT MODEL LISTS OF DATA l[i].append(self.data[j]) j+=1 newmodel=[] #means=[] for j in range(k): model=Model(self,l[j],float(len(l[j]))/len(self.data),self.delta) newmodel.append(model) #means.append(model.args[0]) logP=self.computeP(self.data,newmodel)[0] # COMPUTE NEW LOG-P #logP+=self.parent.computeP(means) # COMPUTE LOG-P FOR newmodels print 'logP:',logP if logP>self.lastP+self.logConfidence: # ACCEPT THE NEW MODEL self.models=newmodel self.lastP=logP self.k=k return True return False
def cluster_sentences(cls, sentences, n): """Cluster the sentences into n clusters. Args: sentences: [IRSentence] n: int, number of clusters Returns: [int], group id of each sentence in sentences """ vol = set() for sentence in sentences: tfidf = sentence.get_tfidf() for term in tfidf: vol.add(term) vol = list(vol) vecs = [] for sentence in sentences: tfidf = sentence.get_tfidf() vec = [] for term in vol: if term in tfidf: vec.append(tfidf[term]) else: vec.append(0.0) vecs.append(vec) # call pycluster k-means from Pycluster import kcluster, clustercentroids, distancematrix labels, error, nfound = kcluster(vecs, nclusters=n, method='a', dist='u') centroids, cmask = clustercentroids(vecs, clusterid=labels, method='a') sentence_ids = [] for centroid_index, centroid in enumerate(centroids): # find vecs in the cluster subvecs = [centroid] subvecindexs = [-1] for label_index, label in enumerate(labels): if label == centroid_index: subvecs.append(vecs[label_index]) subvecindexs.append(label_index) # find the min dist vec matrix = distancematrix(subvecs, dist='u') minimum = 100000 minimum_index = 0 for i in xrange(1, subvecs.__len__()): dist = matrix[i][0] if dist < minimum: minimum = dist minimum_index = subvecindexs[i] sentence_ids.append(minimum_index) # method='a') return labels, sentence_ids
def find_clusters(self): #print "Finding clusters.." #print "Response times: " + str(self.restimes) data_array = array([self.restimes]) cluster_index, error, nfound = kcluster(data_array, nclusters = self.nclusters, mask = self.masks, weight = self.weights, transpose = self.cluster_transpose, npass = self.npasses, method = self.method, dist = self.dist) return cluster_index
def Main(self, model): """calculate kmeans""" self.model = model data = self.model.GetCurrentData()[:] nclusters = wx.GetNumberFromUser("Kmeans Dialog", "Enter number of clusters", "Kmeans nclusters", 1) z, error, found = kcluster(data, nclusters=nclusters) #IGNORE:W0612 self.model.NewGroup('Kmeans%02d' % nclusters) self.model.hdf5.createArray(self.model.current_group, 'z', array(z)) self.model.update()
def random_forest_cluster(X, k=2, dissimilarity=True, **kwargs): """ Random Forest Cluster :param X: :param k: :param dissimilarity: :param kwargs: :return: """ clf, prox_mat = unsupervised_random_forest(X, **kwargs) if dissimilarity: prox_mat = 1 - prox_mat cluster_ids, error, n_found = kcluster(prox_mat, nclusters=k, method="m") return clf, prox_mat, cluster_ids
def main(): schooldata = csv.reader(open(filename),delimiter=',') # locales = csv.reader(open('tn_locale_codes.csv'), delimiter=",") schools = [] for row in schooldata: schools.append(SC.School(row)) # locale_code = {} # for row in locales: # locale_code[row[1]] = row[0] school_data = SC.build_data_array(schools) # (clusterid, error, nfound) = kcluster(school_data,nclusters=3,npass=3,dist='b') (clusterid, error, nfound) = kcluster(school_data,nclusters=5,npass=10,dist='b') print clusterid, nfound
def kmeans(cm, k=2, i=10): """ Perform K-means clustering on a cross-comparison matrix as returned by cross_compare(). Note that clustering of two perfectly segregated groups (i.e., groups consisting of identical items) sometimes fails, so that all items are clustered into the same group. Not sure why, but this appears to affect many implementations of K-Means clustering, not just Pycluster. This function uses Pycluster. Alternative implementations can be found in scipy and python-cluster: <http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm#pycluster> <http://docs.scipy.org/doc/scipy/reference/cluster.vq.html> <http://python-cluster.sourceforge.net/> Arguments: cm -- the cross-comparison matrix Keyword arguments: k -- the number of clusters (default=2) i -- the number of iterations (default=10) Returns: A key-cluster dictionary """ from Pycluster import kcluster # First convert the dictionary into a list of tuples so it can be handled # by python-cluster data = [] for row in cm: data.append(tuple(cm[row].values())) # Perform the clustering clusterid, error, nfound = kcluster(data, nclusters=k, npass=i) # Parse the results into a dictionary and return d = {} for j in range(len(clusterid)): d[cm.keys()[j]] = clusterid[j] return d
def create_clustered_samples(points, nclusters, transpose): print points[1:6] labels, error, nfound= kcluster(points[1:4], nclusters, None, None, transpose, npass=1, method='a', dist='e', initialid=None) cdata, cmask = clustercentroids(points[1:4], None, labels, 'a', transpose) print cdata clusteredpoints = list() for i in range(nclusters): clusteredpoints.append(list()) if transpose == 0: for index in range(len(points)): clusteredpoints[labels[index]].append(points[index]) return clusteredpoints, cdata else: for i in range(len(clusteredpoints)): for types in range(len(points)): clusteredpoints[i].append(list()) for index in range(len(labels)): for item in range(len(points)): clusteredpoints[labels[index]][item].append(points[item][index]) #print clusters and some element x = cdata[1] y = cdata[2] # fig = figure() # ax1 = fig.add_subplot(1,1,1) # ax1.scatter(x, y, c='r') # ax1.axis([0,max(x)+1,0,max(y)+1]) # ax1.set_xlabel('number of bodies') # ax1.set_ylabel('number of steps') # x0 = clusteredpoints[0][2] # y0 = clusteredpoints[0][3] # # x1 = clusteredpoints[1][1] # y1 = clusteredpoints[1][2] # # x2 = clusteredpoints[2][1] # y2 = clusteredpoints[2][2] # # x3 = clusteredpoints[3][1] # y3 = clusteredpoints[3][2] # # x4 = clusteredpoints[4][1] # y4 = clusteredpoints[4][2] # # x5 = clusteredpoints[5][1] # y5 = clusteredpoints[5][2] # # ax1.scatter(x0[1:20],y0[1:20], marker='s') # ax1.scatter(x1[1:20],y1[1:20], marker='^') # ax1.scatter(x2[1:15],y2[1:15], marker='<') # ax1.scatter(x3[1:15],y3[1:15], marker='>') # ax1.scatter(x4[1:15],y4[1:15], marker='p') # ax1.scatter(x5[1:15],y5[1:15], marker='8') # show() return clusteredpoints, cdata
def cluster_by_single_feature(feature_matrix): k = int(math.floor(math.sqrt(len(feature_matrix)/2.0))) clustered_images_by_color, _, _ = kcluster(feature_matrix, k, npass=5) return clustered_images_by_color, k
def cluster(top): session.exemplars_val = session.metrics[session.exemplars][:,session.best_metrics] list_exemp = [] for i in session.exemplars: list_exemp.append(session.list_img[i]) # if True or session.best_metrics[0]==-1: # table = TABLE(TR(*[TD(IMG(_src=URL('static','thumbs/'+img),_alt=img)) for img in list_exemp])) session.clusterid, error, nfound = kcluster(session.exemplars_val,nclusters=2) cluster0_val = None cluster1_val = None cluster0 = [] cluster1 = [] for i,cluster in enumerate(session.clusterid): if cluster==0: cluster0.append(list_exemp[i]) if cluster0_val is None: cluster0_val = session.exemplars_val[i] else: cluster0_val = np.vstack((cluster0_val,session.exemplars_val[i])) else : cluster1.append(list_exemp[i]) if cluster1_val is None: cluster1_val = session.exemplars_val[i] else: cluster1_val = np.vstack((cluster1_val,session.exemplars_val[i])) cluster0_avg = sum(cluster0_val)/len(cluster0_val) cluster1_avg = sum(cluster1_val)/len(cluster1_val) mse = (sum((cluster0_avg-cluster1_avg)**2)/6)**(.5) table_avg = TABLE(TR(TD("Cluster",_class="average_table_text"),*[TD(session.header[i],_class="average_table_text") for i in session.best_metrics]), TR(TD(1),*[TD(round(cluster0_avg[i],2)) for i in xrange(6)]), TR(TD(2),*[TD(round(cluster1_avg[i],2)) for i in xrange(6)])) table_avg["_class"]="average_table" cluster0_address = [] for img in cluster0: cluster0_address.append(session.address+'thumbs/'+img) while len(cluster0_address)<15: cluster0_address.append(URL('static','thumbs/blank.png')) cluster0.append("blank.png") cluster1_address = [] for img in cluster1: cluster1_address.append(session.address+'thumbs/'+img) while len(cluster1_address)<15: cluster1_address.append(URL('static','thumbs/blank.png')) cluster1.append("blank.png") table0 = TABLE( TR(*[TD(IMG(_src=cluster0_address[i],_alt=cluster0[i])) for i in xrange(5)]), TR(*[TD(IMG(_src=cluster0_address[i],_alt=cluster0[i])) for i in xrange(5,10)]), TR(*[TD(IMG(_src=cluster0_address[i],_alt=cluster0[i])) for i in xrange(10,15)])) table1 = TABLE( TR(*[TD(IMG(_src=cluster1_address[i],_alt=cluster1[i])) for i in xrange(5)]), TR(*[TD(IMG(_src=cluster1_address[i],_alt=cluster1[i])) for i in xrange(5,10)]), TR(*[TD(IMG(_src=cluster1_address[i],_alt=cluster1[i])) for i in xrange(10,15)])) return (table0,table1,table_avg,round(mse,3))
def clusterAndDrawUS(unused): #automatically delete temp US image older than 14 days os.system("find applications/MetricsFinder/static/temp_US/US_clustered_* -mtime +14 -exec rm {} \;") clusterid, error, nfound = kcluster(session.metrics[:][:,session.best_metrics],nclusters=2) cluster0_val = None cluster1_val = None cluster0 = [] cluster1 = [] min_x = None max_x = None min_y = None max_y = None for i,cluster in enumerate(clusterid): col,row = session.list_name[i].split('-') x = int(col)-22 y = int(row)-4 if min_x is None: min_x = x max_x = x min_y = y max_y = y min_x = min(x,min_x) min_y = min(y,min_y) max_x = max(x,max_x) max_y = max(y,max_y) if cluster==0: cluster0.append((x,y)) else : cluster1.append((x,y)) im = Image.open("applications/MetricsFinder/static/avl_lands_img.png") draw = ImageDraw.Draw(im) for coord in cluster0: draw.ellipse([(coord[0]-1,coord[1]-1),(coord[0]+1,coord[1]+1)], fill="red") for coord in cluster1: draw.ellipse([(coord[0]-1,coord[1]-1),(coord[0]+1,coord[1]+1)], fill="blue") del draw img_name = "US_clustered_"+str(int(time.time()*10000))+".png" # write to stdout delta_x = max_x-min_x delta_y = max_y-min_y upper_left_x = max(0,min_x-delta_x/2) upper_left_y = max(0,min_y-delta_y/2) lower_right_x = min(im.size[0],max_x+delta_x/2) lower_right_y = min(im.size[1],max_y+delta_y/2) region = im.crop((upper_left_x,upper_left_y,lower_right_x,lower_right_y)) region.save("applications/MetricsFinder/static/temp_US/"+img_name, "PNG") ratio = float(lower_right_x-upper_left_x)/(lower_right_y-upper_left_y) return (img_name,ratio)
def kmeans(names, tsstags, tags, outdir, norm=True, mi=-2000, ma=6000, bin=100, k=5): Xall = [] N = names.split(',') if len(tsstags.split(',')) == 1: TSS = [int(tsstags)] * len(N) else: TSS = [int(x) for x in tsstags.split(',')] if len(tags.split(',')) == 1: TAGS = [int(tags)] * len(N) else: TAGS = [int(x) for x in tags.split(',')] INDEX = [] for n, tss, t in zip(N, TSS, TAGS): fi = "../results/tss_profiles/%s/TSStags%i_tags%i_bin%i_avg200_pdist%i_profiles.txt" % ( n, tss, t, bin, ma) X = loadtxt(fi) Xall.append(X) gfi = "../results/tss_profiles/%s/TSStags%i_tags%i_bin%i_avg200_pdist%i_genes.txt" % ( n, tss, t, bin, ma) for line in open(gfi): sym = line.strip('\n') INDEX.append((n, sym)) mi = mi + 1.5 * bin ma = ma - bin INDEX = array(INDEX) Xcat = np.vstack(Xall) if norm: X_norm = (Xcat.T / Xcat.sum(axis=1).T).T print X_norm #cond_name = fn[0:fn.find('_')] #figure() figure() #Specify the corresponding gene list for the profile array #if genes: # #G=[] # Gfi=genes.split(',') # for name,fi in zip(N,Gfi): # #G.append(line.strip('\n')) # #G=array(G) # #INDEX.append(G) # INDEX=array(INDEX) k_clusts = [int(x) for x in k.split(',')] for i, nclust in enumerate(k_clusts): clusterid, error, nfound = kcluster(X_norm, nclusters=nclust, npass=10) print 'nclusters=%d' % nclust print clusterid mean_clusts = [] num_in_clusts = [] #CIDS=set(clusterid) CIDS = xrange(nclust) #This was a quick/dirty solution to rank the clusters by 5' density RANKS, SCORES = {}, [] for cid in CIDS: print cid, mean_clust = X_norm[clusterid == cid].mean(axis=0) score = mean_clust[:30].sum() SCORES.append((score, cid)) SCORES.sort() RANKS = [x[1] for x in SCORES] R = {} for ind, cid in enumerate(RANKS): R[cid] = ind + 1 #Reorganize clusterid array by cluster ranks clusranks = array([R[cid] for cid in clusterid]) new_cids = xrange(1, nclust + 1) for name in N: name_dir = '%s/%s' % (outdir, name) try: os.mkdir(name_dir) except: exception = 1 for cid in new_cids: mean_clust = X_norm[clusranks == cid].mean(axis=0) mean_clusts.append(mean_clust) num_in_clusts.append((clusranks == cid).sum()) for name in N: IN_CLUS = INDEX[clusranks == cid] G_out = [] for n, sym in IN_CLUS: if name == n: G_out.append(sym) name_dir = '%s/%s' % (outdir, name) savetxt('%s/k%d_%d.txt' % (name_dir, nclust, cid), G_out, fmt='%s') save('%s/k%d.npy' % (outdir, nclust), array(mean_clusts)) save('%s/k%d_ids.npy' % (outdir, nclust), array(clusranks)) ax = subplot(len(k_clusts), 1, i + 1) xax = arange(mi, ma, bin) colors = ['blue', 'magenta', 'cyan', 'yellow', 'red'] for mean_clust, num_in_clust, cid, color in zip( mean_clusts, num_in_clusts, new_cids, colors): #print xax,mean_clust print xax.shape, mean_clust.shape ax.plot(xax, mean_clust, label='%d' % (cid), linewidth=2, color=color) ax.set_xlabel('Distance to TSS', fontsize='xx-large') ax.set_ylabel('Read density', fontsize='xx-large') ax.xaxis.set_major_locator(MaxNLocator(4)) ax.yaxis.set_major_locator(MaxNLocator(4)) for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize('large') for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize('large') ax.grid(True) legend(loc=1, prop={'size': 18}, markerscale=0.5) savefig('%s/kmean_clusts.pdf' % (outdir))
import numpy from Pycluster import kcluster data = numpy.array([ (1, 1, 0), (1, 0, 0), (0, 0, 0) ]) print data labels, error, nfound = kcluster(data, 2) print labels