def cluster_by_edges(edges):
  k_edges = 2
  clustered_images_by_edges, error, _ = kcluster(edges, k_edges, npass=5)
  previous_error = error * 10
  while 1/log(k_edges+0.0000001)*error > 1/log(k_edges-0.999999)*previous_error and k_edges < len(edges):
  #while error < 0.85 * previous_error:
    k_edges += 1
    previous_error = error
    clustered_images_by_edges, error, _ = kcluster(edges, k_edges, npass=5)
  k_edges -= 1
  clustered_images_by_edges, error, nfound = kcluster(edges, k_edges, npass=10)

  return clustered_images_by_edges, k_edges
Esempio n. 2
0
def get_mvn_pars(x, nclusters):
#     n, p = x.shape
#     pis = ones(nclusters)/nclusters
#     mus = normal(0, 1, (nclusters, p))
#     omegas = [identity(p) for i in range(nclusters)]

    z, error, found = kcluster(x, nclusters=nclusters)
    z = array(z)
    n = len(z)

    pis = []
    mus = []
    omegas = []

    for i in range(nclusters):
        cluster = x[z==i]
        pi = len(cluster)/float(n)
        mu = mean(cluster, 0)
        sigma = (1.0/len(cluster))*dot(transpose(cluster - mu), (cluster - mu))

        pis.append(pi)
        mus.append(mu)
        try:
            omegas.append(inv(sigma))
        except LinAlgError:
            omegas.append(identity(x.shape[1]))

    return pis, mus, omegas
Esempio n. 3
0
 def extend(self, k):
     '''try to improve likelihood:
     likelihood of obs in our models times likelihood of our models
     in parent,
     vs. likelihood of obs in parent'''
     clusterid, err, nfound = kcluster([(p, ) for p in self.data],
                                       k,
                                       npass=5)
     l = [[] for i in range(k)]
     j = 0
     for i in clusterid:  # CONSTRUCT MODEL LISTS OF DATA
         l[i].append(self.data[j])
         j += 1
     newmodel = []
     #means=[]
     for j in range(k):
         model = Model(self, l[j],
                       float(len(l[j])) / len(self.data), self.delta)
         newmodel.append(model)
         #means.append(model.args[0])
     logP = self.computeP(self.data, newmodel)[0]  # COMPUTE NEW LOG-P
     #logP+=self.parent.computeP(means) # COMPUTE LOG-P FOR newmodels
     print 'logP:', logP
     if logP > self.lastP + self.logConfidence:  # ACCEPT THE NEW MODEL
         self.models = newmodel
         self.lastP = logP
         self.k = k
         return True
     return False
Esempio n. 4
0
 def extend(self,k):
     '''try to improve likelihood:
     likelihood of obs in our models times likelihood of our models
     in parent,
     vs. likelihood of obs in parent'''
     clusterid,err,nfound=kcluster([(p,) for p in self.data],k,npass=5)
     l=[[] for i in range(k)]
     j=0
     for i in clusterid: # CONSTRUCT MODEL LISTS OF DATA
         l[i].append(self.data[j])
         j+=1
     newmodel=[]
     #means=[]
     for j in range(k):
         model=Model(self,l[j],float(len(l[j]))/len(self.data),self.delta)
         newmodel.append(model)
         #means.append(model.args[0])
     logP=self.computeP(self.data,newmodel)[0] # COMPUTE NEW LOG-P
     #logP+=self.parent.computeP(means) # COMPUTE LOG-P FOR newmodels
     print 'logP:',logP
     if logP>self.lastP+self.logConfidence: # ACCEPT THE NEW MODEL
         self.models=newmodel
         self.lastP=logP
         self.k=k
         return True
     return False
Esempio n. 5
0
    def cluster_sentences(cls, sentences, n):
        """Cluster the sentences into n clusters.

        Args:
            sentences: [IRSentence]
            n: int, number of clusters

        Returns:
            [int], group id of each sentence in sentences
        """

        vol = set()
        for sentence in sentences:
            tfidf = sentence.get_tfidf()
            for term in tfidf:
                vol.add(term)
        vol = list(vol)
        vecs = []
        for sentence in sentences:
            tfidf = sentence.get_tfidf()
            vec = []
            for term in vol:
                if term in tfidf:
                    vec.append(tfidf[term])
                else:
                    vec.append(0.0)
            vecs.append(vec)
        # call pycluster k-means
        from Pycluster import kcluster, clustercentroids, distancematrix
        labels, error, nfound = kcluster(vecs, nclusters=n, method='a',
                                         dist='u')
        centroids, cmask = clustercentroids(vecs, clusterid=labels, method='a')
        sentence_ids = []
        for centroid_index, centroid in enumerate(centroids):
            # find vecs in the cluster
            subvecs = [centroid]
            subvecindexs = [-1]
            for label_index, label in enumerate(labels):
                if label == centroid_index:
                    subvecs.append(vecs[label_index])
                    subvecindexs.append(label_index)
            # find the min dist vec
            matrix = distancematrix(subvecs, dist='u')
            minimum = 100000
            minimum_index = 0
            for i in xrange(1, subvecs.__len__()):
                dist = matrix[i][0]
                if dist < minimum:
                    minimum = dist
                    minimum_index = subvecindexs[i]
            sentence_ids.append(minimum_index)

        # method='a')
        return labels, sentence_ids
Esempio n. 6
0
	def find_clusters(self):
		#print "Finding clusters.."
		#print "Response times: " + str(self.restimes)
		data_array = array([self.restimes])
		cluster_index, error, nfound = kcluster(data_array, 
				nclusters = self.nclusters, mask = self.masks,
				weight = self.weights, 
				transpose = self.cluster_transpose,
				npass =  self.npasses, method = self.method, 
				dist = self.dist)

		return cluster_index
Esempio n. 7
0
    def Main(self, model):
        """calculate kmeans"""
        self.model = model
        data = self.model.GetCurrentData()[:]
        
        nclusters = wx.GetNumberFromUser("Kmeans Dialog",
                                         "Enter number of clusters",
                                         "Kmeans nclusters",
                                         1)

        z, error, found = kcluster(data, nclusters=nclusters) #IGNORE:W0612
        self.model.NewGroup('Kmeans%02d' % nclusters)
        self.model.hdf5.createArray(self.model.current_group, 'z', array(z))
        self.model.update()
Esempio n. 8
0
def random_forest_cluster(X, k=2, dissimilarity=True, **kwargs):
    """
    Random Forest Cluster
    :param X:
    :param k:
    :param dissimilarity:
    :param kwargs:
    :return:
    """
    clf, prox_mat = unsupervised_random_forest(X, **kwargs)

    if dissimilarity:
        prox_mat = 1 - prox_mat
    cluster_ids, error, n_found = kcluster(prox_mat, nclusters=k, method="m")

    return clf, prox_mat, cluster_ids
Esempio n. 9
0
def main():

    schooldata = csv.reader(open(filename),delimiter=',')
#    locales = csv.reader(open('tn_locale_codes.csv'), delimiter=",")
    
    schools = []
    for row in schooldata:
        schools.append(SC.School(row))
    
#    locale_code = {}
#    for row in locales:
#        locale_code[row[1]] = row[0]
            
    school_data = SC.build_data_array(schools)
#    (clusterid, error, nfound) = kcluster(school_data,nclusters=3,npass=3,dist='b')
    (clusterid, error, nfound) = kcluster(school_data,nclusters=5,npass=10,dist='b')
    print clusterid, nfound
Esempio n. 10
0
def kmeans(cm, k=2, i=10):

	"""
	Perform K-means clustering on a	cross-comparison matrix as returned by
	cross_compare(). Note that clustering of two perfectly segregated groups
	(i.e., groups consisting of identical items) sometimes fails, so that all
	items are clustered into the same group. Not sure why, but this appears to
	affect many implementations of K-Means clustering, not just Pycluster.
	
	This function uses Pycluster. Alternative implementations can be found in
	scipy and python-cluster:
	
	<http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm#pycluster>
	<http://docs.scipy.org/doc/scipy/reference/cluster.vq.html>
	<http://python-cluster.sourceforge.net/>
	
	Arguments:
	cm -- the cross-comparison matrix
	
	Keyword arguments:
	k -- the number of clusters (default=2)
	i -- the number of iterations (default=10)
	
	Returns:
	A key-cluster dictionary
	"""
	
	from Pycluster import kcluster
	
	# First convert the dictionary into a list of tuples so it can be handled
	# by python-cluster
	data = []
	for row in cm:
		data.append(tuple(cm[row].values()))									
			
	# Perform the clustering
	clusterid, error, nfound = kcluster(data, nclusters=k, npass=i)
	
	# Parse the results into a dictionary and return
	d = {}
	for j in range(len(clusterid)):
		d[cm.keys()[j]] = clusterid[j]	
	return d
Esempio n. 11
0
def create_clustered_samples(points, nclusters, transpose):
    
    print points[1:6]
    labels, error, nfound= kcluster(points[1:4], nclusters, None, None, transpose, npass=1, method='a', dist='e', initialid=None)
    
    cdata, cmask = clustercentroids(points[1:4], None, labels, 'a', transpose)
    
    print cdata
    
    
    
    clusteredpoints = list()
    
    for i in range(nclusters):
        clusteredpoints.append(list())
    if transpose == 0:    
        for index in range(len(points)):
            clusteredpoints[labels[index]].append(points[index])
        return clusteredpoints, cdata
    else:
        for i in range(len(clusteredpoints)):
            for types in range(len(points)):
                clusteredpoints[i].append(list())
        for index in range(len(labels)):
            for item in range(len(points)):
                clusteredpoints[labels[index]][item].append(points[item][index])
        #print clusters and some element
        x = cdata[1]
        y = cdata[2]
        
#        fig = figure()
#        ax1 = fig.add_subplot(1,1,1)
#        ax1.scatter(x, y, c='r')
#        ax1.axis([0,max(x)+1,0,max(y)+1])
#        ax1.set_xlabel('number of bodies')
#        ax1.set_ylabel('number of steps')
        
#        x0 = clusteredpoints[0][2]
#        y0 = clusteredpoints[0][3]
#        
#        x1 = clusteredpoints[1][1]
#        y1 = clusteredpoints[1][2]
#        
#        x2 = clusteredpoints[2][1]
#        y2 = clusteredpoints[2][2]
#        
#        x3 = clusteredpoints[3][1]
#        y3 = clusteredpoints[3][2]
#        
#        x4 = clusteredpoints[4][1]
#        y4 = clusteredpoints[4][2]
#        
#        x5 = clusteredpoints[5][1]
#        y5 = clusteredpoints[5][2]
#        
#        ax1.scatter(x0[1:20],y0[1:20], marker='s')
#        ax1.scatter(x1[1:20],y1[1:20], marker='^')
#        ax1.scatter(x2[1:15],y2[1:15], marker='<')
#        ax1.scatter(x3[1:15],y3[1:15], marker='>')
#        ax1.scatter(x4[1:15],y4[1:15], marker='p')
#        ax1.scatter(x5[1:15],y5[1:15], marker='8')
#        show()
        return clusteredpoints, cdata
def cluster_by_single_feature(feature_matrix):
  k = int(math.floor(math.sqrt(len(feature_matrix)/2.0)))
  clustered_images_by_color, _, _ = kcluster(feature_matrix, k, npass=5)   
  return clustered_images_by_color, k
Esempio n. 13
0
def cluster(top):

    session.exemplars_val = session.metrics[session.exemplars][:,session.best_metrics]

    list_exemp = []
    for i in session.exemplars:
        list_exemp.append(session.list_img[i])

 #   if True or session.best_metrics[0]==-1:
  #      table = TABLE(TR(*[TD(IMG(_src=URL('static','thumbs/'+img),_alt=img)) for img in list_exemp]))

    session.clusterid, error, nfound = kcluster(session.exemplars_val,nclusters=2)

    cluster0_val = None
    cluster1_val = None

    cluster0 = []
    cluster1 = []
    for i,cluster in enumerate(session.clusterid):
        if cluster==0:
            cluster0.append(list_exemp[i])
            if cluster0_val is None:
                cluster0_val = session.exemplars_val[i]
            else:
                cluster0_val = np.vstack((cluster0_val,session.exemplars_val[i]))

        else :
            cluster1.append(list_exemp[i])
            if cluster1_val is None:
                cluster1_val = session.exemplars_val[i]
            else:
                cluster1_val = np.vstack((cluster1_val,session.exemplars_val[i]))

    cluster0_avg = sum(cluster0_val)/len(cluster0_val)
    cluster1_avg = sum(cluster1_val)/len(cluster1_val)
    mse = (sum((cluster0_avg-cluster1_avg)**2)/6)**(.5)
    table_avg  = TABLE(TR(TD("Cluster",_class="average_table_text"),*[TD(session.header[i],_class="average_table_text") for i in session.best_metrics]),
                        TR(TD(1),*[TD(round(cluster0_avg[i],2)) for i in xrange(6)]),
                        TR(TD(2),*[TD(round(cluster1_avg[i],2)) for i in xrange(6)]))

    table_avg["_class"]="average_table"

    cluster0_address = []
    for img in cluster0:
        cluster0_address.append(session.address+'thumbs/'+img)
    while len(cluster0_address)<15:
        cluster0_address.append(URL('static','thumbs/blank.png'))
        cluster0.append("blank.png")
    cluster1_address = []
    for img in cluster1:
        cluster1_address.append(session.address+'thumbs/'+img)
    while len(cluster1_address)<15:
        cluster1_address.append(URL('static','thumbs/blank.png'))
        cluster1.append("blank.png")

    table0 = TABLE(
     TR(*[TD(IMG(_src=cluster0_address[i],_alt=cluster0[i])) for i in xrange(5)]),
     TR(*[TD(IMG(_src=cluster0_address[i],_alt=cluster0[i])) for i in xrange(5,10)]),
     TR(*[TD(IMG(_src=cluster0_address[i],_alt=cluster0[i])) for i in xrange(10,15)]))

    table1 = TABLE(
     TR(*[TD(IMG(_src=cluster1_address[i],_alt=cluster1[i])) for i in xrange(5)]),
     TR(*[TD(IMG(_src=cluster1_address[i],_alt=cluster1[i])) for i in xrange(5,10)]),
     TR(*[TD(IMG(_src=cluster1_address[i],_alt=cluster1[i])) for i in xrange(10,15)]))

    return (table0,table1,table_avg,round(mse,3))
Esempio n. 14
0
def clusterAndDrawUS(unused):

    #automatically delete temp US image older than 14 days
    os.system("find applications/MetricsFinder/static/temp_US/US_clustered_* -mtime +14 -exec rm {} \;")


    clusterid, error, nfound = kcluster(session.metrics[:][:,session.best_metrics],nclusters=2)

    cluster0_val = None
    cluster1_val = None

    cluster0 = []
    cluster1 = []

    min_x = None
    max_x = None
    min_y = None
    max_y = None

    for i,cluster in enumerate(clusterid):
        col,row = session.list_name[i].split('-')
        x = int(col)-22
        y = int(row)-4

        if min_x is None:
            min_x = x
            max_x = x
            min_y = y
            max_y = y
        min_x = min(x,min_x)
        min_y = min(y,min_y)
        max_x = max(x,max_x)
        max_y = max(y,max_y)

        if cluster==0:
            cluster0.append((x,y))
        else :
            cluster1.append((x,y))

    im = Image.open("applications/MetricsFinder/static/avl_lands_img.png")

    draw = ImageDraw.Draw(im)
    for coord in cluster0:
        draw.ellipse([(coord[0]-1,coord[1]-1),(coord[0]+1,coord[1]+1)], fill="red")

    for coord in cluster1:
        draw.ellipse([(coord[0]-1,coord[1]-1),(coord[0]+1,coord[1]+1)], fill="blue")

    del draw
    img_name = "US_clustered_"+str(int(time.time()*10000))+".png"
    # write to stdout

    delta_x = max_x-min_x
    delta_y = max_y-min_y
    upper_left_x = max(0,min_x-delta_x/2)
    upper_left_y = max(0,min_y-delta_y/2)
    lower_right_x = min(im.size[0],max_x+delta_x/2)
    lower_right_y = min(im.size[1],max_y+delta_y/2)
    region = im.crop((upper_left_x,upper_left_y,lower_right_x,lower_right_y))
    region.save("applications/MetricsFinder/static/temp_US/"+img_name, "PNG")

    ratio = float(lower_right_x-upper_left_x)/(lower_right_y-upper_left_y)

    return (img_name,ratio)
def kmeans(names,
           tsstags,
           tags,
           outdir,
           norm=True,
           mi=-2000,
           ma=6000,
           bin=100,
           k=5):
    Xall = []
    N = names.split(',')
    if len(tsstags.split(',')) == 1: TSS = [int(tsstags)] * len(N)
    else: TSS = [int(x) for x in tsstags.split(',')]
    if len(tags.split(',')) == 1: TAGS = [int(tags)] * len(N)
    else: TAGS = [int(x) for x in tags.split(',')]
    INDEX = []
    for n, tss, t in zip(N, TSS, TAGS):
        fi = "../results/tss_profiles/%s/TSStags%i_tags%i_bin%i_avg200_pdist%i_profiles.txt" % (
            n, tss, t, bin, ma)
        X = loadtxt(fi)
        Xall.append(X)
        gfi = "../results/tss_profiles/%s/TSStags%i_tags%i_bin%i_avg200_pdist%i_genes.txt" % (
            n, tss, t, bin, ma)

        for line in open(gfi):
            sym = line.strip('\n')
            INDEX.append((n, sym))
    mi = mi + 1.5 * bin
    ma = ma - bin
    INDEX = array(INDEX)
    Xcat = np.vstack(Xall)
    if norm: X_norm = (Xcat.T / Xcat.sum(axis=1).T).T
    print X_norm
    #cond_name = fn[0:fn.find('_')]
    #figure()
    figure()
    #Specify the corresponding gene list for the profile array
    #if genes:
    #    #G=[]
    #    Gfi=genes.split(',')
    #    for name,fi in zip(N,Gfi):
    #            #G.append(line.strip('\n'))
    #        #G=array(G)
    #    #INDEX.append(G)
    #    INDEX=array(INDEX)
    k_clusts = [int(x) for x in k.split(',')]
    for i, nclust in enumerate(k_clusts):
        clusterid, error, nfound = kcluster(X_norm, nclusters=nclust, npass=10)
        print 'nclusters=%d' % nclust
        print clusterid
        mean_clusts = []
        num_in_clusts = []

        #CIDS=set(clusterid)
        CIDS = xrange(nclust)

        #This was a quick/dirty solution to rank the clusters by 5' density
        RANKS, SCORES = {}, []
        for cid in CIDS:
            print cid,
            mean_clust = X_norm[clusterid == cid].mean(axis=0)
            score = mean_clust[:30].sum()
            SCORES.append((score, cid))
        SCORES.sort()
        RANKS = [x[1] for x in SCORES]
        R = {}
        for ind, cid in enumerate(RANKS):
            R[cid] = ind + 1
        #Reorganize clusterid array by cluster ranks
        clusranks = array([R[cid] for cid in clusterid])

        new_cids = xrange(1, nclust + 1)
        for name in N:
            name_dir = '%s/%s' % (outdir, name)
            try:
                os.mkdir(name_dir)
            except:
                exception = 1
        for cid in new_cids:
            mean_clust = X_norm[clusranks == cid].mean(axis=0)
            mean_clusts.append(mean_clust)
            num_in_clusts.append((clusranks == cid).sum())
            for name in N:
                IN_CLUS = INDEX[clusranks == cid]
                G_out = []
                for n, sym in IN_CLUS:
                    if name == n: G_out.append(sym)
                name_dir = '%s/%s' % (outdir, name)
                savetxt('%s/k%d_%d.txt' % (name_dir, nclust, cid),
                        G_out,
                        fmt='%s')

        save('%s/k%d.npy' % (outdir, nclust), array(mean_clusts))
        save('%s/k%d_ids.npy' % (outdir, nclust), array(clusranks))
        ax = subplot(len(k_clusts), 1, i + 1)
        xax = arange(mi, ma, bin)
        colors = ['blue', 'magenta', 'cyan', 'yellow', 'red']
        for mean_clust, num_in_clust, cid, color in zip(
                mean_clusts, num_in_clusts, new_cids, colors):
            #print xax,mean_clust
            print xax.shape, mean_clust.shape
            ax.plot(xax,
                    mean_clust,
                    label='%d' % (cid),
                    linewidth=2,
                    color=color)
        ax.set_xlabel('Distance to TSS', fontsize='xx-large')
        ax.set_ylabel('Read density', fontsize='xx-large')
        ax.xaxis.set_major_locator(MaxNLocator(4))
        ax.yaxis.set_major_locator(MaxNLocator(4))
        for tick in ax.xaxis.get_major_ticks():
            tick.label.set_fontsize('large')
        for tick in ax.yaxis.get_major_ticks():
            tick.label.set_fontsize('large')
        ax.grid(True)
        legend(loc=1, prop={'size': 18}, markerscale=0.5)

    savefig('%s/kmean_clusts.pdf' % (outdir))
Esempio n. 16
0
import numpy

from Pycluster import kcluster

data = numpy.array([
    (1, 1, 0),
    (1, 0, 0),
    (0, 0, 0)
])

print data

labels, error, nfound = kcluster(data, 2)
print labels