Ejemplo n.º 1
0
def findcenters(x,n=1000,k=6):
    #get dimensions
    m = x.shape[1]
    #create centers as empty
    centers = DataFrame(np.zeros(shape=(k,m)))

    for i in range(n):
        labels, _, _ = Pycluster.kcluster(x, nclusters = k, transpose=0,
                                        method='a', dist='e', npass = 1)
        center, _ = Pycluster.clustercentroids(x,clusterid = labels)
        #sort centers by the distance to the origin
        center = sorted(center,key = lambda t: np.linalg.norm(np.array(t)-np.zeros(m)), reverse = True)

        #print np.linalg.norm(np.array(center[0])-np.zeros(m))
        #print np.linalg.norm(np.array(center[1])-np.zeros(m))
        #print np.linalg.norm(np.array(center[2])-np.zeros(m))
        #print np.linalg.norm(np.array(center[3])-np.zeros(m))
        #print np.linalg.norm(np.array(center[4])-np.zeros(m))
        #print np.linalg.norm(np.array(center[5])-np.zeros(m))
        #print np.array(center[0])
        #print np.array(center[1])
        #print np.array(center[2])
        #print np.array(center[3])
        #print np.array(center[4])
        #print np.array(center[5])
        #take the average
        for j in range(k):
            centers.ix[j,:] = centers.ix[j,:] + center[j]
    centers = centers/n
    return(centers)
Ejemplo n.º 2
0
    def suggest(self, word):
        v = self.analyze(word)

        # pick first x
        res = []
        for nword, nv in self.ndx.items():
            wsim = self.compute_similarity([v, nv])
            res.append((wsim, nword, self.as_vector(nv)))
        res.sort()
        res = res[::-1]

        # from first y pick the most distant ones
        res2 = [v for (sim, word, v) in res]
        resw = [word for (sim, word, v) in res]
        lab, err, nfound = Pycluster.kcluster(res2, 40)

        resg = defaultdict(lambda: [])
        for i, l in enumerate(lab):
            resg[l] += [res[i]]

        res_sug = []
        used_groups = set()
        for l, w in zip(lab, resw):
            if not l in used_groups:
                res_sug += [w]
                used_groups.add(l)
                
        return res_sug
Ejemplo n.º 3
0
	def multikmeans(self, krange=None):
		# La recette magique
		
		if krange==None:
			kr=np.arange(2, len(self.mat)-1)
		else: kr=krange
		lmat=len(self.mat)
		
		accords=np.zeros((lmat,lmat), dtype=int) # Où on comptera combien de fois chq paire de documents est classé ensemble
		t=deque() # pour sauver temps & mémoire, on emploie deque à la place de list
		t0=time()
		k2s = lambda x: x*0.85
		tunits=k2s(np.array(kr)).sum()
		
		# La boucle elle-même
		for k in kr:
			t1=time()
			
			# K-means
			c,err,nfound=pc.kcluster(self.mat,k)
			
			# Mise à jour des valeurs
			for i in np.unique(c):
				accords[c==i] += c==i
			
			# Prédiction du temps restant
			t2=time()
			tunits-=k2s(k)
			t.append((t2-t1)/k2s(k))
			prediction = tunits*np.mean(tuple(t)[-20:])
			print "k={0}: \t{1} ({2} depuis le début) \t{3} à faire".format(k,human_time(t2-t1),human_time(t2-t0),human_time(prediction))
		
		return accords/float(k)
Ejemplo n.º 4
0
def findcenters(x, n=1000, k=6):
    # get dimensions
    m = x.shape[1]
    # create centers as empty
    centers = DataFrame(np.zeros(shape=(k, m)))

    for i in range(n):
        labels, _, _ = Pycluster.kcluster(x, nclusters=k, transpose=0, method="a", dist="e", npass=1)
        center, _ = Pycluster.clustercentroids(x, clusterid=labels)
        # sort centers by the distance to the origin
        center = sorted(center, key=lambda t: np.linalg.norm(np.array(t) - np.zeros(m)), reverse=True)

        # print np.linalg.norm(np.array(center[0])-np.zeros(m))
        # print np.linalg.norm(np.array(center[1])-np.zeros(m))
        # print np.linalg.norm(np.array(center[2])-np.zeros(m))
        # print np.linalg.norm(np.array(center[3])-np.zeros(m))
        # print np.linalg.norm(np.array(center[4])-np.zeros(m))
        # print np.linalg.norm(np.array(center[5])-np.zeros(m))
        # print np.array(center[0])
        # print np.array(center[1])
        # print np.array(center[2])
        # print np.array(center[3])
        # print np.array(center[4])
        # print np.array(center[5])
        # take the average
        for j in range(k):
            centers.ix[j, :] = centers.ix[j, :] + center[j]
    centers = centers / n
    return centers
Ejemplo n.º 5
0
def grapeCluster(vectors, iterationCountPerBurst, maximumPixelDiameter, minimumPixelDiameter):
    # If we have no vectors, return empty array
    if not vectors:
        return []
    # Assign all vectors to a single cluster
    globalClusters = [numpy.array(vectors)]
    globalCount = len(vectors)
    globalClusterMeans = []
    # While there are globalClusters,
    while globalClusters:
        # Pop the last cluster
        globalCluster = globalClusters.pop()
        # Measure size
        sizeCategory = measureClusterSize(globalCluster, maximumPixelDiameter, minimumPixelDiameter)
        # If it is too big,
        if sizeCategory > 0:
            # Burst it
            # assignments = scipy.cluster.vq.kmeans2(globalCluster, k=2, iter=iterationCountPerBurst)[1]
            assignments = Pycluster.kcluster(globalCluster, npass=iterationCountPerBurst)[0]
            # Extract localClusters
            booleanAssignments = numpy.array(assignments) > 0
            localClusters = globalCluster[booleanAssignments], globalCluster[~booleanAssignments]
            # Push localClusters to the end of the stack
            globalClusters.extend(localClusters)
        # If it is the right size, append the weighted mean
        elif sizeCategory == 0:
            globalClusterMeans.append(computeWeightedMean(globalCluster))
        # Show feedback
        view.printPercentUpdate(globalCount - len(globalClusters), globalCount)
    # Return
    view.printPercentFinal(globalCount)
    return globalClusterMeans
Ejemplo n.º 6
0
def cluster(parser, k):
    """
    general method for clustering data
    """
    
    #get index number for every page
    code_book = parser.get_data_encoding(page_min_occurance=5)
    
    #use only sequence of pages visited
    simple_session = [session for session in parser.get_simple_sessions() if config.session_filter_fn(session)]
    
    #use vector representation (v1,v2,v2) where v1 means page v1 was visited    
    #models = session_modeling.convert_sessions_to_vector(simple_session, code_book, binary=True)
    
    #construct markov chains, estimate transition probabilities
    models = session_modeling.convert_sessions_to_markov(simple_session, code_book, bayes=False)
    idx, sse, _ = Pycluster.kcluster(models, k, method='a', dist='e')
 
    #idx, sse, _ = cluster_kmedoids(models, k, string_similarity.jaccard_distance)
    

    clusters = {}
    for name, clusterid in zip(simple_session, idx):
        clusters.setdefault(clusterid, []).append(name)
    
    return clusters, sse
def clusterAndPlot(df, k, height=10, engine='PyCluster', cmap='spectral'):
    '''calculate and plot kmean clustering'''
    fig, axes = plt.subplots(k + 1,
                             figsize=(18, height),
                             sharex='all',
                             sharey='all')

    if engine == 'scipy':
        centroids, label = kmeans2(df, k, iter=100, thresh=1e-05)
    else:
        labels, error, nfound = Pycluster.kcluster(df, k)
    df['label'] = labels

    colors = nColors(k=k, cmap=cmap)

    # one by one
    for l, g in df.groupby('label'):
        g.T.plot(ax=axes[l], legend=0, c=colors[l], alpha=.2)
        axes[l].set_title('cluster %d, %d zipcodes' % (l, len(g)))

        pd.Series(g.mean(0)).plot(ax=axes[-1],
                                  label='cluster %d' % (l),
                                  c=colors[l])

    #     plt.legend()
    return df
Ejemplo n.º 8
0
	def testPricesDiffsVecsKmeansClustering(self):
		"""Testing whether kmeans clustering with prices differences
		   vectors works."""

		prices_diffs_vecs = utils.make_prices_diffs_vecs(self.data1)		
		labels, wcss, n = Pycluster.kcluster(prices_diffs_vecs, 3, npass=100)
		clusters = utils.make_groups_from_labels(labels, self.data1)

		# The result should be sth like this modulo group numbers. Probability
		# that this isn't like this with npass=100 is (I think) very low! But
		# it can happen that this grouping will be different.

		suggested_clusters = {0: ['E'], 1: ['A', 'D'], 2: ['B', 'C']}

		# Let's check this.

		num_matches = 0

		for cluster in clusters.values():
			cluster.sort()
			for suggested_cluster in suggested_clusters.values():
				suggested_cluster.sort()
				if cluster == suggested_cluster:
					num_matches = num_matches + 1

		# Ok, so we've found out that each suggested cluster exists
		# in output of our kcluster algorithm and because length of
		# clusters dict is 3 we can be sure these dictionaries are equal.

		self.assertEqual(num_matches, 3)
		self.assertEqual(len(clusters), 3)
Ejemplo n.º 9
0
    def suggest(self, word):
        v = self.analyze(word)

        # pick first x
        res = []
        for nword, nv in self.ndx.items():
            wsim = self.compute_similarity([v, nv])
            res.append((wsim, nword, self.as_vector(nv)))
        res.sort()
        res = res[::-1]

        # from first y pick the most distant ones
        res2 = [v for (sim, word, v) in res]
        resw = [word for (sim, word, v) in res]
        lab, err, nfound = Pycluster.kcluster(res2, 40)

        resg = defaultdict(lambda: [])
        for i, l in enumerate(lab):
            resg[l] += [res[i]]

        res_sug = []
        used_groups = set()
        for l, w in zip(lab, resw):
            if not l in used_groups:
                res_sug += [w]
                used_groups.add(l)

        return res_sug
Ejemplo n.º 10
0
def pyclustertest():
    
    data=sp.rand(100,4)
    cid,e,n=pcl.kcluster(data)
    centroids,cmask=pcl.clustercentroids(D,clusterid=cid)
    
    print data    
    print centroids
def getlabels(x, y, n = 1000 , k = 8):
    if y == "none":
        y = x
    #fit k-means clusters
    labels, _, _ = Pycluster.kcluster(y, nclusters = k, transpose=0,
                                     method='a', dist='e', npass = n)
    #write labels back
    x.loc[:,"group"] = labels
    return(x)
Ejemplo n.º 12
0
def findk(x, n=1000, minK=2, maxK=20):
    errors = []
    # fit k-means clusters for n times
    for i in range(minK, maxK + 1, 1):
        _, error, nfound = Pycluster.kcluster(x, nclusters=i, transpose=0, method="a", dist="e", npass=n)
        # get errors
        errors.append(error)
        print i
    print errors
Ejemplo n.º 13
0
def clusters(labels, data, k):
	kclus = Pycluster.kcluster(data, k, npass=1)[0]
	nx = numpy.zeros((len(labels), len(labels)), dtype=numpy.float32)
	for ind1 in range(len(labels)):
		for ind2 in range(len(labels)):
			if kclus[ind1] == kclus[ind2]:
				nx[ind1][ind2] = 1
	print k, " of ", len(labels)
	return nx
Ejemplo n.º 14
0
def findk(x, n = 1000, minK = 2, maxK = 20):
    errors = []
    #fit k-means clusters for n times
    for i in range(minK,maxK+1,1):
        _, error, nfound = Pycluster.kcluster(x, nclusters = i, transpose=0,
                                         method='a', dist='e', npass = n)
        #get errors
        errors.append(error)
        print i
    print errors
Ejemplo n.º 15
0
def cluster_spw_rpw(list_of_recs):
	number_of_clusters = 8
	only_serve_return = []
	if list_of_recs==[]:
		print "ERRROR"
	for rec in list_of_recs:
		only_serve_return.append([float(rec[0]),float(rec[1])])
	k = get_k_value(only_serve_return)
	labels, error, nfound = Pycluster.kcluster(scipy.array(only_serve_return), k)
	return labels
Ejemplo n.º 16
0
Archivo: test.py Proyecto: LKF10051/ML
def myCKDemo(filename,n):
    #以下两个语句是获取数据,用于聚类分析的数据位于第3和第4列(从0开始计算)   
    data = np.loadtxt(filename, delimiter = "," ,usecols=(2,4,14,8))
    #第8和第9列,保存了城市的经纬度坐标,用于最后画散点图
    xy = np.loadtxt(filename, delimiter = "," ,usecols=(2,4))
    #clustermap是聚类之后的集合,记录每一组数据的类别id
    clustermap = pc.kcluster(data, n)[0]
    #centroids 是分组聚类之后的聚类中心坐标
    centroids = pc.clustercentroids(data, clusterid=clustermap)[0]
    #m是距离矩阵
    m = pc.distancematrix(data)
 
    #mass 用来记录各类的点的数目
    mass = np.zeros(n)
    for c in clustermap: 
        mass[c] += 1 
   
   
    #sil是轮廓系统矩阵,用于记录每个簇的大小
    sil = np.zeros(n*len(data)) 
    sil.shape = ( len(data), n ) 
   
    for i in range( 0, len(data) ): 
        for j in range( i+1, len(data) ): 
            d = m[j][i] 
            sil[i, clustermap[j] ] += d 
            sil[j, clustermap[i] ] += d 
 
    for i in range(0,len(data)): 
        sil[i,:] /= mass 
   
    #s轮廓系数是一个用来评估聚类效果的参数
    #值在-1 —— 1之间,值越大,表示效果越好。
    #小于0,说明与其簇内元素的平均距离小于最近的其他簇,表示聚类效果不好。
    #趋近与1,说明聚类效果比较好。
    s=0 
    for i in range( 0, len(data) ): 
        c = clustermap[i] 
        a = sil[i,c] 
        b = min(sil[i,range(0,c)+range(c+1,n)]) 
        si = (b-a)/max(b,a)
        s+=si 
   
    print n, s/len(data) 
   
    #使用matplotlib画出散点图。
    fig, ax = pl.subplots()
    #cmap是用于区分不同类别的颜色
    cmap = pl.get_cmap('jet', n)
    cmap.set_under('gray')
    #xy是经纬度,主要为了通过经纬度来画出不同城市在地理上的位置
    x = [list(d)[0] for d in xy]   
    y = [list(d)[1] for d in xy] 
    cax = ax.scatter(x, y, c=clustermap, s=30, cmap=cmap, vmin=0, vmax=n)
    pl.show() 
Ejemplo n.º 17
0
def cluster_spw_rpw(list_of_recs):
    number_of_clusters = 8
    only_serve_return = []
    if list_of_recs == []:
        print "ERRROR"
    for rec in list_of_recs:
        only_serve_return.append([float(rec[0]), float(rec[1])])
    k = get_k_value(only_serve_return)
    labels, error, nfound = Pycluster.kcluster(scipy.array(only_serve_return),
                                               k)
    return labels
Ejemplo n.º 18
0
    def _G(self, data, K):
        labels, _, _ = Pycluster.kcluster(data.T, K)
        centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels)
        centers = centers.T
        G = zeros((K, data.shape[1]))
        
        for k in range(K):
            D = data - expand_dims(centers[:, k], axis=1)
            G[k, :] = -sqrt(sum(multiply(D, D), axis=0))

        return G
Ejemplo n.º 19
0
    def _G(self, data, K):
        labels, _, _ = Pycluster.kcluster(data.T, K)
        centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels)
        centers = centers.T
        G = zeros((K, data.shape[1]))

        for k in range(K):
            D = data - expand_dims(centers[:, k], axis=1)
            G[k, :] = -sqrt(sum(multiply(D, D), axis=0))

        return G
Ejemplo n.º 20
0
def getlabels(x, y, n=1000, k=8):
    if y == "none":
        y = x
    # fit k-means clusters
    labels, _, _ = Pycluster.kcluster(y, nclusters=k, transpose=0, method="a", dist="e", npass=n)
    # write labels back
    x.loc[:, "group"] = labels
    # count how many items in each group
    labels = list(labels)
    for i in range(k):
        print labels.count(i)
    return x
Ejemplo n.º 21
0
def cluster():
	x = [[76.0,32.0],[63.0,40.0],[70.0,30.0],[64.0,45.0]]
	k = 2
	labels, error, nfound = Pycluster.kcluster(scipy.array(x),k)
	print "Input data:"
	print "   spw " + "  rpw"
	j = 1
	for i in x:
		print str(j)+") "+str(i[0]) + "  " + str(i[1])
		j +=1
	print " "
	print "clusters: " + str(labels)
Ejemplo n.º 22
0
def cluster():
    x = [[76.0, 32.0], [63.0, 40.0], [70.0, 30.0], [64.0, 45.0]]
    k = 2
    labels, error, nfound = Pycluster.kcluster(scipy.array(x), k)
    print "Input data:"
    print "   spw " + "  rpw"
    j = 1
    for i in x:
        print str(j) + ") " + str(i[0]) + "  " + str(i[1])
        j += 1
    print " "
    print "clusters: " + str(labels)
Ejemplo n.º 23
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help"])
        except getopt.GetoptError, msg:
            raise Usage(msg)
        try:
            
            nodesFile = argv[1]
            #nodesFile="C:\Users\Selin\Desktop\k-means\TibyOutput\TibyNodes.txt"
            
        except IndexError:
            raise Error("Not enough arguments provided to script.")

        nodes=readNodesFromTxtForKmeans(nodesFile)
        nodes = numpy.array(nodes)
        
        #results,assign=kmeans2(whiten(features),2,iter=20,thresh=0.0000000000000000001)
        #results,assignment=kmeans2(features,2,iter=100,thresh=0.0000000000000000000000000000000000000001)
        results = Pycluster.kcluster(array(nodes),nclusters=30,npass=50,method='m')
        assignments=results[0]
        #print results
        # Roy's verison of making cluster dict
        #clusterIDs = set(assignments)

        #clusterByClusterID = dict((clusterID, nodes[assignments == clusterID]) for clusterID in clusterIDs)
       #print clusterByClusterID[0]
        
        xs = [node[0] for node in array(nodes)]
        ys = [node[1] for node in array(nodes)]
        clusterByClusterID=collections.defaultdict(list)
        
        for x, y, clusterID in itertools.izip(xs,ys,assignments):
            #if clusterID not in clusterByClusterID:
                #clusterByClusterID[clusterID] = []
            clusterByClusterID[clusterID].append((x,y))
        #print clusterDictByID[3]    
        # print data

        #pylab.axis([-10000, 500000, -10000, 500000])
        pylab.figure()
        pylab.hold(True)
        colors=['r','b','g','c','m','y','k','w','#ff6c01','#00cd00']
        #colors=['r','b','g','c','m','y','k','w']
        #colors=['burlywood']
        #colors = 'rbgcmykw'
        for clusterID, color in itertools.izip(clusterByClusterID.keys(), itertools.cycle(colors)):
            #print clusterByClusterID[clusterID]
             plotCluster(clusterByClusterID[clusterID], color)
        print results[1], results[2]
Ejemplo n.º 24
0
def kmeans(data, **kwargs):
    """
    Perform k-means clustering on unstructured N-dimensional data.
    
    @type data: array
    @param data: The data to be clustered
    @type kwargs: dict
    @param kwargs: The following args are accepted:
        - numClusters: The number of clusters to form (returned number of clusters may be less than k).
        - npasses: The number of times the k-means clustering algorithm is performed, 
        each time with a different (random) initial condition.
        - method: describes how the center of a cluster is found: 
            - method=='a': arithmetic mean.
            - method=='m': median.
        - initialCenters: a set of points that should be used as the initial
                          cluster centers
            
    @rtype: tuple
    @return: A list where each element indicates the cluster membership of the 
        corresponding index in the original data and a message string
    """
    k = 1
    npasses = 1
    method = 'a'
    initialCenters = None
    smartCenters = False
    msg = ''
    
    if 'numClusters' in kwargs:
        k = int(kwargs['numClusters'])
    if 'npasses' in kwargs:
        npasses = int(kwargs['npasses'])
    if 'method' in kwargs:
        method = kwargs['method']
    if 'initialCenters' in kwargs:
        initialCenters = kwargs['initialCenters']
    if 'smartCenters' in kwargs:
        smartCenters = kwargs['smartCenters']
    
    
    logData = tm.getMethod('log')(data)
    if initialCenters is not None:
        (clusterIDs, err, nOpt) = pc.kcluster(logData, k, npass=npasses, method=method)
        msg = "Number of rounds optimal solution was found: %i" % nOpt
    else:
        logCenters = tm.getMethod('log')(np.array(initialCenters[:k]))
        (centroids, clusterIDs) = kmeans2(logData, logCenters, minit='matrix')
        if len(np.unique(clusterIDs)) < k:
            wx.MessageBox('Warning: One or more of the returned clusters are empty. Please choose different initial cluster centers and re-run k-means for better results.', 'Insufficiently varied cluster centers', wx.OK | wx.ICON_WARNING)
            
    
    return clusterIDs, msg
Ejemplo n.º 25
0
def getlabels(x, y, n = 1000 , k = 8):
    if y == "none":
        y = x
    #fit k-means clusters
    labels, _, _ = Pycluster.kcluster(y, nclusters = k, transpose=0,
                                     method='a', dist='e', npass = n)
    #write labels back
    x.loc[:,"group"] = labels
    #count how many items in each group
    labels = list(labels)
    for i in range(k):
        print labels.count(i)
    return(x)
Ejemplo n.º 26
0
def clustering(file_path, k, dist_measure, PLOT):
    """
    Do the K-means clustering for input data.

    @param file_path: Input data file.
    @param k: Number of centers in K-means algorithm.
    @param dist_measure: Distance measure (in this case, we use Manhattan distance).
    @param PLOT: Bool variable, check if plot the result (set it as True only in testing).
    @return: Clusters id for all data points in the input data file.
    """

    data = numpy.genfromtxt(file_path, delimiter=',')

    if len(data.shape) == 1:
        return [-1]

    print "-- Processing file: " + file_path + "  -- Data points: " + str(len(data))
    print "-- Start clustering"

    k = set_k(len(data), k)
    ite_num = method_name(len(data))

    # Do the K-means clustering
    cluster_id, _, _ = Pycluster.kcluster(data, nclusters=k, mask=None, weight=None, transpose=0, npass=ite_num,
                                          method='a', dist=dist_measure, initialid=None)

    if PLOT is False:
        return cluster_id

    # Draw the clustering result plot.
    centroids, _ = Pycluster.clustercentroids(data, clusterid=cluster_id)

    if PLOT:
        data_pca = mlab.PCA(data)
        cutoff = data_pca.fracs[1]
        data_2d = data_pca.project(data, minfrac=cutoff)
        centroids_2d = data_pca.project(centroids, minfrac=cutoff)
    else:
        data_2d = data
        centroids_2d = centroids

    color = ['#2200CC', '#D9007E', '#FF6600', '#FFCC00', '#ACE600', '#0099CC',
             '#8900CC', '#FF0000', '#FF9900', '#FFFF00', '#00CC01', '#0055CC']

    for i in range(k):
        scatter(data_2d[cluster_id == i, 0], data_2d[cluster_id == i, 1], color=color[i % 12])

    plot(centroids_2d[:, 0], centroids_2d[:, 1], 'sg', markersize=8)
    show()

    return cluster_id
Ejemplo n.º 27
0
def main():
    args = parse_args()

    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

    logging.debug('Reading %s', args.input.name)
    src_nodes = np.loadtxt(args.input, dtype=int)

    lattice_width, lattice_height = src_nodes.shape
    lattice = nx.grid_2d_graph(lattice_width, lattice_height)

    # find and remove wall
    wall_nodes = map(lambda e: tuple(e), np.transpose(np.nonzero(src_nodes)))
    lattice.remove_nodes_from(wall_nodes)
    assert len(lattice.nodes()) == (lattice_width * lattice_height -
                                    len(wall_nodes))

    nodelist = list(lattice.nodes())
    node_ids = {n: i for i, n in enumerate(nodelist)}
    assert len(nodelist) == len(node_ids)

    # compute normalized laplacian
    norm_lapl = normalized_laplacian(lattice, nodelist, node_ids)

    # compute eigenvalues and eigenvectors
    eigen_val, eigen_vec = np.linalg.eig(norm_lapl)
    # kmeans
    labels, _, _ = Pycluster.kcluster(eigen_vec[:, :args.kappa + 1],
                                      args.kappa,
                                      dist='e',
                                      npass=100,
                                      initialid=None)
    # assign colors
    colors = [COLORS[i] for i in labels]
    assert len(colors) == len(labels)
    # compute grid lattice_height x lattice_width containing colors
    grid = []
    colored, non_colored = 0, 0
    its = 0
    for i in xrange(lattice_height):
        grid.append([])
        for j in xrange(lattice_width):
            node_id = node_ids.get((i, j))
            color = colors[node_id] if node_id is not None else BLACK
            grid[i].append(color)
            if color == BLACK:
                non_colored += 1
            else:
                colored += 1
    assert non_colored == len(wall_nodes)
    display(grid)
def main():
    args = parse_args()

    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

    logging.debug('Reading %s', args.input.name)
    src_nodes = np.loadtxt(args.input, dtype=int)

    lattice_width, lattice_height = src_nodes.shape
    lattice = nx.grid_2d_graph(lattice_width, lattice_height)

    # find and remove wall
    wall_nodes = map(lambda e: tuple(e),
                     np.transpose(np.nonzero(src_nodes)))
    lattice.remove_nodes_from(wall_nodes)
    assert len(lattice.nodes()) == (lattice_width * lattice_height - len(wall_nodes))

    nodelist = list(lattice.nodes())
    node_ids = {n: i for i, n in enumerate(nodelist)}
    assert len(nodelist) == len(node_ids)

    # compute normalized laplacian
    norm_lapl = normalized_laplacian(lattice, nodelist, node_ids)
    
    # compute eigenvalues and eigenvectors
    eigen_val, eigen_vec = np.linalg.eig(norm_lapl)
    # kmeans
    labels, _, _ = Pycluster.kcluster(eigen_vec[:, :args.kappa+1],
                                      args.kappa,
                                      dist='e', npass=100, initialid=None)
    # assign colors
    colors = [COLORS[i] for i in labels]
    assert len(colors) == len(labels)
    # compute grid lattice_height x lattice_width containing colors
    grid = []
    colored, non_colored = 0, 0
    its = 0
    for i in xrange(lattice_height):
        grid.append([])
        for j in xrange(lattice_width):
            node_id = node_ids.get((i, j))
            color = colors[node_id] if node_id is not None else BLACK
            grid[i].append(color)
            if color == BLACK:
                non_colored += 1
            else:
                colored += 1
    assert non_colored == len(wall_nodes)
    display(grid)
Ejemplo n.º 29
0
def cluster(fname, nclust):
    fh = open(fname, 'r')
    lines = fh.readlines()
    fh.close()

    clusters = int(nclust)

    points = []
    points_r = []
    dates = []
    volumes = []
    close_prices = []

    for i in range(len(lines)):
        if i <= 1:
            continue
        line_c = lines[i - 1].strip().split(',')
        close_price = float(line_c[0])
        volume = float(line_c[1])

        points_r.append((close_price, volume))
        volumes.append(volume)
        close_prices.append(close_price)
        #dates.append(line_c[0])

    volume_z = np.array(volumes)
    #volume_z = stats.zscore(a)
    close_price_z = np.array(close_prices)
    #close_price_z = stats.zscore(a)

    points = zip(close_price_z, volume_z)

    init_data = []
    k = len(points) / (nclust)

    for i in range(nclust - 1):
        for j in range(k):
            init_data.append(i)

    while (len(points) != len(init_data)):
        init_data.append(nclust - 1)
    #print(clusters)

    labels, error, nfound = Pycluster.kcluster(points, clusters, None, None, 0,
                                               1, 'a', 'e', init_data)
    labels_sorted = sort_labels(labels)
    #print('Labels: ')
    print labels_sorted
    return labels_sorted
Ejemplo n.º 30
0
def silhouette(data, k=5, shuffle = True, shufflecount = 100):
	#assume that data is a matrix with variables in rows and dimensions in columns
	coefficients = {}
	data = data.transpose()
	for nclus in range(2,k):
		
		clustermap = pc.kcluster(data,nclusters=nclus,npass=50)[0]
		centroids = pc.clustercentroids(data,clusterid=clustermap)[0]
		m = pc.distancematrix(data)
		res = [silhouette_coefficient(m,clustermap,nclus,data.shape)]

		for _ in range(shufflecount):

			dat = data
			map(np.random.shuffle,dat)
			clustermap = pc.kcluster(dat,nclusters=nclus,npass=50)[0]
			centroids = pc.clustercentroids(dat,clusterid=clustermap)[0]

			#distance matrix-- well it's a list actually
			m = pc.distancematrix(dat)

			res.append([silhouette_coefficient(m,clustermap,nclus,dat.shape)])
		coefficients[nclus]={'data':res[0],'distribution':res[1:]}
	return coefficients
Ejemplo n.º 31
0
def clustering(x,y,cost,ngroup=2):
    if CLUSTER == "scipy":
        z = whiten(cost)

        # let scipy do its magic (k==3 groups)
        res, labels = kmeans2(array(list(zip(x,y,z))),ngroup)

    if CLUSTER == "Pycluster":
        points = np.zeros((x.shape[0], 2))
        points[:,0] = x
        points[:,1] = y

#        labels, error, nfound = Pycluster.kcluster(points, ngroup, weights=cost)
        labels, error, nfound = Pycluster.kcluster(points, ngroup)

    return labels
Ejemplo n.º 32
0
def kmeans_cluster_test(data, real_labels, outputfile=None):
    start = time.time()

    ks = range(8, 15)
    if outputfile != None:
        f = open(outputfile, 'w')
        f.write(out_result_header())
    for k in ks:
        print 'running kmeans when k=%d' % k
        predicted = Pycluster.kcluster(data, k)[0].tolist()
        if outputfile != None:
            f.write(out_result(predicted, k, real_labels))

    f.close()
    elasped = time.time() - start
    print 'Average time: %.3f' % (elasped / float(len(ks)))
Ejemplo n.º 33
0
def kmeans_cluster_test(data, real_labels, outputfile=None):
    start = time.time()

    ks = range(8,15)
    if outputfile != None:
        f = open(outputfile,'w')
        f.write(out_result_header())
    for k in ks:
        print 'running kmeans when k=%d' % k
        predicted = Pycluster.kcluster(data,k)[0].tolist()
        if outputfile != None:
            f.write(out_result(predicted,k, real_labels))
        
    f.close()
    elasped = time.time() - start
    print 'Average time: %.3f' % (elasped/float(len(ks)))
Ejemplo n.º 34
0
def main():
    usage= 'usage: %prog [options] infname1 [infname2 ...]'
    parser= OptionParser(usage=usage)
    parser.add_option('-o', '--output', 
                        dest='output_fname', default='cluster.out',
                        help='output fname')
    parser.add_option('-c', '--no-cache', 
                        dest='cache', action='store_false', default=True, 
                        help='discard cache')
    parser.add_option('-k', '--resulting-dimensionality', 
                        dest='k', default=150, 
                        help='el numero de dimensiones resultantes')

    options, args= parser.parse_args(argv[1:])
    if len(args) < 1: parser.error('not enaught args')

    parser= parserclass('models_cache')

    infnames= args
    outfname= options.output_fname

    print 'parsing scores'
    scores= [parser.parse(infname, cache=options.cache) for infname in infnames]

    nclusterss= [2] # range(2,5)
    npasss= [5,10, 15, 20, 25]
    methods= ['a', 'm']
    dists= ['e', 'b', 'c', 'a', 'u', 'x', 's', 'k']
    configs= list(combine(nclusterss, npasss, methods, dists))

    results= {}
    for k in range(2,10,2):
        print 'k=', k
        concept_vectors= apply_lsa(scores, k)
        step= len(configs)/10
        for i, (nclusters, npass, method, dist) in enumerate(configs):
            if (i+1)%step == 0: print '\t', ((i+1)*100)/len(configs)
            r= Pycluster.kcluster(concept_vectors, 
                                  nclusters= nclusters, 
                                  method= method, dist= dist)
            results[(k, nclusters, npass, method, dist)]= r

    
    f= open('clusters_results.pickle', 'w')
    pickle.dump(results, f, 2)
    f.close()
Ejemplo n.º 35
0
def get_clusterid(vanadium_dbName, Cobalt_dbName):
  ''' Given the database of sensor observation it calculates clusterid for each sensor'''
  con = lite.connect(vanadium_dbName)
  con.row_factory = lite.Row
  
  with con:
    cur = con.cursor()
    cur.execute("SELECT * FROM sensor")    
    columnNames = cur.fetchone()
    columnNames = columnNames.keys()
    
  con = lite.connect(vanadium_dbName)
  with con:
    cur = con.cursor()
    cur.execute("SELECT * FROM sensor")    
    rows = cur.fetchall()
    
  ROW = [row[3:] for row in rows]
  
  Data = numpy.matrix(ROW).astype(float)
  Data = numpy.transpose(Data)            
  
  numRows, numCols = Data.shape
  
  mask = numpy.ones((numRows, numCols)).astype(numpy.uint8)     
  counter = 0
  for i in range(numRows):
    for j in range(numCols):
      if Data[i,j] < 0:
        counter += 1                      # Counting the missing observation
        mask[i,j] = 0
  
  
  clusterid, error, nfound = cluster.kcluster(data=Data, nclusters=7, 
                                        mask=mask, weight=None,
                                        transpose=0, npass=1,
                                        method='a', dist='c', initialid=None)      
  
  
  '''Create clusterid to SPND dictonary'''
  clusterIdtoSPND = {}
  lst = []
  for cid in clusterid:
    clusterIdtoSPND[cid] = lst
  for cid, col in zip(clusterid, columnNames[3:]):
    clusterIdtoSPND[cid].append(col)  
Ejemplo n.º 36
0
def cluster_colors(points):
    num_clusters = min(len(points), 3)

    labels, error, nfound = Pycluster.kcluster(points, num_clusters)

    totals = []
    for i in range(num_clusters):
        totals.append( [[0, 0, 0], 0] )

    for i in range(len(labels)):
        tmp = totals[labels[i]]
        tmp[0][0] += points[i][0]
        tmp[0][1] += points[i][1]
        tmp[0][2] += points[i][2]
        tmp[1] += 1

    averages = [ [ 1.0 * a[0]/n, 1.0 * a[1]/n, 1.0 * a[2]/n] for a,n in totals]

    return averages
Ejemplo n.º 37
0
Archivo: cl.py Proyecto: zlike/affivir
def get_clusters(job_id, t, debug=False):
	dbc = MySQLdb.connect(host = 'localhost',
			user = '******',
			passwd = '1qaz2wsx',
			db = 'affivir')

	cur = dbc.cursor()
	assets = get_assets(cur, job_id, t)

	# number of clusters
	nc = len(assets) / 2
	if nc == 0:
		return None

	if debug is True:
		print t, nc, 'clusters'

	(b, a) = get_asset_range(assets)

	sz = b - a + 1
	W = np.zeros((sz, sz))
	for i in range(a, b + 1):
		for j in range(i + 1, b + 1):
			if i != j:
				sim = get_sim(dbc.cursor(), i, j)
				W[i - a, j - a] = sim
				W[j - a, i - a] = sim
	D = np.diag(np.sum(W, 0))
	L = D - W
	lam, u = eigs(L, k=nc, which='SR')
	u = real(u)
	labels, error, nfound = Pycluster.kcluster(u, nc)

	for k in range(nc):
		out = ''
		for i in range(len(labels)):
			if labels[i] == k:
				out += str(a + i) + ' '
		if debug is False:
			q = 'INSERT INTO clusters(job_id, type, data) VALUES ("' + str(job_id) + '", "' + t + '", "' + out + '")'
			cur.execute(q)
		else:
			print out
Ejemplo n.º 38
0
def k_means(flat_data, data, nclusters, method, distance):
    """ K-Means Clustering """
    clusterid, error, nfound = pc.kcluster(
                        flat_data.values(),
                        nclusters=nclusters,
                        mask=None,
                        weight=None,
                        transpose=0,
                        npass=100,
                        method=method,
                        dist=distance,
                        initialid=None)
    
    # load clusters into dictionary
    clusters = defaultdict(list)
    for i, j in zip(clusterid, data):
        clusters[i].append(j)
        
    return clusters
Ejemplo n.º 39
0
def resolution_clustering(clusters, cluster_ids, sampled, kx=2):
    X = np.array([np.append(np.append(c[0], c[1]), c[2]) for c in clusters])
    n = X.shape[1] / 3
    Xn = X / ([np.average(X[:, :n])] * n + [np.average(X[:, n:2 * n])] * n +
              [np.average(X[:, 2 * n:])] * n)
    C, e, nf = Pycluster.kcluster(Xn, len(clusters) / len(sampled) * kx)
    del Xn
    Cidx = defaultdict(list)
    for i, c in enumerate(C):
        Cidx[c].append(i)
    CStable = []
    for k, v in Cidx.items():
        members = set()
        for c in v:
            members.update(clusters[c][3])
        members = sorted(members)
        s = stability(members, cluster_ids)
        CStable.append((s, np.average(X[v], axis=0).reshape(
            (3, X.shape[1] / 3)), members))
    return CStable
Ejemplo n.º 40
0
def kmeans(k, table):
 # k = 50
  (labels, error, nfound) = pc.kcluster(table, k, None, None, 0, 20, 'a', 'b')
#  plot.plot_scatter(table, labels, k)
  
#  centers = get_centers(table, labels)
#  np.random.shuffle(table)
#  tab = [map(float, x) for x in table[:1000]]

#  mycluster = mc.MyClustering(tab, k)
#  mycluster.init_heap()
#  mycluster.hierarchy_cluster()
#  mycluster.clear_sample_points()

#  for i, row in enumerate(table):
#    if i % 1000 == 0:
#      print 'progress: %d' % i
#    mycluster.add_point(i, map(float, row))

#  mycluster.get_cluster()

  return labels
Ejemplo n.º 41
0
def k_means():
	dataList = list(database.find_all())
	vectors = []
	uuids = []
	for data in dataList:
		counter = Counter()
		uuids.append(data['uuid'])
		for event in data['events']:
				counter[event['name']] += 1
		vector = []
		for typ in counter:
			vector.append(counter[typ])
		vectors.append(vector)

	result = vectors
	labels, error, nfound = Pycluster.kcluster(vectors, 3)

	classes = []
	for label in labels:
		classes.append(numpy.asscalar(label))
	result = dict(zip(uuids,classes))
	return result
Ejemplo n.º 42
0
    def __kmeans_initialization(self):
        """
        given the data points, cluster them by applying kmeans clustering
        algorithm.
       """
        # apply kmeans clustering to get the centroids and labels for each vector in data
        labels, error, nfound = Pycluster.kcluster(self._data, self._nClusters)

        # get the dimension of the input data
        rows, cols = self._data.shape

        clusterData = [[] for i in xrange(self._nClusters)]

        # assign vectors to clusters
        for data, label in zip(self._data, labels):
            clusterData[label].append(data)

        models = [GaussianCluster(*muAndSigma(clusterData[i], cols)) for i in xrange(self._nClusters)]

        apriori = np.ones(self._nClusters, dtype=np.float32) / np.array([len(elem) for elem in clusterData])

        return models, apriori
Ejemplo n.º 43
0
    def __kmeans_initialization(self):
        """
        given the data points, cluster them by applying kmeans clustering
        algorithm.
       """
        # apply kmeans clustering to get the centroids and labels for each vector in data
        labels, error, nfound = Pycluster.kcluster(self._data, self._nClusters)

        # get the dimension of the input data
        rows, cols = self._data.shape

        clusterData = [[] for i in xrange(self._nClusters)]

        # assign vectors to clusters
        for data, label in zip(self._data, labels):
            clusterData[label].append(data)

        models = [GaussianCluster( *muAndSigma(clusterData[i], cols)) for i in xrange(self._nClusters)]

        apriori = np.ones(self._nClusters, dtype = np.float32) / np.array([len(elem) for elem in clusterData])

        return models, apriori
Ejemplo n.º 44
0
def cl(Data_v_co):
 starttime = time.time()
 clusterid, error, nfound = cluster.kcluster(data=Data_v_co, nclusters=10, 
                                        mask=mask_v_co, weight=None,
                                        transpose=0, npass=98,
                                        method='a', dist='c', initialid=None)   
 #centroids, _ = cluster.clustercentroids(Data_v_co, clusterid=clusterid)   
 #cen = map(tuple, centroids)   
 #set(cen)   
 count=Counter(clusterid)
 #print count
 #print clusterid


#--------------------------------------------------------------------------------------------------------------
 clusterIdtoSPND = {}
 lst = []
 for cid in clusterid:
  clusterIdtoSPND[cid] = lst
 for cid, col in zip(clusterid, columnNames_v_co[3:]):
  clusterIdtoSPND[cid] = clusterIdtoSPND[cid] + [col]
  print clusterIdtoSPND
 return clusterIdtoSPND
Ejemplo n.º 45
0
def kmeans(k, table):
    # k = 50
    (labels, error, nfound) = pc.kcluster(table, k, None, None, 0, 20, 'a',
                                          'b')
    #  plot.plot_scatter(table, labels, k)

    #  centers = get_centers(table, labels)
    #  np.random.shuffle(table)
    #  tab = [map(float, x) for x in table[:1000]]

    #  mycluster = mc.MyClustering(tab, k)
    #  mycluster.init_heap()
    #  mycluster.hierarchy_cluster()
    #  mycluster.clear_sample_points()

    #  for i, row in enumerate(table):
    #    if i % 1000 == 0:
    #      print 'progress: %d' % i
    #    mycluster.add_point(i, map(float, row))

    #  mycluster.get_cluster()

    return labels
Ejemplo n.º 46
0
def clusterAndPlot(df, k, height=10, engine='PyCluster', cmap='spectral'):
    '''calculate and plot kmean clustering'''
    fig, axes = plt.subplots(k + 1, figsize=(18, height),
                             sharex='all', sharey='all')

    if engine == 'scipy':
        centroids, label = kmeans2(df, k, iter=100, thresh=1e-05)
    else:
        labels, error, nfound = Pycluster.kcluster(df, k)
    df['label'] = labels

    colors = nColors(k=k, cmap=cmap)

    # one by one
    for l, g in df.groupby('label'):
        g.T.plot(ax=axes[l], legend=0, c=colors[l], alpha=.2)
        axes[l].set_title('cluster %d, %d zipcodes' % (l, len(g)))

        pd.Series(g.mean(0)).plot(
            ax=axes[-1], label='cluster %d' % (l), c=colors[l])

    #     plt.legend()
    return df
Ejemplo n.º 47
0
    def pt(x,y):
        """Take the mean of a sample neighborhood, discarding
        any invalid (depth == 2047) pixels
        """
        t = sample_side
        global d, ds, samples, mask
        d = depth[y-t:y+t,x-t:x+t]

        # This is where I choose which point in the sample to use. I take
        # the minimum, which is the nearest pixel. Other possibilities
        # are median, mean, etc.
        if method=='median':
            meand = np.median(d[d<2047])
        if method=='mean':
            meand = np.mean(d[d<2047])
        if method=='min':
            meand = d[d<2047].min()
        if method=='kmeans':
            import Pycluster
            labels, error, nfound = Pycluster.kcluster(d.reshape(-1,1),4)
            labels = labels.reshape(d.shape)
            means = np.array([d[labels==i].mean() for i in range(labels.max()+1)])
            nearest = np.argmin(means)
            mask = labels==nearest
            samples = d[mask]

            def radius(target):
                x,y = np.nonzero(d == target)
                return np.sqrt((x[0]-sample_side/2)**2+(y[0]-sample_side/2)**2)
            cands = (samples.min(), samples.max())
            rads = [radius(i) for i in cands]

            meand = means.min()
            #meand = cands[np.argmax(rads)]
            #meand = np.median(samples)
            #meand = samples.min() if np.median(samples) > np.mean(samples) else samples.max()
        return x,y,meand,1
Ejemplo n.º 48
0
# Distance metrics
dDict = dict([("corr", "c"), ("abscorr", "a"), ("uncentcorr", "u"),
              ("absunccorr", "x"), ("spearman", "s"), ("kendall", "k"),
              ("euc", "e"), ("cityblock", "b")])

# Unsupervised validation metrics list
silhouetteList = []

# K-means
if (algorithm == "k"):
    # Method
    mDict = dict([("mean", "a"), ("median", "m")])
    # All
    clusterListAll, error, nfound = pc.kcluster(np.array(rawData),
                                                nclusters=noClust,
                                                transpose=0,
                                                method=mDict[method],
                                                dist=dDict[distance])
    silScore = metrics.silhouette_score(rawData,
                                        clusterListAll,
                                        metric='euclidean')
    silhouetteList.append(silScore)
    # Single
    clusterListSingle = []
    for i in range(0, len(labelList)):
        cluterListTemp, error, nfound = pc.kcluster(
            np.array(rawData)[:, (rawVph * i):(rawVph * i) + rawVph],
            nclusters=noClust,
            transpose=0,
            method=mDict[method],
            dist=dDict[distance])
Ejemplo n.º 49
0
import Pycluster as pc
import numpy as np
import sys

# Read data filename and desired number of clusters from command line
filename, n = sys.argv[1], int(sys.argv[2])

data = np.loadtxt(filename)

# Perform clustering and find centroids
clustermap, _, _ = pc.kcluster(data, nclusters=n, npass=50)
centroids, _ = pc.clustercentroids(data, clusterid=clustermap)

# Obtain distance matrix
m = pc.distancematrix(data)

# Find the masses of all clusters
mass = np.zeros(n)
for c in clustermap:
    mass[c] += 1

# Create a matrix for individual silhouette coefficients
sil = np.zeros(n * len(data))
sil.shape = (len(data), n)

# Evaluate the distance for all pairs of points
for i in range(0, len(data)):
    for j in range(i + 1, len(data)):
        d = m[j][i]

        sil[i, clustermap[j]] += d
Ejemplo n.º 50
0
def heatmap(args):
    datafiles = args.datafiles
    for x in args.datafiles:
        if not os.path.isfile(x):
            print "ERROR: Data file '{0}' does not exist".format(x)
            sys.exit(1)
    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(
                x)
            pysam.index(x)

    # Options Parser
    featurefile = args.featurefile
    datafiles = [x.strip() for x in args.datafiles]
    tracks = [os.path.basename(x) for x in datafiles]
    titles = [os.path.splitext(x)[0] for x in tracks]
    colors = parse_colors(args.colors)
    bgcolors = parse_colors(args.bgcolors)
    outfile = args.outfile
    extend_up = args.extend
    extend_down = args.extend
    fragmentsize = args.fragmentsize
    cluster_type = args.clustering[0].lower()
    merge_mirrored = args.merge_mirrored
    bins = (extend_up + extend_down) / args.binsize
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    ncpus = args.cpus
    distancefunction = args.distancefunction[0].lower()
    dynam = args.graphdynamics
    fontsize = args.textfontsize

    # Check for mutually exclusive parameters
    if dynam:
        if merge_mirrored:
            print "ERROR: -m and -g option CANNOT be used together"
            sys.exit(1)
        if distancefunction == 'e':
            print 'Dynamics can only be identified using Pearson correlation as metric.'
            print 'Assigning metric to Pearson correlation'
            distancefunction = 'p'

    # Warning about too much files
    if (len(tracks) > 4):
        print "Warning: Running fluff with too many files might make you system use enormous amount of memory!"

    # Method of clustering
    if (args.pick != None):
        pick = [i - 1 for i in split_ranges(args.pick)]
        if not all(i <= len(tracks) - 1 for i in pick):
            sys.stderr.write(
                "You picked a non-existent file for clustering.\n")
            sys.exit(1)
    else:
        pick = range(len(datafiles))

    if not cluster_type in ["k", "h", "n"]:
        sys.stderr.write("Unknown clustering type!\n")
        sys.exit(1)
    # Number of clusters
    if cluster_type == "k" and not args.numclusters >= 2:
        sys.stderr.write("Please provide number of clusters!\n")
        sys.exit(1)
    # Distance function
    if not distancefunction in ["e", "p"]:
        sys.stderr.write("Unknown distance function!\n")
        sys.exit(1)
    else:
        if distancefunction == "e":
            METRIC = cfg.DEFAULT_METRIC
            print "Euclidean distance method"
        else:
            METRIC = "c"
            print "Pearson distance method"
    ## Get scale for each track
    tscale = [1.0 for track in datafiles]

    # Function to load heatmap data
    def load_data(featurefile,
                  amount_bins,
                  extend_dyn_up,
                  extend_dyn_down,
                  rmdup,
                  rpkm,
                  rmrepeats,
                  fragmentsize,
                  dynam,
                  guard=None):
        if guard is None:
            guard = []
        # Calculate the profile data
        data = {}
        regions = []
        print "Loading data"
        try:
            # Load data in parallel
            pool = multiprocessing.Pool(processes=ncpus)
            jobs = []
            for datafile in datafiles:
                jobs.append(
                    pool.apply_async(load_heatmap_data,
                                     args=(featurefile, datafile, amount_bins,
                                           extend_dyn_up, extend_dyn_down,
                                           rmdup, rpkm, rmrepeats,
                                           fragmentsize, dynam, guard)))
            for job in jobs:
                track, regions, profile, guard = job.get()
                data[track] = profile
        except Exception as e:
            sys.stderr.write("Error loading data in parallel, trying serial\n")
            sys.stderr.write("Error: {}\n".format(e))
            for datafile in datafiles:
                track, regions, profile, guard = load_heatmap_data(
                    featurefile, datafile, amount_bins, extend_dyn_up,
                    extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize,
                    dynam, guard)
                data[track] = profile
        return data, regions, guard

    # -g : Option to try and get dynamics
    # Extend features 1kb up/down stream
    # Cluster them in one bin
    # Cluster them in one bin
    guard = []
    amount_bins = bins
    extend_dyn_up = extend_up
    extend_dyn_down = extend_down
    if dynam:
        # load the data once to get the features which extend below 0
        guard = check_data(featurefile, extend_dyn_up, extend_dyn_down)
        extend_dyn_up = 1000
        extend_dyn_down = 1000
        amount_bins = 1

    # Load data for clustering
    data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up,
                                     extend_dyn_down, rmdup, rpkm, rmrepeats,
                                     fragmentsize, dynam, guard)

    # Normalize
    norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE)

    clus = hstack([
        norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)
    ])

    # Clustering
    if cluster_type == "k":
        print "K-means clustering"
        ## K-means clustering
        # PyCluster
        labels, _, nfound = Pycluster.kcluster(clus,
                                               args.numclusters,
                                               dist=METRIC)
        if not dynam and merge_mirrored:
            (i, j) = mirror_clusters(data, labels)
            while j:
                for track in data.keys():
                    data[track][labels == j] = [
                        row[::-1] for row in data[track][labels == j]
                    ]
                for k in range(len(regions)):
                    if labels[k] == j:
                        (chrom, start, end, gene, strand) = regions[k]
                        if strand == "+":
                            strand = "-"
                        else:
                            strand = "+"
                        regions[k] = (chrom, start, end, gene, strand)
                n = len(set(labels))
                labels[labels == j] = i
                for k in range(j + 1, n):
                    labels[labels == k] = k - 1
                (i, j) = mirror_clusters(data, labels)

        ind = labels.argsort()

        # Hierarchical clustering
    elif cluster_type == "h":
        print "Hierarchical clustering"
        tree = Pycluster.treecluster(clus, method="m", dist=METRIC)
        labels = tree.cut(args.numclusters)
        ind = sort_tree(tree, arange(len(regions)))
    else:
        ind = arange(len(regions))
        labels = zeros(len(regions))

    # Load data for visualization if -g option was used
    if dynam:
        data, regions, guard = load_data(featurefile, bins, extend_up,
                                         extend_down, rmdup, rpkm, rmrepeats,
                                         fragmentsize, dynam, guard)

    f = open("{0}_clusters.bed".format(outfile), "w")
    for (chrom, start, end, gene, strand), cluster in zip(
            array(regions, dtype="object")[ind],
            array(labels)[ind]):
        if not gene:
            f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(
                chrom, start, end, cluster + 1, strand))
        else:
            f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(
                chrom, start, end, gene, cluster + 1, strand))
    f.close()
    # Save read counts
    readcounts = {}
    for i, track in enumerate(tracks):
        readcounts[track] = {}
        readcounts[track]['bins'] = []
        for idx, row in enumerate(data[track]):
            bins = ''
            for b in row:
                if not bins:
                    bins = '{0}'.format(b)
                else:
                    bins = '{0};{1}'.format(bins, b)
            readcounts[track]['bins'].append(bins)

    input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w')
    input_fileBins.write('Regions\t')
    for i, track in enumerate(titles):
        input_fileBins.write('{0}\t'.format(track))
    input_fileBins.write('\n')
    for i, track in enumerate(tracks):
        for idx in ind:
            input_fileBins.write('{0}:{1}-{2}\t'.format(
                regions[idx][0], regions[idx][1], regions[idx][2]))
            for i, track in enumerate(tracks):
                input_fileBins.write('{0}\t'.format(
                    readcounts[track]['bins'][idx]))
            input_fileBins.write('\n')
        break
    input_fileBins.close()

    if not cluster_type == "k":
        labels = None

    scale = get_absolute_scale(args.scale, [data[track] for track in tracks])
    heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors,
                 scale, tscale, labels, fontsize)
Ejemplo n.º 51
0
import numpy as np
import Pycluster


def cargaDatos(nombreArchivo):
    ## carga la matriz de caracteristicas a clasificar
    return np.load(nombreArchivo)


M = cargaDatos("DataSet.npy")
## proceso de clustering
## hacemos varias corridas
c = []
for i in range(5):
    labels, error, nfound = Pycluster.kcluster(M, 5)
    c.append(labels)

z = np.array(c)
np.save("resultados.npy", z)
print labels
Ejemplo n.º 52
0
errorOld = 5000
index = 175
'''for i in range(max(nodes)+1):
	clusterid,error,nfound = pc.kcluster(adjMat, nclusters=i+1, transpose=0,npass=1,method='a',dist='e')
	if (i==0):
		errorOld = error
	elif (error == 0):
		index = i
		break
	elif(error < errorOld):
		index = i
		errorOld = error'''

clusterid, error, nfound = pc.kcluster(adjMat,
                                       nclusters=index + 1,
                                       transpose=0,
                                       npass=1,
                                       method='a',
                                       dist='e')

print(clusterid)
print nfound
print error

neglect = set([])

for i in range(max(nodes) + 1):
    if (sum(adjMat[i]) == 0.0):
        neglect.add(clusterid[i])

print neglect
Ejemplo n.º 53
0
with open('d2_out', 'r') as file1:
    d2 = pickle.load(file1)
with open('np_out', 'r') as file1:
    np = pickle.load(file1)

goodBbands = [b for b in np.keys() if np[b] > 500 and b != "Ram"]
d3 = {b: d2[b] for b in goodBbands}
data = numpy.array([d2[b] for b in goodBbands]).squeeze()

ave = numpy.mean(d3.values(), axis=0)
stds = numpy.std(d3.values(), axis=0)

sims = scipy.zeros((len(goodBbands), len(goodBbands)))
dists = scipy.zeros((len(goodBbands), len(goodBbands)))

idx = Pycluster.kcluster(data, 3, npass=1)[0]
keys = [goodBbands[i] for i in numpy.argsort(idx)]

for (ia, a) in enumerate(keys):
    for (ib, b) in enumerate(keys):
        v1 = (d3[a] - ave) / stds
        v2 = (d3[b] - ave) / stds
        sims[ia, ib] = (v1.T.dot(v2)) / (scipy.linalg.norm(v1) *
                                         scipy.linalg.norm(v2))
        dists[ia, ib] = scipy.linalg.norm(v1 - v2)

labels = zip([str(t) for t in idx[numpy.argsort(idx)]], keys)
fig = plt.figure()
ax = fig.add_subplot(111)
imgplot = ax.imshow(sims, interpolation='none')
ax.set_yticks(xrange(len(keys)))
Ejemplo n.º 54
0
    for t in tokenList:
        if t in tokens:
            tokenV[j] = 1.0
        j += 1
    vectorList.append(tokenV)
'''
Logging intermediate results
'''
print vectorList
print profileVector
print len(tokenList)
print len(vectorList)

features = array(vectorList)

labels, error, nfound = Pycluster.kcluster(features, kc)
centroids = vstack(
    [features[labels == i].mean(0) for i in range(labels.max() + 1)])
s1Vector = defaultdict(list)
s2Vector = defaultdict(list)
Result1 = []
Result2 = []
for l in range(0, len(labels)):
    s2Vector[labels[l]].append(profileVector[l][1])
    company = profileVector[l][0]
    for z in range(0, len(labels)):
        if profileVector[z][0] in profileVector[l][0]:
            if len(profileVector[z][0]) < len(company):
                company = profileVector[z][0]
    s1Vector[labels[l]].append(company)
Ejemplo n.º 55
0
import networkx as net
import networkx.algorithms as algo
import matplotlib.pyplot as plt
import numpy as np
import numpy.linalg as la
import Pycluster

g = net.Graph()
g.add_edges_from([(1, 2), (1, 3), (1, 4), (2, 3), (3, 4), (4, 5), (4, 6),
                  (5, 6), (5, 7), (5, 8), (6, 7), (6, 8), (7, 8), (7, 9)])
adj_m = net.adjacency_matrix(g)

w, v = la.eig(adj_m)

S = [[0.0 for i in range(1, 3)] for k in range(1, 10)]
S = np.mat(S)
S[:, 0] += v[:, 0]
S[:, 1] += v[:, 1]
B = np.diag(
    (w[0], w[1]))  # diagonal matrix built with the top 2 eigenvalues of adj_m

labels = Pycluster.kcluster(S, 2)