Example #1
0
def findcenters(x, n=1000, k=6):
    # get dimensions
    m = x.shape[1]
    # create centers as empty
    centers = DataFrame(np.zeros(shape=(k, m)))

    for i in range(n):
        labels, _, _ = Pycluster.kcluster(x, nclusters=k, transpose=0, method="a", dist="e", npass=1)
        center, _ = Pycluster.clustercentroids(x, clusterid=labels)
        # sort centers by the distance to the origin
        center = sorted(center, key=lambda t: np.linalg.norm(np.array(t) - np.zeros(m)), reverse=True)

        # print np.linalg.norm(np.array(center[0])-np.zeros(m))
        # print np.linalg.norm(np.array(center[1])-np.zeros(m))
        # print np.linalg.norm(np.array(center[2])-np.zeros(m))
        # print np.linalg.norm(np.array(center[3])-np.zeros(m))
        # print np.linalg.norm(np.array(center[4])-np.zeros(m))
        # print np.linalg.norm(np.array(center[5])-np.zeros(m))
        # print np.array(center[0])
        # print np.array(center[1])
        # print np.array(center[2])
        # print np.array(center[3])
        # print np.array(center[4])
        # print np.array(center[5])
        # take the average
        for j in range(k):
            centers.ix[j, :] = centers.ix[j, :] + center[j]
    centers = centers / n
    return centers
Example #2
0
def findcenters(x,n=1000,k=6):
    #get dimensions
    m = x.shape[1]
    #create centers as empty
    centers = DataFrame(np.zeros(shape=(k,m)))

    for i in range(n):
        labels, _, _ = Pycluster.kcluster(x, nclusters = k, transpose=0,
                                        method='a', dist='e', npass = 1)
        center, _ = Pycluster.clustercentroids(x,clusterid = labels)
        #sort centers by the distance to the origin
        center = sorted(center,key = lambda t: np.linalg.norm(np.array(t)-np.zeros(m)), reverse = True)

        #print np.linalg.norm(np.array(center[0])-np.zeros(m))
        #print np.linalg.norm(np.array(center[1])-np.zeros(m))
        #print np.linalg.norm(np.array(center[2])-np.zeros(m))
        #print np.linalg.norm(np.array(center[3])-np.zeros(m))
        #print np.linalg.norm(np.array(center[4])-np.zeros(m))
        #print np.linalg.norm(np.array(center[5])-np.zeros(m))
        #print np.array(center[0])
        #print np.array(center[1])
        #print np.array(center[2])
        #print np.array(center[3])
        #print np.array(center[4])
        #print np.array(center[5])
        #take the average
        for j in range(k):
            centers.ix[j,:] = centers.ix[j,:] + center[j]
    centers = centers/n
    return(centers)
Example #3
0
def pyclustertest():
    
    data=sp.rand(100,4)
    cid,e,n=pcl.kcluster(data)
    centroids,cmask=pcl.clustercentroids(D,clusterid=cid)
    
    print data    
    print centroids
Example #4
0
File: test.py Project: LKF10051/ML
def myCKDemo(filename,n):
    #以下两个语句是获取数据,用于聚类分析的数据位于第3和第4列(从0开始计算)   
    data = np.loadtxt(filename, delimiter = "," ,usecols=(2,4,14,8))
    #第8和第9列,保存了城市的经纬度坐标,用于最后画散点图
    xy = np.loadtxt(filename, delimiter = "," ,usecols=(2,4))
    #clustermap是聚类之后的集合,记录每一组数据的类别id
    clustermap = pc.kcluster(data, n)[0]
    #centroids 是分组聚类之后的聚类中心坐标
    centroids = pc.clustercentroids(data, clusterid=clustermap)[0]
    #m是距离矩阵
    m = pc.distancematrix(data)
 
    #mass 用来记录各类的点的数目
    mass = np.zeros(n)
    for c in clustermap: 
        mass[c] += 1 
   
   
    #sil是轮廓系统矩阵,用于记录每个簇的大小
    sil = np.zeros(n*len(data)) 
    sil.shape = ( len(data), n ) 
   
    for i in range( 0, len(data) ): 
        for j in range( i+1, len(data) ): 
            d = m[j][i] 
            sil[i, clustermap[j] ] += d 
            sil[j, clustermap[i] ] += d 
 
    for i in range(0,len(data)): 
        sil[i,:] /= mass 
   
    #s轮廓系数是一个用来评估聚类效果的参数
    #值在-1 —— 1之间,值越大,表示效果越好。
    #小于0,说明与其簇内元素的平均距离小于最近的其他簇,表示聚类效果不好。
    #趋近与1,说明聚类效果比较好。
    s=0 
    for i in range( 0, len(data) ): 
        c = clustermap[i] 
        a = sil[i,c] 
        b = min(sil[i,range(0,c)+range(c+1,n)]) 
        si = (b-a)/max(b,a)
        s+=si 
   
    print n, s/len(data) 
   
    #使用matplotlib画出散点图。
    fig, ax = pl.subplots()
    #cmap是用于区分不同类别的颜色
    cmap = pl.get_cmap('jet', n)
    cmap.set_under('gray')
    #xy是经纬度,主要为了通过经纬度来画出不同城市在地理上的位置
    x = [list(d)[0] for d in xy]   
    y = [list(d)[1] for d in xy] 
    cax = ax.scatter(x, y, c=clustermap, s=30, cmap=cmap, vmin=0, vmax=n)
    pl.show() 
Example #5
0
    def _G(self, data, K):
        labels, _, _ = Pycluster.kcluster(data.T, K)
        centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels)
        centers = centers.T
        G = zeros((K, data.shape[1]))

        for k in range(K):
            D = data - expand_dims(centers[:, k], axis=1)
            G[k, :] = -sqrt(sum(multiply(D, D), axis=0))

        return G
Example #6
0
    def _G(self, data, K):
        labels, _, _ = Pycluster.kcluster(data.T, K)
        centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels)
        centers = centers.T
        G = zeros((K, data.shape[1]))
        
        for k in range(K):
            D = data - expand_dims(centers[:, k], axis=1)
            G[k, :] = -sqrt(sum(multiply(D, D), axis=0))

        return G
Example #7
0
def clustering(file_path, k, dist_measure, PLOT):
    """
    Do the K-means clustering for input data.

    @param file_path: Input data file.
    @param k: Number of centers in K-means algorithm.
    @param dist_measure: Distance measure (in this case, we use Manhattan distance).
    @param PLOT: Bool variable, check if plot the result (set it as True only in testing).
    @return: Clusters id for all data points in the input data file.
    """

    data = numpy.genfromtxt(file_path, delimiter=',')

    if len(data.shape) == 1:
        return [-1]

    print "-- Processing file: " + file_path + "  -- Data points: " + str(len(data))
    print "-- Start clustering"

    k = set_k(len(data), k)
    ite_num = method_name(len(data))

    # Do the K-means clustering
    cluster_id, _, _ = Pycluster.kcluster(data, nclusters=k, mask=None, weight=None, transpose=0, npass=ite_num,
                                          method='a', dist=dist_measure, initialid=None)

    if PLOT is False:
        return cluster_id

    # Draw the clustering result plot.
    centroids, _ = Pycluster.clustercentroids(data, clusterid=cluster_id)

    if PLOT:
        data_pca = mlab.PCA(data)
        cutoff = data_pca.fracs[1]
        data_2d = data_pca.project(data, minfrac=cutoff)
        centroids_2d = data_pca.project(centroids, minfrac=cutoff)
    else:
        data_2d = data
        centroids_2d = centroids

    color = ['#2200CC', '#D9007E', '#FF6600', '#FFCC00', '#ACE600', '#0099CC',
             '#8900CC', '#FF0000', '#FF9900', '#FFFF00', '#00CC01', '#0055CC']

    for i in range(k):
        scatter(data_2d[cluster_id == i, 0], data_2d[cluster_id == i, 1], color=color[i % 12])

    plot(centroids_2d[:, 0], centroids_2d[:, 1], 'sg', markersize=8)
    show()

    return cluster_id
def clusterAndPlot(df, k, height=10, engine='PyCluster', cmap='spectral'):
    '''calculate and plot kmean clustering'''
    fig, axes = plt.subplots(k + 1,
                             figsize=(18, height),
                             sharex='all',
                             sharey='all')

    if engine == 'scipy':
        centroids, label = kmeans2(df, k, iter=100, thresh=1e-05)
    else:
        labels, error, nfound = Pycluster.kcluster(df, k)
    df['label'] = labels

    colors = nColors(k=k, cmap=cmap)

    # one by one
    for l, g in df.groupby('label'):
        g.T.plot(ax=axes[l], legend=0, c=colors[l], alpha=.2)
        axes[l].set_title('cluster %d, %d zipcodes' % (l, len(g)))

        pd.Series(g.mean(0)).plot(ax=axes[-1],
                                  label='cluster %d' % (l),
                                  c=colors[l])

    #     plt.legend()
    return df
Example #9
0
	def cluster_kmedoids(self, k=2, npass=50):
		# Utilise la distance pour produire une partition de k classes
		# n est le nombre d'itérations
		
		c, err, nfound = pc.kmedoids(self.zd, k, npass=npass)
		
		return partition(c, self.mat)
Example #10
0
    def suggest(self, word):
        v = self.analyze(word)

        # pick first x
        res = []
        for nword, nv in self.ndx.items():
            wsim = self.compute_similarity([v, nv])
            res.append((wsim, nword, self.as_vector(nv)))
        res.sort()
        res = res[::-1]

        # from first y pick the most distant ones
        res2 = [v for (sim, word, v) in res]
        resw = [word for (sim, word, v) in res]
        lab, err, nfound = Pycluster.kcluster(res2, 40)

        resg = defaultdict(lambda: [])
        for i, l in enumerate(lab):
            resg[l] += [res[i]]

        res_sug = []
        used_groups = set()
        for l, w in zip(lab, resw):
            if not l in used_groups:
                res_sug += [w]
                used_groups.add(l)

        return res_sug
    def kmedoids_cluster(self, similarities=None):
        # https://jpcomputing.wordpress.com/2014/05/18/pycluster-kmedoids-example/
        from sklearn.metrics import silhouette_score

        distances = []
        if similarities is None:
            for page_id, sim_vector in self._pages_similarities.iteritems():
                distances.append([1 - x[1][USED_DISTANCE] for x in sim_vector])
        else:
            for x in similarities:
                distances.append([1 - a for a in x])

        np_distances = np.asarray(distances)
        import scipy.cluster
        from sklearn.metrics import silhouette_score
        from scipy.spatial.distance import squareform
        squareform_distances = squareform(np_distances)

        import Pycluster
        nb_clusters = 2  # this is the number of cluster the dataset is supposed to be partitioned into
        clusterid, error, nfound = Pycluster.kmedoids(squareform_distances,
                                                      nclusters=nb_clusters,
                                                      npass=50)
        print 'clusterid: ', len(set(clusterid)), clusterid
        res = silhouette_score(np_distances, clusterid, metric='precomputed')
        print 'Res: ', res
        return
        # grouping to clusters
        clusters_indexes = {}
        for i, medoid in enumerate(clusterid):
            if medoid not in clusters_indexes:
                clusters_indexes[medoid] = [i]
            else:
                clusters_indexes[medoid].append(i)
Example #12
0
def grapeCluster(vectors, iterationCountPerBurst, maximumPixelDiameter, minimumPixelDiameter):
    # If we have no vectors, return empty array
    if not vectors:
        return []
    # Assign all vectors to a single cluster
    globalClusters = [numpy.array(vectors)]
    globalCount = len(vectors)
    globalClusterMeans = []
    # While there are globalClusters,
    while globalClusters:
        # Pop the last cluster
        globalCluster = globalClusters.pop()
        # Measure size
        sizeCategory = measureClusterSize(globalCluster, maximumPixelDiameter, minimumPixelDiameter)
        # If it is too big,
        if sizeCategory > 0:
            # Burst it
            # assignments = scipy.cluster.vq.kmeans2(globalCluster, k=2, iter=iterationCountPerBurst)[1]
            assignments = Pycluster.kcluster(globalCluster, npass=iterationCountPerBurst)[0]
            # Extract localClusters
            booleanAssignments = numpy.array(assignments) > 0
            localClusters = globalCluster[booleanAssignments], globalCluster[~booleanAssignments]
            # Push localClusters to the end of the stack
            globalClusters.extend(localClusters)
        # If it is the right size, append the weighted mean
        elif sizeCategory == 0:
            globalClusterMeans.append(computeWeightedMean(globalCluster))
        # Show feedback
        view.printPercentUpdate(globalCount - len(globalClusters), globalCount)
    # Return
    view.printPercentFinal(globalCount)
    return globalClusterMeans
Example #13
0
    def suggest(self, word):
        v = self.analyze(word)

        # pick first x
        res = []
        for nword, nv in self.ndx.items():
            wsim = self.compute_similarity([v, nv])
            res.append((wsim, nword, self.as_vector(nv)))
        res.sort()
        res = res[::-1]

        # from first y pick the most distant ones
        res2 = [v for (sim, word, v) in res]
        resw = [word for (sim, word, v) in res]
        lab, err, nfound = Pycluster.kcluster(res2, 40)

        resg = defaultdict(lambda: [])
        for i, l in enumerate(lab):
            resg[l] += [res[i]]

        res_sug = []
        used_groups = set()
        for l, w in zip(lab, resw):
            if not l in used_groups:
                res_sug += [w]
                used_groups.add(l)
                
        return res_sug
Example #14
0
    def _guide_tree(self, dist_matrix):
        """
        @summary: Build a guide tree from the distance matrix

        @param dist_matrix: The distance matrix
        @type dist_matrix: numpy.ndarray
        @return: Pycluster similarity tree
        @rtype: Pycluster.cluster.Tree

        @author: Woon Wai Keen
        @author: Vladimir Likic
        """

        n = len(dist_matrix)

        print " -> Clustering %d pairwise alignments." % (n * (n - 1)),
        tree = Pycluster.treecluster(dist_matrix, method='a')
        print '\n'
        print tree
        # x = 1
        # for i in list(tree):
        #     print (i, x)
        #     x += 1
        #return different for of tree perhaps , list within list **
        print "Done"

        return tree
def clusterSessionsKmed(featMan, weightFile):

  data = featMan.returnKeys()
  weightList = getWeightMatrixForKMedFromFile(featMan.returnLastId(),
                                              weightFile, data)
  cnt = 0
  kclusters = {}
  for k in range(4, 5, 2):
    i = (len(weightList) + 1) / k
    if i == 0:
      i = 1
    clusArray, error, opt = clust.kmedoids(weightList, i, 10, None)
    print error, len(clusArray)
    clusters = {}
    for c in range(len(clusArray)):
      clusId = clusArray[c]
      q = featMan.returnQuery(c)
      if len(q) > 1:
        if clusId not in clusters:
          clusters[clusId] = set()
        clusters[clusId].add(q)
        cnt += 1

    kclusters[k] = clusters.values()

    print 'Cluster with kmed ', len(clusters), cnt, ' queries'
  return kclusters[4]
Example #16
0
	def testPricesDiffsVecsKmeansClustering(self):
		"""Testing whether kmeans clustering with prices differences
		   vectors works."""

		prices_diffs_vecs = utils.make_prices_diffs_vecs(self.data1)		
		labels, wcss, n = Pycluster.kcluster(prices_diffs_vecs, 3, npass=100)
		clusters = utils.make_groups_from_labels(labels, self.data1)

		# The result should be sth like this modulo group numbers. Probability
		# that this isn't like this with npass=100 is (I think) very low! But
		# it can happen that this grouping will be different.

		suggested_clusters = {0: ['E'], 1: ['A', 'D'], 2: ['B', 'C']}

		# Let's check this.

		num_matches = 0

		for cluster in clusters.values():
			cluster.sort()
			for suggested_cluster in suggested_clusters.values():
				suggested_cluster.sort()
				if cluster == suggested_cluster:
					num_matches = num_matches + 1

		# Ok, so we've found out that each suggested cluster exists
		# in output of our kcluster algorithm and because length of
		# clusters dict is 3 we can be sure these dictionaries are equal.

		self.assertEqual(num_matches, 3)
		self.assertEqual(len(clusters), 3)
Example #17
0
def cluster(parser, k):
    """
    general method for clustering data
    """
    
    #get index number for every page
    code_book = parser.get_data_encoding(page_min_occurance=5)
    
    #use only sequence of pages visited
    simple_session = [session for session in parser.get_simple_sessions() if config.session_filter_fn(session)]
    
    #use vector representation (v1,v2,v2) where v1 means page v1 was visited    
    #models = session_modeling.convert_sessions_to_vector(simple_session, code_book, binary=True)
    
    #construct markov chains, estimate transition probabilities
    models = session_modeling.convert_sessions_to_markov(simple_session, code_book, bayes=False)
    idx, sse, _ = Pycluster.kcluster(models, k, method='a', dist='e')
 
    #idx, sse, _ = cluster_kmedoids(models, k, string_similarity.jaccard_distance)
    

    clusters = {}
    for name, clusterid in zip(simple_session, idx):
        clusters.setdefault(clusterid, []).append(name)
    
    return clusters, sse
Example #18
0
def cluster_kmedoids(sessions, clusters, distance_fn=string_similarity.jaccard_distance):
    """
    kmedoids clustering, requires distance matrix, therefore slow
    """
    distances = compute_distances(sessions, distance_fn)
    clusterids, error, nfound = Pycluster.kmedoids(distances, nclusters=clusters)
    return clusterids, error, nfound
def clusterSessionsPre(catQueryDist, featMan, weightMatrix):

  tclusters = {}
  print len(catQueryDist)
  for termCount in range(4, 5):
    tclusters[termCount] = []
    for cat, qSet in catQueryDist.items():
      if len(qSet) > 1:  # and cat in pairs:
        k = len(qSet) / termCount
        if k == 0:
          k = 1
        #print cat, len(qSet), k
        qList = list(qSet)
        catDist = getWeightMatrixForKMed(qList, weightMatrix)

        clusArray, error, opt = clust.kmedoids(catDist, k, 5, None)
        #print 'Queries', qList
        clusters = {}
        for c in range(len(clusArray)):
          clusId = clusArray[c]
          if clusId not in clusters:
            clusters[clusId] = []
          qc = featMan.returnQuery(qList[c])
          if len(qc) > 1:
            clusters[clusId].append(qc)
        #print cat, len(clusters)
        for entry in clusters.values():
          tclusters[termCount].append(entry)

  print len(tclusters[4])
  return tclusters[4]
Example #20
0
	def multikmeans(self, krange=None):
		# La recette magique
		
		if krange==None:
			kr=np.arange(2, len(self.mat)-1)
		else: kr=krange
		lmat=len(self.mat)
		
		accords=np.zeros((lmat,lmat), dtype=int) # Où on comptera combien de fois chq paire de documents est classé ensemble
		t=deque() # pour sauver temps & mémoire, on emploie deque à la place de list
		t0=time()
		k2s = lambda x: x*0.85
		tunits=k2s(np.array(kr)).sum()
		
		# La boucle elle-même
		for k in kr:
			t1=time()
			
			# K-means
			c,err,nfound=pc.kcluster(self.mat,k)
			
			# Mise à jour des valeurs
			for i in np.unique(c):
				accords[c==i] += c==i
			
			# Prédiction du temps restant
			t2=time()
			tunits-=k2s(k)
			t.append((t2-t1)/k2s(k))
			prediction = tunits*np.mean(tuple(t)[-20:])
			print "k={0}: \t{1} ({2} depuis le début) \t{3} à faire".format(k,human_time(t2-t1),human_time(t2-t0),human_time(prediction))
		
		return accords/float(k)
Example #21
0
    def DoClustering(self, nclusters=30):
        '''Main clustering function'''

        gx = self._gx
        func = self._scale_function

        nid, jm, am, fg = zip(*[(x, gx.node[x]['JuvenileMass'],
                                 gx.node[x]['AdultMass'],
                                 gx.node[x]['FunctionalGroup'])
                                for x in gx.node.keys()])
        data = np.c_[func(jm), func(am)]

        if (self._normalize_data == True):
            data = whiten(data)
        data = np.c_[data, 1000 * np.array(fg)]

        if self._algorithm == Aggregation._HIERARCHICAL_CLUSTERING:

            if not self._tree_done:
                if self._distance_matrix:
                    self._tree = pc.treecluster(
                        distancematrix=self._distance_matrix)
                else:
                    self._tree = pc.treecluster(data)

                self._tree_done = True

        self._data = data
        self._nodes_ids = nid
        clusters_ids = self._tree.cut(nclusters)
        self._clusters_ids = clusters_ids
        self._nclusters = len(np.unique(self._clusters_ids))

        cluster_attrib = dict(zip(nid, clusters_ids))
        nx.set_node_attributes(gx, 'cluster', cluster_attrib)
        self._gx = gx

        for cid in clusters_ids:

            fg = [
                gx.node[x]['FunctionalGroup'] for x in gx.node.keys()
                if gx.node[x]['cluster'] == cid
            ]
            if len(np.unique(fg)) is not 1:
                raise Exception(
                    'Many functional groups inside the same cluster!!!!!! A CRASH JUST HAPPENED, just joking!!!!'
                )
Example #22
0
def clusters(labels, data, k):
	kclus = Pycluster.kcluster(data, k, npass=1)[0]
	nx = numpy.zeros((len(labels), len(labels)), dtype=numpy.float32)
	for ind1 in range(len(labels)):
		for ind2 in range(len(labels)):
			if kclus[ind1] == kclus[ind2]:
				nx[ind1][ind2] = 1
	print k, " of ", len(labels)
	return nx
def getlabels(x, y, n = 1000 , k = 8):
    if y == "none":
        y = x
    #fit k-means clusters
    labels, _, _ = Pycluster.kcluster(y, nclusters = k, transpose=0,
                                     method='a', dist='e', npass = n)
    #write labels back
    x.loc[:,"group"] = labels
    return(x)
Example #24
0
def findk(x, n=1000, minK=2, maxK=20):
    errors = []
    # fit k-means clusters for n times
    for i in range(minK, maxK + 1, 1):
        _, error, nfound = Pycluster.kcluster(x, nclusters=i, transpose=0, method="a", dist="e", npass=n)
        # get errors
        errors.append(error)
        print i
    print errors
Example #25
0
def findk(x, n = 1000, minK = 2, maxK = 20):
    errors = []
    #fit k-means clusters for n times
    for i in range(minK,maxK+1,1):
        _, error, nfound = Pycluster.kcluster(x, nclusters = i, transpose=0,
                                         method='a', dist='e', npass = n)
        #get errors
        errors.append(error)
        print i
    print errors
Example #26
0
def cluster_spw_rpw(list_of_recs):
	number_of_clusters = 8
	only_serve_return = []
	if list_of_recs==[]:
		print "ERRROR"
	for rec in list_of_recs:
		only_serve_return.append([float(rec[0]),float(rec[1])])
	k = get_k_value(only_serve_return)
	labels, error, nfound = Pycluster.kcluster(scipy.array(only_serve_return), k)
	return labels
Example #27
0
def cluster_spw_rpw(list_of_recs):
    number_of_clusters = 8
    only_serve_return = []
    if list_of_recs == []:
        print "ERRROR"
    for rec in list_of_recs:
        only_serve_return.append([float(rec[0]), float(rec[1])])
    k = get_k_value(only_serve_return)
    labels, error, nfound = Pycluster.kcluster(scipy.array(only_serve_return),
                                               k)
    return labels
Example #28
0
def getlabels(x, y, n=1000, k=8):
    if y == "none":
        y = x
    # fit k-means clusters
    labels, _, _ = Pycluster.kcluster(y, nclusters=k, transpose=0, method="a", dist="e", npass=n)
    # write labels back
    x.loc[:, "group"] = labels
    # count how many items in each group
    labels = list(labels)
    for i in range(k):
        print labels.count(i)
    return x
Example #29
0
def cluster():
	x = [[76.0,32.0],[63.0,40.0],[70.0,30.0],[64.0,45.0]]
	k = 2
	labels, error, nfound = Pycluster.kcluster(scipy.array(x),k)
	print "Input data:"
	print "   spw " + "  rpw"
	j = 1
	for i in x:
		print str(j)+") "+str(i[0]) + "  " + str(i[1])
		j +=1
	print " "
	print "clusters: " + str(labels)
Example #30
0
def cluster():
    x = [[76.0, 32.0], [63.0, 40.0], [70.0, 30.0], [64.0, 45.0]]
    k = 2
    labels, error, nfound = Pycluster.kcluster(scipy.array(x), k)
    print "Input data:"
    print "   spw " + "  rpw"
    j = 1
    for i in x:
        print str(j) + ") " + str(i[0]) + "  " + str(i[1])
        j += 1
    print " "
    print "clusters: " + str(labels)
Example #31
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help"])
        except getopt.GetoptError, msg:
            raise Usage(msg)
        try:
            
            nodesFile = argv[1]
            #nodesFile="C:\Users\Selin\Desktop\k-means\TibyOutput\TibyNodes.txt"
            
        except IndexError:
            raise Error("Not enough arguments provided to script.")

        nodes=readNodesFromTxtForKmeans(nodesFile)
        nodes = numpy.array(nodes)
        
        #results,assign=kmeans2(whiten(features),2,iter=20,thresh=0.0000000000000000001)
        #results,assignment=kmeans2(features,2,iter=100,thresh=0.0000000000000000000000000000000000000001)
        results = Pycluster.kcluster(array(nodes),nclusters=30,npass=50,method='m')
        assignments=results[0]
        #print results
        # Roy's verison of making cluster dict
        #clusterIDs = set(assignments)

        #clusterByClusterID = dict((clusterID, nodes[assignments == clusterID]) for clusterID in clusterIDs)
       #print clusterByClusterID[0]
        
        xs = [node[0] for node in array(nodes)]
        ys = [node[1] for node in array(nodes)]
        clusterByClusterID=collections.defaultdict(list)
        
        for x, y, clusterID in itertools.izip(xs,ys,assignments):
            #if clusterID not in clusterByClusterID:
                #clusterByClusterID[clusterID] = []
            clusterByClusterID[clusterID].append((x,y))
        #print clusterDictByID[3]    
        # print data

        #pylab.axis([-10000, 500000, -10000, 500000])
        pylab.figure()
        pylab.hold(True)
        colors=['r','b','g','c','m','y','k','w','#ff6c01','#00cd00']
        #colors=['r','b','g','c','m','y','k','w']
        #colors=['burlywood']
        #colors = 'rbgcmykw'
        for clusterID, color in itertools.izip(clusterByClusterID.keys(), itertools.cycle(colors)):
            #print clusterByClusterID[clusterID]
             plotCluster(clusterByClusterID[clusterID], color)
        print results[1], results[2]
def clusterCatWithMediods(lowerLimit, upperLimit,featMan, weightMatrix, \
						 samePairsSet, differentPairsSet, catQueryDist, \
						outFile = 'cat-clusters-with-med.txt'):
	
	oFile = open(outFile,'w')
	metrics = {}
	for noTerms in range(lowerLimit, upperLimit):
		#fclusters = []
		cluster_list = []
		i = 0
		oFile = open(outFile+str(noTerms)+'.txt','w')
		for cat, qSet in catQueryDist.items():
			if len(qSet) > 1: # and cat in pairs:
				k = len(qSet)/noTerms
				if k == 0:
					k = 1
			
				qList = sorted(list(qSet),reverse=True)
				catDist = getWeightMatrixForKMed(qList, weightMatrix,'cat_kmediods')
							
				clusArray, error, opt = clust.kmedoids(catDist,k, 5, None)
				clusters = {}
				for c in range(1, len(clusArray)):
					clusId = clusArray[c]
					if clusId not in clusters:
						clusters[clusId] = set()
					clusters[clusId].add(qList[c-1])

				
				for entry in clusters.values():
					cluster_list.append(list(entry))
					qStr = toString(entry,featMan)
					#fclusters.append(qStr)
					oFile.write(cat+'\t'+qStr+'\n');
				print 'Clust category',cat, 'length', len(clusters),\
                                        'Queries' , len(qSet),'k', k,  'error', error, opt
				if i % 5 == 0:
					print i
				i+=1	
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		#metrics[noTerms] = getRecallPrecision(samePairsSet, \
		#			differentPairsSet,\
		#			predictedSamePairsSet,\
		#			predictedDifferentPairsSet)	
                metrics[noTerms] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                                predictedSamePairsSet)

		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
Example #33
0
def kmeans(data, **kwargs):
    """
    Perform k-means clustering on unstructured N-dimensional data.
    
    @type data: array
    @param data: The data to be clustered
    @type kwargs: dict
    @param kwargs: The following args are accepted:
        - numClusters: The number of clusters to form (returned number of clusters may be less than k).
        - npasses: The number of times the k-means clustering algorithm is performed, 
        each time with a different (random) initial condition.
        - method: describes how the center of a cluster is found: 
            - method=='a': arithmetic mean.
            - method=='m': median.
        - initialCenters: a set of points that should be used as the initial
                          cluster centers
            
    @rtype: tuple
    @return: A list where each element indicates the cluster membership of the 
        corresponding index in the original data and a message string
    """
    k = 1
    npasses = 1
    method = 'a'
    initialCenters = None
    smartCenters = False
    msg = ''
    
    if 'numClusters' in kwargs:
        k = int(kwargs['numClusters'])
    if 'npasses' in kwargs:
        npasses = int(kwargs['npasses'])
    if 'method' in kwargs:
        method = kwargs['method']
    if 'initialCenters' in kwargs:
        initialCenters = kwargs['initialCenters']
    if 'smartCenters' in kwargs:
        smartCenters = kwargs['smartCenters']
    
    
    logData = tm.getMethod('log')(data)
    if initialCenters is not None:
        (clusterIDs, err, nOpt) = pc.kcluster(logData, k, npass=npasses, method=method)
        msg = "Number of rounds optimal solution was found: %i" % nOpt
    else:
        logCenters = tm.getMethod('log')(np.array(initialCenters[:k]))
        (centroids, clusterIDs) = kmeans2(logData, logCenters, minit='matrix')
        if len(np.unique(clusterIDs)) < k:
            wx.MessageBox('Warning: One or more of the returned clusters are empty. Please choose different initial cluster centers and re-run k-means for better results.', 'Insufficiently varied cluster centers', wx.OK | wx.ICON_WARNING)
            
    
    return clusterIDs, msg
Example #34
0
def Kmedoids(num_patches, samples, progress=None):
    """Estimate patches as centroids of samples using k-Medoids.

  This requires the `Pycluster` library to be installed.

  :param int num_patches: number of patches to create
  :type samples: 2D array
  :param samples: example patches
  :param progress: ignored
  :rtype: 2D array with `num_patches` rows and N columns, where N is the number
     of columns in `samples`.
  :return: created patches

  """
    logging.info("Learning %d prototypes per size by k-Medoids clustering" %
                 num_patches)
    import Pycluster
    dist = Pycluster.distancematrix(samples)
    cluster_ids, _, _ = Pycluster.kmedoids(dist, nclusters=num_patches)
    # `cluster_ids` contains `num_patches` unique values, each of which is
    # the index of the medoid for a different cluster.
    return samples[np.unique(cluster_ids)].astype(ACTIVATION_DTYPE)
Example #35
0
def Kmedoids(num_patches, samples, progress=None):
  """Estimate patches as centroids of samples using k-Medoids.

  This requires the `Pycluster` library to be installed.

  :param int num_patches: number of patches to create
  :type samples: 2D array
  :param samples: example patches
  :param progress: ignored
  :rtype: 2D array with `num_patches` rows and N columns, where N is the number
     of columns in `samples`.
  :return: created patches

  """
  logging.info("Learning %d prototypes per size by k-Medoids clustering" %
      num_patches)
  import Pycluster
  dist = Pycluster.distancematrix(samples)
  cluster_ids, _, _ = Pycluster.kmedoids(dist, nclusters=num_patches)
  # `cluster_ids` contains `num_patches` unique values, each of which is
  # the index of the medoid for a different cluster.
  return samples[np.unique(cluster_ids)].astype(ACTIVATION_DTYPE)
Example #36
0
def getlabels(x, y, n = 1000 , k = 8):
    if y == "none":
        y = x
    #fit k-means clusters
    labels, _, _ = Pycluster.kcluster(y, nclusters = k, transpose=0,
                                     method='a', dist='e', npass = n)
    #write labels back
    x.loc[:,"group"] = labels
    #count how many items in each group
    labels = list(labels)
    for i in range(k):
        print labels.count(i)
    return(x)
Example #37
0
def reassignClusterIDs(src, dst):
    """
    Given the cluster centers for two clusterings, determine the centers most 
    similar to each other and reassign the cluster ids to match.
    """
    srcFCS = DataStore.getData()[src[0]]
    dstFCS = DataStore.getData()[dst[0]]
    
    srcdata = srcFCS.data
    if srcFCS.selDims:
        srcdata = dh.filterData(srcFCS.data, srcFCS.selDims)
    srcids = srcFCS.clustering[src[1]]
    srccenters = pc.clustercentroids(srcdata, clusterid=srcids)[0]
    
    dstdata = dstFCS.data
    if dstFCS.selDims:
        dstdata = dh.filterData(dstFCS.data, dstFCS.selDims)
    dstids = dstFCS.clustering[dst[1]]
    dstcenters = pc.clustercentroids(dstdata, clusterid=dstids)[0]
    
    srcsep = separate(srcdata, srcids)
    dstsep = separate(dstdata, dstids)

    centerEQ = {}
    taken = []
    # Fill the map with the closest source center for each destination center
    for i,dc in enumerate(dstcenters):
        bestDist = -1
        for j,sc in enumerate(srccenters):
            if (j not in taken):
                dist = nonSymmetricClusterDistance(dstsep[i], srcsep[j])
                if (bestDist < 0) or (dist < bestDist):
                    bestDist = dist
                    centerEQ[i] = j
        taken.append(centerEQ[i])
    
    # Renumber the cluster IDs in the destination to match the IDs of the closest src center
    tmp = [centerEQ[id] for id in dstids]
    DataStore.getData()[dst[0]].clustering[dst[1]] = tmp
Example #38
0
def main():
    args = parse_args()

    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

    logging.debug('Reading %s', args.input.name)
    src_nodes = np.loadtxt(args.input, dtype=int)

    lattice_width, lattice_height = src_nodes.shape
    lattice = nx.grid_2d_graph(lattice_width, lattice_height)

    # find and remove wall
    wall_nodes = map(lambda e: tuple(e), np.transpose(np.nonzero(src_nodes)))
    lattice.remove_nodes_from(wall_nodes)
    assert len(lattice.nodes()) == (lattice_width * lattice_height -
                                    len(wall_nodes))

    nodelist = list(lattice.nodes())
    node_ids = {n: i for i, n in enumerate(nodelist)}
    assert len(nodelist) == len(node_ids)

    # compute normalized laplacian
    norm_lapl = normalized_laplacian(lattice, nodelist, node_ids)

    # compute eigenvalues and eigenvectors
    eigen_val, eigen_vec = np.linalg.eig(norm_lapl)
    # kmeans
    labels, _, _ = Pycluster.kcluster(eigen_vec[:, :args.kappa + 1],
                                      args.kappa,
                                      dist='e',
                                      npass=100,
                                      initialid=None)
    # assign colors
    colors = [COLORS[i] for i in labels]
    assert len(colors) == len(labels)
    # compute grid lattice_height x lattice_width containing colors
    grid = []
    colored, non_colored = 0, 0
    its = 0
    for i in xrange(lattice_height):
        grid.append([])
        for j in xrange(lattice_width):
            node_id = node_ids.get((i, j))
            color = colors[node_id] if node_id is not None else BLACK
            grid[i].append(color)
            if color == BLACK:
                non_colored += 1
            else:
                colored += 1
    assert non_colored == len(wall_nodes)
    display(grid)
Example #39
0
 def DoClustering(self,nclusters=30,distance_matrix=None):
     #Avoid working two times
     if not self._tree_done:
         df_nc = self._df_nodes[self._df_nodes['ID']>=0].copy()
         
         data = df_nc[['JuvenileMass', 'AdultMass']]
         data = data.as_matrix()
         data = self._scale_function(data)
 
         if(self._normalize_data==True):
             data = whiten(data)
         
         data = np.c_[data,100.*df_nc.FunctionalGroup.values]
         
         if distance_matrix:
             self._tree = pc.treecluster(distancematrix=distance_matrix)
         else:
             self._tree = pc.treecluster(data)
         
         self._data = data
         self._tree_done = True
     
     self.FillClusterIndividualData(self._tree.cut(nclusters))
def clusterCatWithMediodsAndNetwork(threshold, \
				    lowerLimit, upperLimit, featMan, \
				    weightMatrix, samePairsSet, \
				    differentPairsSet, catQueryDist, \
				    catNetwork, \
				    outFile = 'cat-clusters-with-med.txt'):
	#cluster each cat find the outliers
	#move them to parents
	metrics = {}
	for noTerms in range(lowerLimit, upperLimit, 2):
		cluster_list = []
		#fclusters = []
		i = 0
		oFile = open(outFile+str(noTerms)+'.txt','w')
		for cat, qSet in catQueryDist.items():
			if len(qSet) > 1: # and cat in pairs:
				k = len(qSet)/noTerms
				if k == 0:
					k = 1
				#print cat, len(qSet), k
				qList = list(qSet)
				catDist = getWeightMatrixForKMed(qList, weightMatrix)
				clusArray, error, opt = clust.kmedoids(catDist,k, 5, None)
				#print 'Queries', qList
				clusters = {}
				for c in range(len(clusArray)):
					clusId = clusArray[c]
					if clusId not in clusters:
						clusters[clusId] = set()
					clusters[clusId].add(qList[c])
				#outliers = getOutliers(qList,catDist)
				for entry in clusters.values():
					cluster_list.append(list(entry))
					qStr = toString(entry,featMan)
					oFile.write(cat+'\t'+qStr+'\n');
					#fclusters.append(qStr)
				print 'Clust ',cat, len(clusters), error, opt
				if i % 50 == 0:
					print i
				i+=1
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		key = str(threshold)+'_'+str(noTerms)
		metrics[key] = getRecallPrecision(samePairsSet, differentPairsSet,\
			     		            predictedSamePairsSet,\
			     		            predictedDifferentPairsSet)
		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
Example #41
0
    def DoClustering(self,nclusters=30):
        '''Main clustering function'''
        
        gx = self._gx; func = self._scale_function
        
        nid,jm,am,fg=zip(*[(x,gx.node[x]['JuvenileMass'],gx.node[x]['AdultMass'],gx.node[x]['FunctionalGroup']) for x in gx.node.keys()])
        data = np.c_[func(jm),func(am)]
        
        if(self._normalize_data==True):
            data = whiten(data)        
        data = np.c_[data,1000*np.array(fg)]
       
        if self._algorithm == Aggregation._HIERARCHICAL_CLUSTERING:

            if not self._tree_done:
                if self._distance_matrix:
                    self._tree = pc.treecluster(distancematrix=self._distance_matrix)
                else:
                    self._tree = pc.treecluster(data)
            
                self._tree_done = True

        self._data = data        
        self._nodes_ids = nid
        clusters_ids = self._tree.cut(nclusters)
        self._clusters_ids = clusters_ids
        self._nclusters = len(np.unique(self._clusters_ids))
        
        cluster_attrib = dict(zip(nid,clusters_ids))
        nx.set_node_attributes(gx,'cluster',cluster_attrib)
        self._gx = gx
        
        for cid in clusters_ids:

            fg = [gx.node[x]['FunctionalGroup'] for x in gx.node.keys() if gx.node[x]['cluster']==cid]
            if len(np.unique(fg)) is not 1:
                raise Exception('Many functional groups inside the same cluster!!!!!! A CRASH JUST HAPPENED, just joking!!!!')
def main():
    args = parse_args()

    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

    logging.debug('Reading %s', args.input.name)
    src_nodes = np.loadtxt(args.input, dtype=int)

    lattice_width, lattice_height = src_nodes.shape
    lattice = nx.grid_2d_graph(lattice_width, lattice_height)

    # find and remove wall
    wall_nodes = map(lambda e: tuple(e),
                     np.transpose(np.nonzero(src_nodes)))
    lattice.remove_nodes_from(wall_nodes)
    assert len(lattice.nodes()) == (lattice_width * lattice_height - len(wall_nodes))

    nodelist = list(lattice.nodes())
    node_ids = {n: i for i, n in enumerate(nodelist)}
    assert len(nodelist) == len(node_ids)

    # compute normalized laplacian
    norm_lapl = normalized_laplacian(lattice, nodelist, node_ids)
    
    # compute eigenvalues and eigenvectors
    eigen_val, eigen_vec = np.linalg.eig(norm_lapl)
    # kmeans
    labels, _, _ = Pycluster.kcluster(eigen_vec[:, :args.kappa+1],
                                      args.kappa,
                                      dist='e', npass=100, initialid=None)
    # assign colors
    colors = [COLORS[i] for i in labels]
    assert len(colors) == len(labels)
    # compute grid lattice_height x lattice_width containing colors
    grid = []
    colored, non_colored = 0, 0
    its = 0
    for i in xrange(lattice_height):
        grid.append([])
        for j in xrange(lattice_width):
            node_id = node_ids.get((i, j))
            color = colors[node_id] if node_id is not None else BLACK
            grid[i].append(color)
            if color == BLACK:
                non_colored += 1
            else:
                colored += 1
    assert non_colored == len(wall_nodes)
    display(grid)
Example #43
0
def cluster(fname, nclust):
    fh = open(fname, 'r')
    lines = fh.readlines()
    fh.close()

    clusters = int(nclust)

    points = []
    points_r = []
    dates = []
    volumes = []
    close_prices = []

    for i in range(len(lines)):
        if i <= 1:
            continue
        line_c = lines[i - 1].strip().split(',')
        close_price = float(line_c[0])
        volume = float(line_c[1])

        points_r.append((close_price, volume))
        volumes.append(volume)
        close_prices.append(close_price)
        #dates.append(line_c[0])

    volume_z = np.array(volumes)
    #volume_z = stats.zscore(a)
    close_price_z = np.array(close_prices)
    #close_price_z = stats.zscore(a)

    points = zip(close_price_z, volume_z)

    init_data = []
    k = len(points) / (nclust)

    for i in range(nclust - 1):
        for j in range(k):
            init_data.append(i)

    while (len(points) != len(init_data)):
        init_data.append(nclust - 1)
    #print(clusters)

    labels, error, nfound = Pycluster.kcluster(points, clusters, None, None, 0,
                                               1, 'a', 'e', init_data)
    labels_sorted = sort_labels(labels)
    #print('Labels: ')
    print labels_sorted
    return labels_sorted
Example #44
0
def generate_network_clusters(G):
# Function creates the cluster partitions using heierarchical clustering
# on geodesic distances
    # First check to make sure the given network is a single fully
    # connected component.
    if len(NX.component.connected_component_subgraphs(G)) >1:
        raise NX.NetworkXError, 'G must be single component! Extract main component...'
    # Now generte clusters
    dist_matrix=get_dist_matrix(G)
    # Default Heierarchical Clustering algo used
    hclus=PC.treecluster(data=None,distancematrix=dist_matrix,method='m')
    partitions={}   # create dictionary of partitioning at each cut in heierarchy
    for c in range(1,len(hclus)+1):  # treecluster cuts start at 1
        partitions[c]=hclus.cut(c).tolist()
    return partitions
Example #45
0
	def __init__(self, numComps=None, dim=5, data=None, epsilon=math.pow(10, -10), wishartScalar=1, wishartScale=np.identity(dim), dirichlet=np.ones(numComps), normalMu=0, normalSigma=np.identity(dim)):
		
		# INITIALIZE ALL POSTERIOR PARAMETERS

		self.d = dim
		self.k = numComps
		self.n = len(data)

		# INITIALIZE ALL PRIOR PARAMETERS

		self.e = normalSigma
		self.m = normalMu
		self.w = wishartScale
		self.v = wishartScalar
		self.di = dirichlet
		self.epsilon = epsilon

		# INITIALIZE ALL PRIORS USING k-means CLUSTERING

		# INITIALIZE THE MUS

		labels, error, nfound = pc.kcluster(data, self.k)#, iter=300, thresh=1e-05)
	
		centroids, _ = pc.clustercentroids(data, clusterid=labels)

		self.mu = centroids

		self.pointsInComp = [[] for comp in xrange(self.k)]
		for n in xrange(self.n):
			self.pointsInComp[labels[n]].append(data[n])

		# INITIALIZE THE COVARIANCE MATRIX
		self.sigma = [np.cov(np.array(kpoints).T) for kpoints in self.pointsInComp]

		# INITIALIZE THE WEIGHTS
		self.pi = [len(l)/data.shape[0] for l in self.pointsInComp]
Example #46
0
def tree_cluster_test(data, real_labels, outputfile=None):
    start = time.time()
    tree = Pycluster.treecluster(data, method='m')

    ks = range(25, 50, 1)
    if outputfile != None:
        f = open(outputfile, 'w')
        f.write(out_result_header())
    for k in ks:
        print 'hierachical clustering whn k=%d' % k
        predicted = tree.cut(k).tolist()
        if outputfile != None:
            f.write(out_result(predicted, k, real_labels))

    elasped = time.time() - start
    print 'hierarchical clustering time: %.3f' % (elasped / float(len(ks)))
Example #47
0
def clustering(x,y,cost,ngroup=2):
    if CLUSTER == "scipy":
        z = whiten(cost)

        # let scipy do its magic (k==3 groups)
        res, labels = kmeans2(array(list(zip(x,y,z))),ngroup)

    if CLUSTER == "Pycluster":
        points = np.zeros((x.shape[0], 2))
        points[:,0] = x
        points[:,1] = y

#        labels, error, nfound = Pycluster.kcluster(points, ngroup, weights=cost)
        labels, error, nfound = Pycluster.kcluster(points, ngroup)

    return labels
Example #48
0
def kmeans_cluster_test(data, real_labels, outputfile=None):
    start = time.time()

    ks = range(8, 15)
    if outputfile != None:
        f = open(outputfile, 'w')
        f.write(out_result_header())
    for k in ks:
        print 'running kmeans when k=%d' % k
        predicted = Pycluster.kcluster(data, k)[0].tolist()
        if outputfile != None:
            f.write(out_result(predicted, k, real_labels))

    f.close()
    elasped = time.time() - start
    print 'Average time: %.3f' % (elasped / float(len(ks)))
Example #49
0
def tree_cluster_test(data,real_labels, outputfile = None):
    start = time.time()
    tree = Pycluster.treecluster(data, method='m')    

    ks = range(25,50,1)
    if outputfile != None:
        f = open(outputfile,'w')
        f.write(out_result_header())
    for k in ks:
        print 'hierachical clustering whn k=%d' % k
        predicted = tree.cut(k).tolist()
        if outputfile != None:
            f.write(out_result(predicted,k, real_labels))

    elasped = time.time() - start
    print 'hierarchical clustering time: %.3f' % (elasped/float(len(ks)))
Example #50
0
def cluster(D, k):
    import Pycluster as pcl
    labels, _, _ = pcl.kmedoids(D, nclusters=k, npass=10, initialid=None)
    errors = np.array([D[labels[i], i] for i in range(len(labels))])
    centroidids = np.unique(labels)
    cmap = np.zeros(labels.max() + 1)
    for c in centroidids:
        cmap[c] = np.nonzero(centroidids == c)[0][0]
    labels = cmap[labels]
    logger.debug('k-medoids (k=%i): %.2f.' % (k, errors.sum()))
    return labels, {
        'method': 'kmedoids',
        'init': 'random',
        'k': k,
        'centroidids': centroidids,
        'errors': errors,
        'error': errors.sum(),
        'error-label': 'sum of distances'
    }
Example #51
0
def resolution_clustering(clusters, cluster_ids, sampled, kx=2):
    X = np.array([np.append(np.append(c[0], c[1]), c[2]) for c in clusters])
    n = X.shape[1] / 3
    Xn = X / ([np.average(X[:, :n])] * n + [np.average(X[:, n:2 * n])] * n +
              [np.average(X[:, 2 * n:])] * n)
    C, e, nf = Pycluster.kcluster(Xn, len(clusters) / len(sampled) * kx)
    del Xn
    Cidx = defaultdict(list)
    for i, c in enumerate(C):
        Cidx[c].append(i)
    CStable = []
    for k, v in Cidx.items():
        members = set()
        for c in v:
            members.update(clusters[c][3])
        members = sorted(members)
        s = stability(members, cluster_ids)
        CStable.append((s, np.average(X[v], axis=0).reshape(
            (3, X.shape[1] / 3)), members))
    return CStable
Example #52
0
    def _guide_tree(self, dist_matrix):
        """
        @summary: Build a guide tree from the distance matrix

        @param dist_matrix: The distance matrix
        @type dist_matrix: numpy.ndarray
        @return: Pycluster similarity tree
        @rtype: Pycluster.cluster.Tree

        @author: Woon Wai Keen
        @author: Vladimir Likic
        """

        n = len(dist_matrix)

        print " -> Clustering %d pairwise alignments." % (n * (n - 1)),
        tree = Pycluster.treecluster(distancematrix=dist_matrix, method='a')
        print "Done"

        return tree
Example #53
0
def matchs_ia():

    # if not current_user.admin:
    #     return jsonify({'message' : 'Cannot perform that function!'})

    tests = TestR.query.all()

    dataSet = []
    trans_tab = dict.fromkeys(map(ord, u"\u0301\u0308"), None)
    for test in tests:
        # test_data = {}
        resident = Residents.query.filter_by(public_id=test.public_id).first()
        test_data = ("-" + str(resident.id) + "-" + test.gender +
                     str(test.age) + test.musicGender + test.sport +
                     test.hobbie + test.movieSeries + test.filmGender +
                     test.tabaco + test.alcohol + test.party +
                     str(test.ordenConvivencia) + str(test.ordenPersonal) +
                     test.personalidad)
        test_data = normalize(
            "NFKC",
            normalize("NFKD", test_data).translate(trans_tab))
        dataSet.append(test_data)

    distans = [
        distance.edit_distance(dataSet[i], dataSet[j])
        for i in range(1, len(dataSet)) for j in range(0, i)
    ]

    labels, error, nfound = PC.kmedoids(distans, nclusters=5, npass=10)
    cluster = dict()
    output = []
    for roommate, label in zip(dataSet, labels):
        cluster.setdefault(label, []).append(roommate)
    for label, grp in cluster.items():
        cluster_data = {}
        cluster_data["Roommate"] = grp
        cluster_data["Count"] = len(grp)
        cluster_data["label"] = str(label)
        output.append(cluster_data)

    return jsonify({"testsALL": output}, {"error": error}, {"nfound": nfound})
Example #54
0
    def cluster(self, num_cluster):
        category_tfidf = self.category_tfidf
        categories = list(category_tfidf)
        random.shuffle(categories)

        tfidf_norms = {category: sum(value**2 for value in tfidf.values())
                       for category, tfidf in category_tfidf.items()}

        for category, norm in tfidf_norms.items():
            if not norm:
                raise Exception((category, category_tfidf[category]))

        distances = []
        for i, category1 in enumerate(categories):
            cat1_tfidf = category_tfidf[category1]
            row_array = array([0.0] * i)
            for j, category2 in enumerate(categories):
                if j >= i:
                    break
                row_array[j] = self.compute_distance(cat1_tfidf, category_tfidf[category2], tfidf_norms[category1], tfidf_norms[category2])

            distances.append(row_array)

        clusterids, error, nfound = Pycluster.kmedoids(distances, num_cluster)
        print error

        category_clusters = [[] for _ in range(num_cluster)]

        print len(clusterids)
        print len(categories)
        print clusterids

        clusterid_map = {}


        for i, category in enumerate(clusterids):
            category_id = clusterid_map.setdefault(category,
                                                   len(clusterid_map))
            category_clusters[category_id].append(categories[i])

        return category_clusters
Example #55
0
    def __kmeans_initialization(self):
        """
        given the data points, cluster them by applying kmeans clustering
        algorithm.
       """
        # apply kmeans clustering to get the centroids and labels for each vector in data
        labels, error, nfound = Pycluster.kcluster(self._data, self._nClusters)

        # get the dimension of the input data
        rows, cols = self._data.shape

        clusterData = [[] for i in xrange(self._nClusters)]

        # assign vectors to clusters
        for data, label in zip(self._data, labels):
            clusterData[label].append(data)

        models = [GaussianCluster( *muAndSigma(clusterData[i], cols)) for i in xrange(self._nClusters)]

        apriori = np.ones(self._nClusters, dtype = np.float32) / np.array([len(elem) for elem in clusterData])

        return models, apriori
Example #56
0
def kmeans(k, table):
    # k = 50
    (labels, error, nfound) = pc.kcluster(table, k, None, None, 0, 20, 'a',
                                          'b')
    #  plot.plot_scatter(table, labels, k)

    #  centers = get_centers(table, labels)
    #  np.random.shuffle(table)
    #  tab = [map(float, x) for x in table[:1000]]

    #  mycluster = mc.MyClustering(tab, k)
    #  mycluster.init_heap()
    #  mycluster.hierarchy_cluster()
    #  mycluster.clear_sample_points()

    #  for i, row in enumerate(table):
    #    if i % 1000 == 0:
    #      print 'progress: %d' % i
    #    mycluster.add_point(i, map(float, row))

    #  mycluster.get_cluster()

    return labels
Example #57
0
def som_cluster_test(data, real_labels, outputfile=None):
    if outputfile != None:
        f = open(outputfile, 'w')
        f.write(out_result_header())

    start = time.time()
    ks = range(6, 40)
    for k in ks:
        print 'som clustering when k=%d' % k
        predicted = Pycluster.somcluster(data,
                                         nxgrid=k,
                                         nygrid=1,
                                         niter=5,
                                         dist='u')[0]
        predicted = [xy[0] for xy in predicted.tolist()]
        cata = tuple(set(predicted))
        for i in range(0, len(predicted)):
            predicted[i] = cata.index(predicted[i])
        if outputfile != None:
            f.write(out_result(predicted, k, real_labels))

    elasped = time.time() - start
    print 'som clustering time: %.3f' % (elasped / float(len(ks)))