Example #1
0
 def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True):
     print "ClusterBalancing..."
     indexesPicked = []
     obs1 = self.observations[indexesToPick]
     obs = normalize(obs1, axis=0)
     if len(indexesToPick) != 0:
         if kmeansFlag:
             if(len(indexesToPick) < self.numClusters):
                 cluster = KMeans(init='k-means++', n_clusters=len(obs), n_init=10)
             else:
                 cluster = KMeans(init='k-means++', n_clusters=self.numClusters, n_init=10)
         else:
             if(len(indexesToPick) < self.numClusters):
                 cluster = spectral_clustering(n_clusters=len(obs), n_init=10)
             else:
                 cluster = spectral_clustering(n_clusters=self.numClusters, n_init=10)
         cluster.fit(obs)
         labels = cluster.labels_
         whenToStop = max(2, stopCount)
         count = 0
         while count != whenToStop:
             cluster_list = range(self.numClusters)
             index = 0
             for j in labels:
                 if j in cluster_list:
                     indexesPicked.append(indexesToPick[index])
                     cluster_list.remove(j)
                     count += 1
                     if count == whenToStop:
                         break
                     labels[index] = -1
                     if len(cluster_list) == 0:
                         break
                 index += 1
     return indexesPicked
def test_spectral_clustering_with_arpack_amg_solvers():
    # Test that spectral_clustering is the same for arpack and amg solver
    # Based on toy example from plot_segmentation_toy.py

    # a small two coin image
    x, y = np.indices((40, 40))

    center1, center2 = (14, 12), (20, 25)
    radius1, radius2 = 8, 7

    circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2
    circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2

    circles = circle1 | circle2
    mask = circles.copy()
    img = circles.astype(float)

    graph = img_to_graph(img, mask=mask)
    graph.data = np.exp(-graph.data / graph.data.std())

    labels_arpack = spectral_clustering(
        graph, n_clusters=2, eigen_solver='arpack', random_state=0)

    assert len(np.unique(labels_arpack)) == 2

    if amg_loaded:
        labels_amg = spectral_clustering(
            graph, n_clusters=2, eigen_solver='amg', random_state=0)
        assert adjusted_rand_score(labels_arpack, labels_amg) == 1
    else:
        assert_raises(
            ValueError, spectral_clustering,
            graph, n_clusters=2, eigen_solver='amg', random_state=0)
Example #3
0
	def __init__(self, laplacian,ncluster,classesnames):
		self.laplacian = laplacian
		self.ncluster = ncluster
		m,n=laplacian.shape
		print 'size Laplacian_matrix: ',m, n
		labels = spectral_clustering(laplacian, n_clusters=ncluster)

		x=range(n+1)
		wordsall=zip(x, classesnames)
		lc= zip(labels,x)
		print "labels", lc
		allwordsclustered=[]
		for m in range(ncluster):
			sort=[item[1] for item in lc if item[0] == m]
			wordsclustered=[]

			for y in sort:

				for item in wordsall:
				 if item[0] == y:
				  wordsclustered.append(item[1])
			if len(wordsclustered) >1:	
				allwordsclustered.append(wordsclustered)

		print'clusteredwords'
		print allwordsclustered
		
		self.cluster=  len(allwordsclustered),allwordsclustered
Example #4
0
def test_spectral_amg_mode():
    # Test the amg mode of SpectralClustering
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    try:
        from pyamg import smoothed_aggregation_solver
        amg_loaded = True
    except ImportError:
        amg_loaded = False
    if amg_loaded:
        labels = spectral_clustering(S, n_clusters=len(centers),
                                     random_state=0, mode="amg")
        # We don't care too much that it's good, just that it *worked*.
        # There does have to be some lower limit on the performance though.
        assert_greater(np.mean(labels == true_labels), .3)
    else:
        assert_raises(ValueError, spectral_embedding, S,
                      n_components=len(centers), random_state=0, mode="amg")
def spectral(tweetfile,npmifile,dictfile,k,noc):
	Ptmp=textscan(npmifile,'([^ ]*) ([^ ]*) ([^ ]*)');
	PP=textscan(dictfile,'(.*) (.*)',(int,str));
	PP[0] -= 1
	PMI=ssp.coo_matrix(
		(Ptmp[2],(Ptmp[0]-1,Ptmp[1]-1)),
		(PP[0].shape[0],PP[0].shape[0])
	).tocsr();

	W=knnmatrix(PMI,k);
	# This is hidious and wrong and it must be fixed
	W=ssp.csr_matrix(minimum(W.todense(),W.T.todense()))
	
	s,comp = ssp.csgraph.connected_components(W,directed=False)
	comp_mode = mstats.mode(comp)[0]
	inds = comp==comp_mode
	inds = [x for x in range(W.shape[0]) if inds[x]]
	WW = W[inds,:][:,inds]
	P=PP[1][inds];

	ids = P;
	X = WW;

	c = spectral_clustering(X,n_clusters=noc, eigen_solver='arpack')
	fid=file("".join(['cl.',tweetfile,'-',str(noc)]),'w');
	for i in range(max(c)+1):
		cl=[x for x in range(len(c)) if c[x] == i]
		b,wordsix = centralityn(cl,X,ids);
		for j in range(len(b)):
			word=wordsix[j];
			fid.write('%s %d %.5f\n'%(word,i,b[j]));
Example #6
0
  def getPairwiseDistanceMatrix(self):
    """
      It is sloghtly slower but memory efficient, fast implementation is not tractable in terms of memory for such a scale
    """
    self.clusters = []
    dataSize = self.data_points.shape
    self.PDistMat = sp.sparse.csr_matrix((dataSize[0],dataSize[0]))
    for k in range(dataSize[0]):
      CurrentPoint = self.data_points[k,:]
      Dist = sp.spatial.distance.cdist(np.reshape(CurrentPoint,(1,dataSize[1])),self.data_points,'euclidean')
      kMins = []
      kDists = []
      maxD = np.max(Dist)+1
      while len(kMins)<5:
        cMins = np.argmin(Dist)
        kMins.append(cMins)
        kDists.append(Dist[0,cMins])
        Dist[0,cMins]=maxD
      for pt in range(len(kMins)):
        #print kMins[pt],k,self.PDistMat.shape,kDists[pt],pt,kDists
        self.PDistMat[k,kMins[pt]]=kDists[pt]
        self.PDistMat[kMins[pt],k]=kDists[pt]

    SM=self.PDistMat.data.mean()
    self.PDistMat.data[:] = np.exp(((-1)*self.PDistMat.data)/SM)
    #Here we go a bit low-level and apply the e^(-1.x) to the data array
    #self.PDistMat.data = np.exp((-1)*self.PDistMat.data)

    pickle.dump(self.PDistMat,open('pdist.bnbb','wb'))
    labs = spectral_clustering(self.PDistMat,n_clusters=20)
    pickle.dump(labs,open('labs.bnbb','wb'))
Example #7
0
    def classifySpeCluLsa(self, class_num):
        from draw_data import draw_data 
        draw_title = draw_data()
        lsa = models.LsiModel.load('model.lsa', mmap='r')
        logging.info("load lsa model!!")
        index = similarities.MatrixSimilarity.load('model_lsa.index')
        self.get_data(num=3000)
        (tfidf, dictionary) = self.get_tfidf(True, num=3000)

        hash_id2list = dict() # 保存id -> 下标 similar_matrix中对应使用
        for i in range(len(self.title_id)):
            hash_id2list[self.title_id[i]] = i

        logging.info('开始创建相似矩阵...')
        similar_matrix = np.zeros((len(tfidf),len(tfidf))) #存放相似度
        for i in range(len(tfidf)):
            sims = index[lsa[tfidf[i]]]
            for j,v in enumerate(sims): 
                similar_matrix[i][j] = v
                similar_matrix[j][i] = v
        logging.info('done,相似矩阵建立完成,使用普聚类进行分类...')
        labels = spectral_clustering(similar_matrix, n_clusters=class_num, eigen_solver='arpack')
        self.vector_table = [[] for i in range(class_num)]
        for i in range(len(labels)):
            self.vector_table[labels[i]].append(self.title_id[i])
        logging.info("print set... "+str(len(self.vector_table)))
        self.printTitleTOfile(hash_id2list)
        draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-12-25')
Example #8
0
def community_clustering():
    path = settings.COMMUNITY_PATH
    index = 0
    communities = []
    merged_communities = {}
    for root, dirs, files in os.walk(path):
        for year in files:
            merged_communities[int(year)] = [[] for i in range(200)]
            comm_dict = {}
            input = open(os.path.join(path,year))
            for line in input:
                x = line.strip().split(' ')
                author = int(x[0])
                id = int(x[1])
                if not comm_dict.has_key(id):
                    comm_dict[id] = Community(int(year),id,index)
                    index+=1
                comm_dict[id].append_member(author)
            for id in comm_dict.keys():
                communities.append(comm_dict[id])
    verbose.debug("num of communities: "+str(len(communities)))
    adjacency = np.ndarray(shape=(len(communities),len(communities)), dtype=int)
    for i in range(len(communities)):
        for j in range(i+1,len(communities)):
            affinity = communities[i].intersect(communities[j])
            adjacency[i,j]=affinity
            adjacency[j,i]=affinity
    labels = spectral_clustering(adjacency, n_clusters = 200)
    verbose.debug("clustering finished")
    for i in range(len(labels)):
        merged_communities[communities[i].year][labels[i]].extend(communities[i].members)
    for year in merged_communities.keys():
        cluster_file = open(settings.DATA_PATH+"\\clusters\\"+str(year), 'w')
        for i in range(len(merged_communities[year])):
            [cluster_file.write(str(member)+',') for member in merged_communities[year][i]]                     
def speclu(data_matrix, k):
	#use spectral clustering
	print 'using spectral clustering......'
	E_matrix = getEMatrix(data_matrix)
	result_total = spectral_clustering(E_matrix, n_clusters = k)
	result = result_total[ : len(data_matrix)]
	return result
Example #10
0
def compute(n):
	G , nodes , ego = build_graph(n)
	A = nx.to_numpy_matrix(G)
	C = connectedness(A)
	row , col = A.shape
	if row >= 350:
		clus = 10
	else:
		clus = 6
	L = spectral_clustering(C , n_clusters = clus)
	circles = []
	for x in range(0,clus):
		circles += [[]]
	
	tmp = 0
	for node in nodes:
		circles[L[tmp]] += [node]
		tmp += 1
	final_circle = []
	for circle in circles:
		if len(circles) == 1:
			final_circle += [circle]
			continue
		den = compute_density(circle , nodes , A)
		if den + 1e-9 < .250:
			continue
		final_circle += [circle]


	# print(final_circle) 
	return ego , final_circle 
Example #11
0
def image_features_labels(img,n_clusters,maxPixel):
     # X is the feature vector with one row of features per image
     #
     imageSize=maxPixel*maxPixel
     img = resize(img, (maxPixel, maxPixel))
     mask = img.astype(bool)
     # Convert the image into a graph with the value of the gradient on the
     # edges.
     graph = s_im.img_to_graph(img, mask=mask)

     # Take a decreasing function of the gradient: we take it weakly
     # dependent from the gradient the segmentation is close to a voronoi
     graph.data = np.exp(-graph.data / graph.data.std())

     # Force the solver to be arpack, since amg is numerically
     # unstable on this example
     labels = spectral_clustering(graph, n_clusters, eigen_solver='arpack')
     label_im = -np.ones(mask.shape)
     label_im[mask] = labels

     X=np.zeros(imageSize, dtype=float)

     # Store the rescaled image pixels
     X[0:imageSize] = np.reshape(label_im,(1, imageSize))
     return X
Example #12
0
def cluster_nodes(dist_laplacian, clusters=3, show=False):
    norm_laplacian = Lapl_normalize(dist_laplacian)
    norm_laplacian.setdiag(0)
    norm_laplacian = -norm_laplacian
    if show:
        plt.imshow(norm_laplacian.toarray(), cmap='jet', interpolation="nearest")
        plt.colorbar()
        plt.show()
    labels = spectral_clustering(norm_laplacian, n_clusters=clusters, eigen_solver='arpack')
    return np.reshape(labels, (dist_laplacian.shape[0], 1))
def cluster_spatial_data(X, n_parcels, xyz=None, shape=None, mask=None,
                         method='ward', verbose=False):
    """Cluster the data using Ward's algorithm

    Parameters
    ==========
    X: array of shape(n_voxels, n_subjects)
       the functional data, across subjects
    n_parcels: int, the desired number of parcels
    xyz: array of shape (n_voxels, 3), optional
         positions of the voxels in grid coordinates
    shape: tuple: the domain shape (assuming a grid structure), optional
          alternative specification of positions
    mask: arbitrary array of arbitrary dimension,optional
          alternative specification of positions
    method: string, one of ['ward', 'spectral', 'kmeans'], optional
            clustering method

    Returns
    =======
    label: array of shape(n_voxels): the resulting cluster assignment

    Note
    ====
    One of xyz, shape or mask needs to be provided
    """
    from sklearn.cluster import spectral_clustering, k_means
    if mask is not None:
        connectivity = grid_to_graph(*shape, mask=mask)
    elif shape is not None:
        connectivity = grid_to_graph(*shape)
    elif xyz is not None:
        from sklearn.neighbors import kneighbors_graph
        n_neighbors = 2 * xyz.shape[1]
        connectivity = kneighbors_graph(xyz, n_neighbors=n_neighbors)
    else:
        raise ValueError('One of mask, shape or xyz has to be provided')

    if n_parcels == 1:
        return np.zeros(X.shape[0])
    if method == 'ward':
        connectivity = connectivity.tocsr()
        ward = Ward(n_clusters=n_parcels, connectivity=connectivity).fit(X)
        label = ward.labels_
    elif method == 'spectral':
        i, j = connectivity.nonzero()
        sigma = np.sum((X[i] - X[j]) ** 2, 1).mean()
        connectivity.data = np.exp(- np.sum((X[i] - X[j]) ** 2, 1) /
                                      (2 * sigma))
        label = spectral_clustering(connectivity, n_clusters=n_parcels)
    elif method == 'kmeans':
        _, label, _ = k_means(X, n_parcels)
    else:
        raise ValueError('Unknown method for parcellation')
    return label
def spectralClusteringTest01():
	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn.feature_extraction import image
	from sklearn.cluster import spectral_clustering

	l = 100
	x,y = np.indices((l, l)) #x,y 都是二维矩阵, 表示了某点的x 和 y的坐标


	center1 = (28, 24)
	center2 = (40, 50)
	center3 = (67, 58)
	center4 = (24, 70)

	radius1, radius2, radius3, radius4 = 16, 14, 15, 14

	circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2
	circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2
	circle3 = (x - center3[0]) ** 2 + (y - center3[1]) ** 2 < radius3 ** 2
	circle4 = (x - center4[0]) ** 2 + (y - center4[1]) ** 2 < radius4 ** 2


	img = circle1 + circle2 + circle3 + circle4
	mask = img.astype(bool)
	img = img.astype(float)

	img += 1 + 0.2 * np.random.randn(*img.shape)

	#Convert the image into a graph with the value of the gradient on the edges

	#img就是一个100 * 100的图片
	#mask是一个bool型的100 * 100模板
	#graph是一个稀疏矩阵 -- 不过为什么是2678 * 2678 ?
	#估计这一步里面计算了梯度
	graph = image.img_to_graph(img, mask = mask)

	print graph.shape
	graph.data = np.exp(-graph.data / graph.data.std())

	#这里还是指定了聚类的中心数目
	#这里是只对mask内的点进行聚类
	labels = spectral_clustering(graph, n_clusters = 4, eigen_solver = "arpack")


	print labels

	label_im = -np.ones(mask.shape)
	label_im[mask] = labels

	plt.matshow(img)
	plt.matshow(label_im)

	plt.show()
def cluster_and_rank_demos(sm, n_clusters, eigen_solver='arpack', assign_labels='discretize'):
    """
    Clusters demos based on similarity matrix.
    """
    labels = spectral_clustering(sm, n_clusters = n_clusters, eigen_solver=eigen_solver,assign_labels=assign_labels)
    clusters = {i:[] for i in xrange(n_clusters)}
    for i,l in enumerate(labels):
        clusters[l].append(i)

    # Maybe re-cluster large demos
    return rank_demos_in_cluster(clusters, sm)
Example #16
0
def spectralcluster(correlations,n_clusters,names):
    labels=cluster.spectral_clustering(correlations,n_clusters=n_clusters, eigen_solver=None, random_state=0, n_init=10,  k=None, eigen_tol=0.0, 
    assign_labels='kmeans', mode=None)
    #print labels
    clusdict=[]
    print ""
    print "Spectral Clustering - shape: " + str(correlations.shape)
    for i in range(labels.max()+1):
        print 'Cluster %i: %s' % ((i+1),', '.join(names[labels==i]))
        clusdict.append(names[labels==i])
    #print clusdict                     
    return clusdict
def test_spectral_lobpcg_mode():
    # Test the lobpcg mode of SpectralClustering
    # We need a fairly big data matrix, as lobpcg does not work with
    # small data matrices
    centers = np.array([[0.0, 0.0], [10.0, 10.0]])
    X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=0.1, random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, eigen_solver="lobpcg")
    # We don't care too much that it's good, just that it *worked*.
    # There does have to be some lower limit on the performance though.
    assert_greater(np.mean(labels == true_labels), 0.3)
Example #18
0
def affin_sclustering(X,n_clust, distance='euclid', gamma=0.1, std=1):
    print 'Basic spectral clustering using affinity matrix'
    if distance=='cosine':
        similarity=cos(X)#pairwise_distances(X, metric='cosine')
    elif distance=='euclid':
        dist=euclidean_distances(X)
        if std:
            similarity = np.exp(-gamma * dist/dist.std())
        else:
            similarity = np.exp(-gamma * dist)
    labels = cluster.spectral_clustering(similarity,n_clusters=n_clust, eigen_solver='arpack')
    return labels
	def __speclu(self):
		#use spectral clustering
		print 'using spectral clustering......'
		data_matrix = self.data_matrix
		if len(data_matrix) == len(data_matrix[0]):
			print "Donot need to use E_matrix"
			E_matrix = data_matrix
		else:
			E_matrix = self.__getEMatrix()
		result_total = spectral_clustering(E_matrix, n_clusters = self.k)
		result = result_total[ : len(data_matrix)]
		return result
Example #20
0
    def test_spectral_clustering(self):
        N = 50
        m = np.random.random_integers(1, 200, size=(N, N))
        m = (m + m.T) / 2

        df = pdml.ModelFrame(m)
        result = df.cluster.spectral_clustering(random_state=self.random_state)
        expected = cluster.spectral_clustering(m, random_state=self.random_state)

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        self.assert_index_equal(result.index, df.index)
        self.assert_numpy_array_equal(result.values, expected)
 def global_clustering_by_spectral(self):
     num_clusters = self.num_global_clusters
     X = self.build_global_feature_vectors_by_jaccard_with_weight()
     logging.info("Global spectral clustering...")
     spectral = spectral_clustering(X, n_clusters=num_clusters, eigen_solver='arpack')
     logging.info("Global spectral finished")
     self.global_clusters = [[[] for i in range(num_clusters)] for j in range(self.num_time_slides)]
     self.global_cluster_labels = [[None for i in range(self.num_local_clusters)] for j in range(self.num_time_slides)]
     labels = spectral
     for time in range(self.num_time_slides):
         for i, cluster in enumerate(self.local_clusters[time]):
             l = labels[self.gloabl_feature_vectors_index[time][i]]
             self.global_clusters[time][l].append(i)
             self.global_cluster_labels[time][i] = l
Example #22
0
    def _spectral_clustering(self,samples):
        if sp_version < (0, 12):
            raise SkipTest("Skipping because SciPy version earlier than 0.12.0 and "
                   "thus does not include the scipy.misc.face() image.")

        # Convert the image into a graph with the value of the gradient on the
        # edges.
        graph = image.img_to_graph(samples)


        # Take a decreasing function of the gradient: an exponential
        # The smaller beta is, the more independent the segmentation is of the
        # actual image. For beta=1, the segmentation is close to a voronoi
        beta = 5
        eps = 1e-6
        graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps

        # Apply spectral clustering (this step goes much faster if you have pyamg
        # installed)
        N_REGIONS = 4

        #############################################################################
        # Visualize the resulting regions

        for assign_labels in ('kmeans', 'discretize'):
            t0 = time.time()
            labels = spectral_clustering(graph, n_clusters=N_REGIONS,
                                         assign_labels=assign_labels, random_state=1)
            sample=pd.DataFrame(labels)
            sample.to_csv(os.path.join(OUTPUT_DIR, "spectral_result.csv"),sep=",")
            t1 = time.time()
            #classif=labels.fit(samples)
            #print classif
            print labels
            print sample
            
            labels = labels.reshape(samples.shape)

            plt.figure(figsize=(5, 5))
            plt.imshow(samples, cmap=plt.cm.gray)
            for l in range(N_REGIONS):
                plt.contour(labels == l, contours=1,
                            colors=[plt.cm.spectral(l / float(N_REGIONS))])
            plt.xticks(())
            plt.yticks(())
            title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0))
            print(title)
            plt.title(title)
        plt.show() 
Example #23
0
def clust(vectorfile,matrixfile,clusted):

    fid2fname = {}
    for line in open(vectorfile) :
        line = line.strip().split('\t')
        fid2fname.setdefault(int(line[0]), line[1:])

    N = len(fid2fname)
    rowlist = []
    collist = []
    datalist = []
    for line in open(matrixfile) :
        line = line.strip().split('\t')
        if len(line) < 3 : continue
        f1, f2, sim = line[:3]
        rowlist.append(int(f1))
        collist.append(int(f2))
        datalist.append(float(sim))

    for id in fid2fname :
        rowlist.append(int(id))
        collist.append(int(id))
        datalist.append(1.0)

    row = np.array(rowlist)
    col = np.array(collist)
    data = np.array(datalist)
    graph = coo_matrix((data, (row, col)), shape=(N, N))

    ###############################################################################

    # Force the solver to be arpack, since amg is numerically
    # unstable on this example
    labels = spectral_clustering(graph, n_clusters=550, eigen_solver='arpack')

    cluster2fid = {}
    for index, lab in enumerate(labels) :
        cluster2fid.setdefault(lab, [])
        cluster2fid[lab].append(index)

    normal_data = open("normal-data.txt", 'w')
    easy_data=open("spectal_easy-data-550.txt", 'w')
    for index, lab in enumerate(cluster2fid) :
        for fid in cluster2fid[lab] :
            strx=""
            for i in range(0, len(fid2fname[fid])):
                strx+=str(fid2fname[fid][i])+"\t"
            print >> normal_data,strx+'\t'+str(index)
            print >> easy_data,strx+'\t'+str(fid)+'\t'+str(index)
Example #24
0
def clustering():
    cosMatrix_mat = sio.loadmat(
        '../data/result/cosMatrix.mat', struct_as_record=False,
        squeeze_me=True)['cosMatrix']
    userMatrix_mat = getFriendsMatrix()
    combinedMatrix_mat = userMatrix_mat + cosMatrix_mat
    clusterNumber = range(50,60)
    sims = []
    for c in clusterNumber:
        labels = spectral_clustering(
            combinedMatrix_mat, n_clusters=c, eigen_solver='arpack')
        sim = clusterSimilarity(combinedMatrix_mat, labels, c)
        sims.append(sim)
        print "{} cluster: average simi={}".format(c, sim)
    print sims
def main():
	src_path = os.path.join(os.getcwd(), 'ratings.csv')
	res_path = os.path.join(os.getcwd(), 'preRatings.csv')
	predicted_data = pd.read_csv(res_path, header = 0, index_col = 0)
	int_col = []
	for col in predicted_data.columns:
		icol = int(col)
		int_col.append(icol)
	predicted_data.columns = int_col
	movie_rated_num = pd.Series(index = predicted_data.columns)
	for i in predicted_data.columns.values:
		movie_rated_num[i] = predicted_data[i].dropna().count()
	movie_rated_num.sort()
	cuted_data = predicted_data.loc[ : , movie_rated_num[8500: ].index]
	print cuted_data.shape

	data_matrix = cuted_data.fillna(0).values

	for i in range(0, len(data_matrix)):
		for j in range(0, len(data_matrix[i])):
			if data_matrix[i][j]>3.5:
				data_matrix[i][j] = 2
			elif data_matrix[i][j]<2.5:
				data_matrix[i][j] = 0
			else:
				data_matrix[i][j] = 1
	print data_matrix

	E_matrix = cs.getEMatrix(data_matrix)

	labels = spectral_clustering(E_matrix, n_clusters = 20)
	print labels
	'''
	init_data = pd.read_csv(src_path, header = 0, index_col = 0)
	# Cause the type of columns that read in csv is str, we need to convert it into int
	int_col = [int(col) for col in init_data.columns]
	init_data.columns = int_col
	init_data = init_data.loc[predicted_data.index, predicted_data.columns]
	init_data_matrix = init_data.fillna(0).values
	'''
	columns = ['userID', 'movieID', 'rating', 'timestamp']
	ratings = pd.read_csv(src_path, header = 1, names = columns)
	data = ratings.pivot(index = 'userID', columns = 'movieID', values = 'rating')
	init_data = data.loc[cuted_data.index, cuted_data.columns]
	init_data_matrix = init_data.fillna(0).values
	dp.drawPicture(init_data_matrix, labels)

	'''
Example #26
0
def dist_matrix(axis, clusters):
    # if axis:
    #     re_shards = shards[:, non_nilled]
    # else:
    #     re_shards = shards
    # print re_shards.shape
    # c_list = np.split(re_shards, re_shards.shape[axis], axis)
    # accumulator_matrix = np.zeros((re_shards.shape[axis], re_shards.shape[axis]))
    # est_len = re_shards.shape[axis]*(re_shards.shape[axis]-1)/2
    # for i, (i_a, i_b) in enumerate(combinations(range(0, re_shards.shape[axis]), 2)):
    #     if not i%100:
    #         pl = "{0:0.2f}".format(i/float(est_len)*100.0)
    #         print pl, '%'
    #     a = c_list[i_a]
    #     b = c_list[i_b]
    #     dist = distance(a, b)
    #     accumulator_matrix[i_a, i_b] = dist
    #     accumulator_matrix[i_b, i_a] = dist
    #
    # dump(accumulator_matrix,open('loc_dump.dmp','w'))

    ##########################################################################################################

    pre_accumulator_matrix = load(open('loc_dump.dmp','r'))

    accumulator_matrix = np.exp( - pre_accumulator_matrix*pre_accumulator_matrix / pre_accumulator_matrix.std() )
    plt.imshow(accumulator_matrix, interpolation='nearest')
    plt.show()




    vals, vects =  eigh(accumulator_matrix)
    plt.hist(vals, 1000, log=True)
    vals[vals**2 < 0.3] = 0
    print vals
    # accumulator_matrix = np.dot(vects, np.dot(np.diag(vals), vects.T))
    plt.show()

    labels = spectral_clustering(accumulator_matrix, n_clusters=clusters, eigen_solver='arpack')
    print labels
    stable_mappings = crible(10, labels, non_nilled)
    print 'stable mappings redundancy:', len(stable_mappings), len(set(stable_mappings))
    srt_idx = hierchical_clustering(accumulator_matrix, labels)

    dump((stable_mappings, accumulator_matrix, srt_idx, non_nilled), open('loc_dump2.dmp','w'))
Example #27
0
def SpectralClusterImage(input_image, beta=5, eps=1e-6, n_regions=11, assign_labels='discretize',downsample_factor=np.NaN, order=3):
    """ Spectral Cluster an image
        Inputs:
            input_image: ndarray of image
            beta: Take a decreasing function of the gradient: an exponential
                The smaller beta is, the more independent the segmentation is of 
                the acutal image. For beta=1, the segmentation is close to a 
                voronoi. Default is 5.
            eps: error term. Default is 1E-6
            n_regions: number of regions to decompose into. Default is 11.
            assign_labels: ways of decomposition. Selecting from 'discretize' and 
                'kmeans'. Default is 'discretize'.
            downsample_factor: downsampling before spectral decomposition. Default
                is to keep the original sampling. Enter a single number to apply
                the kernel for both dimensions of the image, or enter as a sequence
                to apply different kernel for each dimension
            order: downsampling method, order of B-spline interpolation
    """
    # Downsample the image
    if not np.isnan(downsample_factor):
        zoom(input_image, zoom=downsample_factor, order=order)
    # Convert the image into a graph with the value of the gradient on the edges
    graph = image.img_to_graph(input_image)
    # Take a decreasing function of the gradient: an exponential
    # The smaller beta is, the more independent the segmentation is of the
    # acutal image. For beta=1, the segmentation is close to a voronoi
    graph.data = np.exp(-beta * graph.data / input_image.std()) + eps 
    # Apply spectral clustering  (this step goes much faster if yuo have pyamg 
    # installed) 
    labels = spectral_clustering(graph, n_clusters=n_regions,
                                 assign_labels='discretize')
    labels = labels.reshape(input_image.shape)
    # Visualizing the resulting regions
    pl.figure(figsize=(5,5))
    pl.imshow(input_image, cmap=pl.cm.gray)
    for lb in range(n_regions):
        pl.contour(labels == lb, contour=1,
                   color=[pl.cm.spectral(lb / float(n_regions)), ])
    # Get rid of x, y tick marks
    pl.xticks(())
    pl.yticks(())



                            
def perform_clustering(alpha=0.0, num_clusters=100):
    """
    clustering the tag/terms and return the cluster ids for each tag
    :param alpha: parameter to combine visual and textual similarity matrix
    :param num_clusters: number of clusters/concepts obtained
    :return: cluster ids for each tag
    """
    vis_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_affinity_matrix_scaled.pkl")
    tex_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_textual_similarity_matrix.pkl")

    tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat)
    vis_sim_mat = expit(vis_sim_mat)

    # introduce a parameter alpha to merge the two matrics
    joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat

    # let's start spectrum clustering
    # obtain cluster IDs for each word
    # eigen_solver: None, arpack, lobpcg, or amg
    cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack')
    print("Done...")
    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    word_centroid_map = dict(zip(words, cluster_ids))
    utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl")

    cluster_contents = []
    # For the first 10 clusters
    for cluster in range(0, num_clusters):
        # print the cluster number
        print("\nCluster %d" % cluster)
        # Find all of the words for that cluster number, and print them out
        r_words = []
        for i in range(0,len(word_centroid_map.values())):
            if( word_centroid_map.values()[i] == cluster ):
                r_words.append(word_centroid_map.keys()[i])

        print (r_words)
        cluster_contents.append(r_words)

    utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl")

    return cluster_ids
Example #29
0
def spectral_cluster(G, node_list):
    # G is a similarity matrix
    S = nx.to_scipy_sparse_matrix(G, nodelist=node_list)

    previous_sum_cut = 0
    previous_cluster_node = {}
    previous_cluster_label = {}
    for i in range(2, 100):
        labels = spectral_clustering(S, n_clusters=i)
        labels = labels.tolist()
        # print(labels)
        result_cluster_node = dict(zip(node_list, labels))
        result_cluster_label = {}
        for k in result_cluster_node:
            v = result_cluster_node[k]
            if v in result_cluster_label:
                result_cluster_label.get(v).add(k)
            else:
                result_cluster_label[v] = {k}
        # print(result_cluster_label)
        sum_cut = 0
        for k in result_cluster_label:
            cut_k = 0
            vol_k = 0
            v = result_cluster_label[k]
            for nk in v:
                set_not_k = set(node_list).difference(v)
                vol_k += csr_matrix.sum(S.getcol(node_list.index(nk)))
                # print(nk, S.getcol(cited_list.index(nk)).toarray().tolist())
                for notk in set_not_k:
                    cut_k += G.get_edge_data(nk,notk,default={"weight":0})["weight"]
            # print(cut_k, vol_k)
            sum_cut += (cut_k/vol_k)

        if sum_cut > previous_sum_cut != 0 or i == 99:
            print(i, sum_cut, result_cluster_label)
            return {"result_by_node": previous_cluster_node, "result_by_cluster": previous_cluster_label}
            break
        else:
            previous_cluster_node = result_cluster_node
            previous_cluster_label = result_cluster_label
            previous_sum_cut = sum_cut
Example #30
0
def defficient_spectral_clustring(name_dict, z_depth, shape_dict):
    # requires to re-implement the distance definition on sparse images.

    z_stack = next(name_dict.itervalues())

    _3D_chan1 = np.zeros((shape_dict[1][0], shape_dict[1][1], z_depth))
    _3D_chan2 = np.zeros((shape_dict[2][0], shape_dict[2][1], z_depth))

    for depth, bi_image in z_stack.iteritems():
        img1 = bi_image[1]
        img2 = bi_image[2]

        _3D_chan1[:, :, depth-1] = img1
        _3D_chan2[:, :, depth-1] = img2

    # mlab.pipeline.volume(mlab.pipeline.scalar_field(_3D_chan1), vmin=0.1)
    # mlab.show()

    _3D_chan2[_3D_chan2<0.04] = 0

    mlab.pipeline.volume(mlab.pipeline.scalar_field(_3D_chan2))
    mlab.show()

    mask = _3D_chan2.astype(bool)
    img = _3D_chan2.astype(float)

    graph = image.img_to_graph(img, mask=mask)
    graph.data = np.exp(-graph.data / graph.data.std())

    print graph.shape
    print len(graph.nonzero()[0])

    clusters = 4
    labels = spectral_clustering(graph, n_clusters = clusters, eigen_solver='arpack')
    label_im = -np.ones(mask.shape)
    label_im[mask] = labels

    for i in range(0, clusters):
        re_img = copy(_3D_chan2)
        re_img[label_im!=i] = 0
        mlab.pipeline.volume(mlab.pipeline.scalar_field(re_img))
        mlab.show()