def myCKDemo(filename,n): #以下两个语句是获取数据,用于聚类分析的数据位于第3和第4列(从0开始计算) data = np.loadtxt(filename, delimiter = "," ,usecols=(2,4,14,8)) #第8和第9列,保存了城市的经纬度坐标,用于最后画散点图 xy = np.loadtxt(filename, delimiter = "," ,usecols=(2,4)) #clustermap是聚类之后的集合,记录每一组数据的类别id clustermap = pc.kcluster(data, n)[0] #centroids 是分组聚类之后的聚类中心坐标 centroids = pc.clustercentroids(data, clusterid=clustermap)[0] #m是距离矩阵 m = pc.distancematrix(data) #mass 用来记录各类的点的数目 mass = np.zeros(n) for c in clustermap: mass[c] += 1 #sil是轮廓系统矩阵,用于记录每个簇的大小 sil = np.zeros(n*len(data)) sil.shape = ( len(data), n ) for i in range( 0, len(data) ): for j in range( i+1, len(data) ): d = m[j][i] sil[i, clustermap[j] ] += d sil[j, clustermap[i] ] += d for i in range(0,len(data)): sil[i,:] /= mass #s轮廓系数是一个用来评估聚类效果的参数 #值在-1 —— 1之间,值越大,表示效果越好。 #小于0,说明与其簇内元素的平均距离小于最近的其他簇,表示聚类效果不好。 #趋近与1,说明聚类效果比较好。 s=0 for i in range( 0, len(data) ): c = clustermap[i] a = sil[i,c] b = min(sil[i,range(0,c)+range(c+1,n)]) si = (b-a)/max(b,a) s+=si print n, s/len(data) #使用matplotlib画出散点图。 fig, ax = pl.subplots() #cmap是用于区分不同类别的颜色 cmap = pl.get_cmap('jet', n) cmap.set_under('gray') #xy是经纬度,主要为了通过经纬度来画出不同城市在地理上的位置 x = [list(d)[0] for d in xy] y = [list(d)[1] for d in xy] cax = ax.scatter(x, y, c=clustermap, s=30, cmap=cmap, vmin=0, vmax=n) pl.show()
def silhouette(data, k=5, shuffle = True, shufflecount = 100): #assume that data is a matrix with variables in rows and dimensions in columns coefficients = {} data = data.transpose() for nclus in range(2,k): clustermap = pc.kcluster(data,nclusters=nclus,npass=50)[0] centroids = pc.clustercentroids(data,clusterid=clustermap)[0] m = pc.distancematrix(data) res = [silhouette_coefficient(m,clustermap,nclus,data.shape)] for _ in range(shufflecount): dat = data map(np.random.shuffle,dat) clustermap = pc.kcluster(dat,nclusters=nclus,npass=50)[0] centroids = pc.clustercentroids(dat,clusterid=clustermap)[0] #distance matrix-- well it's a list actually m = pc.distancematrix(dat) res.append([silhouette_coefficient(m,clustermap,nclus,dat.shape)]) coefficients[nclus]={'data':res[0],'distribution':res[1:]} return coefficients
def Kmedoids(num_patches, samples, progress=None): """Estimate patches as centroids of samples using k-Medoids. This requires the `Pycluster` library to be installed. :param int num_patches: number of patches to create :type samples: 2D array :param samples: example patches :param progress: ignored :rtype: 2D array with `num_patches` rows and N columns, where N is the number of columns in `samples`. :return: created patches """ logging.info("Learning %d prototypes per size by k-Medoids clustering" % num_patches) import Pycluster dist = Pycluster.distancematrix(samples) cluster_ids, _, _ = Pycluster.kmedoids(dist, nclusters=num_patches) # `cluster_ids` contains `num_patches` unique values, each of which is # the index of the medoid for a different cluster. return samples[np.unique(cluster_ids)].astype(ACTIVATION_DTYPE)
import Pycluster as pc import numpy as np import sys # Read data filename and desired number of clusters from command line filename, n = sys.argv[1], int( sys.argv[2] ) data = np.loadtxt( filename ) # Perform clustering and find centroids clustermap, _, _ = pc.kcluster( data, nclusters=n, npass=50 ) centroids, _ = pc.clustercentroids( data, clusterid=clustermap ) # Obtain distance matrix m = pc.distancematrix( data ) # Find the masses of all clusters mass = np.zeros( n ) for c in clustermap: mass[c] += 1 # Create a matrix for individual silhouette coefficients sil = np.zeros( n*len(data) ) sil.shape = ( len(data), n ) # Evaluate the distance for all pairs of points for i in range( 0, len(data) ): for j in range( i+1, len(data) ): d = m[j][i]
def cluster(data, threshold = 0.5,method='sk', preprocess=True): length = len(data) print data.shape nclus = 2 nclusmax=15 sil = [-1] models=[] if preprocess==True: print 'Preprocessing by scaling each row by its range' data /= (amax(data,axis=0)-amin(data,axis=0))[newaxis,:] print 'Now to cluster' if method == 'sk': print 'Clustering using Scikits K-means implementation' print "This option returns a tuple of" print "\t\t (kmeans object, silhouette coefficients)" while nclus < nclusmax: #average(sil[-1]) < threshold and model = KMeans(init='k-means++',n_clusters=nclus) #Assume data is propery preprocessed model.fit(data) labels = model.labels_ #<-- can only sample this in chunks of 100 print data.shape print 'Calculating silhouette_score ' sil.append(silhouette_score(data,labels,metric='euclidean')) models.append(model) print 'For %d clusters, the silhouette coefficient is %.03f'%(nclus,sil[-1]) nclus += 1 return (models,sil) elif method == 'pyclus': import Pycluster as pc print 'Clustering using the C Clustering library' print 'This option returns a dictionary with the distance matrix, silhouettes, and clusterids for each iteration.' res = [] sil_co_one = 1 sil_co = [1] #Assume while sil_co_one > threshold and nclus < nclusmax: print 'No. of clus: %d'%nclus print 'Before kcluster' clustermap,_,_ = pc.kcluster(data,nclusters=nclus,npass=50) print 'After kcluster' centroids,_ = pc.clustercentroids(data,clusterid=clustermap) print 'After centroids' m = pc.distancematrix(data) print 'Finding mass' #Find the masses of all clusters mass = zeros(nclus) for c in clustermap: mass[c] += 1 #Create a matrix for individual silhouette coefficients sil = zeros((len(data),nclus)) print 'Evaluating pairwise distance' #Evaluate the distance for all pairs of points for i in xrange(0,length): for j in range(i+1,length): d = m[j][i] sil[i, clustermap[j] ] += d sil[j, clustermap[i] ] += d #Average over cluster for i in range(0,len(data)): sil[i,:] /= mass print 'Sil co' #Evaluate the silhouette coefficient s = 0 for i in xrange(0,length): c = clustermap[i] a = sil[i,c] b = min( sil[i, range(0,c) + range(c+1,nclus)]) si = (b-a)/max(b,a) #silhouette coefficient of point i s+=si nclus += 1 sil_co.append( s/length) sil_co_one = s/length print 'Sil co %.02f'%sil_co_one res.append({'clustermap':clustermap, 'centroids':centroids, 'distances':m, 'mass':mass, 'silhouettes':sil_co}) return res
import Pycluster as pc import numpy as np import sys # Read data filename and desired number of clusters from command line filename, n = sys.argv[1], int(sys.argv[2]) data = np.loadtxt(filename) # Perform clustering and find centroids clustermap, _, _ = pc.kcluster(data, nclusters=n, npass=50) centroids, _ = pc.clustercentroids(data, clusterid=clustermap) # Obtain distance matrix m = pc.distancematrix(data) # Find the masses of all clusters mass = np.zeros(n) for c in clustermap: mass[c] += 1 # Create a matrix for individual silhouette coefficients sil = np.zeros(n * len(data)) sil.shape = (len(data), n) # Evaluate the distance for all pairs of points for i in range(0, len(data)): for j in range(i + 1, len(data)): d = m[j][i] sil[i, clustermap[j]] += d