예제 #1
0
파일: test.py 프로젝트: LKF10051/ML
def myCKDemo(filename,n):
    #以下两个语句是获取数据,用于聚类分析的数据位于第3和第4列(从0开始计算)   
    data = np.loadtxt(filename, delimiter = "," ,usecols=(2,4,14,8))
    #第8和第9列,保存了城市的经纬度坐标,用于最后画散点图
    xy = np.loadtxt(filename, delimiter = "," ,usecols=(2,4))
    #clustermap是聚类之后的集合,记录每一组数据的类别id
    clustermap = pc.kcluster(data, n)[0]
    #centroids 是分组聚类之后的聚类中心坐标
    centroids = pc.clustercentroids(data, clusterid=clustermap)[0]
    #m是距离矩阵
    m = pc.distancematrix(data)
 
    #mass 用来记录各类的点的数目
    mass = np.zeros(n)
    for c in clustermap: 
        mass[c] += 1 
   
   
    #sil是轮廓系统矩阵,用于记录每个簇的大小
    sil = np.zeros(n*len(data)) 
    sil.shape = ( len(data), n ) 
   
    for i in range( 0, len(data) ): 
        for j in range( i+1, len(data) ): 
            d = m[j][i] 
            sil[i, clustermap[j] ] += d 
            sil[j, clustermap[i] ] += d 
 
    for i in range(0,len(data)): 
        sil[i,:] /= mass 
   
    #s轮廓系数是一个用来评估聚类效果的参数
    #值在-1 —— 1之间,值越大,表示效果越好。
    #小于0,说明与其簇内元素的平均距离小于最近的其他簇,表示聚类效果不好。
    #趋近与1,说明聚类效果比较好。
    s=0 
    for i in range( 0, len(data) ): 
        c = clustermap[i] 
        a = sil[i,c] 
        b = min(sil[i,range(0,c)+range(c+1,n)]) 
        si = (b-a)/max(b,a)
        s+=si 
   
    print n, s/len(data) 
   
    #使用matplotlib画出散点图。
    fig, ax = pl.subplots()
    #cmap是用于区分不同类别的颜色
    cmap = pl.get_cmap('jet', n)
    cmap.set_under('gray')
    #xy是经纬度,主要为了通过经纬度来画出不同城市在地理上的位置
    x = [list(d)[0] for d in xy]   
    y = [list(d)[1] for d in xy] 
    cax = ax.scatter(x, y, c=clustermap, s=30, cmap=cmap, vmin=0, vmax=n)
    pl.show() 
예제 #2
0
파일: utils.py 프로젝트: mac389/clinic
def silhouette(data, k=5, shuffle = True, shufflecount = 100):
	#assume that data is a matrix with variables in rows and dimensions in columns
	coefficients = {}
	data = data.transpose()
	for nclus in range(2,k):
		
		clustermap = pc.kcluster(data,nclusters=nclus,npass=50)[0]
		centroids = pc.clustercentroids(data,clusterid=clustermap)[0]
		m = pc.distancematrix(data)
		res = [silhouette_coefficient(m,clustermap,nclus,data.shape)]

		for _ in range(shufflecount):

			dat = data
			map(np.random.shuffle,dat)
			clustermap = pc.kcluster(dat,nclusters=nclus,npass=50)[0]
			centroids = pc.clustercentroids(dat,clusterid=clustermap)[0]

			#distance matrix-- well it's a list actually
			m = pc.distancematrix(dat)

			res.append([silhouette_coefficient(m,clustermap,nclus,dat.shape)])
		coefficients[nclus]={'data':res[0],'distribution':res[1:]}
	return coefficients
예제 #3
0
def Kmedoids(num_patches, samples, progress=None):
  """Estimate patches as centroids of samples using k-Medoids.

  This requires the `Pycluster` library to be installed.

  :param int num_patches: number of patches to create
  :type samples: 2D array
  :param samples: example patches
  :param progress: ignored
  :rtype: 2D array with `num_patches` rows and N columns, where N is the number
     of columns in `samples`.
  :return: created patches

  """
  logging.info("Learning %d prototypes per size by k-Medoids clustering" %
      num_patches)
  import Pycluster
  dist = Pycluster.distancematrix(samples)
  cluster_ids, _, _ = Pycluster.kmedoids(dist, nclusters=num_patches)
  # `cluster_ids` contains `num_patches` unique values, each of which is
  # the index of the medoid for a different cluster.
  return samples[np.unique(cluster_ids)].astype(ACTIVATION_DTYPE)
예제 #4
0
def Kmedoids(num_patches, samples, progress=None):
    """Estimate patches as centroids of samples using k-Medoids.

  This requires the `Pycluster` library to be installed.

  :param int num_patches: number of patches to create
  :type samples: 2D array
  :param samples: example patches
  :param progress: ignored
  :rtype: 2D array with `num_patches` rows and N columns, where N is the number
     of columns in `samples`.
  :return: created patches

  """
    logging.info("Learning %d prototypes per size by k-Medoids clustering" %
                 num_patches)
    import Pycluster
    dist = Pycluster.distancematrix(samples)
    cluster_ids, _, _ = Pycluster.kmedoids(dist, nclusters=num_patches)
    # `cluster_ids` contains `num_patches` unique values, each of which is
    # the index of the medoid for a different cluster.
    return samples[np.unique(cluster_ids)].astype(ACTIVATION_DTYPE)
예제 #5
0
파일: ch13_lst1.py 프로젝트: CeasarSS/books
import Pycluster as pc
import numpy as np
import sys

# Read data filename and desired number of clusters from command line
filename, n = sys.argv[1], int( sys.argv[2] )

data = np.loadtxt( filename )

# Perform clustering and find centroids
clustermap, _, _ = pc.kcluster( data, nclusters=n, npass=50 )
centroids, _ = pc.clustercentroids( data, clusterid=clustermap )

# Obtain distance matrix
m = pc.distancematrix( data )

# Find the masses of all clusters
mass = np.zeros( n )
for c in clustermap:
    mass[c] += 1

# Create a matrix for individual silhouette coefficients
sil = np.zeros( n*len(data) )
sil.shape = ( len(data), n )

# Evaluate the distance for all pairs of points
for i in range( 0, len(data) ):
    for j in range( i+1, len(data) ):
        d = m[j][i]
예제 #6
0
파일: sortUtils.py 프로젝트: mac389/brainpy
def cluster(data, threshold = 0.5,method='sk', preprocess=True):
	length = len(data)
	print data.shape
	nclus = 2
	nclusmax=15
	sil = [-1]
	models=[]
	if preprocess==True:
		print 'Preprocessing by scaling each row by its range'
		data /= (amax(data,axis=0)-amin(data,axis=0))[newaxis,:]
		print 'Now to cluster'	
	if method == 'sk':
		print 'Clustering using Scikits K-means implementation'
		print "This option returns a tuple of"
		print "\t\t (kmeans object, silhouette coefficients)"
		while nclus < nclusmax: #average(sil[-1]) < threshold and
			model = KMeans(init='k-means++',n_clusters=nclus) 
			#Assume data is propery preprocessed
			model.fit(data)
			labels = model.labels_
			#<-- can only sample this in chunks of 100
			print data.shape
			print 'Calculating silhouette_score '
			sil.append(silhouette_score(data,labels,metric='euclidean')) 
			models.append(model)
			print 'For %d clusters, the silhouette coefficient is %.03f'%(nclus,sil[-1])
			nclus += 1
		return (models,sil)
	elif method == 'pyclus':
		import Pycluster as pc
		print 'Clustering using the C Clustering library'
		print 'This option returns a dictionary with the distance matrix, silhouettes, and clusterids for each iteration.'
		res = []
		sil_co_one = 1
		sil_co = [1]
		#Assume 
		while sil_co_one > threshold and nclus < nclusmax:
			print 'No. of clus: %d'%nclus
			print 'Before kcluster'
			clustermap,_,_ = pc.kcluster(data,nclusters=nclus,npass=50)
			print 'After kcluster'
			centroids,_ = pc.clustercentroids(data,clusterid=clustermap)
			print 'After centroids'
	
			m = pc.distancematrix(data)
			
			print 'Finding mass'
			#Find the masses of all clusters
			mass = zeros(nclus)
			for c in clustermap:
				mass[c] += 1
		
			#Create a matrix for individual silhouette coefficients
			sil = zeros((len(data),nclus))
			
			print 'Evaluating pairwise distance'
			#Evaluate the distance for all pairs of points		
			for i in xrange(0,length):
				for j in range(i+1,length):
					d = m[j][i]
					
					sil[i, clustermap[j] ] += d
					sil[j, clustermap[i] ] += d
			
			#Average over cluster
			for i in range(0,len(data)):
				sil[i,:] /= mass
			
			print 'Sil co'	
			#Evaluate the silhouette coefficient
			s = 0
			for i in xrange(0,length):
				c = clustermap[i]
				a = sil[i,c] 
				b = min( sil[i, range(0,c) + range(c+1,nclus)])
				si = (b-a)/max(b,a) #silhouette coefficient of point i
				s+=si
						
			nclus += 1
			sil_co.append( s/length)
			sil_co_one = s/length
			print 'Sil co %.02f'%sil_co_one
			res.append({'clustermap':clustermap,
						'centroids':centroids,
						 'distances':m,
						 'mass':mass,
						 'silhouettes':sil_co})
		return res
예제 #7
0
import Pycluster as pc
import numpy as np
import sys

# Read data filename and desired number of clusters from command line
filename, n = sys.argv[1], int(sys.argv[2])

data = np.loadtxt(filename)

# Perform clustering and find centroids
clustermap, _, _ = pc.kcluster(data, nclusters=n, npass=50)
centroids, _ = pc.clustercentroids(data, clusterid=clustermap)

# Obtain distance matrix
m = pc.distancematrix(data)

# Find the masses of all clusters
mass = np.zeros(n)
for c in clustermap:
    mass[c] += 1

# Create a matrix for individual silhouette coefficients
sil = np.zeros(n * len(data))
sil.shape = (len(data), n)

# Evaluate the distance for all pairs of points
for i in range(0, len(data)):
    for j in range(i + 1, len(data)):
        d = m[j][i]

        sil[i, clustermap[j]] += d