Ejemplo n.º 1
0
def findcenters(x, n=1000, k=6):
    # get dimensions
    m = x.shape[1]
    # create centers as empty
    centers = DataFrame(np.zeros(shape=(k, m)))

    for i in range(n):
        labels, _, _ = Pycluster.kcluster(x, nclusters=k, transpose=0, method="a", dist="e", npass=1)
        center, _ = Pycluster.clustercentroids(x, clusterid=labels)
        # sort centers by the distance to the origin
        center = sorted(center, key=lambda t: np.linalg.norm(np.array(t) - np.zeros(m)), reverse=True)

        # print np.linalg.norm(np.array(center[0])-np.zeros(m))
        # print np.linalg.norm(np.array(center[1])-np.zeros(m))
        # print np.linalg.norm(np.array(center[2])-np.zeros(m))
        # print np.linalg.norm(np.array(center[3])-np.zeros(m))
        # print np.linalg.norm(np.array(center[4])-np.zeros(m))
        # print np.linalg.norm(np.array(center[5])-np.zeros(m))
        # print np.array(center[0])
        # print np.array(center[1])
        # print np.array(center[2])
        # print np.array(center[3])
        # print np.array(center[4])
        # print np.array(center[5])
        # take the average
        for j in range(k):
            centers.ix[j, :] = centers.ix[j, :] + center[j]
    centers = centers / n
    return centers
Ejemplo n.º 2
0
def findcenters(x,n=1000,k=6):
    #get dimensions
    m = x.shape[1]
    #create centers as empty
    centers = DataFrame(np.zeros(shape=(k,m)))

    for i in range(n):
        labels, _, _ = Pycluster.kcluster(x, nclusters = k, transpose=0,
                                        method='a', dist='e', npass = 1)
        center, _ = Pycluster.clustercentroids(x,clusterid = labels)
        #sort centers by the distance to the origin
        center = sorted(center,key = lambda t: np.linalg.norm(np.array(t)-np.zeros(m)), reverse = True)

        #print np.linalg.norm(np.array(center[0])-np.zeros(m))
        #print np.linalg.norm(np.array(center[1])-np.zeros(m))
        #print np.linalg.norm(np.array(center[2])-np.zeros(m))
        #print np.linalg.norm(np.array(center[3])-np.zeros(m))
        #print np.linalg.norm(np.array(center[4])-np.zeros(m))
        #print np.linalg.norm(np.array(center[5])-np.zeros(m))
        #print np.array(center[0])
        #print np.array(center[1])
        #print np.array(center[2])
        #print np.array(center[3])
        #print np.array(center[4])
        #print np.array(center[5])
        #take the average
        for j in range(k):
            centers.ix[j,:] = centers.ix[j,:] + center[j]
    centers = centers/n
    return(centers)
Ejemplo n.º 3
0
def pyclustertest():
    
    data=sp.rand(100,4)
    cid,e,n=pcl.kcluster(data)
    centroids,cmask=pcl.clustercentroids(D,clusterid=cid)
    
    print data    
    print centroids
Ejemplo n.º 4
0
Archivo: test.py Proyecto: LKF10051/ML
def myCKDemo(filename,n):
    #以下两个语句是获取数据,用于聚类分析的数据位于第3和第4列(从0开始计算)   
    data = np.loadtxt(filename, delimiter = "," ,usecols=(2,4,14,8))
    #第8和第9列,保存了城市的经纬度坐标,用于最后画散点图
    xy = np.loadtxt(filename, delimiter = "," ,usecols=(2,4))
    #clustermap是聚类之后的集合,记录每一组数据的类别id
    clustermap = pc.kcluster(data, n)[0]
    #centroids 是分组聚类之后的聚类中心坐标
    centroids = pc.clustercentroids(data, clusterid=clustermap)[0]
    #m是距离矩阵
    m = pc.distancematrix(data)
 
    #mass 用来记录各类的点的数目
    mass = np.zeros(n)
    for c in clustermap: 
        mass[c] += 1 
   
   
    #sil是轮廓系统矩阵,用于记录每个簇的大小
    sil = np.zeros(n*len(data)) 
    sil.shape = ( len(data), n ) 
   
    for i in range( 0, len(data) ): 
        for j in range( i+1, len(data) ): 
            d = m[j][i] 
            sil[i, clustermap[j] ] += d 
            sil[j, clustermap[i] ] += d 
 
    for i in range(0,len(data)): 
        sil[i,:] /= mass 
   
    #s轮廓系数是一个用来评估聚类效果的参数
    #值在-1 —— 1之间,值越大,表示效果越好。
    #小于0,说明与其簇内元素的平均距离小于最近的其他簇,表示聚类效果不好。
    #趋近与1,说明聚类效果比较好。
    s=0 
    for i in range( 0, len(data) ): 
        c = clustermap[i] 
        a = sil[i,c] 
        b = min(sil[i,range(0,c)+range(c+1,n)]) 
        si = (b-a)/max(b,a)
        s+=si 
   
    print n, s/len(data) 
   
    #使用matplotlib画出散点图。
    fig, ax = pl.subplots()
    #cmap是用于区分不同类别的颜色
    cmap = pl.get_cmap('jet', n)
    cmap.set_under('gray')
    #xy是经纬度,主要为了通过经纬度来画出不同城市在地理上的位置
    x = [list(d)[0] for d in xy]   
    y = [list(d)[1] for d in xy] 
    cax = ax.scatter(x, y, c=clustermap, s=30, cmap=cmap, vmin=0, vmax=n)
    pl.show() 
Ejemplo n.º 5
0
    def _G(self, data, K):
        labels, _, _ = Pycluster.kcluster(data.T, K)
        centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels)
        centers = centers.T
        G = zeros((K, data.shape[1]))
        
        for k in range(K):
            D = data - expand_dims(centers[:, k], axis=1)
            G[k, :] = -sqrt(sum(multiply(D, D), axis=0))

        return G
Ejemplo n.º 6
0
    def _G(self, data, K):
        labels, _, _ = Pycluster.kcluster(data.T, K)
        centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels)
        centers = centers.T
        G = zeros((K, data.shape[1]))

        for k in range(K):
            D = data - expand_dims(centers[:, k], axis=1)
            G[k, :] = -sqrt(sum(multiply(D, D), axis=0))

        return G
Ejemplo n.º 7
0
def reassignClusterIDs(src, dst):
    """
    Given the cluster centers for two clusterings, determine the centers most 
    similar to each other and reassign the cluster ids to match.
    """
    srcFCS = DataStore.getData()[src[0]]
    dstFCS = DataStore.getData()[dst[0]]
    
    srcdata = srcFCS.data
    if srcFCS.selDims:
        srcdata = dh.filterData(srcFCS.data, srcFCS.selDims)
    srcids = srcFCS.clustering[src[1]]
    srccenters = pc.clustercentroids(srcdata, clusterid=srcids)[0]
    
    dstdata = dstFCS.data
    if dstFCS.selDims:
        dstdata = dh.filterData(dstFCS.data, dstFCS.selDims)
    dstids = dstFCS.clustering[dst[1]]
    dstcenters = pc.clustercentroids(dstdata, clusterid=dstids)[0]
    
    srcsep = separate(srcdata, srcids)
    dstsep = separate(dstdata, dstids)

    centerEQ = {}
    taken = []
    # Fill the map with the closest source center for each destination center
    for i,dc in enumerate(dstcenters):
        bestDist = -1
        for j,sc in enumerate(srccenters):
            if (j not in taken):
                dist = nonSymmetricClusterDistance(dstsep[i], srcsep[j])
                if (bestDist < 0) or (dist < bestDist):
                    bestDist = dist
                    centerEQ[i] = j
        taken.append(centerEQ[i])
    
    # Renumber the cluster IDs in the destination to match the IDs of the closest src center
    tmp = [centerEQ[id] for id in dstids]
    DataStore.getData()[dst[0]].clustering[dst[1]] = tmp
Ejemplo n.º 8
0
def clustering(file_path, k, dist_measure, PLOT):
    """
    Do the K-means clustering for input data.

    @param file_path: Input data file.
    @param k: Number of centers in K-means algorithm.
    @param dist_measure: Distance measure (in this case, we use Manhattan distance).
    @param PLOT: Bool variable, check if plot the result (set it as True only in testing).
    @return: Clusters id for all data points in the input data file.
    """

    data = numpy.genfromtxt(file_path, delimiter=',')

    if len(data.shape) == 1:
        return [-1]

    print "-- Processing file: " + file_path + "  -- Data points: " + str(len(data))
    print "-- Start clustering"

    k = set_k(len(data), k)
    ite_num = method_name(len(data))

    # Do the K-means clustering
    cluster_id, _, _ = Pycluster.kcluster(data, nclusters=k, mask=None, weight=None, transpose=0, npass=ite_num,
                                          method='a', dist=dist_measure, initialid=None)

    if PLOT is False:
        return cluster_id

    # Draw the clustering result plot.
    centroids, _ = Pycluster.clustercentroids(data, clusterid=cluster_id)

    if PLOT:
        data_pca = mlab.PCA(data)
        cutoff = data_pca.fracs[1]
        data_2d = data_pca.project(data, minfrac=cutoff)
        centroids_2d = data_pca.project(centroids, minfrac=cutoff)
    else:
        data_2d = data
        centroids_2d = centroids

    color = ['#2200CC', '#D9007E', '#FF6600', '#FFCC00', '#ACE600', '#0099CC',
             '#8900CC', '#FF0000', '#FF9900', '#FFFF00', '#00CC01', '#0055CC']

    for i in range(k):
        scatter(data_2d[cluster_id == i, 0], data_2d[cluster_id == i, 1], color=color[i % 12])

    plot(centroids_2d[:, 0], centroids_2d[:, 1], 'sg', markersize=8)
    show()

    return cluster_id
Ejemplo n.º 9
0
def silhouette(data, k=5, shuffle = True, shufflecount = 100):
	#assume that data is a matrix with variables in rows and dimensions in columns
	coefficients = {}
	data = data.transpose()
	for nclus in range(2,k):
		
		clustermap = pc.kcluster(data,nclusters=nclus,npass=50)[0]
		centroids = pc.clustercentroids(data,clusterid=clustermap)[0]
		m = pc.distancematrix(data)
		res = [silhouette_coefficient(m,clustermap,nclus,data.shape)]

		for _ in range(shufflecount):

			dat = data
			map(np.random.shuffle,dat)
			clustermap = pc.kcluster(dat,nclusters=nclus,npass=50)[0]
			centroids = pc.clustercentroids(dat,clusterid=clustermap)[0]

			#distance matrix-- well it's a list actually
			m = pc.distancematrix(dat)

			res.append([silhouette_coefficient(m,clustermap,nclus,dat.shape)])
		coefficients[nclus]={'data':res[0],'distribution':res[1:]}
	return coefficients
Ejemplo n.º 10
0
	def __init__(self, numComps=None, dim=5, data=None, epsilon=math.pow(10, -10), wishartScalar=1, wishartScale=np.identity(dim), dirichlet=np.ones(numComps), normalMu=0, normalSigma=np.identity(dim)):
		
		# INITIALIZE ALL POSTERIOR PARAMETERS

		self.d = dim
		self.k = numComps
		self.n = len(data)

		# INITIALIZE ALL PRIOR PARAMETERS

		self.e = normalSigma
		self.m = normalMu
		self.w = wishartScale
		self.v = wishartScalar
		self.di = dirichlet
		self.epsilon = epsilon

		# INITIALIZE ALL PRIORS USING k-means CLUSTERING

		# INITIALIZE THE MUS

		labels, error, nfound = pc.kcluster(data, self.k)#, iter=300, thresh=1e-05)
	
		centroids, _ = pc.clustercentroids(data, clusterid=labels)

		self.mu = centroids

		self.pointsInComp = [[] for comp in xrange(self.k)]
		for n in xrange(self.n):
			self.pointsInComp[labels[n]].append(data[n])

		# INITIALIZE THE COVARIANCE MATRIX
		self.sigma = [np.cov(np.array(kpoints).T) for kpoints in self.pointsInComp]

		# INITIALIZE THE WEIGHTS
		self.pi = [len(l)/data.shape[0] for l in self.pointsInComp]
"""
Activity vector
"""
for i in range(0, len(point) - 1):
    activity[i] = point[i + 1] - point[i]
activity[len(activity) - 1] = activity[0] - activity[len(activity) - 1]
# print activity
np.savetxt("test.out", activity, delimiter=",")


"""
Cluster activity into k cluster
"""
number_of_cluster = 5
label, errors, nfound = Pycluster.kcluster(activity, number_of_cluster)
centroid, a = Pycluster.clustercentroids(activity, clusterid=label)

np.savetxt("label.out", label, delimiter=",")


# """
# plot clustered data
# """
# zl = m.ceil(min(activity[0])) - 10
# zh = m.ceil(max(activity[0])) + 10
# # for i in range(0,len(label)):
# 	if(label[i]==0):
# 		ax.scatter(activity[i,0], activity[i,1], activity[i,2], s=200, marker='.', c='r')
# 	if(label[i]==1):
# 		ax.scatter(activity[i,0], activity[i,1], activity[i,2], s=200, marker='.', c='g')
# 	if(label[i]==2):
Ejemplo n.º 12
0
def clustering(filepath, k, dist_measure, PLOT):

    data = np.genfromtxt(filepath, delimiter=",")

    ##    print data

    if len(data.shape) == 1:
        return [-1]

    print "-- Processing file: " + filepath + "  -- Data points: " + str(len(data))
    print "-- Start clustering"
    if k == -1:
        k = int(0.5 * len(data))

    if k > 5000:
        k = 5000

    if len(data) < 1000:
        ite_num = 10
    else:
        ite_num = 1

    clusterid, error, nfound = pc.kcluster(
        data,
        nclusters=k,
        mask=None,
        weight=None,
        transpose=0,
        npass=ite_num,
        method="a",
        dist=dist_measure,
        initialid=None,
    )

    if PLOT is False:
        return clusterid

    centroids, _ = pc.clustercentroids(data, clusterid=clusterid)

    ##    # make a plot
    ##    colors = ['red', 'green', 'blue']
    ##    plt.figure()
    ##    plt.xlim([data[:,0].min() - .5, data[:,0].max() + .5])
    ##    plt.ylim([data[:,1].min() - .5, data[:,1].max() + .5])
    ##    plt.xticks([], []); plt.yticks([], []) # numbers aren't meaningful
    ##
    ##    # show the centroids
    ##    plt.scatter(centroids[:,0], centroids[:,1], marker='o', c=colors, s=100)
    ##
    ##    # show user numbers, colored by their cluster id
    ##    for i, ((x,y), kls) in enumerate(zip(data, clusterid)):
    ##        #
    ##        plt.annotate('o', xy=(x,y), xytext=(0,0), textcoords='offset points', color=colors[kls])

    if PLOT:
        data_pca = mlab.PCA(data)
        cutoff = data_pca.fracs[1]
        data_2d = data_pca.project(data, minfrac=cutoff)
        centroids_2d = data_pca.project(centroids, minfrac=cutoff)
    else:
        data_2d = data
        centroids_2d = centroids

    color = [
        "#2200CC",
        "#D9007E",
        "#FF6600",
        "#FFCC00",
        "#ACE600",
        "#0099CC",
        "#8900CC",
        "#FF0000",
        "#FF9900",
        "#FFFF00",
        "#00CC01",
        "#0055CC",
    ]

    for i in range(k):
        scatter(data_2d[clusterid == i, 0], data_2d[clusterid == i, 1], color=color[i % 12])

    plot(centroids_2d[:, 0], centroids_2d[:, 1], "sg", markersize=8)
    show()

    return clusterid
Ejemplo n.º 13
0
def get_volume_sources(volume, space=5, remains=None):
    """get sources in volume

    Parameters
    ----------
    volume : Volume object
    space : float
        The distance between sources
    remains : None | int
        The number of sources that we want to keep

    Returns
    -------
    src : SourceSpaces object
    -------
    Author : Alexandre Fabre
    """

    if remains is None:
        remains, removes = get_number_sources(volume, space=space,
                                              surface=False)

    else:
        # avoid to have an incorrect number of sources
        remains = max(0, min(volume.pos_length, remains))
        removes = volume.pos_length - remains

    if remains == 0:
        raise ValueError('Error, 0 source created')

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # create clusters
        km = MiniBatchKMeans(n_clusters=remains, n_init=10)
        
    # get cluster labels
    cluster_id = km.fit(volume.pos).labels_

    # get centroids of clusters
    centroids, _ = Pycluster.clustercentroids(volume.pos, clusterid=cluster_id)

    dist = euclidean_distances(centroids, volume.pos)

    # get indices of closest points of centroids
    arg_min = np.argmin(dist, axis=1)

    inuse = np.zeros(volume.pos_length)
    inuse[arg_min] = 1
    inuse = inuse.astype(int) # Need to be int

    # must be converted to meters
    # Pos is in voxels coords not mm
    rr = volume.pos * 1e-3

    if volume.hemi=='lh':
        Id = 101
    elif volume.hemi=='rh':
        Id = 102
    src = [{'rr': rr, 'coord_frame': np.array((FIFF.FIFFV_COORD_MRI,), np.int32), 'type': 'surf', 'id': Id,
            'np': volume.pos_length, 'nn': volume.normals, 'inuse': inuse, 'nuse': remains, 'dist': None,
            'nearest': None, 'use_tris': None, 'nuse_tris': 0, 'vertno': arg_min, 'patch_inds': None,
            'tris': None, 'dist_limit': None, 'pinfo': None, 'ntri': 0, 'nearest_dist': None, 'removes': removes}]

    src = SourceSpaces(src)

    return src
Ejemplo n.º 14
0
                                np.c_[xx.ravel(), yy.ravel()], 
                                nclusters=args.n_clusters,
                                method='m', # use median (aka medoid)
                                dist='e') # euclidean dist
    # Put the result into a color plot
    Z = kids.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    medoids, _ = Pycluster.clustercentroids(np.c_[xx.ravel(), yy.ravel()], clusterid=kids, method='m')
    plt.scatter(medoids[:, 0], medoids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title('PAM clustering (PCA-reduced data)\n'
              'Medoids are marked with white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.savefig(args.scatter_file)
    plt.close()

    if args.sil_vs_cluster :
        plt.figure(2)
        s = []
Ejemplo n.º 15
0
def cluster(data, threshold = 0.5,method='sk', preprocess=True):
	length = len(data)
	print data.shape
	nclus = 2
	nclusmax=15
	sil = [-1]
	models=[]
	if preprocess==True:
		print 'Preprocessing by scaling each row by its range'
		data /= (amax(data,axis=0)-amin(data,axis=0))[newaxis,:]
		print 'Now to cluster'	
	if method == 'sk':
		print 'Clustering using Scikits K-means implementation'
		print "This option returns a tuple of"
		print "\t\t (kmeans object, silhouette coefficients)"
		while nclus < nclusmax: #average(sil[-1]) < threshold and
			model = KMeans(init='k-means++',n_clusters=nclus) 
			#Assume data is propery preprocessed
			model.fit(data)
			labels = model.labels_
			#<-- can only sample this in chunks of 100
			print data.shape
			print 'Calculating silhouette_score '
			sil.append(silhouette_score(data,labels,metric='euclidean')) 
			models.append(model)
			print 'For %d clusters, the silhouette coefficient is %.03f'%(nclus,sil[-1])
			nclus += 1
		return (models,sil)
	elif method == 'pyclus':
		import Pycluster as pc
		print 'Clustering using the C Clustering library'
		print 'This option returns a dictionary with the distance matrix, silhouettes, and clusterids for each iteration.'
		res = []
		sil_co_one = 1
		sil_co = [1]
		#Assume 
		while sil_co_one > threshold and nclus < nclusmax:
			print 'No. of clus: %d'%nclus
			print 'Before kcluster'
			clustermap,_,_ = pc.kcluster(data,nclusters=nclus,npass=50)
			print 'After kcluster'
			centroids,_ = pc.clustercentroids(data,clusterid=clustermap)
			print 'After centroids'
	
			m = pc.distancematrix(data)
			
			print 'Finding mass'
			#Find the masses of all clusters
			mass = zeros(nclus)
			for c in clustermap:
				mass[c] += 1
		
			#Create a matrix for individual silhouette coefficients
			sil = zeros((len(data),nclus))
			
			print 'Evaluating pairwise distance'
			#Evaluate the distance for all pairs of points		
			for i in xrange(0,length):
				for j in range(i+1,length):
					d = m[j][i]
					
					sil[i, clustermap[j] ] += d
					sil[j, clustermap[i] ] += d
			
			#Average over cluster
			for i in range(0,len(data)):
				sil[i,:] /= mass
			
			print 'Sil co'	
			#Evaluate the silhouette coefficient
			s = 0
			for i in xrange(0,length):
				c = clustermap[i]
				a = sil[i,c] 
				b = min( sil[i, range(0,c) + range(c+1,nclus)])
				si = (b-a)/max(b,a) #silhouette coefficient of point i
				s+=si
						
			nclus += 1
			sil_co.append( s/length)
			sil_co_one = s/length
			print 'Sil co %.02f'%sil_co_one
			res.append({'clustermap':clustermap,
						'centroids':centroids,
						 'distances':m,
						 'mass':mass,
						 'silhouettes':sil_co})
		return res
Ejemplo n.º 16
0
import Pycluster as pc
import numpy as np
import sys

# Read data filename and desired number of clusters from command line
filename, n = sys.argv[1], int(sys.argv[2])

data = np.loadtxt(filename)

# Perform clustering and find centroids
clustermap, _, _ = pc.kcluster(data, nclusters=n, npass=50)
centroids, _ = pc.clustercentroids(data, clusterid=clustermap)

# Obtain distance matrix
m = pc.distancematrix(data)

# Find the masses of all clusters
mass = np.zeros(n)
for c in clustermap:
    mass[c] += 1

# Create a matrix for individual silhouette coefficients
sil = np.zeros(n * len(data))
sil.shape = (len(data), n)

# Evaluate the distance for all pairs of points
for i in range(0, len(data)):
    for j in range(i + 1, len(data)):
        d = m[j][i]

        sil[i, clustermap[j]] += d
Ejemplo n.º 17
0
def recomendador():			
	#MATRIZ_USER_VS_ITEM=[[ 0 for i in range(10) ] for j in range(5)] #MATRIZ LLENA DE 0 DE 5X10
	MATRIZ_USER_VS_ITEM=[[ random.randint(0,5) for i in range(42) ] for j in range(21)]
	UserReco=MATRIZ_USER_VS_ITEM[1]

	##--------------------CONEXION BDD----------------

	try:
		con = psycopg2.connect("dbname='planapp_db' user='******' host='127.0.0.1' password='******'")

	except:
		print "I am unable to connect to the database"

	cur = con.cursor()

	#cur.execute("SELECT nombre from usuario where lugar=%s ", (lugar,))
	'''
	cur.execute("SELECT nombre from usuario")

	rows = cur.fetchall()

##	for i in range(len(rows)):
#			print rows[i]
	'''
	#imprimirMatriz(MATRIZ_USER_VS_ITEM, 42, 21)

	id_acompanante=1
	#----------------------LLENAR MATRIZ DE VOTOS-----

	'''
	cur.execute("SELECT count(*) from usuario")
	row0 = cur.fetchone()
	numerousuarios=int(row0[0])

	cur.execute("SELECT count(*) from lugar")
	row1 = cur.fetchone()
	numerolugares=int(row1[0])

#	print MATRIZ_USER_VS_ITEM
#	print numerousuarios
#	print numerolugares

	for i in range(numerousuarios+1):	
		for j in range(numerolugares+1):
			cur.execute("SELECT voto_lugar from votos where id_acompanante=%s and id_usuario=%s and id_lugar=%s", (id_acompanante,i,j,))
			rows = cur.fetchone()
			if rows:
				MATRIZ_USER_VS_ITEM[i][j]=int(rows[0])

#	for i in range(numerolugares):
#		MATRIZ_USER_VS_ITEM[0][i]=random.randint(0,5)

#	imprimirMatriz(MATRIZ_USER_VS_ITEM, 42,21)
	'''
	##---------------------BORRAR---------------------

	lugares=[]

	for i in range(42):
		lugares.append(i)


	#-------------------------------------------RECOMENDADOR-------------------------------------------

	#print "MATRIZ_USER_VS_ITEMEGLO", MATRIZ_USER_VS_ITEM		

	clusterid, error, nfound = pc.kcluster(MATRIZ_USER_VS_ITEM, nclusters=2, transpose=0, npass=1, method='a', dist='e')
	centroids, algo = pc.clustercentroids(MATRIZ_USER_VS_ITEM, clusterid=clusterid)

	#print "clusterid", clusterid #A que cluster pertence cada vector

	buscar=clusterid[MATRIZ_USER_VS_ITEM.index(UserReco)] #BUSCAR = EL ID DEL CLUSTER AL QUE PERTENECE EL USER A RECOMENDAR

	# Crear una lista para almacenar solo los usuarios que sirvan para la recomendacion (los que estan en el cluster)
	var=[]

	for i in range(len(clusterid)):
		if clusterid[i]==buscar:
			var.append(i) ### USUARIOS QUE ESTAN EN EL CLUSTER PARA RECOMENDAR

#	print "Elementos del cluster:", var

	POS_ITEMR=[] #POSICION/NUMERO DEL LUGAR A RECOMENDAR (I1)
	promedios=[]

	# Lista para sacar los lugares posibles a recomendar (o sea no evaluados)

	for i in range(len(UserReco)):
		if UserReco[i]==0:
			POS_ITEMR.append(i) #POSICION DE LOS LUGARES A RECOMENDAR

#	print "Pos items a recomendar" , POS_ITEMR

	# Crear matriz auxiliar que solo contiene los usuarios que sirven (los del mismo cluster)

	MATRIZ_AUX=[] 

	for i in range(len(var)):
		if MATRIZ_USER_VS_ITEM[var[i]]==UserReco:
			continue	
		else :
			MATRIZ_AUX.append(MATRIZ_USER_VS_ITEM[var[i]]) #MATRIZ QUE CONTIENE SOLO LOS USUARIOS QUE SON REFERENCIA PARA LA RECOMENDACION

	#Sacas promedio de notas del lugar 	
#	print MATRIZ_AUX, "MATRIZ AUX"
#	print "var: ", (len(var)-1)
#	print "user: "******"- item", POS_ITEMR[notas.index(notas1[i])], "nota:", notas1[i]
		recomendados.append(POS_ITEMR[notas.index(notas1[i])])
		notas[notas.index(notas1[i])]=1000  #Para evitar las repeteciones 

	TopN=9

	# Lista de los lugares recomendados

	idrecomendados=[]

#	print "lugares ", len(lugares)
#	print "recomendados ", len(recomendados)

	for i in range(len(recomendados)):
#		print recomendados[i]
		idrecomendados.append(lugares[recomendados[i]])

	print idrecomendados

	panorama1=[]
	actpanorama1=[]

	panorama2=[]
	actpanorama2=[]

	panorama3=[]
	actpanorama3=[]

	r=0
	s=0
	t=0

	for i in range(len(idrecomendados)):
		cur.execute("SELECT id_categoria from tipo_categoria where tipo_categoria.id_lugar=%s", (idrecomendados[i],))
		rows2 = cur.fetchall()

		if (actpanorama1.count(rows2[0])==0) and (r<3):
			panorama1.append(idrecomendados[i])
			actpanorama1.append(rows2[0])
			r=r+1	
		else:
			if (actpanorama2.count(rows2[0])==0 and s<3):	
				panorama2.append(idrecomendados[i])
				actpanorama2.append(rows2[0])
				s=s+1
			else:
				if (actpanorama3.count(rows2[0])==0) and t<3:
					panorama3.append(idrecomendados[i])
					actpanorama3.append(rows2[0])
					t=t+1	


	print "PANORAMA1: "
	for i in range(len(panorama1)):
		print "id: ", panorama1[i]
		print "act: ", actpanorama1[i]
				   
				  
	print "PANORAMA2: "
	for i in range(len(panorama2)):
		print "id: ", panorama2[i]
		print "act: ", actpanorama2[i]              


	print "PANORAMA3: "
	for i in range(len(panorama3)):
		print "id: ", panorama3[i]
		print "act: ", actpanorama3[i]

                      
	return panorama1,panorama2,panorama3
Ejemplo n.º 18
0
def kMeansByPycluster(arr, k, numberToTry=20):
  clusterid, error, nfound = pc.kcluster(arr, nclusters=k, transpose=0,
                                       npass=numberToTry, method='a', dist='e')
  centroids, _ = pc.clustercentroids(arr, clusterid=clusterid)
  return [centroids, clusterid]
Ejemplo n.º 19
0
import Pycluster as pc
import numpy as np
import sys

# Read data filename and desired number of clusters from command line
filename, n = sys.argv[1], int( sys.argv[2] )

data = np.loadtxt( filename )

# Perform clustering and find centroids
clustermap, _, _ = pc.kcluster( data, nclusters=n, npass=50 )
centroids, _ = pc.clustercentroids( data, clusterid=clustermap )

# Obtain distance matrix
m = pc.distancematrix( data )

# Find the masses of all clusters
mass = np.zeros( n )
for c in clustermap:
    mass[c] += 1

# Create a matrix for individual silhouette coefficients
sil = np.zeros( n*len(data) )
sil.shape = ( len(data), n )

# Evaluate the distance for all pairs of points
for i in range( 0, len(data) ):
    for j in range( i+1, len(data) ):
        d = m[j][i]
Ejemplo n.º 20
0
    root = r"c:\Users\rony\python_workspace\CompVision\DataSets\cahana2_vid"
    lsth = [getHistogram(file) for file in files(root)]
    lstf = [getImage(file) for file in files(root)]
    height = 81
    width = 180
    lstt = MakeTilesForList (lstf, height,width)

    c = CompareTiledImageToAllList(lstt,0);

    t = [ CompareTiledImageToAllList(lstt,i) for i in range(0,len(lstt))]

#min and averege of for each image
    m = [min (i) for i in t]
    avg = [np.mean(i) for i in t]
    close_center = {}

    arr = np.array(lsth)
    
    labels = Pycluster.kcluster(arr,nclusters=10,method='m')[0]
    centroids,_ = Pycluster.clustercentroids(arr,clusterid=labels)
    for iter in range(len(lsth)):
        curr_label = labels[iter]
        if(not (curr_label in close_center.keys())):
            close_center[curr_label] = iter
        if(chiDiff (lsth[iter], centroids[curr_label]) < chiDiff(lsth[close_center[curr_label]] ,centroids[curr_label])):
            close_center[curr_label] = iter
                                                                 
    print ([files(root)[x] for x in close_center.values()])


			args = (points, coefs,max_sub_deg)
			new_angle_test = np.zeros((nSig,2))
			mpoints = np.zeros_like(points)
			
			for ii in range(nSig):
				x0 = cart2sphere(points[ii,0],points[ii,1],points[ii,2])[1:3]
				xopt = fmin(even_pODF_opt,x0,args=args,xtol = 0.00001, ftol = 0.00001, disp=0)
				new_angle_test[ii,:] = xopt
				mpoints[ii,:] = np.array([np.sin(new_angle_test[ii,0])*np.cos(new_angle_test[ii,1]), np.sin(new_angle_test[ii,0])*np.sin(new_angle_test[ii,1]), np.cos(new_angle_test[ii,0])])
			
			#---------------------------------------------------------------------------
    			#--Start clustering -- maybe need to use a different set of nodes to evaluate pODF
    			#
    			nclusters = 4
    			(indx,error,nf) = pyc.kcluster(mpoints,nclusters=nclusters,npass=500,dist='u')
    			(cmdata,cmmask) = pyc.clustercentroids(mpoints,np.ones_like(mpoints),indx)
    
    			angles_clust[kk,0] = np.arccos(np.dot(cmdata[0,:],cmdata[1,:]))*180/np.pi
    			angles_clust[kk,1] = np.arccos(np.dot(cmdata[0,:],cmdata[2,:]))*180/np.pi
    			angles_clust[kk,2] = np.arccos(np.dot(cmdata[0,:],cmdata[3,:]))*180/np.pi
    			angles_clust[kk,3] = np.arccos(np.dot(cmdata[1,:],cmdata[2,:]))*180/np.pi
    			angles_clust[kk,4] = np.arccos(np.dot(cmdata[1,:],cmdata[3,:]))*180/np.pi
    			angles_clust[kk,5] = np.arccos(np.dot(cmdata[2,:],cmdata[3,:]))*180/np.pi
 
			#end MC simulation

		angles_clust.sort()  
  		angles_mean.append(angles_clust[:,0].mean()) 
  		angles_std.append(angles_clust[:,0].std())
	
	error += (np.abs(angles - np.array(angles_mean))).reshape((13,1))
Ejemplo n.º 22
0
def clusteringPlot(dataFile, clusterFile, outputPath):
    time = dataFile.split("\\")
    time = time[len(time) - 1]
    time = time.split(".")
    time = time[0]

    data = np.genfromtxt(dataFile, delimiter=',')
    clusterid = np.genfromtxt(clusterFile, delimiter=',')

    if len(data.shape) == 1:
        return [-1];

    #clusterid.astype(np.int64)

    counter = collections.Counter(clusterid)
    sortedCounter = sorted(counter.items(), key=itemgetter(1), reverse = True)

    usedClusters = [np.asscalar(np.int16(i[0])) for i in sortedCounter]
    freq = [np.asscalar(np.int16(i[1])) for i in sortedCounter]

    fig1 = pylab.figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k')
    #rects1 = pylab.bar(ind + 0.05, freq, 0.1, color='#2200CC')
    #print freq
    pylab.xlim(-len(freq)/50, len(freq)+len(freq)/50)
    pylab.ylim(0, max(freq) + 1)
    pylab.plot(freq, marker = '.', markersize = 4)
    pylab.xlabel("Ranked clusters")
    pylab.ylabel("Number of points in the cluster")
    pylab.title("Figure of clusters points distribution in " + time)
    #pylab.show()
    fig1.savefig(outputPath + "\\" + time + '_cluster_stat.png',dpi=80)
    pylab.close()

    k = 10
    if len(sortedCounter) < k:
        k = len(sortedCounter)

    usedClusters = usedClusters[0:k]

    centroids, _ = pc.clustercentroids(data, clusterid=clusterid)

    if len(data) < len(data[0]):
        print len(data)
        print len(data[0])
        return;
    data_pca = mlab.PCA(data)
    cutoff = data_pca.fracs[1]
    data_2d = data_pca.project(data, minfrac=cutoff)
    centroids_2d = data_pca.project(centroids, minfrac=cutoff)

    color = ['#2200CC', '#D9007E', '#660066', '#FFFF00', '#FF6600', '#0099CC',
    '#8900CC', '#140A00', '#6B6B47', '#66FF66', '#FF99CC', '#0055CC']

    fig2 = pylab.figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k')

    Legend = []
    num=0
    for i in usedClusters:
        temp = pylab.scatter(data_2d[clusterid==i,0],data_2d[clusterid==i,1], color=color[num%12])
        Legend.append(temp)
        num += 1

    pylab.plot(centroids_2d[usedClusters,0], centroids_2d[usedClusters,1], 'sg', markersize=8)
    pylab.legend(Legend, range(1, k + 1))
    pylab.xlabel("Feature 1")
    pylab.ylabel("Feature 2")
    pylab.title("Top " + str(k) + " clusters in " + time)

    #pylab.show()

    fig2.savefig(outputPath + "\\" + time + '_cluster_plot.png',dpi=80)
    #fig2.savefig('test222png.png',dpi=80)
    pylab.close()
Ejemplo n.º 23
0
def cluster_plot(data_file, cluster_file, output_path):
    """
    Draw the figure of the clusters (top 10 clusters in 2 dimensions).

    @param data_file: Data file in a month (CSV binary file).
    @param cluster_file: Clustering result file.
    @param output_path: Output path to save the figure.
    @return: none
    """

    # The time of the month, for example, 200311
    time = data_file.split("\\")
    time = time[len(time) - 1]
    time = time.split(".")
    time = time[0]

    # Read data.
    data = numpy.genfromtxt(data_file, delimiter=',')
    cluster_id = numpy.genfromtxt(cluster_file, delimiter=',')

    # If the number of data points is 1, return.
    if len(data.shape) == 1:
        return [-1]

    # Sort the cluster by the number of points in this cluster.
    counter = collections.Counter(cluster_id)
    sorted_counter = sorted(counter.items(), key=itemgetter(1), reverse=True)

    # Extract the clusters id and the freq in different lists
    used_clusters = [numpy.asscalar(numpy.int16(i[0])) for i in sorted_counter]
    freq = [numpy.asscalar(numpy.int16(i[1])) for i in sorted_counter]

    # fig1, (the size of clusters)'s distribution
    fig1 = pylab.figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k')
    pylab.xlim(-len(freq) / 50, len(freq) + len(freq) / 50)
    pylab.ylim(0, max(freq) + 1)
    pylab.plot(freq, marker='.', markersize=4)
    pylab.xlabel("Ranked clusters")
    pylab.ylabel("Number of points in the cluster")
    pylab.title("Figure of clusters points distribution in " + time)
    #pylab.show()
    fig1.savefig(output_path + "\\" + time + '_cluster_stat.png', dpi=80)
    pylab.close()

    # k is the top clusters to be showed
    k = 10
    if len(sorted_counter) < k:
        k = len(sorted_counter)

    used_clusters = used_clusters[0:k]

    # Get the centroids for each cluster
    centroids, _ = Pycluster.clustercentroids(data, clusterid=cluster_id)

    # If the number of data is smaller than the dimension of the data point, then return.
    # Because then we need to do the PCA, if the number of data is too small, the PCA will
    # fail.
    if len(data) < len(data[0]):
        print len(data)
        print len(data[0])
        return

    # PCA
    data_pca = mlab.PCA(data)
    cutoff = data_pca.fracs[1]
    data_2d = data_pca.project(data, minfrac=cutoff)
    centroids_2d = data_pca.project(centroids, minfrac=cutoff)

    color = ['#2200CC', '#D9007E', '#660066', '#FFFF00', '#FF6600', '#0099CC',
             '#8900CC', '#140A00', '#6B6B47', '#66FF66', '#FF99CC', '#0055CC']

    # fig2, figure of top k clusters in 2-D
    fig2 = pylab.figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k')
    legend = []
    num = 0
    for i in used_clusters:
        temp = pylab.scatter(data_2d[cluster_id == i, 0], data_2d[cluster_id == i, 1], color=color[num % 12])
        legend.append(temp)
        num += 1

    pylab.plot(centroids_2d[used_clusters, 0], centroids_2d[used_clusters, 1], 'sg', markersize=8)
    pylab.legend(legend, range(1, k + 1))
    pylab.xlabel("Feature 1")
    pylab.ylabel("Feature 2")
    pylab.title("Top " + str(k) + " clusters in " + time)
    #pylab.show()
    fig2.savefig(output_path + "\\" + time + '_cluster_plot.png', dpi=80)
    pylab.close()
Ejemplo n.º 24
0
#imprimirMatriz(MATRIZ_USER_VS_ITEM, 42,21)

##---------------------BORRAR---------------------

lugares=[]

for i in range(42):
	lugares.append(i)


#-------------------------------------------RECOMENDADOR-------------------------------------------

#print "MATRIZ_USER_VS_ITEMEGLO", MATRIZ_USER_VS_ITEM		

clusterid, error, nfound = pc.kcluster(MATRIZ_USER_VS_ITEM, nclusters=2, transpose=0, npass=1, method='a', dist='e')
centroids, algo = pc.clustercentroids(MATRIZ_USER_VS_ITEM, clusterid=clusterid)

#print "clusterid", clusterid #A que cluster pertence cada vector

buscar=clusterid[MATRIZ_USER_VS_ITEM.index(UserReco)] #BUSCAR = EL ID DEL CLUSTER AL QUE PERTENECE EL USER A RECOMENDAR

# Crear una lista para almacenar solo los usuarios que sirvan para la recomendacion (los que estan en el cluster)
var=[]

for i in range(len(clusterid)):
	if clusterid[i]==buscar:
		var.append(i) ### USUARIOS QUE ESTAN EN EL CLUSTER PARA RECOMENDAR

print "Elementos del cluster:", var

POS_ITEMR=[] #POSICION/NUMERO DEL LUGAR A RECOMENDAR (I1)