def findcenters(x, n=1000, k=6): # get dimensions m = x.shape[1] # create centers as empty centers = DataFrame(np.zeros(shape=(k, m))) for i in range(n): labels, _, _ = Pycluster.kcluster(x, nclusters=k, transpose=0, method="a", dist="e", npass=1) center, _ = Pycluster.clustercentroids(x, clusterid=labels) # sort centers by the distance to the origin center = sorted(center, key=lambda t: np.linalg.norm(np.array(t) - np.zeros(m)), reverse=True) # print np.linalg.norm(np.array(center[0])-np.zeros(m)) # print np.linalg.norm(np.array(center[1])-np.zeros(m)) # print np.linalg.norm(np.array(center[2])-np.zeros(m)) # print np.linalg.norm(np.array(center[3])-np.zeros(m)) # print np.linalg.norm(np.array(center[4])-np.zeros(m)) # print np.linalg.norm(np.array(center[5])-np.zeros(m)) # print np.array(center[0]) # print np.array(center[1]) # print np.array(center[2]) # print np.array(center[3]) # print np.array(center[4]) # print np.array(center[5]) # take the average for j in range(k): centers.ix[j, :] = centers.ix[j, :] + center[j] centers = centers / n return centers
def findcenters(x,n=1000,k=6): #get dimensions m = x.shape[1] #create centers as empty centers = DataFrame(np.zeros(shape=(k,m))) for i in range(n): labels, _, _ = Pycluster.kcluster(x, nclusters = k, transpose=0, method='a', dist='e', npass = 1) center, _ = Pycluster.clustercentroids(x,clusterid = labels) #sort centers by the distance to the origin center = sorted(center,key = lambda t: np.linalg.norm(np.array(t)-np.zeros(m)), reverse = True) #print np.linalg.norm(np.array(center[0])-np.zeros(m)) #print np.linalg.norm(np.array(center[1])-np.zeros(m)) #print np.linalg.norm(np.array(center[2])-np.zeros(m)) #print np.linalg.norm(np.array(center[3])-np.zeros(m)) #print np.linalg.norm(np.array(center[4])-np.zeros(m)) #print np.linalg.norm(np.array(center[5])-np.zeros(m)) #print np.array(center[0]) #print np.array(center[1]) #print np.array(center[2]) #print np.array(center[3]) #print np.array(center[4]) #print np.array(center[5]) #take the average for j in range(k): centers.ix[j,:] = centers.ix[j,:] + center[j] centers = centers/n return(centers)
def pyclustertest(): data=sp.rand(100,4) cid,e,n=pcl.kcluster(data) centroids,cmask=pcl.clustercentroids(D,clusterid=cid) print data print centroids
def myCKDemo(filename,n): #以下两个语句是获取数据,用于聚类分析的数据位于第3和第4列(从0开始计算) data = np.loadtxt(filename, delimiter = "," ,usecols=(2,4,14,8)) #第8和第9列,保存了城市的经纬度坐标,用于最后画散点图 xy = np.loadtxt(filename, delimiter = "," ,usecols=(2,4)) #clustermap是聚类之后的集合,记录每一组数据的类别id clustermap = pc.kcluster(data, n)[0] #centroids 是分组聚类之后的聚类中心坐标 centroids = pc.clustercentroids(data, clusterid=clustermap)[0] #m是距离矩阵 m = pc.distancematrix(data) #mass 用来记录各类的点的数目 mass = np.zeros(n) for c in clustermap: mass[c] += 1 #sil是轮廓系统矩阵,用于记录每个簇的大小 sil = np.zeros(n*len(data)) sil.shape = ( len(data), n ) for i in range( 0, len(data) ): for j in range( i+1, len(data) ): d = m[j][i] sil[i, clustermap[j] ] += d sil[j, clustermap[i] ] += d for i in range(0,len(data)): sil[i,:] /= mass #s轮廓系数是一个用来评估聚类效果的参数 #值在-1 —— 1之间,值越大,表示效果越好。 #小于0,说明与其簇内元素的平均距离小于最近的其他簇,表示聚类效果不好。 #趋近与1,说明聚类效果比较好。 s=0 for i in range( 0, len(data) ): c = clustermap[i] a = sil[i,c] b = min(sil[i,range(0,c)+range(c+1,n)]) si = (b-a)/max(b,a) s+=si print n, s/len(data) #使用matplotlib画出散点图。 fig, ax = pl.subplots() #cmap是用于区分不同类别的颜色 cmap = pl.get_cmap('jet', n) cmap.set_under('gray') #xy是经纬度,主要为了通过经纬度来画出不同城市在地理上的位置 x = [list(d)[0] for d in xy] y = [list(d)[1] for d in xy] cax = ax.scatter(x, y, c=clustermap, s=30, cmap=cmap, vmin=0, vmax=n) pl.show()
def _G(self, data, K): labels, _, _ = Pycluster.kcluster(data.T, K) centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels) centers = centers.T G = zeros((K, data.shape[1])) for k in range(K): D = data - expand_dims(centers[:, k], axis=1) G[k, :] = -sqrt(sum(multiply(D, D), axis=0)) return G
def reassignClusterIDs(src, dst): """ Given the cluster centers for two clusterings, determine the centers most similar to each other and reassign the cluster ids to match. """ srcFCS = DataStore.getData()[src[0]] dstFCS = DataStore.getData()[dst[0]] srcdata = srcFCS.data if srcFCS.selDims: srcdata = dh.filterData(srcFCS.data, srcFCS.selDims) srcids = srcFCS.clustering[src[1]] srccenters = pc.clustercentroids(srcdata, clusterid=srcids)[0] dstdata = dstFCS.data if dstFCS.selDims: dstdata = dh.filterData(dstFCS.data, dstFCS.selDims) dstids = dstFCS.clustering[dst[1]] dstcenters = pc.clustercentroids(dstdata, clusterid=dstids)[0] srcsep = separate(srcdata, srcids) dstsep = separate(dstdata, dstids) centerEQ = {} taken = [] # Fill the map with the closest source center for each destination center for i,dc in enumerate(dstcenters): bestDist = -1 for j,sc in enumerate(srccenters): if (j not in taken): dist = nonSymmetricClusterDistance(dstsep[i], srcsep[j]) if (bestDist < 0) or (dist < bestDist): bestDist = dist centerEQ[i] = j taken.append(centerEQ[i]) # Renumber the cluster IDs in the destination to match the IDs of the closest src center tmp = [centerEQ[id] for id in dstids] DataStore.getData()[dst[0]].clustering[dst[1]] = tmp
def clustering(file_path, k, dist_measure, PLOT): """ Do the K-means clustering for input data. @param file_path: Input data file. @param k: Number of centers in K-means algorithm. @param dist_measure: Distance measure (in this case, we use Manhattan distance). @param PLOT: Bool variable, check if plot the result (set it as True only in testing). @return: Clusters id for all data points in the input data file. """ data = numpy.genfromtxt(file_path, delimiter=',') if len(data.shape) == 1: return [-1] print "-- Processing file: " + file_path + " -- Data points: " + str(len(data)) print "-- Start clustering" k = set_k(len(data), k) ite_num = method_name(len(data)) # Do the K-means clustering cluster_id, _, _ = Pycluster.kcluster(data, nclusters=k, mask=None, weight=None, transpose=0, npass=ite_num, method='a', dist=dist_measure, initialid=None) if PLOT is False: return cluster_id # Draw the clustering result plot. centroids, _ = Pycluster.clustercentroids(data, clusterid=cluster_id) if PLOT: data_pca = mlab.PCA(data) cutoff = data_pca.fracs[1] data_2d = data_pca.project(data, minfrac=cutoff) centroids_2d = data_pca.project(centroids, minfrac=cutoff) else: data_2d = data centroids_2d = centroids color = ['#2200CC', '#D9007E', '#FF6600', '#FFCC00', '#ACE600', '#0099CC', '#8900CC', '#FF0000', '#FF9900', '#FFFF00', '#00CC01', '#0055CC'] for i in range(k): scatter(data_2d[cluster_id == i, 0], data_2d[cluster_id == i, 1], color=color[i % 12]) plot(centroids_2d[:, 0], centroids_2d[:, 1], 'sg', markersize=8) show() return cluster_id
def silhouette(data, k=5, shuffle = True, shufflecount = 100): #assume that data is a matrix with variables in rows and dimensions in columns coefficients = {} data = data.transpose() for nclus in range(2,k): clustermap = pc.kcluster(data,nclusters=nclus,npass=50)[0] centroids = pc.clustercentroids(data,clusterid=clustermap)[0] m = pc.distancematrix(data) res = [silhouette_coefficient(m,clustermap,nclus,data.shape)] for _ in range(shufflecount): dat = data map(np.random.shuffle,dat) clustermap = pc.kcluster(dat,nclusters=nclus,npass=50)[0] centroids = pc.clustercentroids(dat,clusterid=clustermap)[0] #distance matrix-- well it's a list actually m = pc.distancematrix(dat) res.append([silhouette_coefficient(m,clustermap,nclus,dat.shape)]) coefficients[nclus]={'data':res[0],'distribution':res[1:]} return coefficients
def __init__(self, numComps=None, dim=5, data=None, epsilon=math.pow(10, -10), wishartScalar=1, wishartScale=np.identity(dim), dirichlet=np.ones(numComps), normalMu=0, normalSigma=np.identity(dim)): # INITIALIZE ALL POSTERIOR PARAMETERS self.d = dim self.k = numComps self.n = len(data) # INITIALIZE ALL PRIOR PARAMETERS self.e = normalSigma self.m = normalMu self.w = wishartScale self.v = wishartScalar self.di = dirichlet self.epsilon = epsilon # INITIALIZE ALL PRIORS USING k-means CLUSTERING # INITIALIZE THE MUS labels, error, nfound = pc.kcluster(data, self.k)#, iter=300, thresh=1e-05) centroids, _ = pc.clustercentroids(data, clusterid=labels) self.mu = centroids self.pointsInComp = [[] for comp in xrange(self.k)] for n in xrange(self.n): self.pointsInComp[labels[n]].append(data[n]) # INITIALIZE THE COVARIANCE MATRIX self.sigma = [np.cov(np.array(kpoints).T) for kpoints in self.pointsInComp] # INITIALIZE THE WEIGHTS self.pi = [len(l)/data.shape[0] for l in self.pointsInComp]
""" Activity vector """ for i in range(0, len(point) - 1): activity[i] = point[i + 1] - point[i] activity[len(activity) - 1] = activity[0] - activity[len(activity) - 1] # print activity np.savetxt("test.out", activity, delimiter=",") """ Cluster activity into k cluster """ number_of_cluster = 5 label, errors, nfound = Pycluster.kcluster(activity, number_of_cluster) centroid, a = Pycluster.clustercentroids(activity, clusterid=label) np.savetxt("label.out", label, delimiter=",") # """ # plot clustered data # """ # zl = m.ceil(min(activity[0])) - 10 # zh = m.ceil(max(activity[0])) + 10 # # for i in range(0,len(label)): # if(label[i]==0): # ax.scatter(activity[i,0], activity[i,1], activity[i,2], s=200, marker='.', c='r') # if(label[i]==1): # ax.scatter(activity[i,0], activity[i,1], activity[i,2], s=200, marker='.', c='g') # if(label[i]==2):
def clustering(filepath, k, dist_measure, PLOT): data = np.genfromtxt(filepath, delimiter=",") ## print data if len(data.shape) == 1: return [-1] print "-- Processing file: " + filepath + " -- Data points: " + str(len(data)) print "-- Start clustering" if k == -1: k = int(0.5 * len(data)) if k > 5000: k = 5000 if len(data) < 1000: ite_num = 10 else: ite_num = 1 clusterid, error, nfound = pc.kcluster( data, nclusters=k, mask=None, weight=None, transpose=0, npass=ite_num, method="a", dist=dist_measure, initialid=None, ) if PLOT is False: return clusterid centroids, _ = pc.clustercentroids(data, clusterid=clusterid) ## # make a plot ## colors = ['red', 'green', 'blue'] ## plt.figure() ## plt.xlim([data[:,0].min() - .5, data[:,0].max() + .5]) ## plt.ylim([data[:,1].min() - .5, data[:,1].max() + .5]) ## plt.xticks([], []); plt.yticks([], []) # numbers aren't meaningful ## ## # show the centroids ## plt.scatter(centroids[:,0], centroids[:,1], marker='o', c=colors, s=100) ## ## # show user numbers, colored by their cluster id ## for i, ((x,y), kls) in enumerate(zip(data, clusterid)): ## # ## plt.annotate('o', xy=(x,y), xytext=(0,0), textcoords='offset points', color=colors[kls]) if PLOT: data_pca = mlab.PCA(data) cutoff = data_pca.fracs[1] data_2d = data_pca.project(data, minfrac=cutoff) centroids_2d = data_pca.project(centroids, minfrac=cutoff) else: data_2d = data centroids_2d = centroids color = [ "#2200CC", "#D9007E", "#FF6600", "#FFCC00", "#ACE600", "#0099CC", "#8900CC", "#FF0000", "#FF9900", "#FFFF00", "#00CC01", "#0055CC", ] for i in range(k): scatter(data_2d[clusterid == i, 0], data_2d[clusterid == i, 1], color=color[i % 12]) plot(centroids_2d[:, 0], centroids_2d[:, 1], "sg", markersize=8) show() return clusterid
def get_volume_sources(volume, space=5, remains=None): """get sources in volume Parameters ---------- volume : Volume object space : float The distance between sources remains : None | int The number of sources that we want to keep Returns ------- src : SourceSpaces object ------- Author : Alexandre Fabre """ if remains is None: remains, removes = get_number_sources(volume, space=space, surface=False) else: # avoid to have an incorrect number of sources remains = max(0, min(volume.pos_length, remains)) removes = volume.pos_length - remains if remains == 0: raise ValueError('Error, 0 source created') with warnings.catch_warnings(): warnings.simplefilter("ignore") # create clusters km = MiniBatchKMeans(n_clusters=remains, n_init=10) # get cluster labels cluster_id = km.fit(volume.pos).labels_ # get centroids of clusters centroids, _ = Pycluster.clustercentroids(volume.pos, clusterid=cluster_id) dist = euclidean_distances(centroids, volume.pos) # get indices of closest points of centroids arg_min = np.argmin(dist, axis=1) inuse = np.zeros(volume.pos_length) inuse[arg_min] = 1 inuse = inuse.astype(int) # Need to be int # must be converted to meters # Pos is in voxels coords not mm rr = volume.pos * 1e-3 if volume.hemi=='lh': Id = 101 elif volume.hemi=='rh': Id = 102 src = [{'rr': rr, 'coord_frame': np.array((FIFF.FIFFV_COORD_MRI,), np.int32), 'type': 'surf', 'id': Id, 'np': volume.pos_length, 'nn': volume.normals, 'inuse': inuse, 'nuse': remains, 'dist': None, 'nearest': None, 'use_tris': None, 'nuse_tris': 0, 'vertno': arg_min, 'patch_inds': None, 'tris': None, 'dist_limit': None, 'pinfo': None, 'ntri': 0, 'nearest_dist': None, 'removes': removes}] src = SourceSpaces(src) return src
np.c_[xx.ravel(), yy.ravel()], nclusters=args.n_clusters, method='m', # use median (aka medoid) dist='e') # euclidean dist # Put the result into a color plot Z = kids.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # Plot the centroids as a white X medoids, _ = Pycluster.clustercentroids(np.c_[xx.ravel(), yy.ravel()], clusterid=kids, method='m') plt.scatter(medoids[:, 0], medoids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title('PAM clustering (PCA-reduced data)\n' 'Medoids are marked with white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.savefig(args.scatter_file) plt.close() if args.sil_vs_cluster : plt.figure(2) s = []
def cluster(data, threshold = 0.5,method='sk', preprocess=True): length = len(data) print data.shape nclus = 2 nclusmax=15 sil = [-1] models=[] if preprocess==True: print 'Preprocessing by scaling each row by its range' data /= (amax(data,axis=0)-amin(data,axis=0))[newaxis,:] print 'Now to cluster' if method == 'sk': print 'Clustering using Scikits K-means implementation' print "This option returns a tuple of" print "\t\t (kmeans object, silhouette coefficients)" while nclus < nclusmax: #average(sil[-1]) < threshold and model = KMeans(init='k-means++',n_clusters=nclus) #Assume data is propery preprocessed model.fit(data) labels = model.labels_ #<-- can only sample this in chunks of 100 print data.shape print 'Calculating silhouette_score ' sil.append(silhouette_score(data,labels,metric='euclidean')) models.append(model) print 'For %d clusters, the silhouette coefficient is %.03f'%(nclus,sil[-1]) nclus += 1 return (models,sil) elif method == 'pyclus': import Pycluster as pc print 'Clustering using the C Clustering library' print 'This option returns a dictionary with the distance matrix, silhouettes, and clusterids for each iteration.' res = [] sil_co_one = 1 sil_co = [1] #Assume while sil_co_one > threshold and nclus < nclusmax: print 'No. of clus: %d'%nclus print 'Before kcluster' clustermap,_,_ = pc.kcluster(data,nclusters=nclus,npass=50) print 'After kcluster' centroids,_ = pc.clustercentroids(data,clusterid=clustermap) print 'After centroids' m = pc.distancematrix(data) print 'Finding mass' #Find the masses of all clusters mass = zeros(nclus) for c in clustermap: mass[c] += 1 #Create a matrix for individual silhouette coefficients sil = zeros((len(data),nclus)) print 'Evaluating pairwise distance' #Evaluate the distance for all pairs of points for i in xrange(0,length): for j in range(i+1,length): d = m[j][i] sil[i, clustermap[j] ] += d sil[j, clustermap[i] ] += d #Average over cluster for i in range(0,len(data)): sil[i,:] /= mass print 'Sil co' #Evaluate the silhouette coefficient s = 0 for i in xrange(0,length): c = clustermap[i] a = sil[i,c] b = min( sil[i, range(0,c) + range(c+1,nclus)]) si = (b-a)/max(b,a) #silhouette coefficient of point i s+=si nclus += 1 sil_co.append( s/length) sil_co_one = s/length print 'Sil co %.02f'%sil_co_one res.append({'clustermap':clustermap, 'centroids':centroids, 'distances':m, 'mass':mass, 'silhouettes':sil_co}) return res
import Pycluster as pc import numpy as np import sys # Read data filename and desired number of clusters from command line filename, n = sys.argv[1], int(sys.argv[2]) data = np.loadtxt(filename) # Perform clustering and find centroids clustermap, _, _ = pc.kcluster(data, nclusters=n, npass=50) centroids, _ = pc.clustercentroids(data, clusterid=clustermap) # Obtain distance matrix m = pc.distancematrix(data) # Find the masses of all clusters mass = np.zeros(n) for c in clustermap: mass[c] += 1 # Create a matrix for individual silhouette coefficients sil = np.zeros(n * len(data)) sil.shape = (len(data), n) # Evaluate the distance for all pairs of points for i in range(0, len(data)): for j in range(i + 1, len(data)): d = m[j][i] sil[i, clustermap[j]] += d
def recomendador(): #MATRIZ_USER_VS_ITEM=[[ 0 for i in range(10) ] for j in range(5)] #MATRIZ LLENA DE 0 DE 5X10 MATRIZ_USER_VS_ITEM=[[ random.randint(0,5) for i in range(42) ] for j in range(21)] UserReco=MATRIZ_USER_VS_ITEM[1] ##--------------------CONEXION BDD---------------- try: con = psycopg2.connect("dbname='planapp_db' user='******' host='127.0.0.1' password='******'") except: print "I am unable to connect to the database" cur = con.cursor() #cur.execute("SELECT nombre from usuario where lugar=%s ", (lugar,)) ''' cur.execute("SELECT nombre from usuario") rows = cur.fetchall() ## for i in range(len(rows)): # print rows[i] ''' #imprimirMatriz(MATRIZ_USER_VS_ITEM, 42, 21) id_acompanante=1 #----------------------LLENAR MATRIZ DE VOTOS----- ''' cur.execute("SELECT count(*) from usuario") row0 = cur.fetchone() numerousuarios=int(row0[0]) cur.execute("SELECT count(*) from lugar") row1 = cur.fetchone() numerolugares=int(row1[0]) # print MATRIZ_USER_VS_ITEM # print numerousuarios # print numerolugares for i in range(numerousuarios+1): for j in range(numerolugares+1): cur.execute("SELECT voto_lugar from votos where id_acompanante=%s and id_usuario=%s and id_lugar=%s", (id_acompanante,i,j,)) rows = cur.fetchone() if rows: MATRIZ_USER_VS_ITEM[i][j]=int(rows[0]) # for i in range(numerolugares): # MATRIZ_USER_VS_ITEM[0][i]=random.randint(0,5) # imprimirMatriz(MATRIZ_USER_VS_ITEM, 42,21) ''' ##---------------------BORRAR--------------------- lugares=[] for i in range(42): lugares.append(i) #-------------------------------------------RECOMENDADOR------------------------------------------- #print "MATRIZ_USER_VS_ITEMEGLO", MATRIZ_USER_VS_ITEM clusterid, error, nfound = pc.kcluster(MATRIZ_USER_VS_ITEM, nclusters=2, transpose=0, npass=1, method='a', dist='e') centroids, algo = pc.clustercentroids(MATRIZ_USER_VS_ITEM, clusterid=clusterid) #print "clusterid", clusterid #A que cluster pertence cada vector buscar=clusterid[MATRIZ_USER_VS_ITEM.index(UserReco)] #BUSCAR = EL ID DEL CLUSTER AL QUE PERTENECE EL USER A RECOMENDAR # Crear una lista para almacenar solo los usuarios que sirvan para la recomendacion (los que estan en el cluster) var=[] for i in range(len(clusterid)): if clusterid[i]==buscar: var.append(i) ### USUARIOS QUE ESTAN EN EL CLUSTER PARA RECOMENDAR # print "Elementos del cluster:", var POS_ITEMR=[] #POSICION/NUMERO DEL LUGAR A RECOMENDAR (I1) promedios=[] # Lista para sacar los lugares posibles a recomendar (o sea no evaluados) for i in range(len(UserReco)): if UserReco[i]==0: POS_ITEMR.append(i) #POSICION DE LOS LUGARES A RECOMENDAR # print "Pos items a recomendar" , POS_ITEMR # Crear matriz auxiliar que solo contiene los usuarios que sirven (los del mismo cluster) MATRIZ_AUX=[] for i in range(len(var)): if MATRIZ_USER_VS_ITEM[var[i]]==UserReco: continue else : MATRIZ_AUX.append(MATRIZ_USER_VS_ITEM[var[i]]) #MATRIZ QUE CONTIENE SOLO LOS USUARIOS QUE SON REFERENCIA PARA LA RECOMENDACION #Sacas promedio de notas del lugar # print MATRIZ_AUX, "MATRIZ AUX" # print "var: ", (len(var)-1) # print "user: "******"- item", POS_ITEMR[notas.index(notas1[i])], "nota:", notas1[i] recomendados.append(POS_ITEMR[notas.index(notas1[i])]) notas[notas.index(notas1[i])]=1000 #Para evitar las repeteciones TopN=9 # Lista de los lugares recomendados idrecomendados=[] # print "lugares ", len(lugares) # print "recomendados ", len(recomendados) for i in range(len(recomendados)): # print recomendados[i] idrecomendados.append(lugares[recomendados[i]]) print idrecomendados panorama1=[] actpanorama1=[] panorama2=[] actpanorama2=[] panorama3=[] actpanorama3=[] r=0 s=0 t=0 for i in range(len(idrecomendados)): cur.execute("SELECT id_categoria from tipo_categoria where tipo_categoria.id_lugar=%s", (idrecomendados[i],)) rows2 = cur.fetchall() if (actpanorama1.count(rows2[0])==0) and (r<3): panorama1.append(idrecomendados[i]) actpanorama1.append(rows2[0]) r=r+1 else: if (actpanorama2.count(rows2[0])==0 and s<3): panorama2.append(idrecomendados[i]) actpanorama2.append(rows2[0]) s=s+1 else: if (actpanorama3.count(rows2[0])==0) and t<3: panorama3.append(idrecomendados[i]) actpanorama3.append(rows2[0]) t=t+1 print "PANORAMA1: " for i in range(len(panorama1)): print "id: ", panorama1[i] print "act: ", actpanorama1[i] print "PANORAMA2: " for i in range(len(panorama2)): print "id: ", panorama2[i] print "act: ", actpanorama2[i] print "PANORAMA3: " for i in range(len(panorama3)): print "id: ", panorama3[i] print "act: ", actpanorama3[i] return panorama1,panorama2,panorama3
def kMeansByPycluster(arr, k, numberToTry=20): clusterid, error, nfound = pc.kcluster(arr, nclusters=k, transpose=0, npass=numberToTry, method='a', dist='e') centroids, _ = pc.clustercentroids(arr, clusterid=clusterid) return [centroids, clusterid]
import Pycluster as pc import numpy as np import sys # Read data filename and desired number of clusters from command line filename, n = sys.argv[1], int( sys.argv[2] ) data = np.loadtxt( filename ) # Perform clustering and find centroids clustermap, _, _ = pc.kcluster( data, nclusters=n, npass=50 ) centroids, _ = pc.clustercentroids( data, clusterid=clustermap ) # Obtain distance matrix m = pc.distancematrix( data ) # Find the masses of all clusters mass = np.zeros( n ) for c in clustermap: mass[c] += 1 # Create a matrix for individual silhouette coefficients sil = np.zeros( n*len(data) ) sil.shape = ( len(data), n ) # Evaluate the distance for all pairs of points for i in range( 0, len(data) ): for j in range( i+1, len(data) ): d = m[j][i]
root = r"c:\Users\rony\python_workspace\CompVision\DataSets\cahana2_vid" lsth = [getHistogram(file) for file in files(root)] lstf = [getImage(file) for file in files(root)] height = 81 width = 180 lstt = MakeTilesForList (lstf, height,width) c = CompareTiledImageToAllList(lstt,0); t = [ CompareTiledImageToAllList(lstt,i) for i in range(0,len(lstt))] #min and averege of for each image m = [min (i) for i in t] avg = [np.mean(i) for i in t] close_center = {} arr = np.array(lsth) labels = Pycluster.kcluster(arr,nclusters=10,method='m')[0] centroids,_ = Pycluster.clustercentroids(arr,clusterid=labels) for iter in range(len(lsth)): curr_label = labels[iter] if(not (curr_label in close_center.keys())): close_center[curr_label] = iter if(chiDiff (lsth[iter], centroids[curr_label]) < chiDiff(lsth[close_center[curr_label]] ,centroids[curr_label])): close_center[curr_label] = iter print ([files(root)[x] for x in close_center.values()])
args = (points, coefs,max_sub_deg) new_angle_test = np.zeros((nSig,2)) mpoints = np.zeros_like(points) for ii in range(nSig): x0 = cart2sphere(points[ii,0],points[ii,1],points[ii,2])[1:3] xopt = fmin(even_pODF_opt,x0,args=args,xtol = 0.00001, ftol = 0.00001, disp=0) new_angle_test[ii,:] = xopt mpoints[ii,:] = np.array([np.sin(new_angle_test[ii,0])*np.cos(new_angle_test[ii,1]), np.sin(new_angle_test[ii,0])*np.sin(new_angle_test[ii,1]), np.cos(new_angle_test[ii,0])]) #--------------------------------------------------------------------------- #--Start clustering -- maybe need to use a different set of nodes to evaluate pODF # nclusters = 4 (indx,error,nf) = pyc.kcluster(mpoints,nclusters=nclusters,npass=500,dist='u') (cmdata,cmmask) = pyc.clustercentroids(mpoints,np.ones_like(mpoints),indx) angles_clust[kk,0] = np.arccos(np.dot(cmdata[0,:],cmdata[1,:]))*180/np.pi angles_clust[kk,1] = np.arccos(np.dot(cmdata[0,:],cmdata[2,:]))*180/np.pi angles_clust[kk,2] = np.arccos(np.dot(cmdata[0,:],cmdata[3,:]))*180/np.pi angles_clust[kk,3] = np.arccos(np.dot(cmdata[1,:],cmdata[2,:]))*180/np.pi angles_clust[kk,4] = np.arccos(np.dot(cmdata[1,:],cmdata[3,:]))*180/np.pi angles_clust[kk,5] = np.arccos(np.dot(cmdata[2,:],cmdata[3,:]))*180/np.pi #end MC simulation angles_clust.sort() angles_mean.append(angles_clust[:,0].mean()) angles_std.append(angles_clust[:,0].std()) error += (np.abs(angles - np.array(angles_mean))).reshape((13,1))
def clusteringPlot(dataFile, clusterFile, outputPath): time = dataFile.split("\\") time = time[len(time) - 1] time = time.split(".") time = time[0] data = np.genfromtxt(dataFile, delimiter=',') clusterid = np.genfromtxt(clusterFile, delimiter=',') if len(data.shape) == 1: return [-1]; #clusterid.astype(np.int64) counter = collections.Counter(clusterid) sortedCounter = sorted(counter.items(), key=itemgetter(1), reverse = True) usedClusters = [np.asscalar(np.int16(i[0])) for i in sortedCounter] freq = [np.asscalar(np.int16(i[1])) for i in sortedCounter] fig1 = pylab.figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k') #rects1 = pylab.bar(ind + 0.05, freq, 0.1, color='#2200CC') #print freq pylab.xlim(-len(freq)/50, len(freq)+len(freq)/50) pylab.ylim(0, max(freq) + 1) pylab.plot(freq, marker = '.', markersize = 4) pylab.xlabel("Ranked clusters") pylab.ylabel("Number of points in the cluster") pylab.title("Figure of clusters points distribution in " + time) #pylab.show() fig1.savefig(outputPath + "\\" + time + '_cluster_stat.png',dpi=80) pylab.close() k = 10 if len(sortedCounter) < k: k = len(sortedCounter) usedClusters = usedClusters[0:k] centroids, _ = pc.clustercentroids(data, clusterid=clusterid) if len(data) < len(data[0]): print len(data) print len(data[0]) return; data_pca = mlab.PCA(data) cutoff = data_pca.fracs[1] data_2d = data_pca.project(data, minfrac=cutoff) centroids_2d = data_pca.project(centroids, minfrac=cutoff) color = ['#2200CC', '#D9007E', '#660066', '#FFFF00', '#FF6600', '#0099CC', '#8900CC', '#140A00', '#6B6B47', '#66FF66', '#FF99CC', '#0055CC'] fig2 = pylab.figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k') Legend = [] num=0 for i in usedClusters: temp = pylab.scatter(data_2d[clusterid==i,0],data_2d[clusterid==i,1], color=color[num%12]) Legend.append(temp) num += 1 pylab.plot(centroids_2d[usedClusters,0], centroids_2d[usedClusters,1], 'sg', markersize=8) pylab.legend(Legend, range(1, k + 1)) pylab.xlabel("Feature 1") pylab.ylabel("Feature 2") pylab.title("Top " + str(k) + " clusters in " + time) #pylab.show() fig2.savefig(outputPath + "\\" + time + '_cluster_plot.png',dpi=80) #fig2.savefig('test222png.png',dpi=80) pylab.close()
def cluster_plot(data_file, cluster_file, output_path): """ Draw the figure of the clusters (top 10 clusters in 2 dimensions). @param data_file: Data file in a month (CSV binary file). @param cluster_file: Clustering result file. @param output_path: Output path to save the figure. @return: none """ # The time of the month, for example, 200311 time = data_file.split("\\") time = time[len(time) - 1] time = time.split(".") time = time[0] # Read data. data = numpy.genfromtxt(data_file, delimiter=',') cluster_id = numpy.genfromtxt(cluster_file, delimiter=',') # If the number of data points is 1, return. if len(data.shape) == 1: return [-1] # Sort the cluster by the number of points in this cluster. counter = collections.Counter(cluster_id) sorted_counter = sorted(counter.items(), key=itemgetter(1), reverse=True) # Extract the clusters id and the freq in different lists used_clusters = [numpy.asscalar(numpy.int16(i[0])) for i in sorted_counter] freq = [numpy.asscalar(numpy.int16(i[1])) for i in sorted_counter] # fig1, (the size of clusters)'s distribution fig1 = pylab.figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k') pylab.xlim(-len(freq) / 50, len(freq) + len(freq) / 50) pylab.ylim(0, max(freq) + 1) pylab.plot(freq, marker='.', markersize=4) pylab.xlabel("Ranked clusters") pylab.ylabel("Number of points in the cluster") pylab.title("Figure of clusters points distribution in " + time) #pylab.show() fig1.savefig(output_path + "\\" + time + '_cluster_stat.png', dpi=80) pylab.close() # k is the top clusters to be showed k = 10 if len(sorted_counter) < k: k = len(sorted_counter) used_clusters = used_clusters[0:k] # Get the centroids for each cluster centroids, _ = Pycluster.clustercentroids(data, clusterid=cluster_id) # If the number of data is smaller than the dimension of the data point, then return. # Because then we need to do the PCA, if the number of data is too small, the PCA will # fail. if len(data) < len(data[0]): print len(data) print len(data[0]) return # PCA data_pca = mlab.PCA(data) cutoff = data_pca.fracs[1] data_2d = data_pca.project(data, minfrac=cutoff) centroids_2d = data_pca.project(centroids, minfrac=cutoff) color = ['#2200CC', '#D9007E', '#660066', '#FFFF00', '#FF6600', '#0099CC', '#8900CC', '#140A00', '#6B6B47', '#66FF66', '#FF99CC', '#0055CC'] # fig2, figure of top k clusters in 2-D fig2 = pylab.figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k') legend = [] num = 0 for i in used_clusters: temp = pylab.scatter(data_2d[cluster_id == i, 0], data_2d[cluster_id == i, 1], color=color[num % 12]) legend.append(temp) num += 1 pylab.plot(centroids_2d[used_clusters, 0], centroids_2d[used_clusters, 1], 'sg', markersize=8) pylab.legend(legend, range(1, k + 1)) pylab.xlabel("Feature 1") pylab.ylabel("Feature 2") pylab.title("Top " + str(k) + " clusters in " + time) #pylab.show() fig2.savefig(output_path + "\\" + time + '_cluster_plot.png', dpi=80) pylab.close()
#imprimirMatriz(MATRIZ_USER_VS_ITEM, 42,21) ##---------------------BORRAR--------------------- lugares=[] for i in range(42): lugares.append(i) #-------------------------------------------RECOMENDADOR------------------------------------------- #print "MATRIZ_USER_VS_ITEMEGLO", MATRIZ_USER_VS_ITEM clusterid, error, nfound = pc.kcluster(MATRIZ_USER_VS_ITEM, nclusters=2, transpose=0, npass=1, method='a', dist='e') centroids, algo = pc.clustercentroids(MATRIZ_USER_VS_ITEM, clusterid=clusterid) #print "clusterid", clusterid #A que cluster pertence cada vector buscar=clusterid[MATRIZ_USER_VS_ITEM.index(UserReco)] #BUSCAR = EL ID DEL CLUSTER AL QUE PERTENECE EL USER A RECOMENDAR # Crear una lista para almacenar solo los usuarios que sirvan para la recomendacion (los que estan en el cluster) var=[] for i in range(len(clusterid)): if clusterid[i]==buscar: var.append(i) ### USUARIOS QUE ESTAN EN EL CLUSTER PARA RECOMENDAR print "Elementos del cluster:", var POS_ITEMR=[] #POSICION/NUMERO DEL LUGAR A RECOMENDAR (I1)