def spectral(R, k, n): W = [] for i in range(0, len(R)): dist = [] for j in range(0, len(R)): dist.append(0) W.append(dist) for i in range(0, len(R)): for j in range(i + 1, len(R)): W[j][i] = W[i][j] = distEnclud(R[i], R[j]) minN(W, 3) a, dist = numpy.linalg.eig(W) e = numpy.array(dist.T) idx = numpy.argsort(a) eigVec = [] for i in range(0, k): eigVec.append(e[idx[i]]) e = numpy.array(eigVec) e = e.T final = [] for i in range(0, len(e)): tmp = [] for j in range(0, k): a = e[i][j].real tmp.append(float('%0.3f' % a)) tmp.append(0) final.append(tmp) return kMeans.kMeans(final, k, 1)
def executeKMeans(dataTraining, dataTest): min_max_scaler = preprocessing.MinMaxScaler() data = min_max_scaler.fit_transform(dataTraining) pontosTreino, dimensoes = data.shape centroidesPorK = [] clusteredPorK = [] clusteredNumbersPorK = [] k_clusters = [1, 2, 3, 4, 5] elbow_values_plot = [] for k in k_clusters: clustered, centroides = km.kMeans(data, k, dimensoes) clusterNumbers = np.unique(clustered[:, dimensoes]) centroidesPorK.append(centroides) clusteredPorK.append(clustered) clusteredNumbersPorK.append(clusterNumbers) # for ci in clusterNumbers: # ci = clustered[clustered[:, 2] == ci] # ci = ci[:, :2] # cix = ci[:, 0] # ciy = ci[:, 1] # plt.plot(cix, ciy, color=np.random.random(3), marker='x', linestyle='') # plt.show() value = em.elbow_value(clustered, centroides, dimensoes) elbow_values_plot.append(value) print(clusteredPorK)
def biKmeans(dataSet, k): m = np.shape(dataSet)[0] # row # print dataSet clusterAssment = np.mat(np.zeros((m, 2))) centroid0 = np.mean(dataSet, axis=0).tolist()[0] centList = [centroid0] for j in range(m): clusterAssment[:, 1] = tools.distEclud(np.mat(centroid0), dataSet[j, :]) ** 2 count = 0 while len(centList) < k: lowestSSE = np.inf for i in range(len(centList)): ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == i)[0], :] # 质心对应的数据集 centroidMat, splitClustAss = kMeans.kMeans(ptsInCurrCluster, 2) sseSplit = np.sum(splitClustAss[:, 1]) # 划分后的SSE sseNotSplit = np.sum(clusterAssment[np.nonzero(clusterAssment[:, 0].A != i)[0], 1]) # 未划分的SSE # print "sseSplit, and notSplit: ", sseSplit, sseNotSplit if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit bestClustAss[np.nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(centList) bestClustAss[np.nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit # print 'the bestCentToSplit is: ', bestCentToSplit # print 'the len of bestClustAss is: ', len(bestClustAss) centList[bestCentToSplit] = bestNewCents[0, :].tolist()[0] centList.append(bestNewCents[1, :].tolist()[0]) clusterAssment[np.nonzero(clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClustAss return np.mat(centList), clusterAssment
def biKmeans(dataSet, k, distMeas=support.distEclud): m = np.shape(dataSet)[0] clusterAssment = np.mat(np.zeros((m, 2))) centroid0 = np.mean(dataSet, axis=0).tolist()[0] centList = [centroid0] for j in range(m): clusterAssment[j, 1] = distMeas(np.mat(centroid0), dataSet[j, :])**2 while (len(centList) < k): lowestSSE = np.inf for i in range(len(centList)): ptsInCurrCluster = dataSet[np.nonzero( clusterAssment[:, 0].A == i)[0], :] centroidMat, splitClustAss = kMeans.kMeans(ptsInCurrCluster, 2, distMeas) sseSplit = np.sum(splitClustAss[:, 1]) sseNotSplit = np.sum( clusterAssment[np.nonzero(clusterAssment[:, 0].A != i)[0], 1]) print("sseSplit, and notSplit: ", sseSplit, sseNotSplit) if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit bestClustAss[np.nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(centList) bestClustAss[np.nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit print('the bestCentToSplit is: ', bestCentToSplit) print('the len of bestClustAss is: ', len(bestClustAss)) centList[bestCentToSplit] = bestNewCents[0, :] centList.append(bestNewCents[1, :]) clusterAssment[np.nonzero( clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClustAss return (centList), clusterAssment
def mainBayes(): testset = getTestset() group,labels = word.getArrays() dateset = np.array(group) a =dateset.shape[0] k = 6 countAll,count = kMeans.kMeans(dateset, k) types = kMeans.mainKMeans2() Count =[] CountAll = [] for i in range(len(count)): Count.append(count[i]) for i in range(len(Count)): Count[i] = int(Count[i]) for i in range(len(countAll)): CountAll.append(countAll[i]) for i in range(len(CountAll)): CountAll[i] = int(CountAll[i]) k = 6 datesetC = kMeans.classifyDateset(dateset,countAll,count,k) Average = [] Var = [] Pro = [] for i in range(k): Average.append(calAverage(datesetC[i])) Var.append(calVar(datesetC[i],Average[i])) Pro.append(calPro(testset, Average[i], Var[i])) a = len(count) rs =predict(Pro) label = getLabel(rs,labels,countAll) return label
def Recursive_kMeans(centroids, dataMat, K, loop): global nLOOP if loop > 0: loop -= 1 print("Recursive kMeans Loop: %d times" %(nLOOP - loop)) new_centroids = kMeans.kMeans(centroids["centroids"], dataMat, K) return Recursive_kMeans(new_centroids, dataMat, K, loop) else: print("Recursive kMeans Loop: Finished.") return centroids
def testkMeans(): print( "Testing kMeans clustering - Clustering the Mall_Customer Dataset by Income and Spending Score" ) df = pd.read_csv("testData\Mall_Customers.csv", delimiter=',') df.loc[(df.Gender == 'FEMALE'), 'Gender'] = 0 df.loc[(df.Gender == 'MALE'), 'Gender'] = 1 X = df.values clusters = km.kMeans(X[:, 3:5], 200, 5) print("<-------------------->") plotKMeansPoints(X, clusters, 5)
def knn(self, filename, k, GenVector): #callinfo = self.GenVector_1(filename) dataSource = GenVector(filename) tmp_list = [] for call in dataSource: tmp_list.append(call[3:len(call)]) #前2列为call的id和code,参考信息.排除 dataArray = array(tmp_list) import kMeans centroids, clusterAssment = kMeans.kMeans(dataArray, k) clusterlist = clusterAssment.tolist() list_knn = [] for i in range(0, len(dataSource)): #f.write(callinfo[i][0]) list_knn.append(clusterlist[i] + dataSource[i]) resultdir = "../Data/knn_%s/" % time.strftime('%Y%m%d%H%M%S') os.mkdir(resultdir) f = open(resultdir + "knn_result.txt", 'w') for list_item in list_knn: f.write(str(list_item)) f.write("\n") f.close() num = [0] * k for i in range(0, k): f_name = resultdir + "knn_result_%s.txt" % i f_k = open(f_name, 'w') for list_item in list_knn: if (list_item[0] == i): num[i] = num[i] + 1 f_k.write(str(list_item)) f_k.write("\n") f_k.close() s_out = "" s_out += "k-均值聚类结束!\n" s_out += "数据源文件:\n\t%s\n" % filename s_out += "特征向量提取方法:\n\t%s\n" % GenVector s_out += "样本量:\n\t%d\n" % len(list_knn) s_out += "类别数量:\n\tk = %d \n" % k s_out += "结果文件为(位置:%s):\n\tlog.txt (执行日志)\n\tknn_result.txt (总的聚类结果)\n" % resultdir for i in range(0, k): s_out += "\tknn_result_%d.txt " % i s_out += " (l = %d)\n" % num[i] print s_out f = open(resultdir + "log.txt", 'w') f.write(s_out) f.close() return
def main(): data = numpy.loadtxt('GMM_dataset.txt') km = kMeans.kMeans(k=5, r=30, t=1e-03) km.clusterData(data) trueMean, trueCov = computeTrueValues(data) log({"true mean": trueMean}) log({"true cov": trueCov}) gmm = gMM(data, r=50, centroids=km.centroids, clusters=km.clusters, t=1e-03) gmm.fit() print("Gaussian Mixture Centroids - ", gmm.centroids) print("Gaussian Mixture Covariance - ", gmm.covs) gmm.plotGaussian(data)
def clustering(): k = request.form.get('k') clusters = kMeans(dataFrame, selectedFeatures, int(k)) global dataFrame1 dataFrame1 = getIdWithClusters(clusters, dataFrame) global dataFrame2 dataFrame2 = describeData(selectedFeatures, dataFrame, clusters) return render_template("temp/analysis.html", numberOfClusters=k, data1=dataFrame1.to_html(index=False, table_id='ID1'), data2=dataFrame2.to_html(table_id='ID2'))
def biKmeans(dataSet, k, distMeas=kMeans.distEclud): """ 给定数据集、期望的簇数目和距离计算方法的条件下,返回聚类结果。 :param dataSet: 数据集 :param k: 期望的簇数目 :param distMeas: 距离计算方法 :return: 聚类结果 """ m = shape(dataSet)[0] #创建一个矩阵,保存每个数据点的簇分配结果和平方误差。 clusterAssment = mat(zeros((m,2))) #将质心初始化为所有数据点的均值。 centroid0 = mean(dataSet, axis=0).tolist()[0] #创建一个只有一个质心的list. centList =[centroid0] #计算每个数据点到质心的距离平方差 for j in range(m): clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2 #判断当前簇数目是否达到预期 while (len(centList) < k): #初始化最小SSE lowestSSE = inf #开始遍历每一个质心 for i in range(len(centList)): #获取当前簇 i 下的所有数据点 ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:] #将当前簇 i 进行二分kMeans处理 centroidMat, splitClustAss = kMeans.kMeans(ptsInCurrCluster, 2, distMeas) #将二分 kMeans 结果中的平方和的距离进行求和 sseSplit = sum(splitClustAss[:,1]) #将未参与二分 kMeans 分配结果中的平方和的距离进行求和 sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1]) print("sseSplit, and notSplit: ",sseSplit,sseNotSplit) #计算拆分后与未拆分时的误差和,误差和越小,划分的结果就越好。 if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit #找出最好的簇分配结果??? bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #当使用kMeans()函数并指定簇数为2时,会得到两个编号为0和1的结果簇。需要将这些簇编号修改为划分簇及新加簇的编号。 bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit # 更新为最佳质心 print('the bestCentToSplit is: ',bestCentToSplit) print('the len of bestClustAss is: ', len(bestClustAss)) #更新质心列表??? centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0] #更新原质心 list 中的第 i 个质心为使用二分 kMeans 后 bestNewCents 的第一个质心 centList.append(bestNewCents[1,:].tolist()[0]) # 添加 bestNewCents 的第二个质心 # 重新分配最好簇下的数据(质心)以及SSE clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss return mat(centList), clusterAssment
def optimalClustering(): optimalK = getOptimalK(data, maxK) clusters = kMeans(dataFrame, selectedFeatures, optimalK) global dataFrame1 dataFrame1 = getIdWithClusters(clusters, dataFrame) global dataFrame2 dataFrame2 = describeData(selectedFeatures, dataFrame, clusters) return render_template("temp/analysis.html", numberOfClusters=str(optimalK), data1=dataFrame1.to_html(index=False, table_id='ID1'), data2=dataFrame2.to_html(table_id='ID2'))
def main(): word_embedding_df = pd.read_hdf('rem_word_embedding.h5', 'df') word_embedding = mat(word_embedding_df.values) # .values is an array print 'word_embedding is loaded' m = word_embedding.shape[0] n = word_embedding.shape[1] print m print n word_embedding_label = word_embedding[:, 0] word_embedding_vector = word_embedding[:, 1:] # print word_embedding_label # print word_embedding_vector myCentriods, clustAssing = kMeans.kMeans(word_embedding_vector, 2700) print 'clustAssing:' print clustAssing
def startupRecognition(): imageAmount = int(imageAmountEntry.get()) classAmount = int(classAmountEntry.get()) if not (1000 <= imageAmount <= 100000) or not (2 <= classAmount <= 20): return labelProcessing = startupUI(btnProcess) workObject = kMeans(imageAmount, classAmount, canvasMain.winfo_width(), canvasMain.winfo_height()) workObject.generateData() workObject.recognize() workerThread = Thread(target=coresRecount, args=(workObject, canvasMain, labelProcessing, btnProcess)) workerThread.daemon = True workerThread.start() return
def biKmeans(dataSet, k, distMeas=distEclud): m = shape(dataSet)[0] # 这里第一列为类别,第二列为SSE clusterAssment = mat(zeros((m,2))) # 看成一个簇是的质心 centroid0 = mean(dataSet, axis=0).tolist()[0] centList =[centroid0] #create a list with one centroid for j in range(m): #计算只有一个簇是的误差 clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2 # 核心代码 while (len(centList) < k): lowestSSE = inf # 对于每一个质心,尝试的进行划分 for i in range(len(centList)): # 得到属于该质心的数据,其中clusterAssment[:,0].A==i相当于是去判断和i相等的数,并且按照下标给出等于则为true。nonzero(clusterAssment[:,0].A==i)返回需要的下标。 ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:] # 对该质心划分成两类 centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas) # 二分K-均值首次都是采用的随机给质心方式。 # 计算该簇划分后的SSE sseSplit = sum(splitClustAss[:,1]) # 没有参与划分的簇的SSE sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1]) print("sseSplit, and notSplit: ",sseSplit,sseNotSplit) # 寻找最小的SSE进行划分 # 即对哪一个簇进行划分后SSE最小 if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit # 较难理解的部分 bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit print('the bestCentToSplit is: ',bestCentToSplit) print('the len of bestClustAss is: ', len(bestClustAss)) centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0] #replace a centroid with two best centroids' print(bestNewCents[0, :].tolist()[0]) centList.append(bestNewCents[1,:].tolist()[0]) clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss #reassign new clusters, and SSE,这里是把正在拆分的簇全部替换。 return mat(centList), clusterAssment
def Spectral_Clustering(D, k, ratio=True, sig=1): # Get dimensions of nxd D matrix n, d = D.shape # Compute nxn adjacency (similarity) matrix A = np.zeros((n, n)) for i in range(n): for j in range(n): if i == j: A[i][j] = 0.0 else: A[i][j] = similarity(D[i, :], D[j, :], sig) # Compute degree matrix deg = np.identity(n) * np.sum(A, axis=1) # Compute laplacian matrix L = deg - A # Set B accoring to ratio cut/normalized cut if ratio == True: B = L else: B = np.linalg.inv(deg) @ L # Compute eigenvalues and eigenvectors of B w, v = np.linalg.eig(B) # Get reduced basis v = v[:, :k] # Normalize basis to obtain new dataset Y = np.zeros((n, k)) for i in range(n): Y[i, :] = v[i, :] * (1. / (sum(v[i, :]**2)**(1. / 2))) # Run kMeans on new dataset return kMeans(Y, k)
#!/usr/bin/env python __coding__ = "utf-8" __author__ = "Ng WaiMing" from kMeans import kMeans from kMeans import loadDataSet from kMeans import randCent from kMeans import distEclud from kMeans import biKmeans from numpy import * if __name__ == '__main__': dataMat = mat(loadDataSet('testSet.txt')) print('min(dataMat[:, 0])', min(dataMat[:, 0]), '\n') print('min(dataMat[:, 1])', min(dataMat[:, 1]), '\n') print('max(dataMat[:, 0])', max(dataMat[:, 0]), '\n') print('max(dataMat[:, 1])', max(dataMat[:, 1]), '\n') print(randCent(dataMat, 2), '\n') print(distEclud(dataMat[0], dataMat[1])) centroids, clusterAssment = kMeans(dataMat, 4) print('centroids:\n', centroids, '\n') print('clusterAssment:\n', clusterAssment, '\n') dataMat3 = mat(loadDataSet('testSet2.txt')) centList, myNewAssments = biKmeans(dataMat3, 3) print('centList: \n', centList, '\n') # fileName = '../../../../data/k-means/places.txt' # imgName = '../../../../data/k-means/Portland.png' # kMeans.clusterClubs(fileName=fileName, imgName=imgName, numClust=5)
import kMeans as km # reload(kMeans) # Call if any changes have been made to kMeans.py import sklearn.cluster as sklearn # Create data dataSet = km.createDataSet(10,50,5,10,1) # Cluster numbers to test Nclusters = 12 # Slightly larger than true cluster number Num_iters = 50 # Test my implementation start_time = time.clock() for i in range(Num_iters): final_cluster_pos, cost = km.kMeans(dataSet, Nclusters) avg_run_time = (time.clock() - start_time)/Num_iters print "Average run time per my kMeans iteration is", avg_run_time, "s" # Test scikit-learn kMeans start_time_scikit = time.clock() skm = sklearn.KMeans(init='random', n_clusters=Nclusters, n_init=Num_iters) skm.fit(dataSet) avg_run_time_scikit = (time.clock() - start_time_scikit)/Num_iters print "Average run time per sklearn kMeans iteration is", avg_run_time_scikit, "s"
dataSet = mat(dataMat) k = 3 m = shape(dataSet)[0] clusterAssment = mat(zeros((m, 2))) centroid0 = mean(dataSet, axis=0).tolist()[0] centList = [centroid0] #create a list with one centroid for j in range(m): #calc initial Error clusterAssment[j, 1] = distEclud(mat(centroid0), dataSet[j, :])**2 while (len(centList) < k): lowestSSE = inf for i in range(len(centList)): ptsInCurrCluster = dataSet[nonzero( clusterAssment[:, 0].A == i)[0], :] #get the data points currently in cluster i centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distEclud) sseSplit = sum( splitClustAss[:, 1]) #compare the SSE to the currrent minimum sseNotSplit = sum( clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0], 1]) print("sseSplit, and notSplit: ", sseSplit, sseNotSplit) if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(centList) #change 1 to 3,4, or whatever bestClustAss[nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit print('the bestCentToSplit is: ', bestCentToSplit) print('the len of bestClustAss is: ', len(bestClustAss))
if __name__ == "__main__": import kMeans dataMat = mat(kMeans.loadDataSet('testSet.txt')) #print("\ndataMat:\n", dataMat) #print("\n(dataMat[,:0]):\n", dataMat[:, 0]) print("\nmin(dataMat[,:0]):", min(dataMat[:, 0])) #print("\n(dataMat[,:1]):\n", dataMat[:, 1]) print("\nmin(dataMat[,:1]):", min(dataMat[:, 1])) print("\nrandCent(dataMat,2):\n", randCent(dataMat, 2)) print("\ndistEclud(dataMat[0],dataMat[1]:\n", distEclud(dataMat[0], dataMat[1])) myCentroids, clusterAssing = kMeans.kMeans(dataMat, 6) print("\nmyCentroids:\n", myCentroids, "\nclusterAssing:\n", clusterAssing) ################################# print("\n#################biMeans:#########################\n") dataMat3 = mat(loadDataSet('testSet2.txt')) centList, myNewAssments = biKmeans(dataMat3, 3) print("\ncentList:\n", centList) ################################# print("\n#################Yahoo:#########################\n") # geoResults = kMeans.geoGrab('1 VA Center', 'Augusta, ME') # print("geoResults:\n",geoResults) #print(massPlaceFind('portlandClubs.txt')) print("\n#################Clubs:#########################\n")
categoryG.append(1) gDataSet.append(b) gFr.close() with open('mnist.txt', 'r') as mFr: for line in mFr: a = line.split(',') b = [] for item in a: b.append(float(item)) categoryM.append(b[-1]) mDataSet.append(b) mFr.close() calculate.calculate(kMeans.kMeans(gDataSet, 2), categoryG, 2) calculate.calculate(kMeans.kMeans(mDataSet, 10), categoryM, 10) calculate.calculate(nmf.NMF(gDataSet, 2), categoryG, 2) calculate.calculate(nmf.NMF(mDataSet, 10), categoryG, 10) calculate.calculate(spectral.spectral(gDataSet, 2, 3), categoryG, 2) calculate.calculate(spectral.spectral(gDataSet, 2, 6), categoryG, 2) calculate.calculate(spectral.spectral(gDataSet, 2, 9), categoryG, 2) calculate.calculate(spectral.spectral(mDataSet, 10, 3), categoryM, 10)
#!/usr/bin/env python #-*- coding: UTF-8 -*- import kMeans from numpy import * dataMat=mat(kMeans.loadDataSet('testSet.txt')) kMeansRandCenter=kMeans.randCent(dataMat,2) # 两个中心 print(kMeansRandCenter) centroids,clusterAssment=kMeans.kMeans(dataMat,5) import matplotlib.pyplot as plt fig=plt.figure(1) plt.plot(centroids[:,0],centroids[:,1],'ro') plt.plot(dataMat[:,0],dataMat[:,1],'bo') plt.axis([-8,8,-8,8]) # plt.show() kMeans.binaryKeans(dataMat,3) dataMat3=mat(kMeans.loadDataSet('testSet2.txt')) centList,Assments=kMeans.binaryKeans(dataMat3,3) print("centList:",centList) print("Assments:",Assments) fig=plt.figure(2) plt.plot(dataMat3[:,0],dataMat3[:,1],'bo') plt.plot(centList[:,0],centList[:,1],'ro') plt.axis([-10,10,-10,10]) # plt.show()
plt.ion() def show(X, C, centroids, keep = False): import time time.sleep(0.5) plt.cla() plt.plot(X[C == 0, 0], X[C == 0, 1], '*b', X[C == 1, 0], X[C == 1, 1], '*r', X[C == 2, 0], X[C == 2, 1], '*g') plt.plot(centroids[:,0],centroids[:,1],'*m',markersize=20) plt.draw() if keep : plt.ioff() plt.show() # generate 3 cluster data # data = np.genfromtxt('data1.csv', delimiter=',') m1, cov1 = [9, 8], [[1.5, 2], [1, 2]] m2, cov2 = [5, 13], [[2.5, -1.5], [-1.5, 1.5]] m3, cov3 = [3, 7], [[0.25, 0.5], [-0.1, 0.5]] data1 = np.random.multivariate_normal(m1, cov1, 250) data2 = np.random.multivariate_normal(m2, cov2, 180) data3 = np.random.multivariate_normal(m3, cov3, 100) X = np.vstack((data1,np.vstack((data2,data3)))) np.random.shuffle(X) from kMeans import kMeans centroids, C = kMeans(X, K = 3, plot_progress = show) show(X, C, centroids, True)
import kMeans from numpy import * datMat = mat(kMeans.loadDataSet('testSet.txt')) print min(datMat[:,0]) print min(datMat[:,1]) print max(datMat[:,0]) print max(datMat[:,1]) print kMeans.randCent(datMat, 2) print kMeans.distEclud(datMat[0], datMat[1]) myCentroids, clustAssing = kMeans.kMeans(datMat, 4) #print myCentroids, clustAssing datMat3 = mat(kMeans.loadDataSet('testSet2.txt')) centList, myNewAssments = kMeans.biKmeans(datMat3, 3) print centList
x = np.random.randn(10) y = np.random.randn(10) Cluster = np.array([0, 1, 1, 1, 3, 2, 2, 3, 0, 2]) # Labels of cluster 0 to 3 centers = np.random.randn(3, 2) ''' ''' dataMat = np.mat(kMeans.loadDataSet('testSet.txt')) x = dataMat[:,0] y = dataMat[:,1] ''' from sklearn.datasets.samples_generator import make_blobs dataSet, _ = make_blobs(n_samples=100, centers=3, n_features=2, random_state=0) dataSet = np.mat(dataSet) centers, clusterAssgn = kMeans.kMeans(dataSet=dataSet, k=4, createCent=kmeanspp.createCent) #centers,clusterAssgn = kMeans.kMeans(dataSet=dataMat,k=4) x = dataSet[:, 0] y = dataSet[:, 1] Cluster = np.array(clusterAssgn[:, 0]) print centers print 'cluster:', Cluster fig = plt.figure() ax = fig.add_subplot(111) scatter = ax.scatter(x, y, c=Cluster, s=50) #s parameter shows how big will be the plus symbol centers = np.mat(centers) for ele in centers: i = ele[0, 0]
# -*- coding: UTF-8 -*- # kMeans算法测试 # 运行环境: python3 from numpy import * import kMeans print("loading data...") dataSet = mat(kMeans.loadDataSet('testSetForKMeans.txt')) k = 4 centroids, clusterAssment = kMeans.kMeans(dataSet, k) print("show the result...") kMeans.showCluster(dataSet, k, centroids, clusterAssment)
import csv import numpy as np import kMeans as kMeans with open('../Medical_data.csv', 'r') as csv_file: csv_reader = list(csv.reader(csv_file, delimiter=",")) csv_dicReader = csv.DictReader(csv_file) my_data = np.array(csv_reader) ##following the data for Medical_data.csv for kMeans new_data = np.array(my_data[1:, 1:], dtype=np.float) kMeansClassfier = kMeans.kMeans(3, new_data.shape[0], new_data.shape[1]) ##k,n,d 3000,3 kMeansClassfier.showValues() kMeansClassfier.classify(new_data)
import kMeans from numpy import * dataMat = mat(kMeans.loadDataSet('testSet.txt')) # print dataMat randMat = kMeans.randCent(dataMat, 2) # print dataMat[:, 0] # print randMat res = kMeans.kMeans(dataMat, 4) # print res dataMat3 = mat(kMeans.loadDataSet('testSet2.txt')) kMeans.biKmeans(dataMat3, 3) # centList, myNewAssments =
import kMeans from numpy import * pickup_file_dir = '/home/donghao/ITS_project/taxi_finder/data/data_pickup/kMeans/' pickup_filename = 'pickup_8-0_8-30.txt' datMat = mat(kMeans.loadDataSetFile(pickup_file_dir + pickup_filename)) print 'begin k-means clustering' Centroids, clustAssing = kMeans.kMeans(datMat, 20) print 'finish k-means clustering' datMat_list = datMat.tolist() Centroids_list = Centroids.tolist() clustAssing_list = clustAssing.tolist() filename_suffix = pickup_filename.split('_')[1] + '_' + pickup_filename.split( '_')[2].split('.')[0] centroid_f = open(pickup_file_dir + 'centroids_' + filename_suffix + '.txt', 'w') for centroid in Centroids_list: centroid_f.write(str(centroid[0]) + ',' + str(centroid[1]) + '\n') cluster_f = open( pickup_file_dir + 'pickup_cluster_' + filename_suffix + '.txt', 'w') centroids_number = len(Centroids_list) centroid_number = 0 while centroid_number < centroids_number: print 'centroid_number:', centroid_number for i in range(len(clustAssing_list)): if int(clustAssing_list[i][0]) == centroid_number: cluster_f.write( str(centroid_number) + ',' + str(datMat_list[i][0]) + ',' +
global nLOOP if loop > 0: loop -= 1 print("Recursive kMeans Loop: %d times" %(nLOOP - loop)) new_centroids = kMeans.kMeans(centroids["centroids"], dataMat, K) return Recursive_kMeans(new_centroids, dataMat, K, loop) else: print("Recursive kMeans Loop: Finished.") return centroids #print (Recursive_kMeans(firstcent, Indexed_DataMat, K, nLOOP)) print (firstcent) print (kMeans.kMeans(firstcent, Indexed_DataMat, K)) setPlot((Indexed_DataMat), hotelcolor) setPlot(firstcent, 'b+') plt.show() plt.axis([min_X, max_X, min_Y, max_Y]) plt.ylabel('hotels') setPlot((Indexed_DataMat), hotelcolor) loadedfirstcent = {"centroids": firstcent} setPlot(Recursive_kMeans(loadedfirstcent, Indexed_DataMat, K, nLOOP)["centroids"], 'r+') plt.show()
import matplotlib.pyplot as plt def showCluster(dataSet, k, centroids, clusterAssment): m, dim = shape(dataSet) if dim != 2: print("Sorry! i can not draw because the dimension of data is not 2!") return 1 mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] if k > len(mark): print("Sorry! Your k is too large!") return 1 # draw all samples for i in range(m): markIndex = int(clusterAssment[i, 0]) # 为样本指定颜色 plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex]) mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb'] # draw the centroids for i in range(k): plt.plot(centroids[i, 0], centroids[i, 1], mark[i], marker='+', color='red', markersize=18) # 用marker来指定质心样式,用color和markersize来指定颜色和大小 plt.show() datMat=mat(kMeans.loadDataSet('../data/kMeans_testSet.txt')) clusterCenters,clusterAssment = kMeans.kMeans(datMat,4) showCluster(datMat,4,clusterCenters,clusterAssment)
def clusteringAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) valueSet, clusters1 = partitionSampleByMetadatumValue( metadatum, dataArray[1], dataArray[0]) clusters = [[sample[0] for sample in cluster] for cluster in clusters1] #that is, k in K-means Algorithm numberClass = len(valueSet) print "/!\ Number of classes:", numberClass, "." startSet = [cluster[0] for cluster in clusters] #Selects the starting samples of each cluster kClusters = [[start] for start in startSet] if not (len(clusters) == numberClass): print "\n/!\ ERROR: Different lengths: numberClass", numberClass, "clusters:", len( clusters), "." raise ValueError trimmedList = trimList(dataArray[3], startSet) print "/!\ Clustering with the first distance..." #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster) #@dataArray[8] = distMatchedDict kClusters, meanSamples, distanceDict, distanceInClusters = kMeans( trimmedList, numberClass, kClusters, startSet, dataArray[8], dataArray) print "-- End of first clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number == len(dataArray[3])): print "\n/!\ ERROR: A bug occurred during the clustering:", number, "=/=", len( dataArray[3]), "." raise ValueError #Deletes samples in cluster that are too far from the others kClusters, untaken = cleanClusters(kClusters, distanceInClusters) startSet = [cluster[0] for cluster in clusters] #Remove from untaken the starting samples untaken2 = [] for x in untaken: if not (x in startSet): untaken2.append(x) untaken = untaken2 #Remove the samples in untaken from the total set of samples sampleSet = [] for cluster in kClusters: for x in cluster: if not (x in sampleSet): sampleSet.append(x) for x in startSet: if not (x in sampleSet): sampleSet.append(x) trimmedList = trimList(sampleSet, startSet) print "/!\ Clustering with the second distance..." #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2) #@dataArray[9] = distConsensusDict kClusters, meanSamples, distanceDict, _ = kMeans(trimmedList, numberClass, kClusters, startSet, dataArray[9], dataArray) #,meanSamples) print "-- End of second clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number <= len(dataArray[3])): print "\n/!\ ERROR: An error occurred during the clustering:", number, ">", len( dataArray[3]), "." raise ValueError print "Printing the", numberClass, "clusters:" i = 1 #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs for cluster in kClusters: print "\n-- Cluster #", i, "associated to", metadatum, "=", valueSet[ i - 1], ":" print "Size:", len(cluster) print sorted(cluster) i += 1 print "\nScore of the clustering (comprised between 0 and 1):" print "The more it is close to 1, the more the clustering is relevant." #The clustering obtained with the K-Means method kClustersCopy = [cluster for cluster in kClusters] #The clustering obtained by comparing the values of the metadatum clustersCopy = [cluster for cluster in clusters] #Score by using first method of comparison compareClusterScore = 0 if not (len(kClustersCopy) == numberClass == len(clustersCopy)): print "\n/!\ ERROR: Length error in clustering:", numberClass, len( kClustersCopy), len(clustersCopy), "." raise ValueError while kClustersCopy and clustersCopy: cl1 = kClustersCopy.pop() cl2 = clustersCopy.pop() #clusters are non-empty x = compareCluster(cl1, cl2, untaken) if x: compareClusterScore += x else: compareClusterScore = None break if compareClusterScore: compareClusterScore = compareClusterScore / numberClass printClusterScore = compareClusterScore else: printClusterScore = "None" #Score by using second method of comparison #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass) print "Compare clusters score is:", printClusterScore, "." #print "Compare centers score is:",compareCentersScore,"." answer = raw_input("Do you want to save the results? Y/N\n") if (answer == "Y"): answer2 = raw_input( "Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n" ) if (answer2 == "Y"): commonList = extractCommonNodes(kClusters, dataArray) elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:", answer2, "." data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str( valueSet) i = 0 for cluster in kClusters: data += "\n\n-- Cluster #" + str( i + 1) + " associated to " + metadatum + " = " + str( valueSet[i]) data += "\nSize: " + str(len(cluster)) if (answer2 == "Y"): data += "\nSet of common nodes: " + str(commonList[i]) data += "\n" + str(cluster) i += 1 data += "\n\nCompare clusters score is: " + str(compareClusterScore) #data += "\n\nCompare centers score is: " + str(compareCentersScore) data += "\n\nEND OF FILE ****" print "\n/!\ Saving clusters..." writeFile(data) answer2 = raw_input( "Do you want to compute the graph of the clusters? Y/N\n") if (answer2 == "Y"): print "\n/!\ Constructing the graph of the clusters..." #@dataArray[3] = filenames graph = convertClustersIntoGraph(kClusters, distanceDict, len(dataArray[3])) graphNO(graph) print "\n/!\ Done. The graph is in DOT format in \"files\" folder." elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:", answer2, "." elif not (answer == "N"): print "/!\ You should answer by Y or N."
import kMeans from numpy import * dataMat = mat(kMeans.loadDataSet('testSet.txt')) # print min(dataMat[:,0]) # # print(kMeans.randCent(dataMat,2)) # # print(kMeans.distEclud(dataMat[0],dataMat[1])) myCentroids, clustAssing = kMeans.kMeans(dataMat, 4) print myCentroids
if plot_progress != None: plot_progress(X, C, np.array(centroids)) return np.array(centroids), C def show(X, C, centroids, keep=False): import time time.sleep(0.5) plt.cla() plt.plot(X[C == 0, 0], X[C == 0, 1], '*b', X[C == 1, 0], X[C == 1, 1], '*r', X[C == 2, 0], X[C == 2, 1], '*g') plt.plot(centroids[:, 0], centroids[:, 1], '*m', markersize=20) plt.draw() if keep: plt.ioff() plt.show() # generate 3 cluster data # data = np.genfromtxt('data1.csv', delimiter=',') m1, cov1 = [9, 8], [[1.5, 2], [1, 2]] m2, cov2 = [5, 13], [[2.5, -1.5], [-1.5, 1.5]] m3, cov3 = [3, 7], [[0.25, 0.5], [-0.1, 0.5]] data1 = np.random.multivariate_normal(m1, cov1, 250) data2 = np.random.multivariate_normal(m2, cov2, 180) data3 = np.random.multivariate_normal(m3, cov3, 100) X = np.vstack((data1, np.vstack((data2, data3)))) np.random.shuffle(X) from kMeans import kMeans centroids, C = kMeans(X, K=3, plot_progress=show) show(X, C, centroids, True)
def showFigure(dataMat, k, clusterAssment): tag = ['go', 'or', 'yo', 'ko', 'bo', 'mo'] for i in range(k): datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]] c = mat(i * ones((len(datalist), 1))) pylab.plot(datalist[:, 0], c, tag[i]) pylab.show() row = 0 for i in range(k): datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]] for j in range(len(datalist)): sheet1.write(row, 0, datalist[j, 0]) #sheet1.write(row, 1, datalist[j,1]) sheet1.write(row, 1, tag[i]) row += 1 if __name__ == '__main__': outputfilename = 'D:\\code\\KM\\res.xls' outputfile = xlwt.Workbook() sheet1 = outputfile.add_sheet('sheet1', cell_overwrite_ok=True) k = 6 dataMat = mat(kMeans.loadDataSet('D:\\code\\KM\\site.txt')) myCentroids, clusterAssment = kMeans.kMeans(dataMat, k) showFigure(dataMat, k, clusterAssment) outputfile.save(outputfilename)
def clusteringAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) valueSet,clusters1 = partitionSampleByMetadatumValue(metadatum,dataArray[1],dataArray[0]) clusters = [[sample[0] for sample in cluster] for cluster in clusters1] #that is, k in K-means Algorithm numberClass = len(valueSet) print "/!\ Number of classes:",numberClass,"." startSet = [cluster[0] for cluster in clusters] #Selects the starting samples of each cluster kClusters = [[start] for start in startSet] if not (len(clusters) == numberClass): print "\n/!\ ERROR: Different lengths: numberClass",numberClass,"clusters:",len(clusters),"." raise ValueError trimmedList = trimList(dataArray[3],startSet) print "/!\ Clustering with the first distance..." #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster) #@dataArray[8] = distMatchedDict kClusters,meanSamples,distanceDict,distanceInClusters = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[8],dataArray) print "-- End of first clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number == len(dataArray[3])): print "\n/!\ ERROR: A bug occurred during the clustering:",number,"=/=",len(dataArray[3]),"." raise ValueError #Deletes samples in cluster that are too far from the others kClusters,untaken = cleanClusters(kClusters,distanceInClusters) startSet = [cluster[0] for cluster in clusters] #Remove from untaken the starting samples untaken2 = [] for x in untaken: if not (x in startSet): untaken2.append(x) untaken = untaken2 #Remove the samples in untaken from the total set of samples sampleSet = [] for cluster in kClusters: for x in cluster: if not (x in sampleSet): sampleSet.append(x) for x in startSet: if not (x in sampleSet): sampleSet.append(x) trimmedList = trimList(sampleSet,startSet) print "/!\ Clustering with the second distance..." #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2) #@dataArray[9] = distConsensusDict kClusters,meanSamples,distanceDict,_ = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[9],dataArray)#,meanSamples) print "-- End of second clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number <= len(dataArray[3])): print "\n/!\ ERROR: An error occurred during the clustering:",number,">",len(dataArray[3]),"." raise ValueError print "Printing the",numberClass,"clusters:" i = 1 #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs for cluster in kClusters: print "\n-- Cluster #",i,"associated to",metadatum,"=",valueSet[i-1],":" print "Size:",len(cluster) print sorted(cluster) i += 1 print "\nScore of the clustering (comprised between 0 and 1):" print "The more it is close to 1, the more the clustering is relevant." #The clustering obtained with the K-Means method kClustersCopy = [cluster for cluster in kClusters] #The clustering obtained by comparing the values of the metadatum clustersCopy = [cluster for cluster in clusters] #Score by using first method of comparison compareClusterScore = 0 if not (len(kClustersCopy) == numberClass == len(clustersCopy)): print "\n/!\ ERROR: Length error in clustering:",numberClass,len(kClustersCopy),len(clustersCopy),"." raise ValueError while kClustersCopy and clustersCopy: cl1 = kClustersCopy.pop() cl2 = clustersCopy.pop() #clusters are non-empty x = compareCluster(cl1,cl2,untaken) if x: compareClusterScore += x else: compareClusterScore = None break if compareClusterScore: compareClusterScore = compareClusterScore/numberClass printClusterScore = compareClusterScore else: printClusterScore = "None" #Score by using second method of comparison #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass) print "Compare clusters score is:",printClusterScore,"." #print "Compare centers score is:",compareCentersScore,"." answer = raw_input("Do you want to save the results? Y/N\n") if (answer == "Y"): answer2 = raw_input("Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n") if (answer2 == "Y"): commonList = extractCommonNodes(kClusters,dataArray) elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:",answer2,"." data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(valueSet) i = 0 for cluster in kClusters: data += "\n\n-- Cluster #" + str(i+1) + " associated to " + metadatum + " = " + str(valueSet[i]) data += "\nSize: " + str(len(cluster)) if (answer2 == "Y"): data += "\nSet of common nodes: " + str(commonList[i]) data += "\n" + str(cluster) i += 1 data += "\n\nCompare clusters score is: " + str(compareClusterScore) #data += "\n\nCompare centers score is: " + str(compareCentersScore) data += "\n\nEND OF FILE ****" print "\n/!\ Saving clusters..." writeFile(data) answer2 = raw_input("Do you want to compute the graph of the clusters? Y/N\n") if (answer2 == "Y"): print "\n/!\ Constructing the graph of the clusters..." #@dataArray[3] = filenames graph = convertClustersIntoGraph(kClusters,distanceDict,len(dataArray[3])) graphNO(graph) print "\n/!\ Done. The graph is in DOT format in \"files\" folder." elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:",answer2,"." elif not (answer == "N"): print "/!\ You should answer by Y or N."
"D1k3", "K2k4", "D1k5", "D1k6", ] print(dataset) return dataset def testResults(subband, output): newDataSet = pre_process(subband) newModel = mlpnn.predictModel("my_model.h5", newDataSet) if output == '1': return newModel / len(subband) else: return (len(subband) - newModel) / len(subband) DWT = DWT.DWT() kMeans = kMeans.kMeans() mlpnn = mlpnn.MLPNN() testing = open( r"C:\Users\Nathan Joseph\Desktop\CPEG498\SortedData\eyes closed\O001.txt") testing = testing.read().split('\n') testing.pop() testing = np.array(testing) testing = np.split(testing, 17) print(len(testing)) print(testResults(testing, '1'))
import kMeans as km #Genereer de dataset dataset = km.genDataSet("dataset1.csv") #Genereer de labels labels = km.genLabels("dataset1.csv", 2000) #Vind de scalers scalers = km.findminmax(dataset) #Scale de data scaleddata = km.scale(dataset, scalers) results = [] tmp = 0 for k in range(2, 50): results = [] for _ in range(10): #Doe 10 keer kMeans om de randomness van het kiezen van de centroids eruit te halen. clusters, _, _ = km.kMeans(scaleddata, k, labels) total = 0 for l in clusters: for m in l: total += m[0] results.append(total) tmp = min(results) #Kies het resultaat met de laagste aggregate intra-clustur distance print(k, tmp / k)