def test1(): datMat = mat(kMeans.loadDataSet('testSet.txt')) print(min(datMat[:, 0])) print(kMeans.randCent(datMat, 2)) print(kMeans.distEclud(datMat[0], datMat[1])) #myCentroids, clustAssing = kMeans.kMeans(datMat,3) myCentroids, clustAssing = kMeans.biKmeans(datMat, 4) print(myCentroids) kMeans.plot1(datMat, myCentroids)
#!/usr/bin/env python __coding__ = "utf-8" __author__ = "Ng WaiMing" from kMeans import kMeans from kMeans import loadDataSet from kMeans import randCent from kMeans import distEclud from kMeans import biKmeans from numpy import * if __name__ == '__main__': dataMat = mat(loadDataSet('testSet.txt')) print('min(dataMat[:, 0])', min(dataMat[:, 0]), '\n') print('min(dataMat[:, 1])', min(dataMat[:, 1]), '\n') print('max(dataMat[:, 0])', max(dataMat[:, 0]), '\n') print('max(dataMat[:, 1])', max(dataMat[:, 1]), '\n') print(randCent(dataMat, 2), '\n') print(distEclud(dataMat[0], dataMat[1])) centroids, clusterAssment = kMeans(dataMat, 4) print('centroids:\n', centroids, '\n') print('clusterAssment:\n', clusterAssment, '\n') dataMat3 = mat(loadDataSet('testSet2.txt')) centList, myNewAssments = biKmeans(dataMat3, 3) print('centList: \n', centList, '\n') # fileName = '../../../../data/k-means/places.txt' # imgName = '../../../../data/k-means/Portland.png' # kMeans.clusterClubs(fileName=fileName, imgName=imgName, numClust=5)
import kMeans from numpy import * datMat = mat(kMeans.loadDataSet('testSet.txt')) print min(datMat[:, 0]) print min(datMat[:, 1]) print max(datMat[:, 0]) print max(datMat[:, 1]) print kMeans.randCent(datMat, 2) print kMeans.distEclud(datMat[0], datMat[1]) myCentroids, clustAssing = kMeans.kMeans(datMat, 4) #print myCentroids, clustAssing datMat3 = mat(kMeans.loadDataSet('testSet2.txt')) centList, myNewAssments = kMeans.biKmeans(datMat3, 3) print centList
import time st = time.time() k = 3 dataMat = mat(kMeans.loadDataSet('testSet2.txt')) oldClassLabel = zeros(len(dataMat), int) newClassLabel = ones(len(dataMat), int) center = kMeans.randCent(dataMat, k) dist = [] m = 0 while newClassLabel.tolist().__eq__( oldClassLabel.tolist()) != True: # 所有的点的新分类的标签和旧分类的标签不一致时就继续进行划分 m += 1 # 迭代次数 for di in range(len(dataMat)): # 对数据集中的每个数据点 dist = [] for ci in range(len(center)): # 对每个质心,计算某个点到质心的距离 dist.append(kMeans.distEclud(dataMat[di], center[ci])) # dist 用于记录一个点到所有簇点的距离 distsort = array(dist).argsort() # 对距离排序,返回从小到大的索引 oldClassLabel = newClassLabel.copy() # !!!!注意这里是引用不能直接用等号,否则将会使得两个值一起变 newClassLabel[di] = distsort[0] # 取出索引的最小值,就是距离最近的点 for j in range(k): x = mean(array(dataMat)[kMeans.find_all_index(newClassLabel, j), 0]) # 重新计算簇心的坐标 y = mean(array(dataMat)[kMeans.find_all_index(newClassLabel, j), 1]) center[j] = [x, y] fig = plt.figure() ax = fig.add_subplot(111) center = array(center) ax.scatter(center[:, 0], center[:, 1], marker='+', c='r') print center print m dataMat = array(dataMat)
fr = open(fileName) for line in fr.readlines(): lineArr = [] curLine = line.strip().split('\t') for i in range(num): lineArr.append(float(curLine[i])) dataMat.append(lineArr) dataSet = mat(dataMat) k = 3 m = shape(dataSet)[0] clusterAssment = mat(zeros((m, 2))) centroid0 = mean(dataSet, axis=0).tolist()[0] centList = [centroid0] #create a list with one centroid for j in range(m): #calc initial Error clusterAssment[j, 1] = distEclud(mat(centroid0), dataSet[j, :])**2 while (len(centList) < k): lowestSSE = inf for i in range(len(centList)): ptsInCurrCluster = dataSet[nonzero( clusterAssment[:, 0].A == i)[0], :] #get the data points currently in cluster i centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distEclud) sseSplit = sum( splitClustAss[:, 1]) #compare the SSE to the currrent minimum sseNotSplit = sum( clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0], 1]) print("sseSplit, and notSplit: ", sseSplit, sseNotSplit) if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat
@author: 凯风 """ import kMeans import numpy as np from imp import reload reload(kMeans) datMat = np.mat(kMeans.loadDataSet('testSet.txt')) min(datMat[:,0]) max(datMat[:,0]) min(datMat[:,1]) max(datMat[:,1]) kMeans.randCent(datMat,2) # 看一下初始化的质心是否在取值范围内 kMeans.distEclud(datMat[0],datMat[1]) # 在实际数据上看下K-means reload(kMeans) datMat = np.mat(kMeans.loadDataSet('testSet.txt')) myCentroids,clustAssing = kMeans.kMeans(datMat,4) # 不一定是全局最优解 # 二分k-means reload(kMeans) datMat3 = np.mat(kMeans.loadDataSet('testSet2.txt')) centList,myNewAssments = kMeans.biKmeans(datMat3,3) # 其实依然无法保证全局最优解,只能是局部最优解 centList myNewAssments # 利用二分k-means在图上画出簇
import kMeans from numpy import * datMat = mat(kMeans.loadDataSet('testSet.txt')) print min(datMat[:,0]) print min(datMat[:,1]) print max(datMat[:,0]) print max(datMat[:,1]) print kMeans.randCent(datMat, 2) print kMeans.distEclud(datMat[0], datMat[1]) myCentroids, clustAssing = kMeans.kMeans(datMat, 4) #print myCentroids, clustAssing datMat3 = mat(kMeans.loadDataSet('testSet2.txt')) centList, myNewAssments = kMeans.biKmeans(datMat3, 3) print centList
import urllib import json # homedir= os.getcwd()+'/machinelearninginaction/ch10/' #绝对路径 homedir = '' #相对路径 #10.1 k均值聚类算法 datMat = mat(kMeans.loadDataSet(homedir + 'testSet.txt')) myCentroids, clustAssing = kMeans.kMeans(datMat, 4) print "datMat:", datMat print "min(datMat[:,0]):", min(datMat[:, 0]) print "min(datMat[:,1]):", min(datMat[:, 1]) print "max(datMat[:,0]):", max(datMat[:, 0]) print "max(datMat[:,1]):", max(datMat[:, 1]) print "randCent(datMat,2):", kMeans.randCent(datMat, 2) print "distEclud( datMat[ 0], datMat[ 1]):", kMeans.distEclud( datMat[0], datMat[1]) print "myCentroids:", myCentroids print "clustAssing:", clustAssing print ":", print ":", #10.3 二分k均值算法 datMat3 = mat(kMeans.loadDataSet(homedir + 'testSet2.txt')) centList, myNewAssments = kMeans.biKmeans(datMat3, 3) print "datMat3:", datMat3 print "centList:", centList print "myNewAssments:", myNewAssments #10.4.1 Yahoo!PlaceFinder API # geoResults=kMeans.geoGrab('1 VA Center', 'Augusta, ME') # print "geoResults:",geoResults
def test1(): dataMat = np.mat(kMeans.loadDataSet('testSet.txt')) print kMeans.randCent(dataMat, 2) print kMeans.distEclud(dataMat[0], dataMat[1])