def do_biKmeans(datMat, result, n=3): from kMeans import biKmeans from sklearn.metrics import adjusted_rand_score as ari import numpy as np import pandas as pd datMat = np.mat(datMat) i = 0 pred = [] ari_count = [] '''while i<6: #做好多次以确认类标 try: centList, clustAssing = biKmeans(datMat, n) y_pred = clustAssing.A[:, 0].astype(pd.np.int) pred.append(y_pred) ari_count.append(ari(y_pred, result)) except: y_pred = [] pred.append(y_pred) ari_count.append(0) i = i + 1 ''' centList, clustAssing = biKmeans(datMat, n) y_pred = clustAssing.A[:, 0].astype(pd.np.int) f = count_f(y_pred, result) RI = ri(f) J = j(f) return y_pred, [RI, J]
def test1(): datMat = mat(kMeans.loadDataSet('testSet.txt')) print(min(datMat[:, 0])) print(kMeans.randCent(datMat, 2)) print(kMeans.distEclud(datMat[0], datMat[1])) #myCentroids, clustAssing = kMeans.kMeans(datMat,3) myCentroids, clustAssing = kMeans.biKmeans(datMat, 4) print(myCentroids) kMeans.plot1(datMat, myCentroids)
def main(): dataSet = kMeans.loadfromcsv('./data/8.csv') dataMat = np.mat(dataSet) # normalize dataMat norMat = kMeans.normalize(dataMat) # centroids is the center of clusters # clusterAssment[cluster_index,deviation],in which deviation represents the dist # from current point to centroids. # 使用Bisceting Kmeans算法对游客进行聚类,预期聚类簇数目为4 centroids, clusterAssment = kMeans.biKmeans(norMat,4) cluster_label = clusterAssment[:,0] clusters = [[],[],[],[]] for i in range(0,len(cluster_label)): clusters[(int)(cluster_label[i])].append(np.asarray(norMat)[i]) clusters = np.asarray(clusters) for i in range(0,len(clusters)): clusters[i] = np.asarray(clusters[i]) clusters = np.asarray(clusters) #找出含有元素最多的簇,以及最少的簇 minCount = 10000 maxCount = 0 max_cluster = 0 min_cluster = 0 for i in range(0,len(clusters)): if minCount > len(clusters[i]): minCount = len(clusters[i]) if maxCount < len(clusters[i]): maxCount = len(clusters[i]) if len(clusters[max_cluster]) < len(clusters[i]): max_cluster = i if len(clusters[min_cluster]) > len(clusters[i]): min_cluster = i print "%d cluster has %d elements " % (i, len(clusters[i])), print "the centroids is", print centroids[i] number_weight = float(len(clusters[max_cluster]))/(len(clusters[min_cluster])) print centroids[max_cluster] # 计算Dunn指标 di = base.dunn(clusters) # 计算N-Dunn指标 NDunnIndex = di*(maxCount/minCount) print di print NDunnIndex print "original dunn is %f" % di print "weighted dunn is %f" % (number_weight*di)
def biKmeans_func(data_set, k, cent_file="", clus_file=""): print "kMeans : " + cent_file cent, clus = kMeans.biKmeans(data_set, k) # print cent # print clus kmean_res_cent_file = open(cent_file, 'w') for item in cent.A: item_str = "" for column in item: item_str = item_str + str(column) + " " item_str = item_str + '\n' kmean_res_cent_file.write(item_str) kmean_res_cent_file.close kmean_res_clus_file = open(clus_file, 'w') for item in clus.A: item_str = "" for column in item: item_str = item_str + str(column) + " " item_str = item_str + '\n' kmean_res_clus_file.write(item_str) kmean_res_clus_file.close
import kMeans from numpy import * # 导入txt数据 datMat = mat(kMeans.loadDataSet('data2.txt')) # datMat矩阵的第2-4列分别对应半长轴、偏心率和轨道倾角 datMat[0, 2:5] # 计算距离 delta_v = kMeans.distdeltaV(datMat[0, 2:5], datMat[1, 2:5]) # 随机生成k个质心 centroids = kMeans.randCent(datMat[:, 2:5], 4) # k-均值聚类 myCentroids, clustAssing = kMeans.kMeans(datMat[:, 2:5], 5, kMeans.distdeltaV) # 二分 k-均值聚类 centList, myNewAssments = kMeans.biKmeans(datMat[:, 2:5], 5, kMeans.distdeltaV) # 画图 kMeans.showCluster_SRQ(datMat[:, 2:5], myNewAssments)
import kMeans from numpy import* import matplotlib import matplotlib.pyplot as plt k = 4 datmat = array(kMeans.loadDataSet('testSet.txt')) centerList, clusterAssment = kMeans.biKmeans(datmat,k) print 'The cendroids is:',centerList fig = plt.figure() fig.add_subplot(111) colorList = ['b','c','g','k','r','y'] makerList = ['.','^','*','o','+'] for i in range(k): ax = plt.scatter(datmat[nonzero(clusterAssment[:,0].A == i)[0],0],datmat[nonzero(clusterAssment[:,0].A == i)[0],1], c = colorList[i],marker=makerList[i]) ax = plt.scatter(array(centerList[:,0]),array(centerList[:,1]),c = colorList[4],marker=makerList[3]) plt.title('Graph of k_Means ',) plt.xlabel('x') plt.ylabel('y') plt.show()
# -*- coding:utf-8 -*- import kMeans from numpy import * datMat = mat(kMeans.loadDataSet("testSet.txt")) ''' myCentroids,clusterAssing = kMeans.kMeans(datMat, 4) print("myCentroids is %s " % myCentroids) print("clusterAssing is %s " % clusterAssing) ''' #kMeans test example two dataMat2 = mat(kMeans.loadDataSet('testSet2.txt')) centList,myNewAssment = kMeans.biKmeans(dataMat2, 3) print(centList) #geoResult = kMeans.geoGrab('1 VA Center', 'Augusta,ME')
import kMeans from numpy import * dataMat = mat(kMeans.loadDataSet('testSet.txt')) # print dataMat randMat = kMeans.randCent(dataMat, 2) # print dataMat[:, 0] # print randMat res = kMeans.kMeans(dataMat, 4) # print res dataMat3 = mat(kMeans.loadDataSet('testSet2.txt')) kMeans.biKmeans(dataMat3, 3) # centList, myNewAssments =
import kMeans import numpy as np import matplotlib.pyplot as plt dataMat = np.mat(kMeans.loadDataSet('testSet.txt')) #print(kMeans.randCent(dataMat,2)) myCentroids, clustAssing = kMeans.kMeans(dataMat,4) #print(myCentroids,clustAssing) datalist = dataMat.tolist() #print([x[0] for x in datalist]) '''plt.figure() plt.scatter([x[0] for x in datalist],[x[1] for x in datalist]) plt.scatter([x[0] for x in myCentroids.tolist()],[x[1] for x in myCentroids.tolist()]) plt.title('kmeans') plt.show()''' dataMat = kMeans.loadDataSet("testSet2.txt") centList, clusteAssment = kMeans.biKmeans(dataMat, 3) print(centList)
import kMeans import ProbIN from numpy import * import subprocess import numpy as np datMat = mat(kMeans.loadDataSet('motionData_Training.txt')) kMeans.biKmeans(datMat,12) # datMat2 = mat(kMeans.loadDataSet('GPS_1Hz_training.txt')) # kMeans.biKmeans(datMat2,7)
# coding:utf-8 import kMeans from numpy import * datMat=mat(kMeans.loadDataSet('testSet.txt')) print datMat[1:5,:] myCentroids,clustAssing=kMeans.kMeans(datMat,4) print myCentroids print ' ' print clustAssing datMat3=mat(kMeans.loadDataSet('testSet2.txt')) centList,myNewAssments=kMeans.biKmeans(datMat3,3) print centList,myNewAssments
def test4(): dataMat = np.mat(kMeans.loadDataSet('testSet2.txt')) centList, myNewAssments = kMeans.biKmeans(dataMat, 3) kMeans.plotScatter(dataMat, centList, myNewAssments) print centList
#coding=utf-8 import kMeans from numpy import * # datMat=mat(kMeans.loadDataSet('testSet.txt')) # print(datMat) # myCentroids,clustAssing=kMeans.kMeans(datMat,4) # print(clustAssing) datMat = mat(kMeans.loadDataSet('testSet2.txt')) myCentroids, clustAssing = kMeans.biKmeans(datMat, 3) print(myCentroids) # kMeans.clusterClubs(5)
import kMeans from numpy import * datMat = mat(kMeans.loadDataSet('testSet.txt')) print min(datMat[:,0]) print min(datMat[:,1]) print max(datMat[:,0]) print max(datMat[:,1]) print kMeans.randCent(datMat, 2) print kMeans.distEclud(datMat[0], datMat[1]) myCentroids, clustAssing = kMeans.kMeans(datMat, 4) #print myCentroids, clustAssing datMat3 = mat(kMeans.loadDataSet('testSet2.txt')) centList, myNewAssments = kMeans.biKmeans(datMat3, 3) print centList
#!/usr/bin/env python __coding__ = "utf-8" __author__ = "Ng WaiMing" from kMeans import kMeans from kMeans import loadDataSet from kMeans import randCent from kMeans import distEclud from kMeans import biKmeans from numpy import * if __name__ == '__main__': dataMat = mat(loadDataSet('testSet.txt')) print('min(dataMat[:, 0])', min(dataMat[:, 0]), '\n') print('min(dataMat[:, 1])', min(dataMat[:, 1]), '\n') print('max(dataMat[:, 0])', max(dataMat[:, 0]), '\n') print('max(dataMat[:, 1])', max(dataMat[:, 1]), '\n') print(randCent(dataMat, 2), '\n') print(distEclud(dataMat[0], dataMat[1])) centroids, clusterAssment = kMeans(dataMat, 4) print('centroids:\n', centroids, '\n') print('clusterAssment:\n', clusterAssment, '\n') dataMat3 = mat(loadDataSet('testSet2.txt')) centList, myNewAssments = biKmeans(dataMat3, 3) print('centList: \n', centList, '\n') # fileName = '../../../../data/k-means/places.txt' # imgName = '../../../../data/k-means/Portland.png' # kMeans.clusterClubs(fileName=fileName, imgName=imgName, numClust=5)
#-*- coding:utf-8 -*- import kMeans from numpy import * datamat = mat(kMeans.loadDataSet('testSet.txt')) centList, myNewAssments = kMeans.biKmeans(datamat, 3) #每次给出四个质心,三次迭代后收敛 print centList
#!/usr/bin/python2.7 # _*_ coding: utf-8 _*_ """ @Author: MarkLiu """ import numpy as np import kMeans import matplotlib.pyplot as plt dataArr = kMeans.loadDataSet('datasets/testSet2.txt') dataMat = np.matrix(dataArr) k = 3 centroids, clusterAssment = kMeans.biKmeans(dataMat, k) # centroids, clusterAssment = kMeans.kMeans(dataMat, k) # 计算原始数据加上中心数据,将数据分离 m = np.shape(dataMat)[0] # 分离出不同簇的x,y坐标 xPoint_0 = [] yPoint_0 = [] xPoint_1 = [] yPoint_1 = [] xPoint_2 = [] yPoint_2 = [] xPoint_3 = [] yPoint_3 = [] for i in range(m): if int(clusterAssment[i, 0]) == 0:
def plotCluster(): dataSetShangHai = kMeans.loadfromcsv('./data/8.csv') dataSetXiAn = kMeans.loadfromcsv('./data/10195.csv') dataSetQingDao = kMeans.loadfromcsv('./data/10444.csv') dataSetSanYa = kMeans.loadfromcsv('./data/10030.csv') dataSetJiuZhaigou = kMeans.loadfromcsv('./data/10136.csv') dataSetTaiShan = kMeans.loadfromcsv('./data/10284.csv') dataMatXiAn = np.mat(dataSetXiAn) dataMatQingDao = np.mat(dataSetQingDao) dataMatShangHai = np.mat(dataSetShangHai) dataMatSanYa = np.mat(dataSetSanYa) dataMatTaiShan = np.mat(dataSetTaiShan) dataMatJiuZhaigou = np.mat(dataSetJiuZhaigou) norMatXiAn = kMeans.normalize(dataMatXiAn) norMatQingDao = kMeans.normalize(dataMatQingDao) norMatShangHai = kMeans.normalize(dataMatShangHai) norMatSanYa = kMeans.normalize(dataMatSanYa) norMatTaiShan = kMeans.normalize(dataMatTaiShan) norMatJiuZhaigou = kMeans.normalize(dataMatJiuZhaigou) centroidsShangHai, clusterAssmentShangHai = kMeans.biKmeans(norMatShangHai,4) centroidsXiAn, clusterAssmentXiAn = kMeans.biKmeans(norMatXiAn,4) centroidsQingDao, clusterAssmentQingDao = kMeans.biKmeans(norMatQingDao,4) centroidsSanYa, clusterAssmentSanYa = kMeans.biKmeans(norMatSanYa,4) centroidsJiuZhaigou, clusterAssmentJiuZhaigou = kMeans.biKmeans(norMatJiuZhaigou,4) centroidsTaiShan, clusterAssmentTaiShan = kMeans.biKmeans(norMatTaiShan,4) print "==================上海聚类结果==========" printBasicInfo(centroidsShangHai,clusterAssmentShangHai,norMatShangHai) print "==================西安聚类结果==========" printBasicInfo(centroidsXiAn, clusterAssmentXiAn, norMatXiAn) print "==================青岛聚类结果==========" printBasicInfo(centroidsQingDao, clusterAssmentQingDao, norMatQingDao) print "==================三亚聚类结果==========" printBasicInfo(centroidsSanYa, clusterAssmentSanYa, norMatSanYa) print "==================九寨沟聚类结果========" printBasicInfo(centroidsJiuZhaigou, clusterAssmentJiuZhaigou, norMatJiuZhaigou) print "==================泰山聚类结果==========" printBasicInfo(centroidsTaiShan, clusterAssmentTaiShan, norMatTaiShan) # shanghai plt.subplot(321) pointClusNumShangHai = clusterAssmentShangHai[:,0].A.T n = np.shape(pointClusNumShangHai)[1] plt.title(u'上海') for i in range(n): if 0.0 == pointClusNumShangHai.item(i): plt.plot(norMatShangHai[i,0],norMatShangHai[i,1],'g^') elif 1.0 == pointClusNumShangHai.item(i): plt.plot(norMatShangHai[i,0],norMatShangHai[i,1],'b*') elif 2.0 == pointClusNumShangHai.item(i): plt.plot(norMatShangHai[i,0],norMatShangHai[i,1],'k<') elif 3.0 == pointClusNumShangHai.item(i): plt.plot(norMatShangHai[i,0],norMatShangHai[i,1],'ms') plt.plot(centroidsShangHai[:,0],centroidsShangHai[:,1],'ro') plt.axis([0,4.0,0,12]) plt.xticks([0,1,2,3]) # plt.xlabel(u'Distance Index') # plt.ylabel(u'Activity Degree Index') # XiAn plt.subplot(322) pointClusNumXiAn = clusterAssmentXiAn[:,0].A.T n = np.shape(pointClusNumXiAn)[1] plt.title(u'西安') for i in range(n): if 0.0 == pointClusNumXiAn.item(i): plt.plot(norMatXiAn[i,0],norMatXiAn[i,1],'g^') elif 1.0 == pointClusNumXiAn.item(i): plt.plot(norMatXiAn[i,0],norMatXiAn[i,1],'b*') elif 2.0 == pointClusNumXiAn.item(i): plt.plot(norMatXiAn[i,0],norMatXiAn[i,1],'k<') elif 3.0 == pointClusNumXiAn.item(i): plt.plot(norMatXiAn[i,0],norMatXiAn[i,1],'ms') plt.plot(centroidsXiAn[:,0],centroidsXiAn[:,1],'ro') plt.axis([0,4.0,0,12]) plt.xticks([0,1,2,3]) # plt.xlabel(u'Distance Index') # plt.ylabel(u'Activity Degree Index') # QingDao plt.subplot(323) pointClusNumQingDao = clusterAssmentQingDao[:,0].A.T n = np.shape(pointClusNumQingDao)[1] plt.title(u'青岛') for i in range(n): if 0.0 == pointClusNumQingDao.item(i): plt.plot(norMatQingDao[i,0],norMatQingDao[i,1],'g^') elif 1.0 == pointClusNumQingDao.item(i): plt.plot(norMatQingDao[i,0],norMatQingDao[i,1],'b*') elif 2.0 == pointClusNumQingDao.item(i): plt.plot(norMatQingDao[i,0],norMatQingDao[i,1],'k<') elif 3.0 == pointClusNumQingDao.item(i): plt.plot(norMatQingDao[i,0],norMatQingDao[i,1],'ms') plt.plot(centroidsQingDao[:,0],centroidsQingDao[:,1],'ro') plt.axis([0,4.0,0,12]) plt.xticks([0,1,2,3]) # plt.xlabel(u'Distance Index') # plt.ylabel(u'Activity Degree Index') # SanYa plt.subplot(324) pointClusNumSanYa = clusterAssmentSanYa[:,0].A.T n = np.shape(pointClusNumSanYa)[1] plt.title(u'三亚') for i in range(n): if 0.0 == pointClusNumSanYa.item(i): plt.plot(norMatSanYa[i,0],norMatSanYa[i,1],'g^') elif 1.0 == pointClusNumSanYa.item(i): plt.plot(norMatSanYa[i,0],norMatSanYa[i,1],'b*') elif 2.0 == pointClusNumSanYa.item(i): plt.plot(norMatSanYa[i,0],norMatSanYa[i,1],'k<') elif 3.0 == pointClusNumSanYa.item(i): plt.plot(norMatSanYa[i,0],norMatSanYa[i,1],'ms') plt.plot(centroidsSanYa[:,0],centroidsSanYa[:,1],'ro') plt.axis([0,4.0,0,12]) plt.xticks([0,1,2,3]) # plt.xlabel(u'Distance Index') # plt.ylabel(u'Activity Index') # JiuZhaigou plt.subplot(325) pointClusNumJiuZhaigou = clusterAssmentJiuZhaigou[:,0].A.T n = np.shape(pointClusNumJiuZhaigou)[1] plt.title(u'九寨沟') for i in range(n): if 0.0 == pointClusNumJiuZhaigou.item(i): plt.plot(norMatJiuZhaigou[i,0],norMatJiuZhaigou[i,1],'g^') elif 1.0 == pointClusNumJiuZhaigou.item(i): plt.plot(norMatJiuZhaigou[i,0],norMatJiuZhaigou[i,1],'b*') elif 2.0 == pointClusNumJiuZhaigou.item(i): plt.plot(norMatJiuZhaigou[i,0],norMatJiuZhaigou[i,1],'k<') elif 3.0 == pointClusNumJiuZhaigou.item(i): plt.plot(norMatJiuZhaigou[i,0],norMatJiuZhaigou[i,1],'ms') plt.plot(centroidsJiuZhaigou[:,0],centroidsJiuZhaigou[:,1],'ro') plt.axis([0,4.0,0,12]) plt.xticks([0,1,2,3]) plt.xlabel(u'Distance Index') plt.ylabel(u'Activity Index') # TaiShan plt.subplot(326) pointClusNumTaiShan = clusterAssmentTaiShan[:,0].A.T n = np.shape(pointClusNumTaiShan)[1] plt.title(u'泰山') for i in range(n): if 0.0 == pointClusNumTaiShan.item(i): plt.plot(norMatTaiShan[i,0],norMatTaiShan[i,1],'g^') elif 1.0 == pointClusNumTaiShan.item(i): plt.plot(norMatTaiShan[i,0],norMatTaiShan[i,1],'b*') elif 2.0 == pointClusNumTaiShan.item(i): plt.plot(norMatTaiShan[i,0],norMatTaiShan[i,1],'k<') elif 3.0 == pointClusNumTaiShan.item(i): plt.plot(norMatTaiShan[i,0],norMatTaiShan[i,1],'ms') plt.plot(centroidsTaiShan[:,0],centroidsTaiShan[:,1],'ro') plt.axis([0,4.0,0,12]) plt.xticks([0,1,2,3]) plt.xlabel(u'Distance Index') plt.ylabel(u'Activity Index') plt.show()
st = time.time() my_ima = imread('city.jpg') fig0 = plt.figure() ax0 = fig0.add_subplot(111) imshow(my_ima) lab = color.rgb2lab(my_ima) ab = double(lab[:, :, 1:3]) nrows = ab.shape[0] ncols = ab.shape[1] X = ab.reshape(nrows * ncols, 2) fig1 = plt.figure() ax1 = fig1.add_subplot(111) ax1.scatter(X[:, 0], X[:, 1]) k = 10 centList, clusterAssment = kMeans.biKmeans(X, k) fig2 = plt.figure() ax2 = fig2.add_subplot(111) centroids = array(centList) datMat = array(X) colorList = ['b', 'c', 'g', 'k', 'r', 'y', 'm', 'w'] makerList = ['.', '^', '*', '+', 'o'] for i in range(k): ax2.scatter(datMat[kMeans.find_all_index(clusterAssment[:, 0], i), 0], datMat[kMeans.find_all_index(clusterAssment[:, 0], i), 1], c=colorList[i % 8], marker=makerList[3]) ax2.scatter(centroids[:, 0], centroids[:, 1], marker=makerList[4], c=colorList[4])
# kmeansTest.py import kMeans from numpy import * # dataMat = mat(kMeans.loadDataSet('testSet.txt')) ''' print('min(dataMat[:, 0]) = ', min(dataMat[:, 0])) print('min(dataMat[:, 1]) = ', min(dataMat[:, 1])) print('max(dataMat[:, 0]) = ', max(dataMat[:, 0])) print('max(dataMat[:, 1]) = ', max(dataMat[:, 1])) print('randCent of dataset : ', kMeans.randCent(dataMat, 2)) print('distance of eclud : ', kMeans.distEclud(dataMat[0], dataMat[1])) ''' # myCentroids, clusterAssing = kMeans.kMeans(dataMat, 4) # print('myCentroids : ', myCentroids) # print('clusterAssing : ', clusterAssing) dataMat3 = mat(kMeans.loadDataSet('testSet2.txt')) centList, newAssments = kMeans.biKmeans(dataMat3, 3) print('centList = ', centList)
min(datMat[:,0]) max(datMat[:,0]) min(datMat[:,1]) max(datMat[:,1]) kMeans.randCent(datMat,2) # 看一下初始化的质心是否在取值范围内 kMeans.distEclud(datMat[0],datMat[1]) # 在实际数据上看下K-means reload(kMeans) datMat = np.mat(kMeans.loadDataSet('testSet.txt')) myCentroids,clustAssing = kMeans.kMeans(datMat,4) # 不一定是全局最优解 # 二分k-means reload(kMeans) datMat3 = np.mat(kMeans.loadDataSet('testSet2.txt')) centList,myNewAssments = kMeans.biKmeans(datMat3,3) # 其实依然无法保证全局最优解,只能是局部最优解 centList myNewAssments # 利用二分k-means在图上画出簇 reload(kMeans) kMeans.clusterClubs(4)
def test3(): dataMat = np.mat(kMeans.loadDataSet('testSet.txt')) kMeans.biKmeans(dataMat, 4)