Example #1
0
def test1():
    datMat = mat(kMeans.loadDataSet('testSet.txt'))
    print(min(datMat[:, 0]))
    print(kMeans.randCent(datMat, 2))
    print(kMeans.distEclud(datMat[0], datMat[1]))

    #myCentroids, clustAssing = kMeans.kMeans(datMat,3)
    myCentroids, clustAssing = kMeans.biKmeans(datMat, 4)
    print(myCentroids)

    kMeans.plot1(datMat, myCentroids)
Example #2
0
#!/usr/bin/env python
__coding__ = "utf-8"
__author__ = "Ng WaiMing"

from kMeans import kMeans
from kMeans import loadDataSet
from kMeans import randCent
from kMeans import distEclud
from kMeans import biKmeans
from numpy import *

if __name__ == '__main__':
    dataMat = mat(loadDataSet('testSet.txt'))
    print('min(dataMat[:, 0])', min(dataMat[:, 0]), '\n')
    print('min(dataMat[:, 1])', min(dataMat[:, 1]), '\n')
    print('max(dataMat[:, 0])', max(dataMat[:, 0]), '\n')
    print('max(dataMat[:, 1])', max(dataMat[:, 1]), '\n')
    print(randCent(dataMat, 2), '\n')
    print(distEclud(dataMat[0], dataMat[1]))
    centroids, clusterAssment = kMeans(dataMat, 4)
    print('centroids:\n', centroids, '\n')
    print('clusterAssment:\n', clusterAssment, '\n')
    dataMat3 = mat(loadDataSet('testSet2.txt'))
    centList, myNewAssments = biKmeans(dataMat3, 3)
    print('centList: \n', centList, '\n')
    # fileName = '../../../../data/k-means/places.txt'
    # imgName = '../../../../data/k-means/Portland.png'
    # kMeans.clusterClubs(fileName=fileName, imgName=imgName, numClust=5)
import kMeans
from numpy import *

datMat = mat(kMeans.loadDataSet('testSet.txt'))
print min(datMat[:, 0])
print min(datMat[:, 1])
print max(datMat[:, 0])
print max(datMat[:, 1])
print kMeans.randCent(datMat, 2)
print kMeans.distEclud(datMat[0], datMat[1])

myCentroids, clustAssing = kMeans.kMeans(datMat, 4)
#print myCentroids, clustAssing

datMat3 = mat(kMeans.loadDataSet('testSet2.txt'))
centList, myNewAssments = kMeans.biKmeans(datMat3, 3)
print centList
Example #4
0
import time
st = time.time()
k = 3
dataMat = mat(kMeans.loadDataSet('testSet2.txt'))
oldClassLabel = zeros(len(dataMat), int)
newClassLabel = ones(len(dataMat), int)
center = kMeans.randCent(dataMat, k)
dist = []
m = 0
while newClassLabel.tolist().__eq__(
        oldClassLabel.tolist()) != True:  # 所有的点的新分类的标签和旧分类的标签不一致时就继续进行划分
    m += 1  # 迭代次数
    for di in range(len(dataMat)):  # 对数据集中的每个数据点
        dist = []
        for ci in range(len(center)):  # 对每个质心,计算某个点到质心的距离
            dist.append(kMeans.distEclud(dataMat[di],
                                         center[ci]))  # dist 用于记录一个点到所有簇点的距离
        distsort = array(dist).argsort()  # 对距离排序,返回从小到大的索引
        oldClassLabel = newClassLabel.copy()  # !!!!注意这里是引用不能直接用等号,否则将会使得两个值一起变
        newClassLabel[di] = distsort[0]  # 取出索引的最小值,就是距离最近的点
    for j in range(k):
        x = mean(array(dataMat)[kMeans.find_all_index(newClassLabel, j),
                                0])  # 重新计算簇心的坐标
        y = mean(array(dataMat)[kMeans.find_all_index(newClassLabel, j), 1])
        center[j] = [x, y]
fig = plt.figure()
ax = fig.add_subplot(111)
center = array(center)
ax.scatter(center[:, 0], center[:, 1], marker='+', c='r')
print center
print m
dataMat = array(dataMat)
fr = open(fileName)
for line in fr.readlines():
    lineArr = []
    curLine = line.strip().split('\t')
    for i in range(num):
        lineArr.append(float(curLine[i]))
    dataMat.append(lineArr)

dataSet = mat(dataMat)
k = 3
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m, 2)))
centroid0 = mean(dataSet, axis=0).tolist()[0]
centList = [centroid0]  #create a list with one centroid
for j in range(m):  #calc initial Error
    clusterAssment[j, 1] = distEclud(mat(centroid0), dataSet[j, :])**2
while (len(centList) < k):
    lowestSSE = inf
    for i in range(len(centList)):
        ptsInCurrCluster = dataSet[nonzero(
            clusterAssment[:, 0].A ==
            i)[0], :]  #get the data points currently in cluster i
        centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distEclud)
        sseSplit = sum(
            splitClustAss[:, 1])  #compare the SSE to the currrent minimum
        sseNotSplit = sum(
            clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0], 1])
        print("sseSplit, and notSplit: ", sseSplit, sseNotSplit)
        if (sseSplit + sseNotSplit) < lowestSSE:
            bestCentToSplit = i
            bestNewCents = centroidMat
Example #6
0
@author: 凯风
"""

import kMeans
import numpy as np
from imp import reload

reload(kMeans)
datMat = np.mat(kMeans.loadDataSet('testSet.txt'))
min(datMat[:,0])
max(datMat[:,0])
min(datMat[:,1])
max(datMat[:,1])
kMeans.randCent(datMat,2)   # 看一下初始化的质心是否在取值范围内
kMeans.distEclud(datMat[0],datMat[1])

# 在实际数据上看下K-means
reload(kMeans)
datMat = np.mat(kMeans.loadDataSet('testSet.txt'))
myCentroids,clustAssing = kMeans.kMeans(datMat,4)   # 不一定是全局最优解


# 二分k-means
reload(kMeans)
datMat3 = np.mat(kMeans.loadDataSet('testSet2.txt'))
centList,myNewAssments = kMeans.biKmeans(datMat3,3) # 其实依然无法保证全局最优解,只能是局部最优解
centList
myNewAssments

# 利用二分k-means在图上画出簇
import kMeans
from numpy import *

datMat = mat(kMeans.loadDataSet('testSet.txt'))
print min(datMat[:,0])
print min(datMat[:,1])
print max(datMat[:,0])
print max(datMat[:,1])
print kMeans.randCent(datMat, 2)
print kMeans.distEclud(datMat[0], datMat[1])

myCentroids, clustAssing = kMeans.kMeans(datMat, 4)
#print myCentroids, clustAssing

datMat3 = mat(kMeans.loadDataSet('testSet2.txt'))
centList, myNewAssments = kMeans.biKmeans(datMat3, 3)
print centList
Example #8
0
import urllib
import json
# homedir= os.getcwd()+'/machinelearninginaction/ch10/'  #绝对路径
homedir = ''  #相对路径

#10.1 k均值聚类算法
datMat = mat(kMeans.loadDataSet(homedir + 'testSet.txt'))
myCentroids, clustAssing = kMeans.kMeans(datMat, 4)

print "datMat:", datMat
print "min(datMat[:,0]):", min(datMat[:, 0])
print "min(datMat[:,1]):", min(datMat[:, 1])
print "max(datMat[:,0]):", max(datMat[:, 0])
print "max(datMat[:,1]):", max(datMat[:, 1])
print "randCent(datMat,2):", kMeans.randCent(datMat, 2)
print "distEclud( datMat[ 0], datMat[ 1]):", kMeans.distEclud(
    datMat[0], datMat[1])
print "myCentroids:", myCentroids
print "clustAssing:", clustAssing
print ":",
print ":",

#10.3 二分k均值算法
datMat3 = mat(kMeans.loadDataSet(homedir + 'testSet2.txt'))
centList, myNewAssments = kMeans.biKmeans(datMat3, 3)
print "datMat3:", datMat3
print "centList:", centList
print "myNewAssments:", myNewAssments

#10.4.1 Yahoo!PlaceFinder API
# geoResults=kMeans.geoGrab('1 VA Center', 'Augusta, ME')
# print "geoResults:",geoResults
def test1():
    dataMat = np.mat(kMeans.loadDataSet('testSet.txt'))
    print kMeans.randCent(dataMat, 2)
    print kMeans.distEclud(dataMat[0], dataMat[1])