Example #1
0
def test1():
    datMat = mat(kMeans.loadDataSet('testSet.txt'))
    print(min(datMat[:, 0]))
    print(kMeans.randCent(datMat, 2))
    print(kMeans.distEclud(datMat[0], datMat[1]))

    #myCentroids, clustAssing = kMeans.kMeans(datMat,3)
    myCentroids, clustAssing = kMeans.biKmeans(datMat, 4)
    print(myCentroids)

    kMeans.plot1(datMat, myCentroids)
Example #2
0
def main():

    dataDirectory = 'D:/PycharmProjects/ZhiHuKanShan/zhihukanshan/data'

    list_DataMat_wordVectorSet = kMeans.loadDataSet(dataDirectory +
                                                    '/rem_word_embedding.txt')
    # matDataMat_Label = mat(listDataMat_label)
    # matDataMat_wordVectorSet = mat(list_DataMat_wordVectorSet)
    matDataMat_wordVectorSet = pd.DataFrame(list_DataMat_wordVectorSet)

    # store the matDataMat_Label into word_label.pkl
    # output_1 = open('word_wordVectorSet.pkl', 'wb')
    # Pickle dictionary using protocol 0.
    # pickle.dump(matDataMat_wordVectorSet, output_1)
    # output_1.close

    matDataMat_wordVectorSet.to_hdf('rem_word_embedding.h5', 'df')
import matplotlib.pyplot as plt


def showCluster(dataSet, k, centroids, clusterAssment):
    m, dim = shape(dataSet)
    if dim != 2:
        print("Sorry! i can not draw because the dimension of data is not 2!")
        return 1

    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    if k > len(mark):
        print("Sorry! Your k is too large!")
        return 1
    # draw all samples
    for i in range(m):
        markIndex = int(clusterAssment[i, 0])  # 为样本指定颜色
        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])

    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
    # draw the centroids
    for i in range(k):
        plt.plot(centroids[i, 0], centroids[i, 1], mark[i], marker='+', color='red', markersize=18)
        # 用marker来指定质心样式,用color和markersize来指定颜色和大小

    plt.show()


datMat=mat(kMeans.loadDataSet('../data/kMeans_testSet.txt'))

clusterCenters,clusterAssment = kMeans.kMeans(datMat,4)
showCluster(datMat,4,clusterCenters,clusterAssment)
import kMeans
from numpy import *

# dat_set = mat(kMeans.loadDataSet('ds_hash.txt'))
dat_dropship = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_DROPSHIP.txt'))
dat_other_0 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_0.txt'))
dat_other_1 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_1.txt'))
dat_other_2 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_2.txt'))
dat_other_3 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_3.txt'))
dat_other_4 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_4.txt'))
dat_other_5 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_5.txt'))
dat_other_6 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_6.txt'))

print "origin date"
print len(dat_dropship)
print len(dat_other_0)

for i in dat_dropship:
	dat_other_0.append(i)
	dat_other_1.append(i)
	dat_other_2.append(i)
	dat_other_3.append(i)
	dat_other_4.append(i)
	dat_other_5.append(i)
	dat_other_6.append(i)

print len(dat_other_0)
print len(dat_other_1)
print len(dat_other_2)
print len(dat_other_3)
print len(dat_other_4)
Example #5
0
'''
Created on 2016. 2. 9.

@author: TaijinKim
'''

import kMeans
from numpy import *

dataMat = mat(kMeans.loadDataSet('../data/testSet.txt'))
# print(min(dataMat[:, 0]))
# print(min(dataMat[:, 1]))
# print(max(dataMat[:, 1]))
# print(max(dataMat[:, 0]))
# 
print(kMeans.randCent(dataMat, 2))
# 
# print(kMeans.distEclud(dataMat[0], dataMat[1]))
# myCentroids, clustAssing = kMeans.kMeans(dataMat, 4)
Example #6
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

'10.3'

__author__ = 'lxp'

import kMeans
import numpy as np

datMat3 = np.mat(kMeans.loadDataSet('testSet2.txt'))
centList, myNewAssment = kMeans.biKMeans(datMat3, 3)
print(centList)
#print (myNewAssment)
Example #7
0
import kMeans
import numpy as np
import matplotlib.pyplot as plt
dataMat = np.mat(kMeans.loadDataSet('testSet.txt'))
#print(kMeans.randCent(dataMat,2))
myCentroids, clustAssing = kMeans.kMeans(dataMat,4)
#print(myCentroids,clustAssing)
datalist = dataMat.tolist()
#print([x[0] for x in datalist])
'''plt.figure()
plt.scatter([x[0] for x in datalist],[x[1] for x in datalist])
plt.scatter([x[0] for x in myCentroids.tolist()],[x[1] for x in myCentroids.tolist()])
plt.title('kmeans')
plt.show()'''
dataMat = kMeans.loadDataSet("testSet2.txt")
centList, clusteAssment = kMeans.biKmeans(dataMat, 3)
print(centList)

import kMeans
import ProbIN
from numpy import *
import subprocess
import numpy as np

datMat = mat(kMeans.loadDataSet('motionData_Training.txt'))
kMeans.biKmeans(datMat,12)

# datMat2 = mat(kMeans.loadDataSet('GPS_1Hz_training.txt'))
# kMeans.biKmeans(datMat2,7)
import kMeans
import os
import sys
from numpy import *

project_path = os.path.abspath(os.path.dirname(__file__))
text_path = os.path.join(project_path, "../chapter10/testSet.txt")
datMat = mat(kMeans.loadDataSet(text_path))
myCentroids, clustAssing = kMeans.kMeans(datMat, 4)
import kMeans
from numpy import *
import time

# dat_set = mat(kMeans.loadDataSet('ds_hash.txt'))
dat_dropship = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_DROPSHIP.txt'))
dat_other_0 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_0.txt'))

print "origin date"
print len(dat_dropship)
print len(dat_other_0)

for i in dat_dropship:
	dat_other_0.append(i)

print len(dat_other_0)

dat_other_0 = mat(dat_other_0)
# dat_dropship_0 = mat(dat_dropship)



def biKmeans_func(data_set, k, cent_file="", clus_file=""):
	print "kMeans : " + cent_file
	cent, clus = kMeans.biKmeans(data_set, k)
	# print cent
	# print clus

	kmean_res_cent_file = open(cent_file, 'w')
	for item in cent.A:
		item_str = ""
#!/usr/bin/python2.7
# _*_ coding: utf-8 _*_

"""
@Author: MarkLiu
"""

import numpy as np
import kMeans
import matplotlib.pyplot as plt

dataArr = kMeans.loadDataSet('datasets/testSet2.txt')
dataMat = np.matrix(dataArr)
k = 3
centroids, clusterAssment = kMeans.biKmeans(dataMat, k)
# centroids, clusterAssment = kMeans.kMeans(dataMat, k)

# 计算原始数据加上中心数据,将数据分离
m = np.shape(dataMat)[0]

# 分离出不同簇的x,y坐标
xPoint_0 = []
yPoint_0 = []
xPoint_1 = []
yPoint_1 = []
xPoint_2 = []
yPoint_2 = []
xPoint_3 = []
yPoint_3 = []
for i in range(m):
    if int(clusterAssment[i, 0]) == 0:
import kMeans
import numpy as np 
import matplotlib.pyplot as plt

dataMat = np.mat(kMeans.loadDataSet('kMeans/testSet2.txt'))
myCentroid, clustAssing = kMeans.kMeans(dataMat, 4)
#plt.plot(dataMat[:,0],dataMat[:,1], 'ro')
#plt.plot(myCentroid[:,0], myCentroid[:,1], 'gs')

dataMat2 = np.mat(kMeans.loadDataSet('kMeans/testSet2.txt'))
centList, myNewAssments = kMeans.biKMeans(dataMat2,3)
centList = np.mat(centList)
plt.plot(dataMat2[:,0],dataMat2[:,1], 'ro')
plt.plot(centList[:,0], centList[:,1], 'gs')

plt.show()
Example #13
0
# -*- coding:utf-8 -*-

import kMeans
from numpy import  *

datMat = mat(kMeans.loadDataSet("testSet.txt"))

'''
myCentroids,clusterAssing = kMeans.kMeans(datMat, 4)
print("myCentroids is %s " % myCentroids)
print("clusterAssing is %s " % clusterAssing)
'''


#kMeans test example two
dataMat2 = mat(kMeans.loadDataSet('testSet2.txt'))
centList,myNewAssment = kMeans.biKmeans(dataMat2, 3)
print(centList)

#geoResult = kMeans.geoGrab('1 VA Center', 'Augusta,ME')


def test1():
    dataMat = np.mat(kMeans.loadDataSet('testSet.txt'))
    print kMeans.randCent(dataMat, 2)
    print kMeans.distEclud(dataMat[0], dataMat[1])
def test4():
    dataMat = np.mat(kMeans.loadDataSet('testSet2.txt'))
    centList, myNewAssments = kMeans.biKmeans(dataMat, 3)
    kMeans.plotScatter(dataMat, centList, myNewAssments)
    print centList
def test3():
    dataMat = np.mat(kMeans.loadDataSet('testSet.txt'))
    kMeans.biKmeans(dataMat, 4)
Example #17
0
            #将当前簇 i 进行二分kMeans处理
            centroidMat, splitClustAss = kMeans.kMeans(ptsInCurrCluster, 2, distMeas)
            #将二分 kMeans 结果中的平方和的距离进行求和
            sseSplit = sum(splitClustAss[:,1])
            #将未参与二分 kMeans 分配结果中的平方和的距离进行求和
            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
            print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
            #计算拆分后与未拆分时的误差和,误差和越小,划分的结果就越好。
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
        #找出最好的簇分配结果???
        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)   #当使用kMeans()函数并指定簇数为2时,会得到两个编号为0和1的结果簇。需要将这些簇编号修改为划分簇及新加簇的编号。
        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit # 更新为最佳质心
        print('the bestCentToSplit is: ',bestCentToSplit)
        print('the len of bestClustAss is: ', len(bestClustAss))
        #更新质心列表???
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]    #更新原质心 list 中的第 i 个质心为使用二分 kMeans 后 bestNewCents 的第一个质心
        centList.append(bestNewCents[1,:].tolist()[0])      # 添加 bestNewCents 的第二个质心
        # 重新分配最好簇下的数据(质心)以及SSE
        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss
    return mat(centList), clusterAssment

if __name__ == '__main__':
    #测试二分K-Means聚类算法
    myDat = kMeans.loadDataSet(r'C:\Users\v_wangdehong\PycharmProjects\MachineLearning_V\9.K-Means\data\testSet2.txt')
    myMat = mat(myDat)
    centList,myNewAssments = biKmeans(myMat,3)
    print(centList)
    "M11": [0.839471456333, 0.385856421, 0.983790752333],
    "M12": [0.957817317, 0.3012502055, 0.7800295435],
}

dispLabelDic = {
    "D1": [0.47949965, 0],
    "D2": [0.24391939, 0.65005331],
    "D3": [0.65160991, 1],
    "D4": [0.66235972, 0.59802129],
    "D5": [0.61647991, 0.53295326],
    "D6": [0.64551821, 0.67475389],
    "D7": [0, 0.3630064],
}


motionDataMat = mat(kMeans.loadDataSet("motionData_Training.txt"))
dispDataMat = mat(kMeans.loadDataSet("GPS_1Hz_training.txt"))

print motionDataMat[0]

# print ProbIN.classifyMotionLabel(motionDataMat[0]) == 'M4'

print dispDataMat[0]

print ProbIN.classifyDispLabel(dispDataMat[0])

f = open("MD_pair_1Hz_for_Training.txt", "w")

for i in range(len(dispDataMat)):
    print >> f, ProbIN.classifyMotionLabel(motionDataMat[i]), "\t", ProbIN.classifyDispLabel(dispDataMat[i])
Example #19
0
import kMeans
from numpy import *
from imp import reload

# dataMat = mat(kMeans.loadDataSet('./testSet.txt'))
# kMeans.randCent(dataMat, 2)
# kMeans.distEclud(dataMat[0], dataMat[1])
# myCentroids, clustAssing = kMeans.kMeans(dataMat, 4)

datMat3 = mat(kMeans.loadDataSet('./testSet2.txt'))
centList, myNewAssments = kMeans.biKmeans(datMat3, 3)
Example #20
0
# coding:utf-8

import kMeans

from  numpy import *

datMat=mat(kMeans.loadDataSet('testSet.txt'))
print datMat[1:5,:]


myCentroids,clustAssing=kMeans.kMeans(datMat,4)
print myCentroids
print ' '
print clustAssing

datMat3=mat(kMeans.loadDataSet('testSet2.txt'))
centList,myNewAssments=kMeans.biKmeans(datMat3,3)
print centList,myNewAssments
Example #21
0
        """用matplotlib展示划分过程及结果"""
        for t in range(len(centList)):  #遍历当前的每一个簇
            ptsInCurrCluster = dataSet[nonzero(
                clusterAssment[:, 0].A == t)[0], :]  #过滤出属于这一簇的数据
            x = flatten(ptsInCurrCluster[:, 0].tolist())
            y = flatten(ptsInCurrCluster[:, 1].tolist())
            plt.scatter(x, y)  #将点展示出来
        xx = []
        yy = []
        for cent in centList:  #标注质心
            xx.append(cent[0, 0])
            yy.append(cent[0, 1])
        plt.scatter(xx, yy, marker='*')
        plt.show()

    return centList, clusterAssment


if __name__ == '__main__':
    test = loadDataSet('./testSet2.txt')
    x = []
    y = []
    for t in test:
        x.append(t[0])
        y.append(t[1])
    plt.scatter(x, y)
    plt.show()  #得到原始数据的散点图
    dataSet = mat(test)
    #print dataSet,dataSet[0,:]
    biKmeans(dataSet, 3)
Example #22
0

def showFigure(dataMat, k, clusterAssment):

    tag = ['go', 'or', 'yo', 'ko', 'bo', 'mo']
    for i in range(k):
        datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]]
        c = mat(i * ones((len(datalist), 1)))
        pylab.plot(datalist[:, 0], c, tag[i])
    pylab.show()

    row = 0
    for i in range(k):
        datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]]
        for j in range(len(datalist)):
            sheet1.write(row, 0, datalist[j, 0])
            #sheet1.write(row, 1, datalist[j,1])
            sheet1.write(row, 1, tag[i])
            row += 1


if __name__ == '__main__':
    outputfilename = 'D:\\code\\KM\\res.xls'
    outputfile = xlwt.Workbook()
    sheet1 = outputfile.add_sheet('sheet1', cell_overwrite_ok=True)
    k = 6
    dataMat = mat(kMeans.loadDataSet('D:\\code\\KM\\site.txt'))
    myCentroids, clusterAssment = kMeans.kMeans(dataMat, k)
    showFigure(dataMat, k, clusterAssment)
    outputfile.save(outputfilename)
Example #23
0
from sklearn.cluster import KMeans
from numpy import *
import kMeans

X = kMeans.loadDataSet('testSet.txt')
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)
print("sklearn实现质心列表为:", kmeans.cluster_centers_)
centroids, _ = kMeans.kMeans(mat(X), 4)
print("python实现质心列表为:", centroids)
Example #24
0
File: 10.py Project: niumeng07/ML
#!/usr/bin/env python
#-*- coding: UTF-8 -*-

import kMeans
from numpy import *

dataMat=mat(kMeans.loadDataSet('testSet.txt'))
kMeansRandCenter=kMeans.randCent(dataMat,2) #   两个中心
print(kMeansRandCenter)

centroids,clusterAssment=kMeans.kMeans(dataMat,5)

import matplotlib.pyplot as plt
fig=plt.figure(1)
plt.plot(centroids[:,0],centroids[:,1],'ro')
plt.plot(dataMat[:,0],dataMat[:,1],'bo')
plt.axis([-8,8,-8,8])
# plt.show()

kMeans.binaryKeans(dataMat,3)

dataMat3=mat(kMeans.loadDataSet('testSet2.txt'))
centList,Assments=kMeans.binaryKeans(dataMat3,3)
print("centList:",centList)
print("Assments:",Assments)
fig=plt.figure(2)
plt.plot(dataMat3[:,0],dataMat3[:,1],'bo')
plt.plot(centList[:,0],centList[:,1],'ro')
plt.axis([-10,10,-10,10])
# plt.show()
Example #25
0
import xlwt


def showFigure(dataMat, k, clusterAssment):

    tag = ['go', 'or', 'yo', 'ko', 'bo', 'mo']
    for i in range(k):
        datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]]
        pylab.plot(datalist[:, 0], datalist[:, 1], tag[i])
    pylab.show()

    row = 0
    for i in range(k):
        datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]]
        for j in range(len(datalist)):
            sheet1.write(row, 0, datalist[j, 0])
            sheet1.write(row, 1, datalist[j, 1])
            sheet1.write(row, 2, tag[i])
            row += 1


if __name__ == '__main__':
    outputfilename = 'D:\\code\\team\\res.xls'
    outputfile = xlwt.Workbook()
    sheet1 = outputfile.add_sheet('sheet1', cell_overwrite_ok=True)
    k = 6
    dataMat = mat(kMeans.loadDataSet('D:\\code\\team\\data.txt'))
    myCentroids, clusterAssment = kMeans.kMeans(dataMat, k)
    showFigure(dataMat, k, clusterAssment)
    outputfile.save(outputfilename)
Example #26
0
import kMeans
from numpy import*
import matplotlib
import matplotlib.pyplot as plt

k = 4
datmat = array(kMeans.loadDataSet('testSet.txt'))
centerList, clusterAssment = kMeans.biKmeans(datmat,k)
print 'The cendroids is:',centerList
fig = plt.figure()
fig.add_subplot(111)
colorList = ['b','c','g','k','r','y']
makerList = ['.','^','*','o','+']
for i in range(k):
    ax = plt.scatter(datmat[nonzero(clusterAssment[:,0].A == i)[0],0],datmat[nonzero(clusterAssment[:,0].A == i)[0],1],
                     c = colorList[i],marker=makerList[i])
    ax = plt.scatter(array(centerList[:,0]),array(centerList[:,1]),c = colorList[4],marker=makerList[3])
plt.title('Graph of k_Means ',)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
Example #27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author:yiluzhang

import kMeans
import numpy as np

if __name__ == '__main__':
    data_set = np.mat(kMeans.loadDataSet('testSet.txt'))
    cent, clus = kMeans.kMeans(data_set, 4)
    print(cent)
    #print(clus)
Example #28
0
#!/usr/bin/env python
__coding__ = "utf-8"
__author__ = "Ng WaiMing"

from kMeans import kMeans
from kMeans import loadDataSet
from kMeans import randCent
from kMeans import distEclud
from kMeans import biKmeans
from numpy import *

if __name__ == '__main__':
    dataMat = mat(loadDataSet('testSet.txt'))
    print('min(dataMat[:, 0])', min(dataMat[:, 0]), '\n')
    print('min(dataMat[:, 1])', min(dataMat[:, 1]), '\n')
    print('max(dataMat[:, 0])', max(dataMat[:, 0]), '\n')
    print('max(dataMat[:, 1])', max(dataMat[:, 1]), '\n')
    print(randCent(dataMat, 2), '\n')
    print(distEclud(dataMat[0], dataMat[1]))
    centroids, clusterAssment = kMeans(dataMat, 4)
    print('centroids:\n', centroids, '\n')
    print('clusterAssment:\n', clusterAssment, '\n')
    dataMat3 = mat(loadDataSet('testSet2.txt'))
    centList, myNewAssments = biKmeans(dataMat3, 3)
    print('centList: \n', centList, '\n')
    # fileName = '../../../../data/k-means/places.txt'
    # imgName = '../../../../data/k-means/Portland.png'
    # kMeans.clusterClubs(fileName=fileName, imgName=imgName, numClust=5)
Example #29
0
import kMeans
from numpy import *

# 导入txt数据
datMat = mat(kMeans.loadDataSet('data2.txt'))

# datMat矩阵的第2-4列分别对应半长轴、偏心率和轨道倾角
datMat[0, 2:5]

# 计算距离
delta_v = kMeans.distdeltaV(datMat[0, 2:5], datMat[1, 2:5])

# 随机生成k个质心
centroids = kMeans.randCent(datMat[:, 2:5], 4)

# k-均值聚类
myCentroids, clustAssing = kMeans.kMeans(datMat[:, 2:5], 5, kMeans.distdeltaV)

# 二分 k-均值聚类
centList, myNewAssments = kMeans.biKmeans(datMat[:, 2:5], 5, kMeans.distdeltaV)

# 画图
kMeans.showCluster_SRQ(datMat[:, 2:5], myNewAssments)
Example #30
0
# -*- coding: UTF-8 -*-
# kMeans算法测试
# 运行环境: python3

from numpy import *
import kMeans

print("loading data...")
dataSet = mat(kMeans.loadDataSet('testSetForKMeans.txt'))

k = 4
centroids, clusterAssment = kMeans.kMeans(dataSet, k)

print("show the result...")
kMeans.showCluster(dataSet, k, centroids, clusterAssment)
Example #31
0
def plotResult():
    datMat=mat(kMeans.loadDataSet('testSet.txt'))
    myCentroids, clustAssing = kMeans.kMeans(datMat,4)
Example #32
0
import kMeans
from numpy import *

dataMat = mat(kMeans.loadDataSet('testSet.txt'))
# print min(dataMat[:,0])
#
# print(kMeans.randCent(dataMat,2))
#
# print(kMeans.distEclud(dataMat[0],dataMat[1]))

myCentroids, clustAssing = kMeans.kMeans(dataMat, 4)
print myCentroids
def test2():
    dataMat = np.mat(kMeans.loadDataSet('testSet.txt'))
    myCentroids, clusterAssing = kMeans.kMeans(dataMat, 4)
    print(clusterAssing)