Python kMeans Examples, kMeans.kMeans Python Examples

Example #1

0

Show file

File: spectral.py Project: wsgan001/DataMining-4

def spectral(R, k, n):
    W = []

    for i in range(0, len(R)):
        dist = []
        for j in range(0, len(R)):
            dist.append(0)
        W.append(dist)

    for i in range(0, len(R)):
        for j in range(i + 1, len(R)):
            W[j][i] = W[i][j] = distEnclud(R[i], R[j])

    minN(W, 3)
    a, dist = numpy.linalg.eig(W)
    e = numpy.array(dist.T)
    idx = numpy.argsort(a)
    eigVec = []

    for i in range(0, k):
        eigVec.append(e[idx[i]])
    e = numpy.array(eigVec)
    e = e.T

    final = []

    for i in range(0, len(e)):
        tmp = []
        for j in range(0, k):
            a = e[i][j].real
            tmp.append(float('%0.3f' % a))
        tmp.append(0)
        final.append(tmp)

    return kMeans.kMeans(final, k, 1)

Example #2

0

Show file

File: kMeans.py Project: pedroBBastos/machineLearning

def executeKMeans(dataTraining, dataTest):
    min_max_scaler = preprocessing.MinMaxScaler()
    data = min_max_scaler.fit_transform(dataTraining)

    pontosTreino, dimensoes = data.shape

    centroidesPorK = []
    clusteredPorK = []
    clusteredNumbersPorK = []
    k_clusters = [1, 2, 3, 4, 5]
    elbow_values_plot = []

    for k in k_clusters:
        clustered, centroides = km.kMeans(data, k, dimensoes)
        clusterNumbers = np.unique(clustered[:, dimensoes])

        centroidesPorK.append(centroides)
        clusteredPorK.append(clustered)
        clusteredNumbersPorK.append(clusterNumbers)

        # for ci in clusterNumbers:
        #     ci = clustered[clustered[:, 2] == ci]
        #     ci = ci[:, :2]
        #     cix = ci[:, 0]
        #     ciy = ci[:, 1]
        #     plt.plot(cix, ciy, color=np.random.random(3), marker='x', linestyle='')
        # plt.show()

        value = em.elbow_value(clustered, centroides, dimensoes)
        elbow_values_plot.append(value)

    print(clusteredPorK)

Example #3

0

Show file

File: biKmeans.py Project: houkk/Graduation

def biKmeans(dataSet, k):
    m = np.shape(dataSet)[0] # row
    # print dataSet
    clusterAssment = np.mat(np.zeros((m, 2)))
    centroid0 = np.mean(dataSet, axis=0).tolist()[0]
    centList = [centroid0]
    for j in range(m):
        clusterAssment[:, 1] = tools.distEclud(np.mat(centroid0), dataSet[j, :]) ** 2
    count = 0
    while len(centList) < k:
        lowestSSE = np.inf
        for i in range(len(centList)):
            ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == i)[0], :]
            # 质心对应的数据集
            centroidMat, splitClustAss = kMeans.kMeans(ptsInCurrCluster, 2)
            sseSplit = np.sum(splitClustAss[:, 1])
            # 划分后的SSE
            sseNotSplit = np.sum(clusterAssment[np.nonzero(clusterAssment[:, 0].A != i)[0], 1])
            # 未划分的SSE
            # print "sseSplit, and notSplit: ", sseSplit, sseNotSplit
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit

        bestClustAss[np.nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(centList)
        bestClustAss[np.nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit
        # print 'the bestCentToSplit is: ', bestCentToSplit
        # print 'the len of bestClustAss is: ', len(bestClustAss)
        centList[bestCentToSplit] = bestNewCents[0, :].tolist()[0]
        centList.append(bestNewCents[1, :].tolist()[0])
        clusterAssment[np.nonzero(clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClustAss
    return np.mat(centList), clusterAssment

Example #4

0

Show file

File: bisectingKmeans.py Project: idioticduke/k-means

def biKmeans(dataSet, k, distMeas=support.distEclud):
    m = np.shape(dataSet)[0]
    clusterAssment = np.mat(np.zeros((m, 2)))
    centroid0 = np.mean(dataSet, axis=0).tolist()[0]
    centList = [centroid0]
    for j in range(m):
        clusterAssment[j, 1] = distMeas(np.mat(centroid0), dataSet[j, :])**2
    while (len(centList) < k):
        lowestSSE = np.inf
        for i in range(len(centList)):
            ptsInCurrCluster = dataSet[np.nonzero(
                clusterAssment[:, 0].A == i)[0], :]
            centroidMat, splitClustAss = kMeans.kMeans(ptsInCurrCluster, 2,
                                                       distMeas)
            sseSplit = np.sum(splitClustAss[:, 1])
            sseNotSplit = np.sum(
                clusterAssment[np.nonzero(clusterAssment[:, 0].A != i)[0], 1])
            print("sseSplit, and notSplit: ", sseSplit, sseNotSplit)
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
        bestClustAss[np.nonzero(bestClustAss[:, 0].A == 1)[0],
                     0] = len(centList)
        bestClustAss[np.nonzero(bestClustAss[:, 0].A == 0)[0],
                     0] = bestCentToSplit
        print('the bestCentToSplit is: ', bestCentToSplit)
        print('the len of bestClustAss is: ', len(bestClustAss))
        centList[bestCentToSplit] = bestNewCents[0, :]
        centList.append(bestNewCents[1, :])
        clusterAssment[np.nonzero(
            clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClustAss
    return (centList), clusterAssment

Example #5

0

Show file

def mainBayes():
    testset = getTestset()
    group,labels = word.getArrays()
    dateset = np.array(group)
    a =dateset.shape[0]
    k = 6
    countAll,count = kMeans.kMeans(dateset, k)
    types = kMeans.mainKMeans2()
    Count =[]
    CountAll = []
    for i in range(len(count)):
        Count.append(count[i])
    for i in range(len(Count)):
        Count[i] = int(Count[i])
    for i in range(len(countAll)):
        CountAll.append(countAll[i])
    for i in range(len(CountAll)):
        CountAll[i] = int(CountAll[i])
    k = 6
    datesetC = kMeans.classifyDateset(dateset,countAll,count,k)
    Average = []
    Var = []
    Pro = []
    for i in range(k):
        Average.append(calAverage(datesetC[i]))
        Var.append(calVar(datesetC[i],Average[i]))
        Pro.append(calPro(testset, Average[i], Var[i]))
    a = len(count)
    rs =predict(Pro)
    label = getLabel(rs,labels,countAll)
    return label

Example #6

0

Show file

File: bossAsahi.py Project: kaeru8714/BDJA-recommendation-project

def Recursive_kMeans(centroids, dataMat, K, loop):
	global nLOOP
	if loop > 0:
		loop -= 1
		print("Recursive kMeans Loop: %d times" %(nLOOP - loop))
		new_centroids = kMeans.kMeans(centroids["centroids"], dataMat, K)

		return Recursive_kMeans(new_centroids, dataMat, K, loop)
	else:
		print("Recursive kMeans Loop: Finished.")
		return centroids

Example #7

0

Show file

File: unitTest.py Project: lostMindCtrl/MachineLearning

 def testkMeans():
     print(
         "Testing kMeans clustering - Clustering the Mall_Customer Dataset by Income and Spending Score"
     )
     df = pd.read_csv("testData\Mall_Customers.csv", delimiter=',')
     df.loc[(df.Gender == 'FEMALE'), 'Gender'] = 0
     df.loc[(df.Gender == 'MALE'), 'Gender'] = 1
     X = df.values
     clusters = km.kMeans(X[:, 3:5], 200, 5)
     print("<-------------------->")
     plotKMeansPoints(X, clusters, 5)

Example #8

0

Show file

    def knn(self, filename, k, GenVector):
        #callinfo = self.GenVector_1(filename)
        dataSource = GenVector(filename)
        tmp_list = []
        for call in dataSource:
            tmp_list.append(call[3:len(call)])  #前2列为call的id和code,参考信息.排除
        dataArray = array(tmp_list)

        import kMeans
        centroids, clusterAssment = kMeans.kMeans(dataArray, k)
        clusterlist = clusterAssment.tolist()
        list_knn = []

        for i in range(0, len(dataSource)):
            #f.write(callinfo[i][0])
            list_knn.append(clusterlist[i] + dataSource[i])

        resultdir = "../Data/knn_%s/" % time.strftime('%Y%m%d%H%M%S')
        os.mkdir(resultdir)
        f = open(resultdir + "knn_result.txt", 'w')
        for list_item in list_knn:
            f.write(str(list_item))
            f.write("\n")
        f.close()

        num = [0] * k
        for i in range(0, k):
            f_name = resultdir + "knn_result_%s.txt" % i
            f_k = open(f_name, 'w')
            for list_item in list_knn:
                if (list_item[0] == i):
                    num[i] = num[i] + 1
                    f_k.write(str(list_item))
                    f_k.write("\n")
            f_k.close()

        s_out = ""
        s_out += "k-均值聚类结束!\n"
        s_out += "数据源文件:\n\t%s\n" % filename
        s_out += "特征向量提取方法:\n\t%s\n" % GenVector
        s_out += "样本量:\n\t%d\n" % len(list_knn)
        s_out += "类别数量:\n\tk = %d \n" % k
        s_out += "结果文件为(位置:%s):\n\tlog.txt (执行日志)\n\tknn_result.txt (总的聚类结果)\n" % resultdir
        for i in range(0, k):
            s_out += "\tknn_result_%d.txt " % i
            s_out += " (l = %d)\n" % num[i]

        print s_out
        f = open(resultdir + "log.txt", 'w')
        f.write(s_out)
        f.close()

        return

Example #9

0

Show file

File: gaussianMM.py Project: RishabhKothaari/Machine-Learning-Algorithms

def main():
    data = numpy.loadtxt('GMM_dataset.txt')
    km = kMeans.kMeans(k=5, r=30, t=1e-03)
    km.clusterData(data)
    trueMean, trueCov = computeTrueValues(data)
    log({"true mean": trueMean})
    log({"true cov": trueCov})
    gmm = gMM(data, r=50, centroids=km.centroids,
              clusters=km.clusters, t=1e-03)
    gmm.fit()
    print("Gaussian Mixture Centroids - ", gmm.centroids)
    print("Gaussian Mixture Covariance - ", gmm.covs)
    gmm.plotGaussian(data)

Example #10

0

Show file

File: app.py Project: ArmineB/Clustering_with_KMeans

def clustering():
    k = request.form.get('k')
    clusters = kMeans(dataFrame, selectedFeatures, int(k))
    global dataFrame1
    dataFrame1 = getIdWithClusters(clusters, dataFrame)
    global dataFrame2
    dataFrame2 = describeData(selectedFeatures, dataFrame, clusters)

    return render_template("temp/analysis.html",
                           numberOfClusters=k,
                           data1=dataFrame1.to_html(index=False,
                                                    table_id='ID1'),
                           data2=dataFrame2.to_html(table_id='ID2'))

Example #11

0

Show file

def biKmeans(dataSet, k, distMeas=kMeans.distEclud):
    """
    给定数据集、期望的簇数目和距离计算方法的条件下，返回聚类结果。
    :param dataSet:     数据集
    :param k:           期望的簇数目
    :param distMeas:    距离计算方法
    :return:            聚类结果
    """
    m = shape(dataSet)[0]
    #创建一个矩阵，保存每个数据点的簇分配结果和平方误差。
    clusterAssment = mat(zeros((m,2)))
    #将质心初始化为所有数据点的均值。
    centroid0 = mean(dataSet, axis=0).tolist()[0]
    #创建一个只有一个质心的list.
    centList =[centroid0]
    #计算每个数据点到质心的距离平方差
    for j in range(m):
        clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
    #判断当前簇数目是否达到预期
    while (len(centList) < k):
        #初始化最小SSE
        lowestSSE = inf
        #开始遍历每一个质心
        for i in range(len(centList)):
            #获取当前簇 i 下的所有数据点
            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]
            #将当前簇 i 进行二分kMeans处理
            centroidMat, splitClustAss = kMeans.kMeans(ptsInCurrCluster, 2, distMeas)
            #将二分 kMeans 结果中的平方和的距离进行求和
            sseSplit = sum(splitClustAss[:,1])
            #将未参与二分 kMeans 分配结果中的平方和的距离进行求和
            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
            print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
            #计算拆分后与未拆分时的误差和，误差和越小，划分的结果就越好。
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
        #找出最好的簇分配结果？？？
        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)   #当使用kMeans()函数并指定簇数为2时，会得到两个编号为0和1的结果簇。需要将这些簇编号修改为划分簇及新加簇的编号。
        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit # 更新为最佳质心
        print('the bestCentToSplit is: ',bestCentToSplit)
        print('the len of bestClustAss is: ', len(bestClustAss))
        #更新质心列表？？？
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]    #更新原质心 list 中的第 i 个质心为使用二分 kMeans 后 bestNewCents 的第一个质心
        centList.append(bestNewCents[1,:].tolist()[0])      # 添加 bestNewCents 的第二个质心
        # 重新分配最好簇下的数据（质心）以及SSE
        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss
    return mat(centList), clusterAssment

Example #12

0

Show file

File: app.py Project: ArmineB/Clustering_with_KMeans

def optimalClustering():
    optimalK = getOptimalK(data, maxK)

    clusters = kMeans(dataFrame, selectedFeatures, optimalK)
    global dataFrame1
    dataFrame1 = getIdWithClusters(clusters, dataFrame)
    global dataFrame2
    dataFrame2 = describeData(selectedFeatures, dataFrame, clusters)

    return render_template("temp/analysis.html",
                           numberOfClusters=str(optimalK),
                           data1=dataFrame1.to_html(index=False,
                                                    table_id='ID1'),
                           data2=dataFrame2.to_html(table_id='ID2'))

Example #13

0

Show file

File: cluster.py Project: yuanjungod/ZhiHuKanShan

def main():
    word_embedding_df = pd.read_hdf('rem_word_embedding.h5', 'df')
    word_embedding = mat(word_embedding_df.values)  # .values is an array
    print 'word_embedding is loaded'
    m = word_embedding.shape[0]
    n = word_embedding.shape[1]
    print m
    print n
    word_embedding_label = word_embedding[:, 0]
    word_embedding_vector = word_embedding[:, 1:]
    # print word_embedding_label
    # print word_embedding_vector
    myCentriods, clustAssing = kMeans.kMeans(word_embedding_vector, 2700)
    print 'clustAssing:'
    print clustAssing

Example #14

0

Show file

File: lab1.py Project: prague15031939/DMA_labs

	def startupRecognition():
		imageAmount = int(imageAmountEntry.get())
		classAmount = int(classAmountEntry.get())
		if not (1000 <= imageAmount <= 100000) or not (2 <= classAmount <= 20):
			return

		labelProcessing = startupUI(btnProcess)

		workObject = kMeans(imageAmount, classAmount, canvasMain.winfo_width(), canvasMain.winfo_height())
		workObject.generateData()
		workObject.recognize()

		workerThread = Thread(target=coresRecount, args=(workObject, canvasMain, labelProcessing, btnProcess))
		workerThread.daemon = True
		workerThread.start()

		return

Example #15

0

Show file

def biKmeans(dataSet, k, distMeas=distEclud):
    m = shape(dataSet)[0]
    # 这里第一列为类别，第二列为SSE
    clusterAssment = mat(zeros((m,2)))
    # 看成一个簇是的质心
    centroid0 = mean(dataSet, axis=0).tolist()[0]
    centList =[centroid0] #create a list with one centroid
    for j in range(m):    #计算只有一个簇是的误差
        clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2

    # 核心代码
    while (len(centList) < k):
        lowestSSE = inf
        # 对于每一个质心，尝试的进行划分
        for i in range(len(centList)):
            # 得到属于该质心的数据，其中clusterAssment[:,0].A==i相当于是去判断和i相等的数，并且按照下标给出等于则为true。nonzero(clusterAssment[:,0].A==i)返回需要的下标。
            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]
            # 对该质心划分成两类
            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)  # 二分K-均值首次都是采用的随机给质心方式。
            # 计算该簇划分后的SSE
            sseSplit = sum(splitClustAss[:,1])
            # 没有参与划分的簇的SSE
            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
            print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
            # 寻找最小的SSE进行划分
            # 即对哪一个簇进行划分后SSE最小
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit

        # 较难理解的部分
        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)    #change 1 to 3,4, or whatever
        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
        print('the bestCentToSplit is: ',bestCentToSplit)
        print('the len of bestClustAss is: ', len(bestClustAss))
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]   #replace a centroid with two best centroids'
        print(bestNewCents[0, :].tolist()[0])
        centList.append(bestNewCents[1,:].tolist()[0])
        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss    #reassign new clusters, and SSE，这里是把正在拆分的簇全部替换。
    return mat(centList), clusterAssment

Example #16

0

Show file

File: SpectralClustering.py Project: phillity/Data-Mining-Machine-Learning

def Spectral_Clustering(D, k, ratio=True, sig=1):
    # Get dimensions of nxd D matrix
    n, d = D.shape

    # Compute nxn adjacency (similarity) matrix
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i == j:
                A[i][j] = 0.0
            else:
                A[i][j] = similarity(D[i, :], D[j, :], sig)

    # Compute degree matrix
    deg = np.identity(n) * np.sum(A, axis=1)

    # Compute laplacian matrix
    L = deg - A

    # Set B accoring to ratio cut/normalized cut
    if ratio == True:
        B = L
    else:
        B = np.linalg.inv(deg) @ L

    # Compute eigenvalues and eigenvectors of B
    w, v = np.linalg.eig(B)

    # Get reduced basis
    v = v[:, :k]

    # Normalize basis to obtain new dataset
    Y = np.zeros((n, k))
    for i in range(n):
        Y[i, :] = v[i, :] * (1. / (sum(v[i, :]**2)**(1. / 2)))

    # Run kMeans on new dataset
    return kMeans(Y, k)

Example #17

0

Show file

#!/usr/bin/env python
__coding__ = "utf-8"
__author__ = "Ng WaiMing"

from kMeans import kMeans
from kMeans import loadDataSet
from kMeans import randCent
from kMeans import distEclud
from kMeans import biKmeans
from numpy import *

if __name__ == '__main__':
    dataMat = mat(loadDataSet('testSet.txt'))
    print('min(dataMat[:, 0])', min(dataMat[:, 0]), '\n')
    print('min(dataMat[:, 1])', min(dataMat[:, 1]), '\n')
    print('max(dataMat[:, 0])', max(dataMat[:, 0]), '\n')
    print('max(dataMat[:, 1])', max(dataMat[:, 1]), '\n')
    print(randCent(dataMat, 2), '\n')
    print(distEclud(dataMat[0], dataMat[1]))
    centroids, clusterAssment = kMeans(dataMat, 4)
    print('centroids:\n', centroids, '\n')
    print('clusterAssment:\n', clusterAssment, '\n')
    dataMat3 = mat(loadDataSet('testSet2.txt'))
    centList, myNewAssments = biKmeans(dataMat3, 3)
    print('centList: \n', centList, '\n')
    # fileName = '../../../../data/k-means/places.txt'
    # imgName = '../../../../data/k-means/Portland.png'
    # kMeans.clusterClubs(fileName=fileName, imgName=imgName, numClust=5)

Example #18

0

Show file

File: kMeans_compare.py Project: tjbay/kMeans

import kMeans as km
# reload(kMeans)  # Call if any changes have been made to kMeans.py
import sklearn.cluster as sklearn

# Create data
dataSet = km.createDataSet(10,50,5,10,1)

# Cluster numbers to test
Nclusters = 12  # Slightly larger than true cluster number
Num_iters = 50

# Test my implementation

start_time = time.clock()
for i in range(Num_iters):
	final_cluster_pos, cost = km.kMeans(dataSet, Nclusters)
        
avg_run_time = (time.clock() - start_time)/Num_iters
print "Average run time per my kMeans iteration is", avg_run_time, "s" 

# Test scikit-learn kMeans

start_time_scikit = time.clock()

skm = sklearn.KMeans(init='random', n_clusters=Nclusters, n_init=Num_iters)
skm.fit(dataSet)

avg_run_time_scikit = (time.clock() - start_time_scikit)/Num_iters

print "Average run time per sklearn kMeans iteration is", avg_run_time_scikit, "s"

Example #19

0

Show file

File: test10.2_biKmeans_kmeans.py Project: Derryxd/CodeSet

dataSet = mat(dataMat)
k = 3
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m, 2)))
centroid0 = mean(dataSet, axis=0).tolist()[0]
centList = [centroid0]  #create a list with one centroid
for j in range(m):  #calc initial Error
    clusterAssment[j, 1] = distEclud(mat(centroid0), dataSet[j, :])**2
while (len(centList) < k):
    lowestSSE = inf
    for i in range(len(centList)):
        ptsInCurrCluster = dataSet[nonzero(
            clusterAssment[:, 0].A ==
            i)[0], :]  #get the data points currently in cluster i
        centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distEclud)
        sseSplit = sum(
            splitClustAss[:, 1])  #compare the SSE to the currrent minimum
        sseNotSplit = sum(
            clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0], 1])
        print("sseSplit, and notSplit: ", sseSplit, sseNotSplit)
        if (sseSplit + sseNotSplit) < lowestSSE:
            bestCentToSplit = i
            bestNewCents = centroidMat
            bestClustAss = splitClustAss.copy()
            lowestSSE = sseSplit + sseNotSplit
    bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0],
                 0] = len(centList)  #change 1 to 3,4, or whatever
    bestClustAss[nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit
    print('the bestCentToSplit is: ', bestCentToSplit)
    print('the len of bestClustAss is: ', len(bestClustAss))

Example #20

0

Show file

File: kMeans.py Project: zhamuzhi/Python_workspace


if __name__ == "__main__":
    import kMeans
    dataMat = mat(kMeans.loadDataSet('testSet.txt'))
    #print("\ndataMat:\n", dataMat)
    #print("\n(dataMat[,:0]):\n", dataMat[:, 0])
    print("\nmin(dataMat[,:0]):", min(dataMat[:, 0]))
    #print("\n(dataMat[,:1]):\n", dataMat[:, 1])
    print("\nmin(dataMat[,:1]):", min(dataMat[:, 1]))

    print("\nrandCent(dataMat,2):\n", randCent(dataMat, 2))
    print("\ndistEclud(dataMat[0],dataMat[1]:\n",
          distEclud(dataMat[0], dataMat[1]))

    myCentroids, clusterAssing = kMeans.kMeans(dataMat, 6)
    print("\nmyCentroids:\n", myCentroids, "\nclusterAssing:\n", clusterAssing)

    #################################
    print("\n#################biMeans:#########################\n")
    dataMat3 = mat(loadDataSet('testSet2.txt'))
    centList, myNewAssments = biKmeans(dataMat3, 3)
    print("\ncentList:\n", centList)

    #################################
    print("\n#################Yahoo:#########################\n")
    # geoResults = kMeans.geoGrab('1 VA Center', 'Augusta, ME')
    # print("geoResults:\n",geoResults)

    #print(massPlaceFind('portlandClubs.txt'))
    print("\n#################Clubs:#########################\n")

Example #21

0

Show file

File: clustering.py Project: wsgan001/DataMining-4

            categoryG.append(1)
        gDataSet.append(b)
gFr.close()

with open('mnist.txt', 'r') as mFr:
    for line in mFr:
        a = line.split(',')
        b = []
        for item in a:
            b.append(float(item))
        categoryM.append(b[-1])
        mDataSet.append(b)
mFr.close()


calculate.calculate(kMeans.kMeans(gDataSet, 2), categoryG, 2)

calculate.calculate(kMeans.kMeans(mDataSet, 10), categoryM, 10)

calculate.calculate(nmf.NMF(gDataSet, 2), categoryG, 2)

calculate.calculate(nmf.NMF(mDataSet, 10), categoryG, 10)

calculate.calculate(spectral.spectral(gDataSet, 2, 3), categoryG, 2)

calculate.calculate(spectral.spectral(gDataSet, 2, 6), categoryG, 2)

calculate.calculate(spectral.spectral(gDataSet, 2, 9), categoryG, 2)

calculate.calculate(spectral.spectral(mDataSet, 10, 3), categoryM, 10)

Example #22

0

Show file

File: 10.py Project: niumeng07/ML

#!/usr/bin/env python
#-*- coding: UTF-8 -*-

import kMeans
from numpy import *

dataMat=mat(kMeans.loadDataSet('testSet.txt'))
kMeansRandCenter=kMeans.randCent(dataMat,2) #   两个中心
print(kMeansRandCenter)

centroids,clusterAssment=kMeans.kMeans(dataMat,5)

import matplotlib.pyplot as plt
fig=plt.figure(1)
plt.plot(centroids[:,0],centroids[:,1],'ro')
plt.plot(dataMat[:,0],dataMat[:,1],'bo')
plt.axis([-8,8,-8,8])
# plt.show()

kMeans.binaryKeans(dataMat,3)

dataMat3=mat(kMeans.loadDataSet('testSet2.txt'))
centList,Assments=kMeans.binaryKeans(dataMat3,3)
print("centList:",centList)
print("Assments:",Assments)
fig=plt.figure(2)
plt.plot(dataMat3[:,0],dataMat3[:,1],'bo')
plt.plot(centList[:,0],centList[:,1],'ro')
plt.axis([-10,10,-10,10])
# plt.show()

Example #23

0

Show file

File: kmeans_demo.py Project: FranciscoMeloJr/Python-Tests

plt.ion()

def show(X, C, centroids, keep = False):
    import time
    time.sleep(0.5)
    plt.cla()
    plt.plot(X[C == 0, 0], X[C == 0, 1], '*b',
         X[C == 1, 0], X[C == 1, 1], '*r',
         X[C == 2, 0], X[C == 2, 1], '*g')
    plt.plot(centroids[:,0],centroids[:,1],'*m',markersize=20)
    plt.draw()
    if keep :
        plt.ioff()
        plt.show()

# generate 3 cluster data
# data = np.genfromtxt('data1.csv', delimiter=',')
m1, cov1 = [9, 8], [[1.5, 2], [1, 2]]
m2, cov2 = [5, 13], [[2.5, -1.5], [-1.5, 1.5]]
m3, cov3 = [3, 7], [[0.25, 0.5], [-0.1, 0.5]]
data1 = np.random.multivariate_normal(m1, cov1, 250)
data2 = np.random.multivariate_normal(m2, cov2, 180)
data3 = np.random.multivariate_normal(m3, cov3, 100)
X = np.vstack((data1,np.vstack((data2,data3))))
np.random.shuffle(X)

from kMeans import kMeans
centroids, C = kMeans(X, K = 3, plot_progress = show)
show(X, C, centroids, True)

Example #24

0

Show file

File: test_kMeans.py Project: wang4yu6peng13/MachineLearningInAction

import kMeans
from numpy import *

datMat = mat(kMeans.loadDataSet('testSet.txt'))
print min(datMat[:,0])
print min(datMat[:,1])
print max(datMat[:,0])
print max(datMat[:,1])
print kMeans.randCent(datMat, 2)
print kMeans.distEclud(datMat[0], datMat[1])

myCentroids, clustAssing = kMeans.kMeans(datMat, 4)
#print myCentroids, clustAssing

datMat3 = mat(kMeans.loadDataSet('testSet2.txt'))
centList, myNewAssments = kMeans.biKmeans(datMat3, 3)
print centList

Example #25

0

Show file

File: sample_plot.py Project: shivam2012/parallel_review_scalablekmeans-

x = np.random.randn(10)
y = np.random.randn(10)
Cluster = np.array([0, 1, 1, 1, 3, 2, 2, 3, 0, 2])    # Labels of cluster 0 to 3
centers = np.random.randn(3, 2) 
'''
'''
dataMat = np.mat(kMeans.loadDataSet('testSet.txt'))
x = dataMat[:,0]
y = dataMat[:,1]
'''
from sklearn.datasets.samples_generator import make_blobs

dataSet, _ = make_blobs(n_samples=100, centers=3, n_features=2, random_state=0)
dataSet = np.mat(dataSet)
centers, clusterAssgn = kMeans.kMeans(dataSet=dataSet,
                                      k=4,
                                      createCent=kmeanspp.createCent)
#centers,clusterAssgn = kMeans.kMeans(dataSet=dataMat,k=4)
x = dataSet[:, 0]
y = dataSet[:, 1]
Cluster = np.array(clusterAssgn[:, 0])
print centers
print 'cluster:', Cluster
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(x, y, c=Cluster, s=50)
#s parameter shows how big will be the plus symbol

centers = np.mat(centers)
for ele in centers:
    i = ele[0, 0]

Example #26

0

Show file

File: runKMeans.py Project: junlyyouny/python

# -*- coding: UTF-8 -*-
# kMeans算法测试
# 运行环境: python3

from numpy import *
import kMeans

print("loading data...")
dataSet = mat(kMeans.loadDataSet('testSetForKMeans.txt'))

k = 4
centroids, clusterAssment = kMeans.kMeans(dataSet, k)

print("show the result...")
kMeans.showCluster(dataSet, k, centroids, clusterAssment)

Example #27

0

Show file

File: testMedicalKMeans.py Project: aditya0212jain/ClassificationSchemes

import csv
import numpy as np
import kMeans as kMeans

with open('../Medical_data.csv', 'r') as csv_file:
    csv_reader = list(csv.reader(csv_file, delimiter=","))
    csv_dicReader = csv.DictReader(csv_file)
    my_data = np.array(csv_reader)
    ##following the data for Medical_data.csv for kMeans
    new_data = np.array(my_data[1:, 1:], dtype=np.float)
    kMeansClassfier = kMeans.kMeans(3, new_data.shape[0],
                                    new_data.shape[1])  ##k,n,d 3000,3
    kMeansClassfier.showValues()
    kMeansClassfier.classify(new_data)

Example #28

0

Show file

import kMeans
from numpy import *
dataMat = mat(kMeans.loadDataSet('testSet.txt'))
# print dataMat

randMat = kMeans.randCent(dataMat, 2)
# print dataMat[:, 0]
# print randMat

res = kMeans.kMeans(dataMat, 4)
# print res

dataMat3 = mat(kMeans.loadDataSet('testSet2.txt'))
kMeans.biKmeans(dataMat3, 3)

# centList, myNewAssments =

Example #29

0

Show file

File: pickup_kMeans_clustering.py Project: xiaofandh12/taxi_recommender_system

import kMeans
from numpy import *

pickup_file_dir = '/home/donghao/ITS_project/taxi_finder/data/data_pickup/kMeans/'
pickup_filename = 'pickup_8-0_8-30.txt'
datMat = mat(kMeans.loadDataSetFile(pickup_file_dir + pickup_filename))
print 'begin k-means clustering'
Centroids, clustAssing = kMeans.kMeans(datMat, 20)
print 'finish k-means clustering'

datMat_list = datMat.tolist()
Centroids_list = Centroids.tolist()
clustAssing_list = clustAssing.tolist()

filename_suffix = pickup_filename.split('_')[1] + '_' + pickup_filename.split(
    '_')[2].split('.')[0]
centroid_f = open(pickup_file_dir + 'centroids_' + filename_suffix + '.txt',
                  'w')
for centroid in Centroids_list:
    centroid_f.write(str(centroid[0]) + ',' + str(centroid[1]) + '\n')

cluster_f = open(
    pickup_file_dir + 'pickup_cluster_' + filename_suffix + '.txt', 'w')
centroids_number = len(Centroids_list)
centroid_number = 0
while centroid_number < centroids_number:
    print 'centroid_number:', centroid_number
    for i in range(len(clustAssing_list)):
        if int(clustAssing_list[i][0]) == centroid_number:
            cluster_f.write(
                str(centroid_number) + ',' + str(datMat_list[i][0]) + ',' +

Example #30

0

Show file

File: bossAsahi.py Project: kaeru8714/BDJA-recommendation-project

	global nLOOP
	if loop > 0:
		loop -= 1
		print("Recursive kMeans Loop: %d times" %(nLOOP - loop))
		new_centroids = kMeans.kMeans(centroids["centroids"], dataMat, K)

		return Recursive_kMeans(new_centroids, dataMat, K, loop)
	else:
		print("Recursive kMeans Loop: Finished.")
		return centroids

#print (Recursive_kMeans(firstcent, Indexed_DataMat, K, nLOOP))


print (firstcent)
print (kMeans.kMeans(firstcent, Indexed_DataMat, K))

setPlot((Indexed_DataMat), hotelcolor)
setPlot(firstcent, 'b+')
plt.show()


plt.axis([min_X, max_X, min_Y, max_Y])
plt.ylabel('hotels')
setPlot((Indexed_DataMat), hotelcolor)

loadedfirstcent = {"centroids": firstcent}

setPlot(Recursive_kMeans(loadedfirstcent, Indexed_DataMat, K, nLOOP)["centroids"], 'r+')
plt.show()

Example #31

0

Show file

File: kMeans_executor.py Project: andvoidlei/algorithm-python

import matplotlib.pyplot as plt


def showCluster(dataSet, k, centroids, clusterAssment):
    m, dim = shape(dataSet)
    if dim != 2:
        print("Sorry! i can not draw because the dimension of data is not 2!")
        return 1

    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    if k > len(mark):
        print("Sorry! Your k is too large!")
        return 1
    # draw all samples
    for i in range(m):
        markIndex = int(clusterAssment[i, 0])  # 为样本指定颜色
        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])

    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
    # draw the centroids
    for i in range(k):
        plt.plot(centroids[i, 0], centroids[i, 1], mark[i], marker='+', color='red', markersize=18)
        # 用marker来指定质心样式，用color和markersize来指定颜色和大小

    plt.show()


datMat=mat(kMeans.loadDataSet('../data/kMeans_testSet.txt'))

clusterCenters,clusterAssment = kMeans.kMeans(datMat,4)
showCluster(datMat,4,clusterCenters,clusterAssment)

Example #32

0

Show file

File: actions.py Project: kuredatan/taxocluster

def clusteringAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Select the metadatum among those above to cluster the set of samples. [e.g. "
            + dataArray[1][0] + "]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    valueSet, clusters1 = partitionSampleByMetadatumValue(
        metadatum, dataArray[1], dataArray[0])
    clusters = [[sample[0] for sample in cluster] for cluster in clusters1]
    #that is, k in K-means Algorithm
    numberClass = len(valueSet)
    print "/!\ Number of classes:", numberClass, "."
    startSet = [cluster[0] for cluster in clusters]
    #Selects the starting samples of each cluster
    kClusters = [[start] for start in startSet]
    if not (len(clusters) == numberClass):
        print "\n/!\ ERROR: Different lengths: numberClass", numberClass, "clusters:", len(
            clusters), "."
        raise ValueError
    trimmedList = trimList(dataArray[3], startSet)
    print "/!\ Clustering with the first distance..."
    #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster)
    #@dataArray[8] = distMatchedDict
    kClusters, meanSamples, distanceDict, distanceInClusters = kMeans(
        trimmedList, numberClass, kClusters, startSet, dataArray[8], dataArray)
    print "-- End of first clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number == len(dataArray[3])):
        print "\n/!\ ERROR: A bug occurred during the clustering:", number, "=/=", len(
            dataArray[3]), "."
        raise ValueError
    #Deletes samples in cluster that are too far from the others
    kClusters, untaken = cleanClusters(kClusters, distanceInClusters)
    startSet = [cluster[0] for cluster in clusters]
    #Remove from untaken the starting samples
    untaken2 = []
    for x in untaken:
        if not (x in startSet):
            untaken2.append(x)
    untaken = untaken2
    #Remove the samples in untaken from the total set of samples
    sampleSet = []
    for cluster in kClusters:
        for x in cluster:
            if not (x in sampleSet):
                sampleSet.append(x)
    for x in startSet:
        if not (x in sampleSet):
            sampleSet.append(x)
    trimmedList = trimList(sampleSet, startSet)
    print "/!\ Clustering with the second distance..."
    #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2)
    #@dataArray[9] = distConsensusDict
    kClusters, meanSamples, distanceDict, _ = kMeans(trimmedList, numberClass,
                                                     kClusters, startSet,
                                                     dataArray[9],
                                                     dataArray)  #,meanSamples)
    print "-- End of second clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number <= len(dataArray[3])):
        print "\n/!\ ERROR: An error occurred during the clustering:", number, ">", len(
            dataArray[3]), "."
        raise ValueError
    print "Printing the", numberClass, "clusters:"
    i = 1
    #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs
    for cluster in kClusters:
        print "\n-- Cluster #", i, "associated to", metadatum, "=", valueSet[
            i - 1], ":"
        print "Size:", len(cluster)
        print sorted(cluster)
        i += 1
    print "\nScore of the clustering (comprised between 0 and 1):"
    print "The more it is close to 1, the more the clustering is relevant."
    #The clustering obtained with the K-Means method
    kClustersCopy = [cluster for cluster in kClusters]
    #The clustering obtained by comparing the values of the metadatum
    clustersCopy = [cluster for cluster in clusters]
    #Score by using first method of comparison
    compareClusterScore = 0
    if not (len(kClustersCopy) == numberClass == len(clustersCopy)):
        print "\n/!\ ERROR: Length error in clustering:", numberClass, len(
            kClustersCopy), len(clustersCopy), "."
        raise ValueError
    while kClustersCopy and clustersCopy:
        cl1 = kClustersCopy.pop()
        cl2 = clustersCopy.pop()
        #clusters are non-empty
        x = compareCluster(cl1, cl2, untaken)
        if x:
            compareClusterScore += x
        else:
            compareClusterScore = None
            break
    if compareClusterScore:
        compareClusterScore = compareClusterScore / numberClass
        printClusterScore = compareClusterScore
    else:
        printClusterScore = "None"
    #Score by using second method of comparison
    #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass)
    print "Compare clusters score is:", printClusterScore, "."
    #print "Compare centers score is:",compareCentersScore,"."
    answer = raw_input("Do you want to save the results? Y/N\n")
    if (answer == "Y"):
        answer2 = raw_input(
            "Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n"
        )
        if (answer2 == "Y"):
            commonList = extractCommonNodes(kClusters, dataArray)
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:", answer2, "."
        data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(
            valueSet)
        i = 0
        for cluster in kClusters:
            data += "\n\n-- Cluster #" + str(
                i + 1) + " associated to " + metadatum + " = " + str(
                    valueSet[i])
            data += "\nSize: " + str(len(cluster))
            if (answer2 == "Y"):
                data += "\nSet of common nodes: " + str(commonList[i])
            data += "\n" + str(cluster)
            i += 1
        data += "\n\nCompare clusters score is: " + str(compareClusterScore)
        #data += "\n\nCompare centers score is: " + str(compareCentersScore)
        data += "\n\nEND OF FILE ****"
        print "\n/!\ Saving clusters..."
        writeFile(data)
        answer2 = raw_input(
            "Do you want to compute the graph of the clusters? Y/N\n")
        if (answer2 == "Y"):
            print "\n/!\ Constructing the graph of the clusters..."
            #@dataArray[3] = filenames
            graph = convertClustersIntoGraph(kClusters, distanceDict,
                                             len(dataArray[3]))
            graphNO(graph)
            print "\n/!\ Done. The graph is in DOT format in \"files\" folder."
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:", answer2, "."
    elif not (answer == "N"):
        print "/!\ You should answer by Y or N."

Example #33

0

Show file

import kMeans
from numpy import *

dataMat = mat(kMeans.loadDataSet('testSet.txt'))
# print min(dataMat[:,0])
#
# print(kMeans.randCent(dataMat,2))
#
# print(kMeans.distEclud(dataMat[0],dataMat[1]))

myCentroids, clustAssing = kMeans.kMeans(dataMat, 4)
print myCentroids

Example #34

0

Show file

File: plotting.py Project: saidu941/Python-Programming

        if plot_progress != None: plot_progress(X, C, np.array(centroids))
    return np.array(centroids), C


def show(X, C, centroids, keep=False):
    import time
    time.sleep(0.5)
    plt.cla()
    plt.plot(X[C == 0, 0], X[C == 0, 1], '*b', X[C == 1, 0], X[C == 1, 1],
             '*r', X[C == 2, 0], X[C == 2, 1], '*g')
    plt.plot(centroids[:, 0], centroids[:, 1], '*m', markersize=20)
    plt.draw()
    if keep:
        plt.ioff()
        plt.show()


# generate 3 cluster data
# data = np.genfromtxt('data1.csv', delimiter=',')
m1, cov1 = [9, 8], [[1.5, 2], [1, 2]]
m2, cov2 = [5, 13], [[2.5, -1.5], [-1.5, 1.5]]
m3, cov3 = [3, 7], [[0.25, 0.5], [-0.1, 0.5]]
data1 = np.random.multivariate_normal(m1, cov1, 250)
data2 = np.random.multivariate_normal(m2, cov2, 180)
data3 = np.random.multivariate_normal(m3, cov3, 100)
X = np.vstack((data1, np.vstack((data2, data3))))
np.random.shuffle(X)

from kMeans import kMeans
centroids, C = kMeans(X, K=3, plot_progress=show)
show(X, C, centroids, True)

Example #35

0

Show file


def showFigure(dataMat, k, clusterAssment):

    tag = ['go', 'or', 'yo', 'ko', 'bo', 'mo']
    for i in range(k):
        datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]]
        c = mat(i * ones((len(datalist), 1)))
        pylab.plot(datalist[:, 0], c, tag[i])
    pylab.show()

    row = 0
    for i in range(k):
        datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]]
        for j in range(len(datalist)):
            sheet1.write(row, 0, datalist[j, 0])
            #sheet1.write(row, 1, datalist[j,1])
            sheet1.write(row, 1, tag[i])
            row += 1


if __name__ == '__main__':
    outputfilename = 'D:\\code\\KM\\res.xls'
    outputfile = xlwt.Workbook()
    sheet1 = outputfile.add_sheet('sheet1', cell_overwrite_ok=True)
    k = 6
    dataMat = mat(kMeans.loadDataSet('D:\\code\\KM\\site.txt'))
    myCentroids, clusterAssment = kMeans.kMeans(dataMat, k)
    showFigure(dataMat, k, clusterAssment)
    outputfile.save(outputfilename)

Example #36

0

Show file

File: actions.py Project: kuredatan/taxocluster

def clusteringAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    valueSet,clusters1 = partitionSampleByMetadatumValue(metadatum,dataArray[1],dataArray[0])
    clusters = [[sample[0] for sample in cluster] for cluster in clusters1]
    #that is, k in K-means Algorithm
    numberClass = len(valueSet)
    print "/!\ Number of classes:",numberClass,"."
    startSet = [cluster[0] for cluster in clusters]
    #Selects the starting samples of each cluster
    kClusters = [[start] for start in startSet]
    if not (len(clusters) == numberClass):
        print "\n/!\ ERROR: Different lengths: numberClass",numberClass,"clusters:",len(clusters),"."
        raise ValueError
    trimmedList = trimList(dataArray[3],startSet)
    print "/!\ Clustering with the first distance..."
    #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster)
    #@dataArray[8] = distMatchedDict
    kClusters,meanSamples,distanceDict,distanceInClusters = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[8],dataArray)
    print "-- End of first clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number == len(dataArray[3])):
        print "\n/!\ ERROR: A bug occurred during the clustering:",number,"=/=",len(dataArray[3]),"."
        raise ValueError
    #Deletes samples in cluster that are too far from the others
    kClusters,untaken = cleanClusters(kClusters,distanceInClusters)
    startSet = [cluster[0] for cluster in clusters]
    #Remove from untaken the starting samples
    untaken2 = []
    for x in untaken:
        if not (x in startSet):
            untaken2.append(x)
    untaken = untaken2
    #Remove the samples in untaken from the total set of samples
    sampleSet = []
    for cluster in kClusters:
        for x in cluster:
            if not (x in sampleSet):
                sampleSet.append(x)
    for x in startSet:
        if not (x in sampleSet):
            sampleSet.append(x)
    trimmedList = trimList(sampleSet,startSet)
    print "/!\ Clustering with the second distance..."
    #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2)
    #@dataArray[9] = distConsensusDict
    kClusters,meanSamples,distanceDict,_ = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[9],dataArray)#,meanSamples)
    print "-- End of second clustering --" 
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number <= len(dataArray[3])):
        print "\n/!\ ERROR: An error occurred during the clustering:",number,">",len(dataArray[3]),"."
        raise ValueError
    print "Printing the",numberClass,"clusters:"
    i = 1
    #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs
    for cluster in kClusters:
        print "\n-- Cluster #",i,"associated to",metadatum,"=",valueSet[i-1],":"
        print "Size:",len(cluster)
        print sorted(cluster)
        i += 1
    print "\nScore of the clustering (comprised between 0 and 1):"
    print "The more it is close to 1, the more the clustering is relevant."
    #The clustering obtained with the K-Means method
    kClustersCopy = [cluster for cluster in kClusters]
    #The clustering obtained by comparing the values of the metadatum
    clustersCopy = [cluster for cluster in clusters]
    #Score by using first method of comparison
    compareClusterScore = 0
    if not (len(kClustersCopy) == numberClass == len(clustersCopy)):
        print "\n/!\ ERROR: Length error in clustering:",numberClass,len(kClustersCopy),len(clustersCopy),"."
        raise ValueError
    while kClustersCopy and clustersCopy:
        cl1 = kClustersCopy.pop()
        cl2 = clustersCopy.pop()
        #clusters are non-empty
        x = compareCluster(cl1,cl2,untaken)
        if x:
            compareClusterScore += x
        else:
            compareClusterScore = None
            break
    if compareClusterScore:
        compareClusterScore = compareClusterScore/numberClass
        printClusterScore = compareClusterScore
    else:
        printClusterScore = "None"
    #Score by using second method of comparison
    #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass)
    print "Compare clusters score is:",printClusterScore,"."
    #print "Compare centers score is:",compareCentersScore,"."
    answer = raw_input("Do you want to save the results? Y/N\n")
    if (answer == "Y"):
        answer2 = raw_input("Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n")
        if (answer2 == "Y"):
            commonList = extractCommonNodes(kClusters,dataArray)
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:",answer2,"."
        data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(valueSet)
        i = 0
        for cluster in kClusters:
            data += "\n\n-- Cluster #" + str(i+1) + " associated to " + metadatum + " = " + str(valueSet[i]) 
            data += "\nSize: " + str(len(cluster))
            if (answer2 == "Y"):
                data += "\nSet of common nodes: " + str(commonList[i])
            data += "\n" + str(cluster)
            i += 1
        data += "\n\nCompare clusters score is: " + str(compareClusterScore)
        #data += "\n\nCompare centers score is: " + str(compareCentersScore)
        data += "\n\nEND OF FILE ****"
        print "\n/!\ Saving clusters..."
        writeFile(data)
        answer2 = raw_input("Do you want to compute the graph of the clusters? Y/N\n")
        if (answer2 == "Y"):
            print "\n/!\ Constructing the graph of the clusters..."
            #@dataArray[3] = filenames
            graph = convertClustersIntoGraph(kClusters,distanceDict,len(dataArray[3]))
            graphNO(graph)
            print "\n/!\ Done. The graph is in DOT format in \"files\" folder."
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:",answer2,"."
    elif not (answer == "N"):
        print "/!\ You should answer by Y or N."

Example #37

0

Show file

        "D1k3",
        "K2k4",
        "D1k5",
        "D1k6",
    ]
    print(dataset)
    return dataset


def testResults(subband, output):
    newDataSet = pre_process(subband)
    newModel = mlpnn.predictModel("my_model.h5", newDataSet)
    if output == '1':
        return newModel / len(subband)
    else:
        return (len(subband) - newModel) / len(subband)


DWT = DWT.DWT()
kMeans = kMeans.kMeans()
mlpnn = mlpnn.MLPNN()

testing = open(
    r"C:\Users\Nathan Joseph\Desktop\CPEG498\SortedData\eyes closed\O001.txt")
testing = testing.read().split('\n')
testing.pop()
testing = np.array(testing)
testing = np.split(testing, 17)
print(len(testing))
print(testResults(testing, '1'))

Example #38

0

Show file

import kMeans as km

#Genereer de dataset
dataset = km.genDataSet("dataset1.csv")

#Genereer de labels
labels = km.genLabels("dataset1.csv", 2000)

#Vind de scalers
scalers = km.findminmax(dataset)

#Scale de data
scaleddata = km.scale(dataset, scalers)


results = []
tmp = 0
for k in range(2, 50):
    results = []
    for _ in range(10):     #Doe 10 keer kMeans om de randomness van het kiezen van de centroids eruit te halen.
        clusters, _, _ = km.kMeans(scaleddata, k, labels)  
        total = 0
        for l in clusters:
            for m in l:
                total += m[0]
        results.append(total)
    tmp = min(results)  #Kies het resultaat met de laagste aggregate intra-clustur distance
    print(k, tmp / k)