def globalKmeans(samples, k, tolerance): newCentroid = [] tmpClusterList = [] tmpClusterList.append(np.array(samples[0])) cur_clustError, centroid, clusterList = randomKmeans.randomKmeans( samples, 1, tolerance, tmpClusterList) centroidList = [] centroidList.append(np.array(centroid[0])) # run k means for each cluster for i in range(2, k + 1): # run k-means for all samples for j in range(0, len(samples)): tmpClusterList = [] tmpClusterList.extend(centroidList) tmpClusterList.append(np.array(samples[j])) cur_clustError, centroid, clusterList = randomKmeans.randomKmeans( samples, i, tolerance, tmpClusterList) if (j == 0): prev_clustError = cur_clustError if (cur_clustError <= prev_clustError): prev_clustError = cur_clustError newCentroid = centroid # compile new centroid list centroidList = [] for index in range(0, i): centroidList.append(np.array(newCentroid[index])) return round(prev_clustError, 4), centroidList
def globalThread(start, end, samples, kVal, tolerance, centroidList): global globalDictionary for i in range(start, end): tempCLusterList = [] tempCLusterList.extend(centroidList) tempCLusterList.append(np.array(samples[i])) curClustError, centroid, clusterList = randomKmeans.randomKmeans( samples, kVal, tolerance, tempCLusterList) globalDictionary[curClustError] = centroid
def globalKmeansThread(samples, k, tolerance): global globalDictionary tempCLusterList = [] tempCLusterList.append(np.array(samples[0])) curClustError, centroid, clusterList = randomKmeans.randomKmeans( samples, 1, tolerance, tempCLusterList) centroidList = [] centroidList.append(np.array(centroid[0])) prevClustError = curClustError # run k means for each cluster for kVal in range(2, k + 1): #print "Working on cluster : " + str(kVal) globalDictionary = {} threads = [] count = 0 # use multi threading to speed up the process as the computational complexity is really high while (count < len(samples)): start = count end = count + (len(samples) / 10) t = Thread(target=globalThread, args=( start, end, samples, kVal, tolerance, centroidList, )) t.start() threads.append(t) count = end for t in threads: t.join() # find the centroid with least clustering error for key in globalDictionary.keys(): if key < prevClustError: prevClustError = key newCentroid = globalDictionary[key] centroidList = [] for index in range(0, kVal): centroidList.append(np.array(newCentroid[index])) return round(prevClustError, 4), centroidList
def fastGlobal(self, samples, k, tolerance): self.kd = len(samples[0]) bucketKey = [] tempClusterList = [] tempClusterList.append(np.array(samples[0])) curClustError, centroid, clusterList = randomKmeans.randomKmeans( samples, 1, tolerance, tempClusterList) centroidList = [] centroidList.append(np.array(centroid[0])) # generate the kd Tree for the given set of samples tree = self.buildTree(samples) # code to choose the tree branches using index key generated. Not using tree traversel to reduce computational cost if self.is_power2(self.numBuckets): count = 1 curBucketIndex = math.ceil(math.log(self.numBuckets, 2)) while (count <= self.numBuckets): bucketIndex = str(int(curBucketIndex)) + "." + str(count) count = count + 1 bucketKey.append(bucketIndex) else: count = 1 curBucketIndex = math.floor(math.log(self.numBuckets, 2)) numBucketIndex = pow(2, curBucketIndex) while (count <= numBucketIndex): bucketIndex = str(int(curBucketIndex)) + "." + str(count) count = count + 1 bucketKey.append(bucketIndex) indexToRemove = self.numBuckets - numBucketIndex for count in range(0, int(indexToRemove + 1)): bucketKey.pop(0) nextBucketIndex = curBucketIndex + 1 count = 0 indexCount = 1 while (count < int(indexToRemove * 2)): newBucketIndex = str( int(nextBucketIndex)) + "." + str(indexCount) indexCount = indexCount + 1 if newBucketIndex not in self.bucketList: raise notInList('newBucketIndex not in bucketList') bucketKey.append(newBucketIndex) count = count + 1 # from the buckets, find the possible insertion location by finding the center of the bucket newCentroidList = [] for index in bucketKey: zipped = zip(*self.bucketList[index]) num = len(self.bucketList[index]) newCentroidList.append( [math.fsum(dList) / num for dList in zipped]) # run k-means for the potential insertion points calculated from above for kVal in range(2, k + 1): b = [] for xn in range(0, len(newCentroidList)): bSum = 0 for clusterIndex in range(0, len(clusterList)): for xj in range(len(clusterList[clusterIndex])): dj = abs( randomKmeans.sqEucliDist( clusterList[clusterIndex][xj], centroidList[clusterIndex], len(centroidList[clusterIndex]))) dn = abs( randomKmeans.sqEucliDist( clusterList[clusterIndex][xj], newCentroidList[xn], len(centroidList[clusterIndex]))) bSum = bSum + max(dj - dn, 0) b.append(bSum) bIndex = b.index(max(b)) tempClusterList = [] tempClusterList.extend(centroidList) tempClusterList.append(np.array(newCentroidList[bIndex])) curClustError, centroid, clusterList = randomKmeans.randomKmeans( samples, kVal, tolerance, tempClusterList) centroidList = [] for index in range(0, kVal): centroidList.append(np.array(centroid[index])) return round(curClustError, 4), centroidList
def main(): print('\nLoading data: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) samples = loadData.loadMOG('MixOfGau.txt') print('Loading complete: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) numK = [5, 10] numBuckets = len(samples) / 2 kmeansClustError = [0] * len(numK) globalClustError = [0] * len(numK) fastClustError = [0] * len(numK) tolerance = 0.001 centroidList = [] fg = fastGlobalKmeans.fastGlobal(numBuckets) for index in range(len(numK)): centroidList = random.sample(samples, numK[index]) print('\nStart of random k-means: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) kmeansClustError[ index], kmeansClustList, P = randomKmeans.randomKmeans( samples, numK[index], tolerance, centroidList) print('End of random k-means: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) print('\nStart of global k-means: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) globalClustError[index], globalClustList = globalKmeans.globalKmeans( samples, numK[index], tolerance) print('End of global k-means: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) print('\nStart of fast global k-means: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) fastClustError[index], fastClustList = fg.fastGlobal( samples, numK[index], tolerance) print('End of fast global k-means: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) print "\n\nRandom k-means: clustering error = " + str(kmeansClustError) print "Global k-means: clustering error = " + str(globalClustError) print "Fast global k-means: clustering error = " + str(fastClustError) x = [] y1 = [] y2 = [] y3 = [] for i in numK: x.append(i) for i in kmeansClustError: y1.append(i) for i in globalClustError: y2.append(i) for i in fastClustError: y3.append(i) plt.plot(x, y1, 'r+') plt.plot(x, y2, 'bo') plt.plot(x, y3, 'g*') plt.axis() plt.xlabel('Number Of Clusters k') plt.ylabel('Clustering Error') text = "+ is random\no is global\n* is fast" plt.text(0.02, 0.7, text, fontsize=14, transform=plt.gcf().transFigure) plt.subplots_adjust(left=0.28) plt.show()