def globalKmeans(samples, k, tolerance):
    newCentroid = []
    tmpClusterList = []
    tmpClusterList.append(np.array(samples[0]))
    cur_clustError, centroid, clusterList = randomKmeans.randomKmeans(
        samples, 1, tolerance, tmpClusterList)
    centroidList = []
    centroidList.append(np.array(centroid[0]))

    # run k means for each cluster
    for i in range(2, k + 1):
        # run k-means for all samples
        for j in range(0, len(samples)):
            tmpClusterList = []
            tmpClusterList.extend(centroidList)
            tmpClusterList.append(np.array(samples[j]))
            cur_clustError, centroid, clusterList = randomKmeans.randomKmeans(
                samples, i, tolerance, tmpClusterList)

            if (j == 0):
                prev_clustError = cur_clustError

            if (cur_clustError <= prev_clustError):
                prev_clustError = cur_clustError
                newCentroid = centroid
        # compile new centroid list
        centroidList = []
        for index in range(0, i):
            centroidList.append(np.array(newCentroid[index]))

    return round(prev_clustError, 4), centroidList
def globalThread(start, end, samples, kVal, tolerance, centroidList):
    global globalDictionary
    for i in range(start, end):
        tempCLusterList = []
        tempCLusterList.extend(centroidList)
        tempCLusterList.append(np.array(samples[i]))
        curClustError, centroid, clusterList = randomKmeans.randomKmeans(
            samples, kVal, tolerance, tempCLusterList)
        globalDictionary[curClustError] = centroid
def globalKmeansThread(samples, k, tolerance):
    global globalDictionary
    tempCLusterList = []
    tempCLusterList.append(np.array(samples[0]))
    curClustError, centroid, clusterList = randomKmeans.randomKmeans(
        samples, 1, tolerance, tempCLusterList)
    centroidList = []
    centroidList.append(np.array(centroid[0]))
    prevClustError = curClustError

    # run k means for each cluster
    for kVal in range(2, k + 1):
        #print "Working on cluster : " + str(kVal)
        globalDictionary = {}
        threads = []
        count = 0

        # use multi threading to speed up the process as the computational complexity is really high
        while (count < len(samples)):
            start = count
            end = count + (len(samples) / 10)
            t = Thread(target=globalThread,
                       args=(
                           start,
                           end,
                           samples,
                           kVal,
                           tolerance,
                           centroidList,
                       ))
            t.start()
            threads.append(t)
            count = end

        for t in threads:
            t.join()

        # find the centroid with least clustering error
        for key in globalDictionary.keys():
            if key < prevClustError:
                prevClustError = key
                newCentroid = globalDictionary[key]

        centroidList = []
        for index in range(0, kVal):
            centroidList.append(np.array(newCentroid[index]))

    return round(prevClustError, 4), centroidList
Exemple #4
0
    def fastGlobal(self, samples, k, tolerance):
        self.kd = len(samples[0])
        bucketKey = []
        tempClusterList = []
        tempClusterList.append(np.array(samples[0]))
        curClustError, centroid, clusterList = randomKmeans.randomKmeans(
            samples, 1, tolerance, tempClusterList)
        centroidList = []
        centroidList.append(np.array(centroid[0]))

        # generate the kd Tree for the given set of samples
        tree = self.buildTree(samples)

        # code to choose the tree branches using index key generated. Not using tree traversel to reduce computational cost
        if self.is_power2(self.numBuckets):
            count = 1
            curBucketIndex = math.ceil(math.log(self.numBuckets, 2))
            while (count <= self.numBuckets):
                bucketIndex = str(int(curBucketIndex)) + "." + str(count)
                count = count + 1
                bucketKey.append(bucketIndex)
        else:
            count = 1
            curBucketIndex = math.floor(math.log(self.numBuckets, 2))
            numBucketIndex = pow(2, curBucketIndex)

            while (count <= numBucketIndex):
                bucketIndex = str(int(curBucketIndex)) + "." + str(count)
                count = count + 1
                bucketKey.append(bucketIndex)

            indexToRemove = self.numBuckets - numBucketIndex

            for count in range(0, int(indexToRemove + 1)):
                bucketKey.pop(0)

            nextBucketIndex = curBucketIndex + 1
            count = 0
            indexCount = 1

            while (count < int(indexToRemove * 2)):
                newBucketIndex = str(
                    int(nextBucketIndex)) + "." + str(indexCount)
                indexCount = indexCount + 1
                if newBucketIndex not in self.bucketList:
                    raise notInList('newBucketIndex not in bucketList')
                bucketKey.append(newBucketIndex)
                count = count + 1

        # from the buckets, find the possible insertion location by finding the center of the bucket
        newCentroidList = []
        for index in bucketKey:
            zipped = zip(*self.bucketList[index])
            num = len(self.bucketList[index])
            newCentroidList.append(
                [math.fsum(dList) / num for dList in zipped])

        # run k-means for the potential insertion points calculated from above
        for kVal in range(2, k + 1):
            b = []
            for xn in range(0, len(newCentroidList)):
                bSum = 0
                for clusterIndex in range(0, len(clusterList)):
                    for xj in range(len(clusterList[clusterIndex])):
                        dj = abs(
                            randomKmeans.sqEucliDist(
                                clusterList[clusterIndex][xj],
                                centroidList[clusterIndex],
                                len(centroidList[clusterIndex])))
                        dn = abs(
                            randomKmeans.sqEucliDist(
                                clusterList[clusterIndex][xj],
                                newCentroidList[xn],
                                len(centroidList[clusterIndex])))
                        bSum = bSum + max(dj - dn, 0)
                b.append(bSum)
            bIndex = b.index(max(b))
            tempClusterList = []
            tempClusterList.extend(centroidList)
            tempClusterList.append(np.array(newCentroidList[bIndex]))
            curClustError, centroid, clusterList = randomKmeans.randomKmeans(
                samples, kVal, tolerance, tempClusterList)
            centroidList = []
            for index in range(0, kVal):
                centroidList.append(np.array(centroid[index]))

        return round(curClustError, 4), centroidList
Exemple #5
0
def main():
    print('\nLoading data: {:%Y-%m-%d %H:%M:%S}'.format(
        datetime.datetime.now()))
    samples = loadData.loadMOG('MixOfGau.txt')
    print('Loading complete: {:%Y-%m-%d %H:%M:%S}'.format(
        datetime.datetime.now()))

    numK = [5, 10]
    numBuckets = len(samples) / 2
    kmeansClustError = [0] * len(numK)
    globalClustError = [0] * len(numK)
    fastClustError = [0] * len(numK)
    tolerance = 0.001
    centroidList = []
    fg = fastGlobalKmeans.fastGlobal(numBuckets)

    for index in range(len(numK)):
        centroidList = random.sample(samples, numK[index])

        print('\nStart of random k-means: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))
        kmeansClustError[
            index], kmeansClustList, P = randomKmeans.randomKmeans(
                samples, numK[index], tolerance, centroidList)
        print('End of random k-means: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))

        print('\nStart of global k-means: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))
        globalClustError[index], globalClustList = globalKmeans.globalKmeans(
            samples, numK[index], tolerance)
        print('End of global k-means: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))

        print('\nStart of fast global k-means: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))
        fastClustError[index], fastClustList = fg.fastGlobal(
            samples, numK[index], tolerance)
        print('End of fast global k-means: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))

    print "\n\nRandom k-means: clustering error = " + str(kmeansClustError)
    print "Global k-means: clustering error = " + str(globalClustError)
    print "Fast global k-means: clustering error = " + str(fastClustError)

    x = []
    y1 = []
    y2 = []
    y3 = []

    for i in numK:
        x.append(i)

    for i in kmeansClustError:
        y1.append(i)

    for i in globalClustError:
        y2.append(i)

    for i in fastClustError:
        y3.append(i)

    plt.plot(x, y1, 'r+')
    plt.plot(x, y2, 'bo')
    plt.plot(x, y3, 'g*')
    plt.axis()
    plt.xlabel('Number Of Clusters k')
    plt.ylabel('Clustering Error')
    text = "+ is random\no is global\n* is fast"
    plt.text(0.02, 0.7, text, fontsize=14, transform=plt.gcf().transFigure)
    plt.subplots_adjust(left=0.28)
    plt.show()