Ejemplo n.º 1
0
def submission02(createOutput):
    numTrials = 25
    X = common.readIrisFile()
    X_normalized = normalize(X, norm='l2', axis=0)
    distanceFunc = common.irisDataDistanceFunction
    centerFunc = common.irisDataCenterFunction
    clusterLabels = np.array(list(range(1, 4)), dtype=np.int8)
    bestAssignments = None
    bestErrorTotal = None
    for z in range(numTrials):
        model = kmclustering.BasicKMeansClusteringModel(distanceFunc, centerFunc)
        #initFunc = lambda q, z: kminit.initKMeansSampling(q, z)
        #assignments, centers = kminit.initKMeansSampling(X_normalized, clusterLabels, distanceFunc)
        model.runBasicKMeansClustering(X_normalized, clusterLabels, kminit.initKMeansSampling)
        
        print("====== Done with Trial # " + str(z + 1) + " / " + str(numTrials))
        #print("Assignments = " + str(model.finalClusterAssignments))
        #print("Centers = " + str(model.finalClusterCenters))
        #print("Error Map = " + str(model.finalClusterErrorMap))
        print("Error Total = " + str(model.finalClusterErrorTotal))
        
        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
            print("Improved total SSE! New Value = " + str(bestErrorTotal))
        
    print("Finished K-Means. Best Error Total = " + str(bestErrorTotal))
    print("Best Assignments = " + str(bestAssignments))
    if createOutput:
        common.writeResultsFile(bestAssignments)
Ejemplo n.º 2
0
def compareBasicAndBisecting():
    X = common.readIrisFile()
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)

    
    tsne = TSNE(n_components=2, perplexity=20, learning_rate=200, n_iter=1000, init='random')
    X_new = tsne.fit_transform(X_normalized)
    
    distanceFunc = common.irisDataDistanceFunction
    centerFunc = common.irisDataCenterFunction
    clusterLabels = np.array(list(range(1, 4)), dtype=np.int8)
    numTrials = 5
    
    ##############################################################
    # Basic K-means
    ##############################################################
    bestAssignments = None
    bestErrorTotal = None
    errorSum = 0
    for z in range(numTrials):
        model = kmclustering.BasicKMeansClusteringModel(distanceFunc, centerFunc, 100)
        model.runBasicKMeansClustering(X_new, clusterLabels, kminit.initKMeansSampling)
        errorSum += model.finalClusterErrorTotal
        print("Done with Basic K-Means Trial # " + str(z + 1) + " / " + str(numTrials))
        print("Error Total = " + str(model.finalClusterErrorTotal))
        
        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
    
    avgError = errorSum / numTrials
    print("Finished Basic K-Means. Avg Error = " + str(avgError) + " and Best Error = " + str(bestErrorTotal))
    print("Creating output now for Basic K-Means.")
    common.writeResultsFile(bestAssignments)    
    
    ##############################################################
    # Bisecting K-means
    ##############################################################
    bestAssignments = None
    bestErrorTotal = None
    errorSum = 0
    for z in range(numTrials):
        model = kmclustering.BisectingKMeansClusteringModel(distanceFunc, centerFunc, 5)
        model.runBisectingKMeansClustering(X_new, clusterLabels, kminit.initKMeansSampling)
        errorSum += model.finalClusterErrorTotal
        print("Done with Bisecting K-Means Trial # " + str(z + 1) + " / " + str(numTrials))
        print("Error Total = " + str(model.finalClusterErrorTotal))
        
        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
    
    avgError = errorSum / numTrials
    print("Finished Bisecting K-Means. Avg Error = " + str(avgError) + " and Best Error = " + str(bestErrorTotal))
    print("Creating output now for Bisecting K-Means.")
    common.writeResultsFile(bestAssignments)
    
Ejemplo n.º 3
0
def tuneTsneBasic(createOutput):
    numTrials = 5
    X = common.readDigitsFile()
    print("Shape of Digits file  = " + str(X.shape))
    print("Starting t-SNE.")
    #pca = PCA(n_components=50)
    #X_temp = pca.fit_transform(X)
    tsne = TSNE(n_components=2,
                init='random',
                random_state=0,
                perplexity=35,
                learning_rate=400,
                n_iter=2000)
    X_new = tsne.fit_transform(X)
    #X_new = digitutil.preprocessDownsampling(X)
    print("Finished t-SNE, and got X_new. Shape = " + str(X_new.shape))
    #print("First 10 rows are: ")
    #print(X_new[0:10, :])

    #numTrials = 10
    #X_normalized = normalize(X, norm='l2', axis=0)
    distanceFunc = common.euclideanDistanceFunction
    centerFunc = common.digitsDataCenterFunction
    clusterLabels = np.array(list(range(1, 11)), dtype=np.int8)
    bestAssignments = None
    bestErrorTotal = None
    for z in range(numTrials):
        model = kmclustering.BasicKMeansClusteringModel(
            distanceFunc, centerFunc, 100)
        #initFunc = lambda q, z: kminit.initKMeansSampling(q, z)
        #assignments, centers = kminit.initKMeansSampling(X_normalized, clusterLabels, distanceFunc)
        model.runBasicKMeansClustering(X_new, clusterLabels,
                                       kminit.initKMeansSampling)

        print("====== Done with Trial # " + str(z + 1) + " / " +
              str(numTrials) + ", Error Total = " +
              str(model.finalClusterErrorTotal) + " ======")
        #print("Assignments = " + str(model.finalClusterAssignments))
        #print("Centers = " + str(model.finalClusterCenters))
        #print("Error Map = " + str(model.finalClusterErrorMap))
        #print()

        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
            #print("Improved total SSE! New Value = " + str(bestErrorTotal))

    print("Finished K-Means with t-SNE, P=. Best Error Total = " +
          str(bestErrorTotal))
    #print("Best Assignments = " + str(bestAssignments))
    if createOutput:
        common.writeResultsFile(bestAssignments)
Ejemplo n.º 4
0
def chartClusterErrorVsClusterCount():
    X = common.readDigitsFile()
    X = X / 255.0
    print("Shape of Digits file  = " + str(X.shape))
    print("Starting t-SNE.")
    tsne = TSNE(n_components=2,
                init='random',
                random_state=0,
                perplexity=100,
                learning_rate=400,
                n_iter=2000)
    X_new = tsne.fit_transform(X)
    print("Finished t-SNE, and got X_new. Shape = " + str(X_new.shape))

    distanceFunc = common.euclideanDistanceFunction
    centerFunc = common.digitsDataCenterFunction
    numTrials = 5
    errorMapForClusterSize = {}
    clusterSizeList = list(range(2, 21, 2))

    for clusterSize in clusterSizeList:
        errorSum = 0
        clusterLabels = np.array(list(range(1, clusterSize + 1)),
                                 dtype=np.int8)

        for z in range(numTrials):
            model = kmclustering.BasicKMeansClusteringModel(
                distanceFunc, centerFunc, 100)
            model.runBasicKMeansClustering(X_new, clusterLabels,
                                           kminit.initKMeansSampling)

            print("====== Done with Trial # " + str(z + 1) + " / " + str(numTrials) + " for K = " + str(clusterSize) + \
                  ", Error Total = " + str(model.finalClusterErrorTotal) + " ======")

            errorSum += model.finalClusterErrorTotal

        avgError = errorSum / numTrials
        errorMapForClusterSize[clusterSize] = avgError

    print("Done with K-Means.")

    for clusterSize in clusterSizeList:
        print("For K = " + str(clusterSize) + "     Error is:      " +
              str(errorMapForClusterSize[clusterSize]))
Ejemplo n.º 5
0
def submission01(createOutput, numTrials):
    X = common.readDigitsFile()
    print("Shape of Digits file  = " + str(X.shape))
    X_new = digitutil.preprocessDownsampling(X)
    print("Got X_new. Shape = " + str(X_new.shape))
    #print("First 10 rows are: ")
    #print(X_new[0:10, :])

    #numTrials = 10
    X_normalized = normalize(X, norm='l2', axis=0)
    distanceFunc = common.euclideanDistanceFunction
    centerFunc = common.digitsDataCenterFunction
    clusterLabels = np.array(list(range(1, 11)), dtype=np.int8)
    bestAssignments = None
    bestErrorTotal = None
    for z in range(numTrials):
        model = kmclustering.BasicKMeansClusteringModel(
            distanceFunc, centerFunc, 55)
        #initFunc = lambda q, z: kminit.initKMeansSampling(q, z)
        #assignments, centers = kminit.initKMeansSampling(X_normalized, clusterLabels, distanceFunc)
        model.runBasicKMeansClustering(X_normalized, clusterLabels,
                                       kminit.initKMeansSampling)

        print("====== Done with Trial # " + str(z + 1) + " / " +
              str(numTrials) + ", Error Total = " +
              str(model.finalClusterErrorTotal) + " ======")
        #print("Assignments = " + str(model.finalClusterAssignments))
        #print("Centers = " + str(model.finalClusterCenters))
        #print("Error Map = " + str(model.finalClusterErrorMap))
        #print()

        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
            #print("Improved total SSE! New Value = " + str(bestErrorTotal))

    print("Finished K-Means. Best Error Total = " + str(bestErrorTotal))
    #print("Best Assignments = " + str(bestAssignments))
    if createOutput:
        common.writeResultsFile(bestAssignments)
Ejemplo n.º 6
0
def compareInitialization():
    X = common.readIrisFile()
    scaler = StandardScaler()
    X_new = scaler.fit_transform(X)
    
    distanceFunc = common.irisDataDistanceFunction
    centerFunc = common.irisDataCenterFunction
    clusterLabels = np.array(list(range(1, 4)), dtype=np.int8)
    numTrials = 5
    
    kmSampling = kminit.initKMeansSampling
    kmRandom = lambda x, c: kminit.initKMeansRandomly(x, c, centerFunc, distanceFunc)
    kmPlusPlus = lambda x, c: kminit.initKMeansKMeansPlusPlus(x, c, distanceFunc)
    
    initFunctions = [(kmSampling, "Sampling"), (kmRandom, "Random"), (kmPlusPlus, "K-Means++")]
    
    ##############################################################
    # Basic K-means
    ##############################################################

    for initFunction, initFunctionDesc in initFunctions:
        bestErrorTotal = None
        errorSum = 0
        
        for z in range(numTrials):
            model = kmclustering.BasicKMeansClusteringModel(distanceFunc, centerFunc, 100)
            model.runBasicKMeansClustering(X_new, clusterLabels, initFunction)
            errorSum += model.finalClusterErrorTotal
            print("Done with Basic K-Means with initialization type = " + initFunctionDesc + ", Trial # " + str(z + 1) + " / " + str(numTrials))
            print("Error Total = " + str(model.finalClusterErrorTotal))
            
            if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
                bestErrorTotal = model.finalClusterErrorTotal
        
        avgError = errorSum / numTrials
        print("Finished Basic K-Means with initialization type = " + initFunctionDesc + ". Avg Error = " + str(avgError) + " and Best Error = " + str(bestErrorTotal))