Beispiel #1
0
def submission02a(createOutput):
    numTrials = 25
    X = common.readIrisFile()
    X_normalized = normalize(X, norm='l2', axis=0)
    distanceFunc = common.irisDataDistanceFunction
    centerFunc = common.irisDataCenterFunction
    clusterLabels = np.array(list(range(1, 4)), dtype=np.int8)
    bestAssignments = None
    bestErrorTotal = None
    for z in range(numTrials):
        model = kmclustering.BisectingKMeansClusteringModel(distanceFunc, centerFunc, 10)
        model.runBisectingKMeansClustering(X_normalized, clusterLabels, kminit.initKMeansSampling)
        
        #model = kmclustering.BasicKMeansClusteringModel(distanceFunc, centerFunc)
        #initFunc = lambda q, z: kminit.initKMeansSampling(q, z)
        #assignments, centers = kminit.initKMeansSampling(X_normalized, clusterLabels, distanceFunc)
        #model.runBasicKMeansClustering(X_normalized, clusterLabels, kminit.initKMeansSampling)
        
        print("====== Done with Trial # " + str(z + 1) + " / " + str(numTrials))
        #print("Assignments = " + str(model.finalClusterAssignments))
        #print("Centers = " + str(model.finalClusterCenters))
        #print("Error Map = " + str(model.finalClusterErrorMap))
        print("Error Total = " + str(model.finalClusterErrorTotal))
        
        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
            print("Improved total SSE! New Value = " + str(bestErrorTotal))
        
    print("Finished K-Means. Best Error Total = " + str(bestErrorTotal))
    print("Best Assignments = " + str(bestAssignments))
    if createOutput:
        common.writeResultsFile(bestAssignments)
Beispiel #2
0
def compareBasicAndBisecting():
    X = common.readIrisFile()
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)

    
    tsne = TSNE(n_components=2, perplexity=20, learning_rate=200, n_iter=1000, init='random')
    X_new = tsne.fit_transform(X_normalized)
    
    distanceFunc = common.irisDataDistanceFunction
    centerFunc = common.irisDataCenterFunction
    clusterLabels = np.array(list(range(1, 4)), dtype=np.int8)
    numTrials = 5
    
    ##############################################################
    # Basic K-means
    ##############################################################
    bestAssignments = None
    bestErrorTotal = None
    errorSum = 0
    for z in range(numTrials):
        model = kmclustering.BasicKMeansClusteringModel(distanceFunc, centerFunc, 100)
        model.runBasicKMeansClustering(X_new, clusterLabels, kminit.initKMeansSampling)
        errorSum += model.finalClusterErrorTotal
        print("Done with Basic K-Means Trial # " + str(z + 1) + " / " + str(numTrials))
        print("Error Total = " + str(model.finalClusterErrorTotal))
        
        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
    
    avgError = errorSum / numTrials
    print("Finished Basic K-Means. Avg Error = " + str(avgError) + " and Best Error = " + str(bestErrorTotal))
    print("Creating output now for Basic K-Means.")
    common.writeResultsFile(bestAssignments)    
    
    ##############################################################
    # Bisecting K-means
    ##############################################################
    bestAssignments = None
    bestErrorTotal = None
    errorSum = 0
    for z in range(numTrials):
        model = kmclustering.BisectingKMeansClusteringModel(distanceFunc, centerFunc, 5)
        model.runBisectingKMeansClustering(X_new, clusterLabels, kminit.initKMeansSampling)
        errorSum += model.finalClusterErrorTotal
        print("Done with Bisecting K-Means Trial # " + str(z + 1) + " / " + str(numTrials))
        print("Error Total = " + str(model.finalClusterErrorTotal))
        
        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
    
    avgError = errorSum / numTrials
    print("Finished Bisecting K-Means. Avg Error = " + str(avgError) + " and Best Error = " + str(bestErrorTotal))
    print("Creating output now for Bisecting K-Means.")
    common.writeResultsFile(bestAssignments)
    
Beispiel #3
0
def tuneTsneBisecting(createOutput):
    numTrials = 5
    X = common.readDigitsFile()
    print("Shape of Digits file  = " + str(X.shape))
    print("Starting t-SNE.")
    #pca = PCA(n_components=50)
    #X_temp = pca.fit_transform(X)
    tsne = TSNE(n_components=2,
                init='random',
                random_state=0,
                perplexity=35,
                learning_rate=400,
                n_iter=2000)
    X_new = tsne.fit_transform(X)
    #X_new = digitutil.preprocessDownsampling(X)
    print("Finished t-SNE, and got X_new. Shape = " + str(X_new.shape))
    #print("First 10 rows are: ")
    #print(X_new[0:10, :])

    #numTrials = 10
    #X_normalized = normalize(X, norm='l2', axis=0)
    distanceFunc = common.euclideanDistanceFunction
    centerFunc = common.digitsDataCenterFunction
    clusterLabels = np.array(list(range(1, 11)), dtype=np.int8)
    bestAssignments = None
    bestErrorTotal = None
    for z in range(numTrials):

        model = kmclustering.BisectingKMeansClusteringModel(
            distanceFunc, centerFunc, 3)
        model.runBisectingKMeansClustering(X_new, clusterLabels,
                                           kminit.initKMeansSampling)
        #model = kmclustering.BasicKMeansClusteringModel(distanceFunc, centerFunc, 100)
        #initFunc = lambda q, z: kminit.initKMeansSampling(q, z)
        #assignments, centers = kminit.initKMeansSampling(X_normalized, clusterLabels, distanceFunc)
        #model.runBasicKMeansClustering(X_new, clusterLabels, kminit.initKMeansSampling)

        print("====== Done with Trial # " + str(z + 1) + " / " +
              str(numTrials) + ", Error Total = " +
              str(model.finalClusterErrorTotal) + " ======")
        #print("Assignments = " + str(model.finalClusterAssignments))
        #print("Centers = " + str(model.finalClusterCenters))
        #print("Error Map = " + str(model.finalClusterErrorMap))
        #print()

        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
            #print("Improved total SSE! New Value = " + str(bestErrorTotal))

    print("Finished K-Means with t-SNE, P=. Best Error Total = " +
          str(bestErrorTotal))
    #print("Best Assignments = " + str(bestAssignments))
    if createOutput:
        common.writeResultsFile(bestAssignments)
def runSubmission01():
    #print("This is a sample file.")
    X = common.readIrisFile()

    featureCount = X.shape[1]

    n = normalize(X, norm='l2', axis=0)

    #print(n)
    result = kmeans(n, 3)
    common.writeResultsFile(result)
Beispiel #5
0
def submission02(createOutput, numTrials):
    X = common.readDigitsFile()
    print("Shape of Digits file  = " + str(X.shape))
    pca = PCA(n_components=50)
    X_temp = pca.fit_transform(X)
    tsne = TSNE(n_components=2, init='pca')
    X_new = tsne.fit_transform(X_temp)
    #X_new = digitutil.preprocessDownsampling(X)
    print("Got X_new. Shape = " + str(X_new.shape))
    #print("First 10 rows are: ")
    #print(X_new[0:10, :])

    #numTrials = 10
    X_normalized = normalize(X, norm='l2', axis=0)
    distanceFunc = common.euclideanDistanceFunction
    centerFunc = common.digitsDataCenterFunction
    clusterLabels = np.array(list(range(1, 11)), dtype=np.int8)
    bestAssignments = None
    bestErrorTotal = None
    for z in range(numTrials):
        model = kmclustering.BasicKMeansClusteringModel(
            distanceFunc, centerFunc, 55)
        #initFunc = lambda q, z: kminit.initKMeansSampling(q, z)
        #assignments, centers = kminit.initKMeansSampling(X_normalized, clusterLabels, distanceFunc)
        model.runBasicKMeansClustering(X_normalized, clusterLabels,
                                       kminit.initKMeansSampling)

        print("====== Done with Trial # " + str(z + 1) + " / " +
              str(numTrials) + ", Error Total = " +
              str(model.finalClusterErrorTotal) + " ======")
        #print("Assignments = " + str(model.finalClusterAssignments))
        #print("Centers = " + str(model.finalClusterCenters))
        #print("Error Map = " + str(model.finalClusterErrorMap))
        #print()

        if bestErrorTotal is None or model.finalClusterErrorTotal < bestErrorTotal:
            bestErrorTotal = model.finalClusterErrorTotal
            bestAssignments = model.finalClusterAssignments
            #print("Improved total SSE! New Value = " + str(bestErrorTotal))

    print("Finished K-Means. Best Error Total = " + str(bestErrorTotal))
    #print("Best Assignments = " + str(bestAssignments))
    if createOutput:
        common.writeResultsFile(bestAssignments)