Beispiel #1
0
    new_Y_train = []
    for i in range(len(Y_train)):
        # if int(Y_train[i]) == 0 or int(Y_train[i]) == 1:
        if int(Y_train[i]) in selected_digits:
            new_X_train.append(X_train[i])
            new_Y_train.append(Y_train[i])

    # Use all dataset:
    # new_X_train = X_train
    # new_Y_train = Y_train

    # Used for visualization only
    pca = PCA(n_components=2, whiten=False)
    reduced_X_train = pca.fit_transform(new_X_train)

    unique_labels = get_unique_list(new_Y_train)
    n_clusters = len(unique_labels)
    print(" >> Labels = ", n_clusters)
    print(" >> TOTAL Cluster (n) = ", n_clusters)
    # kmeans = KMeans(n_clusters=n_clusters)
    kmeans = MyKMeans(n_clusters=n_clusters)
    '''
    Do dimension Reduction first, and analyze the result
    Default: DISABLED; you may enable this. 
    RESULT: In clustering, PCA does not affect the accuracy!
    '''
    # pca_x_train = PCA(n_components=64, whiten=False)
    # new_X_train = pca_x_train.fit_transform(new_X_train)

    kmeans.fit(new_X_train)
Beispiel #2
0
    # simulate different number of clusters
    n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]  # from n=2 ~ n=10 (max)
    # n_clusters = [2, 3] # from n=2 ~ n=10 (max)

    # Start simulation ...
    kmeans, reduced_X_train, y_kmeans = None, None, None
    for i in range(len(n_clusters)):
        selected_digits = get_selected_digits(n_clusters[i])
        X_train, Y_train = filter_dataset(selected_digits, init_X_train,
                                          init_Y_train)

        # Used for visualization only
        pca = PCA(n_components=2, whiten=False)
        reduced_X_train = pca.fit_transform(X_train)

        unique_labels = get_unique_list(Y_train)
        kmeans = MyKMeans(
            n_clusters=len(unique_labels)
        )  # n_clusters = total number of unique digits (labels)

        # Start KMeans: Sklearn
        highest_acc = 0.0
        for j in range(K):
            kmeans.fit(X_train)
            y_kmeans = kmeans.predict(X_train)
            accuracy = kmeans.eval_acc(y_kmeans, Y_train) * 100
            # acc_scores.append(accuracy)
            highest_acc = accuracy if accuracy > highest_acc else highest_acc

        str_hacc = str(round(highest_acc, 2))
        print(" >>> highest_acc of n_clusters[%s] = %s " %