new_Y_train = [] for i in range(len(Y_train)): # if int(Y_train[i]) == 0 or int(Y_train[i]) == 1: if int(Y_train[i]) in selected_digits: new_X_train.append(X_train[i]) new_Y_train.append(Y_train[i]) # Use all dataset: # new_X_train = X_train # new_Y_train = Y_train # Used for visualization only pca = PCA(n_components=2, whiten=False) reduced_X_train = pca.fit_transform(new_X_train) unique_labels = get_unique_list(new_Y_train) n_clusters = len(unique_labels) print(" >> Labels = ", n_clusters) print(" >> TOTAL Cluster (n) = ", n_clusters) # kmeans = KMeans(n_clusters=n_clusters) kmeans = MyKMeans(n_clusters=n_clusters) ''' Do dimension Reduction first, and analyze the result Default: DISABLED; you may enable this. RESULT: In clustering, PCA does not affect the accuracy! ''' # pca_x_train = PCA(n_components=64, whiten=False) # new_X_train = pca_x_train.fit_transform(new_X_train) kmeans.fit(new_X_train)
# simulate different number of clusters n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10] # from n=2 ~ n=10 (max) # n_clusters = [2, 3] # from n=2 ~ n=10 (max) # Start simulation ... kmeans, reduced_X_train, y_kmeans = None, None, None for i in range(len(n_clusters)): selected_digits = get_selected_digits(n_clusters[i]) X_train, Y_train = filter_dataset(selected_digits, init_X_train, init_Y_train) # Used for visualization only pca = PCA(n_components=2, whiten=False) reduced_X_train = pca.fit_transform(X_train) unique_labels = get_unique_list(Y_train) kmeans = MyKMeans( n_clusters=len(unique_labels) ) # n_clusters = total number of unique digits (labels) # Start KMeans: Sklearn highest_acc = 0.0 for j in range(K): kmeans.fit(X_train) y_kmeans = kmeans.predict(X_train) accuracy = kmeans.eval_acc(y_kmeans, Y_train) * 100 # acc_scores.append(accuracy) highest_acc = accuracy if accuracy > highest_acc else highest_acc str_hacc = str(round(highest_acc, 2)) print(" >>> highest_acc of n_clusters[%s] = %s " %