from hw5.libs.common.dataset import Dataset from hw5.libs.common.util import plot_digit_data import matplotlib.pyplot as plt def index_digit_ones(Y_train): idx_ones = [] for i in range(len(Y_train)): if int(Y_train[i]) == 1: idx_ones.append(i) return idx_ones if __name__ == '__main__': # dataset = Dataset(train_data=1000, test_data=100) # dataset = Dataset(train_data=40, test_data=10) dataset = Dataset(train_data=80, test_data=20) # dataset = Dataset() X_train, Y_train, X_test, Y_test = dataset.get_dataset() print("before PCA = ", X_train.shape) # Dimensional reduction # pca = PCA(n_components=2, whiten=False) # pca = PCA(n_components=64, whiten=False) pca = PCA(n_components=80, whiten=False) X_train = pca.fit_transform(X_train) print("after PCA = ", X_train.shape) # print(X_train[1].shape) print(Y_train) idx_ones = index_digit_ones(Y_train)
# https://deeplearningcourses.com/c/data-science-supervised-machine-learning-in-python # https://www.udemy.com/data-science-supervised-machine-learning-in-python # Code: https://github.com/lazyprogrammer/machine_learning_examples/blob/master/supervised_class/knn.py import matplotlib.pyplot as plt from hw5.libs.algo.knn import KNN from hw5.libs.common.dataset import Dataset from hw5.libs.common.util import int_to_tuple, save_to_csv from datetime import datetime if __name__ == '__main__': # dataset = Dataset(train_data=80, test_data=20) # dataset = Dataset(train_data=800, test_data=200) dataset = Dataset() X_train, Y_train, X_test, Y_test = dataset.get_dataset() train_scores = [] test_scores = [] exec_times = [] best_test_score = 0 stored_accuracy = [0, 0] # [<acc>, <k-th>] # Define number of iteration (K) K = 50 ks = int_to_tuple(K) # used to plot the results # Start KNN: Scratch print("Evaluating Scratch KNN") for i in range(K): k = i+1 print("\nk =", k) knn = KNN(k)
import matplotlib.pyplot as plt from sklearn.decomposition import PCA # import matplotlib.pyplot as plt import seaborn as sns sns.set() # for plot styling import numpy as np from hw6.libs.algo.kmeans import MyKMeans if __name__ == '__main__': # dataset = Dataset(train_data=1000, test_data=100) # dataset = Dataset(train_data=20, test_data=10) # dataset = Dataset(train_data=40, test_data=10) # dataset = Dataset(train_data=80, test_data=20) dataset = Dataset(train_data=800, test_data=200) # dataset = Dataset() # X_train, Y_train, X_test, Y_test = dataset.get_dataset() init_X_train, init_Y_train, _, _ = dataset.get_dataset() acc_scores = [] # ONLY collect the highest accuracy! # Simulate clustering in K times. K = 50 # K = 5 ks = int_to_tuple(K) # used to plot the results # simulate different number of clusters n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10] # from n=2 ~ n=10 (max) # n_clusters = [2, 3] # from n=2 ~ n=10 (max) # Start simulation ...
# Define value of clustering simulation # K = 9 # K = 1 # total number of simulations K = 4 # total number of simulations dataset_size = 200 # initialize started dataset size here # dataset_step_size = 100 # e.g. datasets = {100, 200, 300, ..., 1000} dataset_step_size = 200 # e.g. datasets = {100, 200, 300, ..., 1000} ratio_train, ratio_test = 0.8, 0.2 # should be total value=1 ! e.g. ratio = { (80,20), (160,40), ..., (800,200) } datasets = [] ds = [] # list of dataset size in each iterations for i in range(K): # print(" >> dataset_size = ", dataset_size) train_data = int(dataset_size * ratio_train) test_data = dataset_size - train_data dataset = Dataset(train_data=train_data, test_data=test_data) X_train, Y_train, X_test, Y_test = dataset.get_dataset() datasets.append({ "X_train": X_train, "Y_train": Y_train, "X_test": X_test, "Y_test": X_train }) ds.append(dataset_size) dataset_size += dataset_step_size # print(" >> train_data, test_data = ", train_data, test_data) # print(" >> ds = ", ds) # Start KMeans: sklearn print("Evaluating sklearn KMeans")
Based on the method used in "Learning To Remember Rare Events" by Lukasz Kaiser, Ofir Nachun, Aurko Roy, and Samy Bengio Paper: https://openreview.net/pdf?id=SJTQLdqlg ''' import numpy as np import theano import theano.tensor as T import time import os from hw5.libs.common.dataset import Dataset from hw5.libs.common.util import int_to_tuple if __name__ == '__main__': dataset = Dataset(train_data=10, test_data=3) # dataset = Dataset() X_train, Y_train, X_test, Y_test = dataset.get_dataset() train_scores = [] test_scores = [] print(" > Total X_train = ", len(X_train)) print(" > Total X_test = ", len(X_test)) print(">>> Y_test = ", Y_test) # Define number of iteration (K) K = 20 ks = int_to_tuple(K) # used to plot the results def l2_normalize(x, dim, epsilon=1e-12):
import matplotlib.pyplot as plt from sklearn.decomposition import PCA # import matplotlib.pyplot as plt import seaborn as sns sns.set() # for plot styling import numpy as np from datetime import datetime from hw6.libs.algo.gmm import MyGMM if __name__ == '__main__': # dataset = Dataset(train_data=1000, test_data=100) # dataset = Dataset(train_data=20, test_data=10) # dataset = Dataset(train_data=40, test_data=10) # dataset = Dataset(train_data=80, test_data=20) dataset = Dataset(train_data=800, test_data=200) # dataset = Dataset() X_train, Y_train, _, _ = dataset.get_dataset() acc_scores = [] # Simulate clustering in K times. K = 50 # K = 5 ks = int_to_tuple(K) # used to plot the results # to make it easier to analyze, take only digit={0, 1, ...} selected_digits = [1, 2, 3] # selected_digits = [1, 2] X_train, Y_train = filter_dataset(selected_digits, X_train, Y_train) # Used for visualization only