from Logistic_Regression import * import numpy as np import matplotlib.pyplot as plt if __name__ == '__main__': N = 2000 model = Logistic_Regression() train_x, train_y = model.load_data('./hw3_train.dat') test_x, test_y = model.load_data('./hw3_test.dat') Ein, Eout, Ein_S, Eout_S = model.fit(train_x, train_y, test_x, test_y, N) t = range(N) plt.style.use('ggplot') plt.xlabel('$t$') plt.ylabel('$E_{in}$') plt.plot(t, Ein, t, Ein_S) plt.legend(['GD ($\eta=0.01$)', 'SGD ($\eta=0.001$)']) plt.title('$E_{in}(\mathbf{w}_t)$ as a function of $t$') plt.savefig('./Ein.pdf', format="pdf") plt.show()
class Evaluation(): def __init__(self, accuracies_3_1=None, accuracies_3_3=None, train_acc_3_3=None, runtimes=None, binary_threshold=0): # stores the accuracies and runtimes of the algorithms on the 4 datasets with indices: # 0: Adult (continuous and binary) # 1: Banknote (all continuous) # 2: Ionosphere (all continuous) # 3: Breast cancer (all binary) self.accuracies_3_1 = accuracies_3_1 self.accuracies_3_3 = accuracies_3_3 self.train_acc_3_3 = train_acc_3_3 self.runtimes = runtimes # 0 by default (i.e. defaults to handling continuous features in naive bayes) self.binary_threshold = binary_threshold self.NB = Naive_Bayes() self.LR = Logistic_Regression() def fit_and_predict(self, xTrain, yTrain, xTest, yTest, test_type): if test_type == "lr": self.LR.fit(xTrain, yTrain) return self.LR.score(xTrain, yTrain) else: self.NB.get_p_features(xTrain, yTrain) self.NB.get_priors(yTrain) return self.NB.score(xTest, yTest) def K_fold_CV(self, X, y, k): #np.random.seed(45) #setting random seed allows us to replicate results N, D = X.shape shuffle_sequence = np.random.permutation(X.shape[0]) X_shuffled = X[shuffle_sequence, :] y_shuffled = y[shuffle_sequence] train_size = math.floor(N / k) results = np.zeros((2, k)) counter = 0 for i in range(k): X_test = X_shuffled[counter:counter + train_size, :] y_test = y_shuffled[counter:counter + train_size] X_train = np.concatenate([ X_shuffled[0:counter, :], X_shuffled[counter + train_size:N, :] ]) y_train = np.concatenate( [y_shuffled[0:counter], y_shuffled[counter + train_size:N]]) results[0, i] = (self.fit_and_predict(X_train, y_train, X_test, y_test, "lr")) results[1, i] = (self.fit_and_predict(X_train, y_train, X_test, y_test, "nb")) counter += train_size # returns the average accuracy over k folds return np.average(results, axis=1) # e.g. file_name = "Adult_Scaled.csv" def separate_X_Y(self, file_name): df = pd.read_csv(file_name) if (file_name == "ionosphere90.csv" or file_name == "ionosphere10.csv" or file_name == "adult_unscaled90.csv" or file_name == "adult_unscaled10.csv"): return np.array(df.iloc[:, 2:-1]), np.array(df.iloc[:, -1]) return np.array(df.iloc[:, 1:-1]), np.array(df.iloc[:, -1]) def main(self): files90 = [ "adult_unscaled90.csv", "banknote90.csv", "ionosphere90.csv", "Breast90.csv" ] files10 = [ "adult_unscaled10.csv", "banknote10.csv", "ionosphere10.csv", "Breast10.csv" ] acc_3_1 = np.zeros((2, 4)) acc_3_3 = np.zeros((2, 4, 4)) train_acc_3_3 = np.zeros((2, 4, 4)) # task 3-1 for i in range(0, len(files90)): print("Starting to process file " + files90[i]) X, Y = self.separate_X_Y(files90[i]) X_10, Y_10 = self.separate_X_Y(files10[i]) # set binary thresholds and hyperparameters # optimal parameters for ionosphere with momentum/l1 regularization: # vs beta = 0.5 # optimal parameters for breast cancer w momentum/l1 regularizatoin: # Optimal parameters for banknote w momentum/l1: if i == 0: self.LR = Logistic_Regression(lr=0.006, penalty='none', max_iter=1750, random_state=0, lambdaa=0.0, beta=0.9) self.NB.set_bin_thresh(79) elif i == 1: # all features are continuous self.LR = Logistic_Regression(lr=3e-5, penalty='l1', max_iter=8000, random_state=0, lambdaa=0.5, beta=0.9) self.NB.set_bin_thresh(0) elif i == 2: # all features are continuous self.LR = Logistic_Regression(lr=3e-5, penalty='l1', max_iter=10000, random_state=0, lambdaa=0.5, beta=0.9) self.NB.set_bin_thresh(0) else: # all features are binary self.LR = Logistic_Regression(lr=3e-5, penalty='l1', max_iter=600, random_state=0, lambdaa=0.75, beta=0.9) self.NB.set_bin_thresh(len(X[0])) acc_3_1[:, i] = self.K_fold_CV(X, Y, 5) N, D = X.shape shuffle_sequence = np.random.permutation(X.shape[0]) X_shuffled = X[shuffle_sequence, :] Y_shuffled = Y[shuffle_sequence] # train model and do CV on 100%, 85%, 70%, 65% of the other data points percent_train = [1, 0.85, 0.7, 0.65] for j in range(4): print("Running " + str(percent_train[j]) + " " + files90[i]) X_train = X_shuffled[0:math.floor(N * percent_train[j]), :] Y_train = Y_shuffled[0:math.floor(N * percent_train[j])] train_acc_3_3[:, i, j] = self.K_fold_CV(X_train, Y_train, 5) acc_3_3[0, i, j] = self.LR.score(X_10, Y_10) acc_3_3[1, i, j] = self.NB.score(X_10, Y_10) self.accuracies_3_1 = acc_3_1 self.accuracies_3_3 = acc_3_3 self.train_acc_3_3 = train_acc_3_3 return acc_3_1, acc_3_3