def trainQuadratic(cvals, kernel): lamerrs = [] lamclasserr = [] lamtesterr = [] min_error = 100 c_choice = -1 for i in range(0, len(cvals)): print(i) svm = SVM(kernel, cvals[i]) svm.fit(X_train, y_train) testpred = svm.predict(X_test) trainpred = svm.predict(X_train) class_error = utils.classification_error(testpred, y_test) training_error = utils.classification_error(trainpred, y_train) lamclasserr.append(class_error) lamtesterr.append(training_error) lerr = lambdaError(kernel, cvals[i], folds_1a) if (lerr < min_error): min_error = lerr c_choice = cvals[i] # print("Training Error: ", training_error, "|Test Error: ", class_error, "|Cross Validation Error: ", lerr, "|Lambda: ", alllam[i]) lamerrs.append(lerr) plt.plot(cvals, lamerrs, label='Cross Validation Error') plt.plot(cvals, lamclasserr, label='Test Error') plt.plot(cvals, lamtesterr, label='Training Error') plt.xlabel("C Used") plt.ylabel("Percent Error") plt.xscale("log") plt.title("Error vs C Value Used") plt.legend() plt.show() return c_choice
def linearSVMPlot(): lamerrs = [] lamclasserr = [] lamtesterr = [] for i in range(0, len(cval_1a)): print(i) svm = SVM(None, cval_1a[i]) svm.fit(X_train, y_train) testpred = svm.predict(X_test) trainpred = svm.predict(X_train) class_error = utils.classification_error(testpred, y_test) training_error = utils.classification_error(trainpred, y_train) lamclasserr.append(class_error) lamtesterr.append(training_error) lerr = lambdaError(None, cval_1a[i], folds_1a) # print("Training Error: ", training_error, "|Test Error: ", class_error, "|Cross Validation Error: ", lerr, "|Lambda: ", alllam[i]) lamerrs.append(lerr) plt.plot(cval_1a, lamerrs, label='Cross Validation Error') plt.plot(cval_1a, lamclasserr, label='Test Error') plt.plot(cval_1a, lamtesterr, label='Training Error') print(lamtesterr) print(lamclasserr) print(lamerrs) plt.xlabel("C Used") plt.ylabel("Percent Error") plt.xscale("log") plt.title("Error vs C Value Used") plt.legend() plt.show()
def plotPolyOptimal(cvals, qvals): lamerrs = [] lamclasserr = [] lamtesterr = [] for i in range(0, len(qvals)): print(i) svm = SVM(lambda x1, x2: (np.dot(x1, x2) + 1)**qvals[i], cvals[i]) svm.fit(X_train, y_train) testpred = svm.predict(X_test) trainpred = svm.predict(X_train) class_error = utils.classification_error(testpred, y_test) training_error = utils.classification_error(trainpred, y_train) lamclasserr.append(class_error) lamtesterr.append(training_error) lerr = lambdaError(lambda x1, x2: (np.dot(x1, x2) + 1)**qvals[i], cvals[i], folds_1a) lamerrs.append(lerr) print(lamtesterr) print(lamclasserr) print(lamerrs) plt.plot(qvals, lamerrs, label='Cross Validation Error') plt.plot(qvals, lamclasserr, label='Test Error') plt.plot(qvals, lamtesterr, label='Training Error') plt.xlabel("Q Used") plt.ylabel("Percent Error") plt.title("Error vs Q Value Used") plt.legend() plt.show()
def plotRBF(cvals, yvals): lamerrs = [] lamclasserr = [] lamtesterr = [] rbf_eq = lambda x1, x2: math.exp((x1 - x2).dot(x1 - x2) * -yvals[i]) for i in range(0, len(yvals)): print(i) svm = SVM(rbf_eq, cvals[i]) svm.fit(X_train, y_train) testpred = svm.predict(X_test) trainpred = svm.predict(X_train) class_error = utils.classification_error(testpred, y_test) training_error = utils.classification_error(trainpred, y_train) lamclasserr.append(class_error) lamtesterr.append(training_error) lerr = lambdaError(rbf_eq, cvals[i], folds_1a) lamerrs.append(lerr) print(lamtesterr) print(lamclasserr) print(lamerrs) plt.plot(yvals, lamerrs, label='Cross Validation Error') plt.plot(yvals, lamclasserr, label='Test Error') plt.plot(yvals, lamtesterr, label='Training Error') plt.xlabel("y Used") plt.xscale("log") plt.ylabel("Percent Error") plt.title("Error vs y Value Used") plt.legend() plt.show()
def tune(name): print(name, 'tuning') data = u.get_data(name) max_nodes = data.shape[1] - 1 min_nodes = int(max_nodes / 2) for i in range(0, 3): for j in range(max_nodes + 1): # data setup test_data = data.sample(45) sets = u.split_to_train_test_sets(test_data) training_set = sets['Training_Set'] test_set = sets['Test_Set'] error = 'oops' try: # training and testing model = BackProp.build_network(training_set, i, j, 0.25) classified = BackProp.classify(model, test_set) error = u.classification_error(classified) error = round(error, 4) * 100 except: pass # record to file row = [i, j, error] with open(name + '_tuning.csv', 'a', newline='') as file: writer = csv.writer(file) writer.writerow(row) file.close()
def fit(self): (self.w, self.alpha, f, _) = minimizers.findMin(self.funObj, self.w, self.alpha, self.maxEvals, self.verbose, self.X, self.y) print("Training error: %.3f" % utils.classification_error(self.predict(self.X), self.y))
def lambdaError(kernel, cval, folds): average = 0 failedTrains = 0 svm = SVM(kernel, cval) for i in range(0, 5): leave_out_data, training_data = utils.partition_cross_validation_fold(folds, i) status = svm.fit(training_data[0], training_data[1]) reg_pred = svm.predict(leave_out_data[0]) reg_err = utils.classification_error(reg_pred, leave_out_data[1]) average = average + reg_err average = average / (5 - failedTrains) return average
def lambdaError(lam, folds): average = 0 logreg = LogisticRegression(lam) for i in range(0, 5): leave_out_data, training_data = utils.partition_cross_validation_fold( folds, i) logreg.fit(training_data[0], training_data[1]) reg_pred = logreg.predict(leave_out_data[0]) reg_err = utils.classification_error(reg_pred, leave_out_data[1]) average = average + reg_err average = average / 5 return average
def plotNeuralNetworks(mode, step_sz): lamerrs = [] lamclasserr = [] lamtesterr = [] for i in range(0, len(dvals)): print("Starting CV ", i) path = "P3/" + folder + "/InitParams/sigmoid/" + str(dvals[i]) initial_params = utils.load_initial_weights(path) nn = NeuralNetworkClassification(d, num_hidden=dvals[i], activation=mode, W1=initial_params["W1"], W2=initial_params["W2"], b1=initial_params["b1"], b2=initial_params["b2"]) nn.fit(X_train, y_train, step_size=step_sz) testpred = nn.predict(X_test) trainpred = nn.predict(X_train) class_error = utils.classification_error(testpred, y_test) training_error = utils.classification_error(trainpred, y_train) print(class_error) print(training_error) lamclasserr.append(class_error) lamtesterr.append(training_error) lerr = cvError(dvals[i], folds, mode, step_sz) # print("Training Error: ", training_error, "|Test Error: ", class_error, "|Cross Validation Error: ", lerr, "|Lambda: ", alllam[i]) lamerrs.append(lerr) plt.plot(dvals, lamerrs, label='Cross Validation Error') plt.plot(dvals, lamclasserr, label='Test Error') plt.plot(dvals, lamtesterr, label='Training Error') print(lamtesterr) print(lamclasserr) print(lamerrs) plt.xlabel("C Used") plt.ylabel("Percent Error") plt.title("Error vs D Value Used") plt.legend() plt.show()
def five_fold_validation(data_set, data_name, eta=None, demo = False): # split the datasets into fifths splits = u.five_fold_split(data_set) errors = [] export = True # for each fifth of the dataset for split in splits: test_set = None training_set = pd.DataFrame(columns=data_set.columns.values) # check each fifth for s in splits: # if fifth in question if s == split: # this fifth is test set test_set = splits[s] # all others are training sets else: training_set = training_set.append(splits[s], sort=False) # only export and demonstrate one of the folds if split != 1: export = False else: export = True # if eta is supplied, perform Linear Regression if eta: model = Logistic_Regression.learn_models(training_set, eta, data_name, export=export) Logistic_Regression.classify(test_set, model) # of no eta is supplied, perform Naive Bayes else: model = Naive_Bayes.learn(training_set, data_name, export=export) Naive_Bayes.classify(test_set, model) # find and append the classification error err = u.classification_error(test_set) errors.append(err) # print results of first split if demo: print('Sample Training Data\n', training_set.head()) print('\nWeight Vectors') for m in model: print(m, model[m]) print('\nClassified Test Set\n',test_set) break # remove Guess column to prevent errors in future fold tests test_set.drop(['Guess'], axis=1, inplace=True) # retrn average error return sum(errors)/len(errors)
def fit(self, X, y): n, d = X.shape minimize = lambda ind: findMin.findMin(self.funObj, np.zeros(len(ind)), self.maxEvals, X[:, ind], y, verbose=0) selected = set() selected.add(0) minLoss = np.inf oldLoss = 0 bestFeature = -1 while minLoss != oldLoss: oldLoss = minLoss print("Epoch %d " % len(selected)) print("Selected feature: %d" % (bestFeature)) print("Min Loss: %.3f\n" % minLoss) for i in range(d): if i in selected: continue # Fit the model with 'i' added to the features, selected_new = selected | { i } # tentatively add feature "i" to the seected set self.w = np.zeros(d) self.w[list(selected_new)], _ = minimize(list(selected_new)) # then compute the loss and update the minLoss/bestFeature loss = utils.classification_error(self.predict(X), y) if loss < minLoss: minLoss = loss bestFeature = i selected.add(bestFeature) self.w = np.zeros(d) self.w[list(selected)], _ = minimize(list(selected))
def cvError(dval, folds, mode, step): average = 0 path = "P3/" + folder + "/InitParams/sigmoid/" + str(dval) initial_params = utils.load_initial_weights(path) for i in range(0, 5): nn = NeuralNetworkClassification(d, num_hidden=dval, activation=mode, W1=initial_params["W1"], W2=initial_params["W2"], b1=initial_params["b1"], b2=initial_params["b2"]) leave_out_data, training_data = utils.partition_cross_validation_fold( folds, i) nn.fit(training_data[0], training_data[1], step_size=step) reg_pred = nn.predict(leave_out_data[0]) reg_err = utils.classification_error(reg_pred, leave_out_data[1]) average = average + reg_err average = average / 5 return average
def five_fold_validation(data_set, data_name, n_layers, n_neurons, demo=False): # split the datasets into fifths splits = u.five_fold_split(data_set) errors = [] export = True # for each fifth of the dataset for split in splits: test_set = None training_set = pd.DataFrame(columns=data_set.columns.values) # check each fifth for s in splits: # if fifth in question if s == split: # this fifth is test set test_set = splits[s] # all others are training sets else: training_set = training_set.append(splits[s], sort=False) # train network model = BackProp.build_network(training_set, n_layers, n_neurons, 0.25) # classify test set classified = BackProp.classify(model, test_set) # find and append the classification error err = u.classification_error(classified) errors.append(err) # print results of first split if demo: print('Sample Training Data\n', training_set.head()) print('\nNetwork') print(model[0]) print('\nClassified Test Set\n', test_set) break # remove Guess column to prevent errors in future fold tests test_set.drop(['Guess'], axis=1, inplace=True) # retrn average error return sum(errors) / len(errors)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-q', '--question', required=True) io_args = parser.parse_args() question = io_args.question if question == "2": data = utils.load_dataset("logisticData") XBin, yBin = data['X'], data['y'] XBinValid, yBinValid = data['Xvalid'], data['yvalid'] model = linear_model.logReg(maxEvals=400) model.fit(XBin, yBin) print("\nlogReg Training error %.3f" % utils.classification_error(model.predict(XBin), yBin)) print("logReg Validation error %.3f" % utils.classification_error(model.predict(XBinValid), yBinValid)) print("# nonZeros: %d" % (model.w != 0).sum()) elif question == "2.1": data = utils.load_dataset("logisticData") XBin, yBin = data['X'], data['y'] XBinValid, yBinValid = data['Xvalid'], data['yvalid'] model = linear_model.logRegL2(lammy=1.0, maxEvals=400) model.fit(XBin, yBin) print("\nlogRegL2 Training error %.3f" % utils.classification_error(model.predict(XBin), yBin)) print("logRegL2 Validation error %.3f" %
# part 1: implement knn.predict # part 2: print training and test errors for k=1,3,10 (use utils.classification_error) # part 3: plot classification boundaries for k=1 (use utils.plot_2dclassifier) model = None predict = None yhat = None Yhat = None tr_err = 0 te_err = 0 for k in [1, 3, 10]: model = knn.fit(X, y, k) predict = model['predict'] yhat = predict(model, X) Yhat = predict(model, Xtest) tr_err = utils.classification_error(y, yhat) te_err = utils.classification_error(ytest, Yhat) print("Training error for k =", k, "is =", tr_err) print("Testing error for k =", k, "is =", te_err) utils.plot_2dclassifier(knn.fit(X, y, 1), Xtest, ytest) plt.show() if question == '1.2': dataset = utils.load_dataset('citiesBig1') X = dataset['X'] y = dataset['y'] Xtest = dataset['Xtest'] ytest = dataset['ytest'] # part 1: implement cnn.py
def main(): X = pd.read_csv( '../data/BlackFriday.csv' ) # names =("User_ID", "Product_ID", "Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status,", "Product_Category_1","Product_Category_2","Product_Category_3", "Purchase" )) N, d = X.shape print(N, d) # fill missing values with 0 # (?) need to calculate percentage of missing value? X = X.fillna(0) # change gender to 0 and 1 X['Gender'] = X['Gender'].apply(change_gender) # change age to 0 to 6 X['Age'] = X['Age'].apply(change_age) # change city categories to 0 to 2 X['City_Category'] = X['City_Category'].apply(change_city) # change the year to integer X['Stay_In_Current_City_Years'] = X['Stay_In_Current_City_Years'].apply( change_year) #predict gender y = np.zeros((N, 1)) y = X.values[:, 2] y = y.astype('int') X1 = X ID = ['User_ID', 'Product_ID', 'Gender'] X1 = X1.drop(ID, axis=1) X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.20, random_state=42) model = LogisticRegression(C=1, fit_intercept=False, solver='lbfgs', multi_class='multinomial') model.fit(X_train, y_train) print("LogisticRegression(softmax) Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("LogisticRegression(softmax) Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) model = linear_model.SGDClassifier(max_iter=1000, tol=1e-3) model.fit(X_train, y_train) print("logLinearClassifier Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("logLinearClassifier Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) #predict the product category1 based on other information. y2 = np.zeros((N, 1)) y2 = X.values[:, 8] y2 = y2.astype('int') X2 = X ID = [ 'User_ID', 'Product_ID', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3' ] X2 = X2.drop(ID, axis=1) X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=42) model = KNeighborsClassifier(n_neighbors=5, metric='cosine') model.fit(X_train, y_train) y_pred = model.predict(X_train) tr_error = np.mean(y_pred != y_train) y_pred = model.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error of KNN to predict age: %.3f" % tr_error) print("Testing error of KNN to predict age: %.3f" % te_error) # Training error of KNN to predict age: 0.363 #Testing error of KNN to predict age: 0.496 # Use decision tree to predict e_depth = 20 s_depth = 1 train_errors = np.zeros(e_depth - s_depth) test_errors = np.zeros(e_depth - s_depth) for i, d in enumerate(range(s_depth, e_depth)): print("\nDepth: %d" % d) model = DecisionTreeClassifier(max_depth=d, criterion='entropy', random_state=1) model.fit(X_train, y_train) y_pred = model.predict(X_train) tr_error = np.mean(y_pred != y_train) y_pred = model.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error: %.3f" % tr_error) print("Testing error: %.3f" % te_error) train_errors[i] = tr_error test_errors[i] = te_error x_vals = np.arange(s_depth, e_depth) plt.title("The effect of tree depth on testing/training error") plt.plot(x_vals, train_errors, label="training error") plt.plot(x_vals, test_errors, label="testing error") plt.xlabel("Depth") plt.ylabel("Error") plt.legend() fname = os.path.join("..", "figs", "trainTest_category1.pdf") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) model = RandomForestClassifier(criterion="entropy", n_estimators=5, max_features=5) model.fit(X_train, y_train) print("RandomForest Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("RandomForest Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) #RandomForest Training error 0.027 #RandomForest Validation error 0.157 tree = DecisionTreeClassifier(max_depth=13, criterion='entropy', random_state=1) tree.fit(X_train, y_train) y_pred = tree.predict(X_train) tr_error = np.mean(y_pred != y_train) y_pred = tree.predict(X_test) te_error = np.mean(y_pred != y_test) print("Decision Tree Training error : %.3f" % tr_error) print("Decision Tree Validation error: %.3f" % te_error) #Depth: 11 #Training error: 0.127 #Testing error: 0.131 #use softmaxClassifier to predict occputation model = LogisticRegression(C=10000, fit_intercept=False, solver='lbfgs', multi_class='multinomial') model.fit(X_train, y_train) print("LogisticRegression(softmax) Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("LogisticRegression(softmax) Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) #LogisticRegression(softmax) Training error 0.651 #LogisticRegression(softmax) Validation error 0.652 from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.gaussian_process.kernels import ConstantKernel, RBF from sklearn.kernel_ridge import KernelRidge from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel from sklearn.metrics import mean_squared_error poly = PolynomialFeatures(degree=4) X_train_sub = X_train[:1000] y_train_sub = y_train[:1000] X_train_ = poly.fit_transform(X_train_sub) model = LinearRegression() model.fit(X_train_, y_train_sub) model.score(X_train_, y_train_sub, sample_weight=None) y_pred = model.predict(X_train_) tr_error = mean_squared_error(y_pred, y_train_sub) y_pred = model.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error : %.3f" % tr_error) print("Validation error: %.3f" % te_error) #kernel = DotProduct() + WhiteKernel() y2 = np.zeros((N, 1)) y2 = X.values[:, 8] y2 = y2.astype('int') X2 = X ID = [ 'User_ID', 'Product_ID', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3' ] X2 = X2.drop(ID, axis=1) X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.02, random_state=42) gpr = GaussianProcessRegressor(kernel=None, random_state=0).fit(X_train, y_train) gpr.score(X_train, y_train) y_pred = gpr.predict(X_train) tr_error = mean_squared_error(y_pred, y_train) y_pred = gpr.predict(X_test) te_error = mean_squared_error(y_pred, y_test) clf = KernelRidge(alpha=0.5) clf.fit(X_train_sub, y_train_sub) clf.score(X_train_sub, y_train_sub, sample_weight=None)
def main(): X = pd.read_csv( '../data/BlackFriday.csv' ) # names =("User_ID", "Product_ID", "Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status,", "Product_Category_1","Product_Category_2","Product_Category_3", "Purchase" )) N, d = X.shape X.info() X.sort_values('User_ID').head(10) X['User_ID'].value_counts().count() #5,891 customers # fill missing values with 0 # (?) need to calculate percentage of missing value? X = X.fillna(0) # change gender to 0 and 1 X['Gender'] = X['Gender'].apply(change_gender) # change age to 0 to 6 X['Age'] = X['Age'].apply(change_age) # change city categories to 0 to 2 X['City_Category'] = X['City_Category'].apply(change_city) # change the year to integer X['Stay_In_Current_City_Years'] = X['Stay_In_Current_City_Years'].apply( change_year) #predict age # Make y matrix to be the age y = np.zeros((N, 1)) y = X.values[:, 3] y = y.astype('int') # X_no_age matrix deletes the Age column in the original dataset X_no_age = X ID = ['User_ID', 'Product_ID', 'Age'] X_no_age = X_no_age.drop(ID, axis=1) #print(X.shape) # split the data into training and test set using sklearn build-in function # the test_size = 0.2 # number of test examples = 107516 # number of training examples = 430061 X_train, X_test, y_train, y_test = train_test_split(X_no_age, y, test_size=0.2) # model = KNeighborsClassifier(n_neighbors=5, metric = 'cosine') # model.fit(X_train, y_train) # y_pred = model.predict(X_train) # tr_error = np.mean(y_pred != y_train) # y_pred = model.predict(X_test) # te_error = np.mean(y_pred != y_test) # print("Training error to predict age: %.3f" % tr_error) # print("Testing error to predict age: %.3f" % te_error) e_depth = 20 s_depth = 1 train_errors = np.zeros(e_depth - s_depth) test_errors = np.zeros(e_depth - s_depth) for i, d in enumerate(range(s_depth, e_depth)): print("\nDepth: %d" % d) model = DecisionTreeClassifier(max_depth=d, criterion='entropy', random_state=1) model.fit(X_train, y_train) y_pred = model.predict(X_train) tr_error = np.mean(y_pred != y_train) y_pred = model.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error: %.3f" % tr_error) print("Testing error: %.3f" % te_error) train_errors[i] = tr_error test_errors[i] = te_error x_vals = np.arange(s_depth, e_depth) plt.title("The effect of tree depth on testing/training error") plt.plot(x_vals, train_errors, label="training error") plt.plot(x_vals, test_errors, label="testing error") plt.xlabel("Depth") plt.ylabel("Error") plt.legend() fname = os.path.join("..", "figs", "trainTest_age.pdf") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) #use decision tree model to predict age tree = DecisionTreeClassifier(max_depth=13, criterion='entropy', random_state=1) tree.fit(X_train, y_train) y_pred = tree.predict(X_train) tr_error = np.mean(y_pred != y_train) #Depth: 13 #Training error: 0.352 #Testing error: 0.373 y_pred = tree.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error of predicting occupation: %.3f" % tr_error) print("Testing error: %.3f" % te_error) #use RandomForestClassifier model = RandomForestClassifier(criterion="entropy", n_estimators=10, max_features=None) model.fit(X_train, y_train) print("RandomForest Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("RandomForest Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) #use softmaxClassifier to predict occputation model = LogisticRegression(C=1, fit_intercept=False, solver='lbfgs', multi_class='multinomial') model.fit(X_train, y_train) print("LogisticRegression(softmax) Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("LogisticRegression(softmax) Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) # result: # k=10: Training error: 0.526 Testing error: 0.630 # k=3: Training error: 0.405 Testing error: 0.669 # k=5: Training error: 0.462 Testing error: 0.650 #---------------------------------------------------------------------------------------------------- #to predict the occupation # Make y matrix to be the occupation y_occ = X.values[:, 4] y_occ = y_occ.astype('int') X_occ = X ID = [ 'User_ID', 'Product_ID', 'Occupation', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3' ] X_occ.drop(ID, inplace=True, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_occ, y_occ, test_size=0.2) model = KNeighborsClassifier(n_neighbors=5) model.fit(X_train, y_train) y_pred = model.predict(X_train) tr_error = np.mean(y_pred != y_train) y_pred = model.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error of predicting occupation: %.3f" % tr_error) print("Testing error of predicting occupation: %.3f" % te_error) #use decision tree model tree = DecisionTreeClassifier(max_depth=18, criterion='entropy', random_state=1) tree.fit(X_train, y_train) y_pred = tree.predict(X_train) tr_error = np.mean(y_pred != y_train) y_pred = tree.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error of predicting occupation: %.3f" % tr_error) print("Testing error: %.3f" % te_error) #use softmaxClassifier to predict occputation model = LogisticRegression(C=10000, fit_intercept=False, solver='lbfgs', multi_class='multinomial') model.fit(X_train, y_train) print("LogisticRegression(softmax) Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("LogisticRegression(softmax) Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) # use isomap to visualize the data from sklearn.manifold import Isomap model = Isomap(n_components=2) ID = [ 'User_ID', 'Product_ID', 'Product_Category_2', 'Product_Category_3', 'Purchase' ] X_1 = X X_1 = X_1.drop(ID, axis=1) fig, ax = plt.subplots() Z = model.fit_transform(X_1[:10000]) ax.scatter(Z[:, 0], Z[:, 1]) plt.ylabel('z2') plt.xlabel('z1') plt.title('ISOMAP with 2components') fname = os.path.join("..", "figs", "ISOMAP_with_2_components.png") plt.savefig(fname) model = DBSCAN(eps=1, min_samples=3) y = model.fit_predict(Z) plt.scatter(Z[:, 0], Z[:, 1], c=y, cmap="jet", s=5) # clustering the 2 dimensional plot model = KMeans(n_clusters=5, random_state=0) model.fit(Z) y = model.predict(Z) plt.scatter(Z[:, 0], Z[:, 1], c=y, cmap="jet") plt.ylabel('z2') plt.xlabel('z1') plt.title('ISOMAP with k_means of 5 clusters') plt.show() fname = os.path.join("..", "figs", "kmeans.png") plt.savefig(fname) #compress in 3 dimension n_compoents = 3 model = Isomap(n_components=3) Z = model.fit_transform(X_1[:5000]) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(Z[:, 0], Z[:, 1], Z[:, 2], c='b') ax.set_zlabel('z3') ax.set_ylabel('z2') ax.set_xlabel('z1') plt.title('ISOMAP with 3') fname = os.path.join("..", "figs", "ISOMAP_with_3_components.png") plt.savefig(fname) #use PCA to study the data ID = ['User_ID', 'Product_ID', 'Product_Category_2', 'Product_Category_3'] X_1 = X X_1 = X_1.drop(ID, axis=1) model = PCA(n_components=3, svd_solver='auto') Z = model.fit_transform(X_1[:10000]) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(Z[:, 0], Z[:, 1], Z[:, 2], c='r') ax.set_zlabel('z3') ax.set_ylabel('z2') ax.set_xlabel('z1') plt.title('PCA with 3 components') plt.show() print(model.explained_variance_ratio_) fname = os.path.join("..", "figs", "PCA.png") plt.savefig(fname) #use pca to study the data 2 componetns ID = ['User_ID', 'Product_ID', 'Product_Category_2', 'Product_Category_3'] X_1 = X X_1 = X_1.drop(ID, axis=1) model = PCA(n_components=2, svd_solver='auto') Z = model.fit_transform(X_1[:100000]) fig = plt.figure() plt.title('PCA with 2 components') plt.scatter(Z[:, 0], Z[:, 1], c='r', cmap="jet", s=5) plt.ylabel('z2') plt.xlabel('z1') fname = os.path.join("..", "figs", "PCA_with_2_components.png") print(model.explained_variance_ratio_) plt.savefig(fname) #clustering ID = ['User_ID', 'Product_ID'] X_1 = X X_1 = X_1.drop(ID, axis=1) model = PCA(n_components=2, svd_solver='auto') Z = model.fit_transform(X_1[:1000]) model = DBSCAN(eps=1, min_samples=3) y = model.fit_predict(Z) plt.scatter(Z[:, 0], Z[:, 1], c=y, cmap="jet", s=5) plt.ylabel('z2') plt.xlabel('z1') fname = os.path.join("..", "figs", "clustering_from_PCA.png") plt.savefig(fname) model = KMeans(n_clusters=4, random_state=0) model.fit(Z) y = model.predict(Z) plt.scatter(Z[:, 0], Z[:, 1], c=y, cmap="jet") plt.ylabel('z2') plt.xlabel('z1') plt.title('PCA with NN=%d with k_means from 2 components') plt.show()
y_int = np.int32(y) lams = [2, 1.75, 1.5, 1.25, 1] test_error = [] train_error = [] train_bias = np.ones((X.shape[0], 1)) test_bias = np.ones((Xtest.shape[0], 1)) X = np.hstack((train_bias, X)) Xtest = np.hstack((test_bias, Xtest)) for lammy in lams: model = linear_model.softmaxClassifier(lammy=lammy, epochs=10, alpha=1, batch=5000) model.fit(X, y, Y) pred = model.predict(Xtest) e = utils.classification_error(ytest, pred) print("at lambda ", lammy, "validation error is ", e) test_error = np.append(test_error, e) pred = model.predict(X) e = utils.classification_error(y, pred) print("at lambda ", lammy, "train error is ", e) train_error = np.append(train_error, e) plt.plot(lams, test_error, label="validation error") plt.plot(lams, train_error, label="training error") plt.title("Multi-Class Linear Classifier") plt.xlabel("Lambda") plt.ylabel("Error") fname = os.path.join("..", "figs", "linear.pdf") plt.savefig(fname) print("\nFigure saved as '%s" % fname)
parser = argparse.ArgumentParser() parser.add_argument("-q", "--question", required=True) io_args = parser.parse_args() question = io_args.question if question == "2": data = utils.load_dataset("logisticData") XBin, yBin = data["X"], data["y"] XBinValid, yBinValid = data["Xvalid"], data["yvalid"] model = linear_model.logReg(maxEvals=400) model.fit(XBin, yBin) print( "\nlogReg Training error %.3f" % utils.classification_error(model.predict(XBin), yBin) ) print( "logReg Validation error %.3f" % utils.classification_error(model.predict(XBinValid), yBinValid) ) print("# nonZeros: %d" % (model.w != 0).sum()) elif question == "2.1": data = utils.load_dataset("logisticData") XBin, yBin = data["X"], data["y"] XBinValid, yBinValid = data["Xvalid"], data["yvalid"] model = linear_model.logRegL2(lammy=1.0, maxEvals=400) model.fit(XBin, yBin)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-q', '--question', required=True) io_args = parser.parse_args() question = io_args.question if question == "2": data = utils.load_dataset("logisticData") XBin, yBin = data['X'], data['y'] XBinValid, yBinValid = data['Xvalid'], data['yvalid'] model = linear_model.logReg(maxEvals=400) model.fit(XBin, yBin) print("\nlogReg Training error %.3f" % utils.classification_error(model.predict(XBin), yBin)) print("logReg Validation error %.3f" % utils.classification_error(model.predict(XBinValid), yBinValid)) print("# nonZeros: %d" % (model.w != 0).sum()) elif question == "2.1": data = utils.load_dataset("logisticData") XBin, yBin = data['X'], data['y'] XBinValid, yBinValid = data['Xvalid'], data['yvalid'] model = linear_model.logRegL2(maxEvals=400, l=1.0) model.fit(XBin, yBin) print("\nlogRegL2 Training error %.3f" % utils.classification_error(model.predict(XBin), yBin)) print("logRegL2 Validation error %.3f" %
io_args = parser.parse_args() question = io_args.question if question == '1.1': dataset = utils.load_dataset('citiesSmall') X = dataset['X'] y = dataset['y'] Xtest = dataset['Xtest'] ytest = dataset['ytest'] #model = knn.fit(X,y,3) #model = knn.fit(X,y,1) model = knn.fit(X, y, 10) y_pred_tr = knn.predict(model, X) y_pred_te = knn.predict(model, Xtest) trerror = utils.classification_error(y_pred_tr, y) teerror = utils.classification_error(y_pred_te, ytest) print(trerror) print(teerror) utils.plot_2dclassifier(model, Xtest, ytest) # part 1: implement knn.predict # part 2: print training and test errors for k=1,3,10 (use utils.classification_error) # part 3: plot classification boundaries for k=1 (use utils.plot_2dclassifier) if question == '1.2': dataset = utils.load_dataset('citiesBig1') X = dataset['X'] y = dataset['y']
reg_err = utils.classification_error(reg_pred, leave_out_data[1]) average = average + reg_err average = average / 5 return average lamerrs = [] lamclasserr = [] lamtesterr = [] for i in range(0, len(alllam)): logreg = LogisticRegression(alllam[i]) logreg.fit(X_train, y_train) testpred = logreg.predict(X_test) trainpred = logreg.predict(X_train) class_error = utils.classification_error(testpred, y_test) training_error = utils.classification_error(trainpred, y_train) lamclasserr.append(class_error) lamtesterr.append(training_error) lerr = lambdaError(alllam[i], folds) print("Training Error: ", training_error, "|Test Error: ", class_error, "|Cross Validation Error: ", lerr, "|Lambda: ", alllam[i]) lamerrs.append(lerr) print(lamerrs) training_cumulative = [] class_cumulative = [] datasize = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] for i in range(0, len(x_data)): logistic.fit(x_data[i], y_data[i])
parser = argparse.ArgumentParser() parser.add_argument('-q','--question', required = True) io_args = parser.parse_args() question = io_args.question if question == "2": data = utils.load_dataset("logisticData") XBin, yBin = data['X'], data['y'] XBinValid, yBinValid = data['Xvalid'], data['yvalid'] model = linear_model.logReg(maxEvals=400, verbose=1) model.fit(XBin,yBin) print("\nlogReg Training error %.3f" % utils.classification_error(model.predict(XBin), yBin)) print("logReg Validation error %.3f" % utils.classification_error(model.predict(XBinValid), yBinValid)) print("# nonZeros: %d" % (model.w != 0).sum()) elif question == "2.1": data = utils.load_dataset("logisticData") XBin, yBin = data['X'], data['y'] XBinValid, yBinValid = data['Xvalid'], data['yvalid'] # Fit logRegL2 model model = linear_model.logRegL2(lammy=1.0, maxEvals=400, verbose=1) model.fit(XBin,yBin) print("\nlogRegL2 Training error %.3f" % utils.classification_error(model.predict(XBin), yBin)) print("logRegL2 Validation error %.3f" % utils.classification_error(model.predict(XBinValid), yBinValid)) print("# nonZeros: %d" % (model.w != 0).sum())
best_batch = 0 best_alpha = 0 for m in range(3): for a in range(3): val_error = [] # cross validation for train, validate in kf.split(X, y): model.fit(X[train], y[train], epoch=100, minibatch=minibatch[m], alpha=alpha[a]) # record validation error v_error = utils.classification_error( model.predict(X[validate]), y[validate]) val_error.append(v_error) avg_val_error = np.average(np.asarray(val_error)) print("batch size: {0}, alpha: {1}, validation error: {2}". format(minibatch[m], alpha[a], avg_val_error)) if avg_val_error < min_val_error: min_val_error = avg_val_error best_batch = minibatch[m] best_alpha = alpha[a] print("When batch size is {0}, alpha is {1}, test error is {2}".format( best_batch, best_alpha, utils.classification_error(model.predict(Xtest), ytest)))
## LOCAL MODELS model1 = logistic_model.logRegL1(XBin[0:cut1,:], yBin[0:cut1], verbose=0, lammy=1, maxEvals=400) model2 = logistic_model.logRegL1(XBin[cut1+1:cut2,:], yBin[cut1+1:cut2], verbose=0, lammy=1, maxEvals=400) model3 = logistic_model.logRegL1(XBin[cut2+1:cut3,:], yBin[cut2+1:cut3], verbose=0, lammy=1, maxEvals=400) model4 = logistic_model.logRegL1(XBin[cut3+1:cut4,:], yBin[cut3+1:cut4], verbose=0, lammy=1, maxEvals=400) model5 = logistic_model.logRegL1(XBin[cut4+1:cut5,:], yBin[cut4+1:cut5], verbose=0, lammy=1, maxEvals=400) model1.fit() model2.fit() model3.fit() model4.fit() model5.fit() print("model1 Training error %.3f" % utils.classification_error(model1.predict(XBin[0:cut1,:]), yBin[0:cut1])) print("model2 Training error %.3f" % utils.classification_error(model2.predict(XBin[cut1+1:cut2,:]), yBin[cut1+1:cut2])) print("model3 Training error %.3f" % utils.classification_error(model3.predict(XBin[cut2+1:cut3,:]), yBin[cut2+1:cut3])) print("model4 Training error %.3f" % utils.classification_error(model4.predict(XBin[cut3+1:cut4,:]), yBin[cut3+1:cut4])) print("model5 Training error %.3f" % utils.classification_error(model5.predict(XBin[cut4+1:cut5,:]), yBin[cut4+1:cut5])) print("model1 Validation error %.3f" % utils.classification_error(model1.predict(XBinValid), yBinValid)) print("model2 Validation error %.3f" % utils.classification_error(model2.predict(XBinValid), yBinValid)) print("model3 Validation error %.3f" % utils.classification_error(model3.predict(XBinValid), yBinValid))
def predict(self, X): x = tf.placeholder(tf.float32, shape=X.shape) result = tf.matmul(x, self.w) prediction = tf.sign(result) result = self.session.run(prediction, feed_dict={x: X}) return np.squeeze(np.where(result == -1, 0, 1)) def compute_cost(self, X, y): result = tf.reduce_sum(tf.log(1 + tf.exp(- y * (X @ self.w)))) if self.loss == 'l2': return result + self.lammy * tf.reduce_sum(self.w ** 2) / 2 else: return result + self.lammy * tf.reduce_sum(tf.abs(self.w)) if __name__ == '__main__': X_train, X_test, y_train, y_test = utils.preprocess_heart() with tf.Session() as sess: model = BinaryClassification(sess, verbose=1, loss='l2', learning_rate=0.00001, num_epochs=500) model.fit(X_train, y_train) pred = model.predict(X_test) print("The test error is: ", utils.classification_error(y_test, pred)) model = svm.SVC() model.fit(X_train, y_train) pred = model.predict(X_test) print("The test error from sk-learn SVM is: ", utils.classification_error(y_test, pred))
model4 = logistic_model.logRegL2(XBin[cut3 + 1:cut4, :], yBin[cut3 + 1:cut4], lammy=0.1, verbose=0, maxEvals=400) model4.fit() model5 = logistic_model.logRegL2(XBin[cut4 + 1:cut5, :], yBin[cut4 + 1:cut5], lammy=0.1, verbose=0, maxEvals=400) model5.fit() print("model1 Validation error %.3f" % utils.classification_error(model1.predict(XBinValid), yBinValid)) print("model2 Validation error %.3f" % utils.classification_error(model2.predict(XBinValid), yBinValid)) print("model3 Validation error %.3f" % utils.classification_error(model3.predict(XBinValid), yBinValid)) print("model4 Validation error %.3f" % utils.classification_error(model4.predict(XBinValid), yBinValid)) print("model5 Validation error %.3f" % utils.classification_error(model5.predict(XBinValid), yBinValid)) clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(XBin, yBin) print("sklearn sgd validation error %.3f" % utils.classification_error(clf.predict(XBinValid), yBinValid)) svmclf = LinearSVC()