def main(train_path, eval_path, pred_path): """Problem 1(b): Logistic regression with Newton's Method. Args: train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True) # *** START CODE HERE *** # Train a logistic regression classifier # Plot decision boundary on top of validation set set # Use np.savetxt to save predictions on eval set to pred_path initial_theta = np.zeros(x_train.shape[1]) log_reg = LogisticRegression(step_size=0.2, max_iter=100, eps=1e-5, theta_0=initial_theta, verbose=True) log_reg.fit(x_train, y_train) prediction = log_reg.predict(x_eval) plot_path = pred_path + ".plot.png" util.plot(x_eval, y_eval, log_reg.theta, plot_path, correction=1.0) np.savetxt(pred_path, prediction)
def main(train_path, test_path): x_train, y_train = util.load_dataset(train_path[0]) if len(train_path) == 2: x_train2, y_train2 = util.load_dataset(train_path[1]) x_train = np.concatenate((x_train, x_train2), axis=0) y_train = np.concatenate((y_train, y_train2), axis=0) # Load the data. x_test, y_test = util.load_dataset(test_path) # # delete the bert entries # x_train=x_train[:,-13:] # x_test=x_test[:,-13:] # Define the SVM clf = LogisticRegression().fit(x_train, y_train) # Predicting prediction = clf.predict(x_test) print(classification_report(y_test, prediction)) prediction = clf.predict(x_train) print(classification_report(y_train, prediction))
def main(lr, train_path, eval_path, pred_path): """Problem 3(d): Poisson regression with gradient ascent. Args: lr: Learning rate for gradient ascent. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ # Load training set # x_train, y_train = util.load_dataset(train_path, add_intercept=False) x_train, y_train = util.load_dataset(train_path, add_intercept=True) pr = PoissonRegression(max_iter=10000) pr.step_size = lr pr.fit(x_train, y_train) x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True) y_pred = np.empty_like(y_eval) for i in range(len(x_eval)): y_pred[i] = pr.predict(x_eval[i]) # np.savetxt(pred_path, np.column_stack((x_eval, y_pred)), delimiter=',') np.savetxt(pred_path, y_pred, delimiter=',')
def main(tau, train_path, eval_path): """Problem 5(b): Locally weighted regression (LWR) Args: tau: Bandwidth parameter for LWR. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=True) # *** START CODE HERE *** # Fit a LWR model model = LocallyWeightedLinearRegression(tau=tau) model.fit(x_train, y_train) # Get MSE value on the validation set x_val, y_val = util.load_dataset(eval_path, add_intercept=True) y_pred = model.predict(x_val) print('p5b mse: ', ((y_val - y_pred)**2).mean(axis=0)) # Plot validation predictions on top of training set plt.figure() # No need to save predictions # Plot data plt.plot(x_train, y_train, 'bx', linewidth=2) plt.plot(x_val, y_pred, 'ro', linewidth=2) plt.xlabel('x') plt.ylabel('y') plt.savefig('output/p05b.png')
def main(train_path, eval_path, pred_path): """Problem 1(b): Logistic regression with Newton's Method. Args: train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ x_train, y_train = util.load_dataset(train_path, add_intercept=True) # initial guess of parameters theta_0 = np.zeros(shape=(3, )) # get the model model = LogisticRegression(theta_0=theta_0) model.fit(x_train, y_train) # predict using the trained model x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True) y_pred = model.predict(x_eval) # Plot decision boundary on top of validation set set util.plot(x_eval, y_eval, model.theta, 'output/{ds}_log_reg.pdf'.format(ds=eval_path.split('/')[-1])) # Use np.savetxt to save predictions on eval set to pred_path np.savetxt(pred_path, y_pred)
def main(train_path, valid_path, save_path): """Problem: Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) # *** START CODE HERE *** # Train a GDA classifier clf = GDA() clf.fit(x_train, y_train) # Plot decision boundary on validation set x_test, y_test = util.load_dataset(valid_path, add_intercept=True) y_test_pred = clf.predict(x_test[:, 1:]) # plot_decision_boundary(x_test, y_test, clf.theta, save_path) plot(x_test, y_test, clf.theta, os.path.splitext(save_path)[0] + '_fig.png') # Use np.savetxt to save predictions on eval set to save_path np.savetxt(save_path, y_test_pred) base, ext = os.path.splitext(save_path) theta_save_path = base + '_theta' + ext np.savetxt(theta_save_path, clf.theta)
def main(train_path, eval_path, pred_path, k = 0): """Problem 1(b): Logistic regression with Newton's Method. Args: train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_eval, y_eval = util.load_dataset(eval_path, add_intercept = True) # *** START CODE HERE *** # Train a logistic regression classifier # Plot decision boundary on top of validation set set # Use np.savetxt to save predictions on eval set to pred_path clf = LogisticRegression() theta = clf.fit(x_train,y_train) p = clf.predict(x_eval) if(k==0): np.savetxt(pred_path,p,delimiter = ',') sp = 'output/p01b_plot' util.plot(x_eval,y_eval,theta,sp) elif(k==1): ind = p < 0.5 p[ind] = 0 index = p >= 0.5 p[index] = 1 return theta,p
def main(train_path, valid_path, save_path): """Problem: Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) x_eval, y_eval = util.load_dataset(valid_path, add_intercept=False) # *** START CODE HERE *** # Train a GDA classifier clf = GDA() clf.fit(x_train, y_train) preds = clf.predict(x_eval) # Plot decision boundary on validation set theta_ = np.insert(clf.theta, 0, clf.theta_zero) save_path_ = save_path.strip('.txt') util.plot(x_eval, y_eval, theta_, save_path_) # Use np.savetxt to save outputs from validation set to save_path np.savetxt(save_path, preds)
def main(tau, train_path, eval_path): """Problem 5(b): Locally weighted regression (LWR) Args: tau: Bandwidth parameter for LWR. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=True) # *** START CODE HERE *** model = LocallyWeightedLinearRegression(tau=tau) model.fit(x_train, y_train) x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True) y_pred = model.predict(x_eval) mse = np.mean((y_pred - y_eval)**2) print(f'MSE={mse}') plt.figure() plt.plot(x_train, y_train, 'bx', linewidth=2) plt.plot(x_eval, y_pred, 'ro', linewidth=2) plt.xlabel('x') plt.ylabel('y') plt.savefig('output/p05b.png')
def main(train_path, eval_path, pred_path): """Problem 1(e): Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) # get the model model = GDA() model.fit(x_train, y_train) # predict using the trained model x_eval, y_eval = util.load_dataset(eval_path, add_intercept=False) y_pred = model.predict(x_eval) # Plot decision boundary on top of validation set set theta = list(model.theta) theta_0 = [model.theta_0] util.plot(x_eval, y_eval, theta_0 + theta, 'output/{ds}_GDA.pdf'.format(ds=eval_path.split('/')[-1])) # Use np.savetxt to save predictions on eval set to pred_path np.savetxt(pred_path, y_pred)
def main(lr, train_path, eval_path, save_path): """Problem: Poisson regression with gradient ascent. Args: lr: Learning rate for gradient ascent. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. save_path: Path to save predictions. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=True) # *** START CODE HERE *** # Fit a Poisson Regression model model = PoissonRegression(step_size=lr) model.fit(x_train, y_train) # Run on the validation set, and use np.savetxt to save outputs to save_path x_val, y_val = util.load_dataset(eval_path, add_intercept=True) pred_val = model.predict(x_val) np.savetxt(save_path, pred_val) # Plot the result plt.scatter(x=y_val, y=pred_val, label="Predictions") plt.xlabel("True Count") plt.ylabel("Predicted Expected Count") l = np.array([min(y_val), max(y_val)]) plt.plot(l, l, alpha=0.6, color="red", label="45-degree Line") plt.legend() image_path = save_path[:-3] + "png" plt.savefig(image_path)
def main(train_path, eval_path, pred_path): """Problem 1(e): Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True) # *** START CODE HERE *** # Train a GDA classifier # Plot decision boundary on validation set # Use np.savetxt to save outputs from validation set to pred_path gda = GDA(verbose=True) gda.fit(x_train, y_train) prediction = gda.predict(x_eval) plot_path = pred_path + ".plot.png" util.plot(x_eval, y_eval, gda.theta, plot_path, correction=1.0) np.savetxt(pred_path, prediction)
def main(tau, train_path, eval_path): """Problem 5(b): Locally weighted regression (LWR) Args: tau: Bandwidth parameter for LWR. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=True) # *** START CODE HERE *** LWR = LocallyWeightedLinearRegression(0.5) LWR.fit(x_train, y_train) x_val, y_val = util.load_dataset(eval_path, add_intercept=True) y_pred = LWR.predict(x_val) #plot plt.figure() plt.plot(x_train[:,1:], y_train, 'bx') plt.plot(x_val[:,1:], y_pred, 'ro') plt.show() mse = ((y_pred - y_val) ** 2).mean() print(mse)
def main(tau_values, train_path, valid_path, test_path, pred_path): """Problem 5(b): Tune the bandwidth paramater tau for LWR. Args: tau_values: List of tau values to try. train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_eval, y_eval = util.load_dataset(valid_path, add_intercept=True) x_test, y_test = util.load_dataset(test_path, add_intercept=True) # *** START CODE HERE *** # Search tau_values for the best tau (lowest MSE on the validation set) MSE_values = [] model_list = [] # Fit a LWR model with the best tau value for tau in tau_values: clf = LocallyWeightedLinearRegression(tau) clf.fit(x_train, y_train) y_pred = clf.predict(x_eval) MSE = np.linalg.norm(y_pred - y_eval)**2 / y_eval.shape[0] print("tau {}, MSE {}".format(tau, MSE)) MSE_values.append(MSE) model_list.append(clf) plot_lwr(x_train, y_train, x_eval, y_pred, "output/tau_{}.png".format(tau)) idx = np.argmin(MSE_values) best_model, best_tau = model_list[idx], tau_values[idx] y_test_pred = best_model.predict(x_test) test_MSE = np.linalg.norm(y_test_pred - y_test)**2 / y_test.shape[0] print("best tau {}, MSE on the test split {}".format(best_tau, test_MSE))
def load_model_input(args, fold): logging.info("Loading preprocessed {} dataset".format(fold)) path = get_model_path(args) X = load_dataset(path / "X_{}.json".format(fold)) Y = load_dataset(path / "Y_{}.json".format(fold)) return X, np.array(Y)
def parallel(self): X_train, y_train = load_dataset(self.train_set) X_test, y_test = load_dataset(self.test_set) rows = X_train.shape[0] hor_X, hor_y = horizontal_split_data(X_train, y_train, self.part) lambda_accurancy = np.zeros(self.part) weights = np.zeros(self.part) for i in self.number_of_learners: for j in self.height: predict_of_test = np.zeros(y_test.shape[0]) adaboost_set = [] for k in hor_X: lambda_accurancy[k], adaboost_instance = self.AdaBoost_DP( hor_X[k], hor_y[k], self.privacy_epsilon[k], j, i, rows) adaboost_set.append(adaboost_instance) weights = get_weight(lambda_accurancy) for k in range(self.part): predict_of_test += weights[k] * adaboost_set[k].predict( X_test) predict_of_test = [ 0.0 if predict <= 0.5 else 1.0 for predict in predict_of_test ] outputFile = open("adaboostdp_output.txt", 'a') outputFile.write( f1_score(y_test, predict_of_test, average="micro") + '\n') outputFile.write(roc_auc_score(y_test, predict_of_test) + '\n') outputFile.close()
def transforming_stuff(train_path, valid_path, save_path): x_train, y_train = util.load_dataset(train_path, add_intercept=False) x_train2 = np.stack([x_train[:, 0], np.log(x_train[:, 1])]).T # *** START CODE HERE *** # Train a GDA classifier clf = GDA() clf.fit(x_train2, y_train) # Plot decision boundary on validation set x_test, y_test = util.load_dataset(valid_path, add_intercept=True) y_test_pred = clf.predict(x_test[:, 1:]) x, y = x_test, y_test plt.figure() plt.plot(x[y == 0, -2], x[y == 0, -1], 'bx', linewidth=2) plt.plot(x[y == 1, -2], x[y == 1, -1], 'go', linewidth=2) x1_min, x1_max = x[:, -2].min(), x[:, -2].max() x_pts = np.arange(x1_min, x1_max, (x1_max - x1_min) / 100) theta0, theta1, theta2 = clf.theta y_pts = np.exp((-1 / theta2) * theta1 * x_pts + theta0) plt.plot(x_pts, y_pts) plt.xlabel('x1') plt.ylabel('x2') if save_path: plt.savefig(os.path.splitext(save_path)[0] + '_fig.png') # Use np.savetxt to save predictions on eval set to save_path np.savetxt(save_path, y_test_pred) base, ext = os.path.splitext(save_path) theta_save_path = base + '_theta' + ext np.savetxt(theta_save_path, clf.theta)
def main(train_path, valid_path, save_path): """Problem: Logistic regression with Newton's Method. Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ x_train, y_train = util.load_dataset(train_path, add_intercept=False) x_valid, y_valid = util.load_dataset(valid_path, add_intercept=False) x_train = x_train[:, 1:] x_valid = x_valid[:, 1:] # normalize the data: (skip binary features) x_train[:, :-1] = (x_train[:, :-1] - np.mean( x_train[:, :-1], axis=0)) / np.std(x_train[:, :-1], axis=0) x_valid[:, :-1] = (x_valid[:, :-1] - np.mean( x_valid[:, :-1], axis=0)) / np.std(x_valid[:, :-1], axis=0) # add intercept for logistic regression: x_train = util.add_intercept(x_train) x_valid = util.add_intercept(x_valid) clf = logistic.LogisticRegression(step_size=1, max_iter=100000000) clf.fit(x_train, y_train) y_pred_prob = clf.predict(x_valid) y_pred = y_pred_prob.round() print(classification_report(y_valid, y_pred)) print(confusion_matrix(y_valid, y_pred)) print(np.sum(y_valid)) np.savetxt(save_path, y_pred)
def main(train_path, valid_path, save_path): """Problem: Logistic regression with Newton's Method. Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ x_train, y_train = util.load_dataset(train_path, add_intercept=True) # *** START CODE HERE *** # Train a logistic regression classifier # Plot decision boundary on top of validation set # Use np.savetxt to save predictions on eval set to save_path model = LogisticRegression() model.fit(x_train, y_train) x_val, y_val = util.load_dataset(valid_path, add_intercept=True) util.plot(x_val, y_val, model.theta, save_path=save_path.replace(".txt", "jpg")) yhat = model.predict(x_val) np.savetxt(save_path, yhat) print(f"LogReg acc: {util.compute_accuracy(y_val, yhat)}") print(f"LogReg log loss: {util.compute_log_loss(y_val, yhat)}")
def main(train_path, test_path): # load the dataset # Load the data. # Load headers x_train, y_train = util.load_dataset(train_path[0]) if len(train_path) == 2: x_train2, y_train2 = util.load_dataset(train_path[1]) x_train = np.concatenate((x_train, x_train2), axis=0) y_train = np.concatenate((y_train, y_train2), axis=0) x_test, y_test = util.load_dataset(test_path) h1 = 300 h2 = 70 h3 = 12 h4 = 8 epoch = 100 # acc_train_all, acc_test_all = simple_nn_all(x_train, y_train, x_test, y_test, h1, h2, h3, h4, epoch) # acc_train_orig, acc_test_orig = simple_nn_orig(x_train, y_train, x_test, y_test, 10, 10, 10, epoch) acc_train, acc_test = keras_cat_nn(x_train, y_train, x_test, y_test, h1, h2, h3, h4, epoch, feature=False)
def main(tau, train_path, eval_path): """Problem 5(b): Locally weighted regression (LWR) Args: tau: Bandwidth parameter for LWR. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=True) # *** START CODE HERE *** # Fit a LWR model model = LocallyWeightedLinearRegression(0.5) model.fit(x_train, y_train) # Get MSE value on the validation set x_val, y_val = util.load_dataset(eval_path, add_intercept=True) y_pred = model.predict(x_val) mse = ((y_pred - y_val)**2).mean() print(mse) # Plot validation predictions on top of training set # No need to save anything # Plot data import matplotlib.pyplot as plt plt.figure() plt.plot(x_train, y_train, 'bx') plt.plot(x_val, y_pred, 'ro')
def main(train_path, valid_path, save_path): """Problem: Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) # *** START CODE HERE *** # Train a GDA classifier # Plot decision boundary on validation set # Use np.savetxt to save outputs from validation set to save_path model = GDA() model.fit(x_train, y_train) x_val, y_val = util.load_dataset(valid_path, add_intercept=False) util.plot(x_val, y_val, model.theta, save_path=save_path.replace(".txt", "jpg")) yhat = model.predict(x_val) np.savetxt(save_path, yhat) print(f"GDA acc: {util.compute_accuracy(y_val, yhat)}") print(f"GDA log loss: {util.compute_log_loss(y_val, yhat)}")
def run_trial_lms(n): """Problem: Logistic regression with Newton's Method. Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ rates = [0.05,0.01, 0.05, 0.1,0.5,1] train_path = 'ds1_train.csv' valid_path = 'ds1_valid.csv' x_train, y_train = util.load_dataset(train_path, add_intercept=True) # *** START CODE HERE *** # Train a logistic regression classifier LR = LogisticRegression(step_size = rates[n]) it = LR.lms_fit(x_train, y_train) # Plot decision boundary on top of validation set set x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True) ac = LR.predict(x_valid, y_valid) return n,it,ac
def main(train_path, eval_path, pred_path): """Problem 1(e): Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ # *** START CODE HERE *** # Train a GDA classifier # NOTE Drop x0 = 1 convention used in regression examples # Will need to account for this to write in terms of theta x_train, y_train = util.load_dataset(train_path, add_intercept=False) x_eval, y_eval = util.load_dataset(eval_path, add_intercept=False) model = GDA() model.fit(x_train, y_train) predictions = model.predict(x_eval) np.savetxt(pred_path, predictions) # Train Logistic regression classifier x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True) model2 = LogisticRegression() model2.fit(x_train, y_train) # Plot decision boundary on validation set # Compare decision boundary with logistic thetas = [model.theta, model2.theta] fig_path = pred_path[:-4] + "_fig.jpg" colours = ["red", "orange"] title = "LinearReg (Orange) vs. GDA (Red)" util.plot_multiple(x_eval, y_eval, thetas, colours, fig_path, title=title)
def main(train_path, valid_path, save_path): """Problem: Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) # *** START CODE HERE *** # Train a GDA classifier model = GDA() # Fit model to the training data. Define theta model.fit(x_train, y_train) # Read validation set x_val, y_val = util.load_dataset(valid_path, add_intercept=True) # Save predictions to save path np.savetxt(save_path, model.predict(x_val)) # Plot boundaries util.plot(x_val, y_val, model.theta, save_path[:-4])
def main(lr, train_path, eval_path, save_path): """Problem: Poisson regression with gradient ascent. Args: lr: Learning rate for gradient ascent. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. save_path: Path to save predictions. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_valid, y_valid = util.load_dataset(eval_path, add_intercept=True) # Train poisson model reg = PoissonRegression(step_size=lr) reg.fit(x_train, y_train) preds = reg.predict(x_valid) np.savetxt(save_path, preds) # plot predictions plt.scatter(y_valid, preds) plt.xlabel('True count') plt.ylabel('Predicted count') plt.axis('equal') plt.savefig('poisson.jpg')
def main(train_path, valid_path, save_path): """Problem: Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) # *** START CODE HERE *** # Train a GDA classifier # Plot decision boundary on validation set # Use np.savetxt to save outputs from validation set to save_path x_val, y_val = util.load_dataset(valid_path, add_intercept=False) ###decomment to normalize the training and validation sets to improve the GDA performance: # x_train = (x_train - np.mean(x_train, axis=0))/np.std(x_train, axis=0) # x_val = (x_val - np.mean(x_val, axis=0))/np.std(x_val, axis=0) # x_train = (x_train - np.min(x_train, axis=0))/(np.max(x_train, axis=0) - np.min(x_train, axis=0)) # x_val = (x_val - np.min(x_val, axis=0))/(np.max(x_val, axis=0) - np.min(x_val, axis=0)) clf = GDA() clf.fit(x_train, y_train) y_predict = clf.predict(x_val) np.savetxt(save_path, y_predict) util.plot(x_val, (y_predict >= 0.5), clf.theta, save_path[:-4]+ "validation_expected") #plotting the real distribution util.plot(x_val, y_val, clf.theta, save_path[:-4] + "validation_real")
def main(lr, train_path, eval_path, pred_path): """Problem 3(d): Poisson regression with gradient ascent. Args: lr: Learning rate for gradient ascent. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=False) x_eval, y_eval = util.load_dataset(train_path, add_intercept=False) # *** START CODE HERE *** # Fit a Poisson Regression model # Run on the validation set, and use np.savetxt to save outputs to pred_path initial_theta = np.zeros(x_train.shape[1]) #took 2008 iterations to converge using the given learning rate and default epsilon value poisson_reg = PoissonRegression(step_size=lr, max_iter=10000, theta_0=initial_theta, verbose=True) poisson_reg.fit(x_train, y_train) prediction = poisson_reg.predict(x_eval) np.savetxt(pred_path, prediction) # comparing prediction and y_eval copied from: # https://scikit-learn.org/0.16/auto_examples/plot_cv_predict.html import matplotlib.pyplot as plt fig,ax = plt.subplots() ax.scatter(y_eval, prediction) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') fig.savefig(pred_path + ".comparision.png")
def main(lr, train_path, eval_path, save_path, plot_path): """Problem: Poisson regression with gradient ascent. Args: lr: Learning rate for gradient ascent. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. save_path: Path to save predictions. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=True) # *** START CODE HERE *** # Fit a Poisson Regression model # Run on the validation set, and use np.savetxt to save outputs to save_path clf = PoissonRegression(step_size=lr) clf.fit(x_train, y_train) # Evaluation x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True) pred_eval = clf.predict(x_eval) pred_train = clf.predict(x_train) np.savetxt(save_path, pred_eval) # Plot plot_path_train = plot_path.replace(".png", "_train.png") plot_path_eval = plot_path.replace(".png", "_eval.png") util.plot_poisson(y_train, pred_train, plot_path_train) util.plot_poisson(y_eval, pred_eval, plot_path_eval)
def main(): # load dataset x_train, y_train = util.load_dataset(training_mode=True) m = x_train.shape[0] n = x_train.shape[1] x_train = x_train.reshape(m, n * n) x_test, y_test = util.load_dataset(training_mode=False) m = x_test.shape[0] n = x_test.shape[1] x_test = x_test.reshape(m, n * n) x_train = x_train / 255.0 x_test = x_test / 255.0 # build SVM model svm = SVC(C=5, gamma=0.05) start_time = dt.datetime.now() svm.fit(x_train, y_train) end_time = dt.datetime.now() elapsed_time = end_time - start_time print('Elapsed learning {}'.format(str(elapsed_time))) # predict expected = y_test predicted = svm.predict(x_test) # confusion matrix cm = metrics.confusion_matrix(expected, predicted) print("Confusion matrix:\n%s" % cm) plot_confusion_matrix(cm) # get accuracy print("Training Accuracy={}".format(metrics.accuracy_score(y_train, svm.predict(x_train)))) print("Testing Accuracy={}".format(metrics.accuracy_score(y_test, svm.predict(x_test))))
def registry(filename,nf, ptitle, kfstart=2, kfend=5, kstart=1, kend=5): ''' starts the project. For each fold it calculates mean accuracy, standard deviation and plot the corresponding graph. ''' dataset = load_dataset(filename) kf_accuracy = [] for kf in range(kfstart, kfend+1): kf_accuracy.append(get_Allknn_acc_for_kfold(dataset, kf, kstart, kend, nf)) kf_mean_acc = [sum(acclist)/len(acclist) for acclist in kf_accuracy] sd = [numpy.std(acclist) for acclist in kf_accuracy] for kf, acclist in zip(range(kfstart, kfend+1),kf_accuracy): print kf, "fold validation ===> accuracy of", sum(acclist)/len(acclist) # print kf_mean_acc mean_sd = sum(sd)/len(sd) mean_acc = sum(kf_mean_acc)/len(kf_mean_acc) print "Mean accuracy : ", mean_acc print "Mean S.D : ", mean_sd plot_graph(kf_accuracy, kstart, kend,sd, ptitle)
PROJ_GRAD=False # Should we project gradient on tangent space to to the Stiefel Manifold (Orthogonal matrices)? RETRACT=False # Should we do retraction step? THRESHOLD=0 #error threshold in which we do the retraction step GAIN=1 # a multiplicative constant we add to all orthogonal matrices RETRACT_SKIP=1 # How many Batches to wait before we do retraction opt_mathods_set=['SGD','ADAM'] OPT_METHOD=opt_mathods_set[0] algorithm={'ORT_INIT':ORT_INIT,'PROJ_GRAD':PROJ_GRAD,'RETRACT':RETRACT,'THRESHOLD':THRESHOLD, 'GAIN':GAIN,'RETRACT_SKIP':RETRACT_SKIP,'OPT_METHOD':OPT_METHOD} params={'network':network,'training':training,'algorithm':algorithm} DO_SAVE=True # should we save results? save_file_name=get_file_name(params) #%% Intialize network model data, vocab, data_ranges = load_dataset(DATASET) # define a list of parameters to orthogonalize (recurrent connectivities) param2orthogonlize=[] # The number of features is number of different letters + 1 unknown letter FEATURES_NUM=len(vocab)+1 # Construct network # Input layer l_in = lasagne.layers.InputLayer( (BATCH_SIZE, SEQUENCE_LENGTH-1, FEATURES_NUM)) # the input has -1 sequence elength since we through away the last character (it is only predicted - in the output) layers_to_concat = [] # All recurrent layer for dd in range(DEPTH): if ORT_INIT: W_in_to_hid_init=lasagne.init.Orthogonal(gain=GAIN)