def main(train_path, valid_path, save_path): """Problem: Logistic regression with Newton's Method. Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ x_train, y_train = util.load_dataset(train_path, add_intercept=False) x_valid, y_valid = util.load_dataset(valid_path, add_intercept=False) x_train = x_train[:, 1:] x_valid = x_valid[:, 1:] # normalize the data: (skip binary features) x_train[:, :-1] = (x_train[:, :-1] - np.mean( x_train[:, :-1], axis=0)) / np.std(x_train[:, :-1], axis=0) x_valid[:, :-1] = (x_valid[:, :-1] - np.mean( x_valid[:, :-1], axis=0)) / np.std(x_valid[:, :-1], axis=0) # add intercept for logistic regression: x_train = util.add_intercept(x_train) x_valid = util.add_intercept(x_valid) clf = logistic.LogisticRegression(step_size=1, max_iter=100000000) clf.fit(x_train, y_train) y_pred_prob = clf.predict(x_valid) y_pred = y_pred_prob.round() print(classification_report(y_valid, y_pred)) print(confusion_matrix(y_valid, y_pred)) print(np.sum(y_valid)) np.savetxt(save_path, y_pred)
def backward_selection(x_train, y_train, x_valid, y_valid): n = x_train.shape[0] # number of examples d = x_train.shape[1] # number of features # Wrapper feature selection: forward search remove_list = [] F_list = np.arange(d).tolist() score_all = [] index = np.arange(d).tolist() i = 0 # iteration times while len(F_list) > 0: i += 1 remove_f = [] score_f = [] for k in range(d): if k in F_list: remove_f.append(k) f = F_list[:] f.remove(k) x_train_f = x_train[:, f] x_valid_f = x_valid[:, f] # add intercept for logistic regression: x_train_f = util.add_intercept(x_train_f) x_valid_f = util.add_intercept(x_valid_f) clf = logistic.LogisticRegression(step_size=1, max_iter=100000000, verbose=False) clf.fit(x_train_f, y_train) y_pred_f_prob = clf.predict(x_valid_f) y_pred_f = y_pred_f_prob.round() f_accuracy = np.mean(y_pred_f == y_valid) score_f.append(f_accuracy) print('Acc = %.6f' % (f_accuracy), f) best_score = np.amax(score_f) best_f_index = np.argwhere(score_f == best_score) best_f_index = best_f_index.flatten().tolist() remove_all = True if remove_all: for f_index in best_f_index: best_f = remove_f[f_index] remove_list.append(best_f) F_list.remove(best_f) score_all.append(best_score) index[len(remove_list) - 1] = i print('') print('Acc_best = %.6f' % (best_score), F_list) print('') else: if len(best_f_index) == 1: f_index = best_f_index[0] else: # more than one best choice f_index = random.choice(best_f_index) best_f = remove_f[f_index] remove_list.append(best_f) F_list.remove(best_f) score_all.append(best_score) print('Acc_best = %.6f' % (best_score), F_list) return remove_list, score_all, index
def fit(self, x, y): """Run Newton's Method to minimize J(theta) for logistic regression. Args: x: Training example inputs. Shape (m, n). y: Training example labels. Shape (m,). """ if self.intercept is True: x = util.add_intercept(x) g = lambda x: 1 / (1 + np.exp(-x)) m, n = x.shape # initialize theta if self.theta is None: self.theta = np.zeros(n) # optimize theta while True: theta = self.theta # compute gradient G = -(1 / m) * (y - g(x.dot(theta))).dot(x) # compute H x_theta = x.dot(theta) H = (1 / m) * g(x_theta).dot(g(1 - x_theta)) * (x.T).dot(x) H_inv = np.linalg.inv(H) # update self.theta = theta - H_inv.dot(G) # if norm is small, terminate if np.linalg.norm(self.theta - theta, ord=1) < self.eps: break
def forward_selection(x_train, y_train, x_valid, y_valid): n = x_train.shape[0] # number of examples d = x_train.shape[1] # number of features # Wrapper feature selection: forward search F_list = [] score_all = [] for i in range(d): add_f = [] score_f = [] for k in range(d): if k not in F_list: add_f.append(k) f = F_list + [k] x_train_f = x_train[:, f] x_valid_f = x_valid[:, f] # add intercept for logistic regression: x_train_f = util.add_intercept(x_train_f) x_valid_f = util.add_intercept(x_valid_f) clf = logistic.LogisticRegression(step_size=1, max_iter=100000000, verbose=False) clf.fit(x_train_f, y_train) y_pred_f_prob = clf.predict(x_valid_f) y_pred_f = y_pred_f_prob.round() f_accuracy = np.mean(y_pred_f == y_valid) score_f.append(f_accuracy) print(f, f_accuracy) print(score_f) best_score = np.max(score_f) best_f_index = np.argwhere(score_f == best_score) best_f_index = best_f_index.flatten().tolist() if len(best_f_index) == 1: best_f_index = best_f_index[0] else: # more than one best choice best_f_index = random.choice(best_f_index) best_f = add_f[best_f_index] F_list.append(best_f) score_all.append(best_score) print('%.8f' % (best_score), F_list) remove_list.append(best_f) best_score_all = np.max(score_all) best_score_index = np.argmax(score_all) F_best = F_list[:int(best_score_index) + 1] return F_best, F_list, score_all
def predict(self, x): """Make a prediction given new inputs x. Args: x: Inputs of shape (m, n). Returns: Outputs of shape (m,). """ # *** START CODE HERE *** x = util.add_intercept(x) return x.dot(self.theta)>=0
def main(tau, train_path, eval_path): """Problem 5(b): Locally weighted regression (LWR) Args: tau: Bandwidth parameter for LWR. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. """ ## ## # Load training set x_train_org, y_train,x_eval_org,y_eval, data_frame = util.load_dataset_new(train_path,eval_path) # Feature Scaling sc_X = StandardScaler() x_train= util.add_intercept(sc_X.fit_transform(x_train_org)) x_eval= util.add_intercept(sc_X.fit_transform(x_eval_org)) #all_zeros = np.where(~x_train.any(axis=0))[0] #print(all_zeros) print("Train shape:" + str(x_train.shape)) print("Eval shape:" + str(x_eval.shape)) # Fit a LWR model clf = LocallyWeightedLinearRegression(tau) clf.fit(x_train, y_train, 0.1) y_train_out_real = np.dot(x_train, clf.theta) #print(y_train_out) p_eval = clf.predict(x_eval) def give_error(y_out, y): cnt = 0 for i in range(len(y_out)): if (y_out[i] == y[i]): cnt +=1 return cnt/len(y_out) #print(give_error(p_eval,y_eval)) print(p_eval, y_eval)
def main_h(train_path, eval_path, fig_path): # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) # it's okay without / (1. + median) median = np.median(x_train, axis=0) x_train = np.log((1. + x_train) / (1. + median)) x_train = util.add_intercept(x_train) # Train a logistic regression classifier lr = p01b_logreg.LogisticRegression() lr.fit(x_train, y_train) # Train a GDA classifier gda = p01e_gda.GDA() gda.fit(x_train[:, 1:], y_train) # Plot decision boundary on top of validation set set x_val, y_val = util.load_dataset(eval_path, add_intercept=False) x_val = np.log((1. + x_val) / (1. + median)) x_val = util.add_intercept(x_val) plot_all(x_val, y_val, lr.theta, gda.theta, fig_path)
def predict(self, x): """Make a prediction given new inputs x. Args: x: Inputs of shape (n_examples, dim). Returns: Outputs of shape (n_examples,). """ # *** START CODE HERE *** x = util.add_intercept(x) probs = sigmoid(x.dot(self.theta)) preds = (probs >= 0.5).astype(np.int) return preds
def predict(self, x): """Make a prediction given new inputs x. Args: x: Inputs of shape (m, n). Returns: Outputs of shape (m,). """ # *** START CODE HERE *** # we do not assume that intercept is added. def sigmoid(z): return 1 / (1 + np.exp(-z)) x = util.add_intercept(x) preds = (sigmoid(x.dot(self.theta.T)) >= 0.5).astype('int') return preds
def predict(self, x): """Make a prediction given new inputs x. Args: x: Inputs of shape (m, n). Returns: Outputs of shape (m,). """ # *** START CODE HERE *** # we do not assume that intercept is added. sigmoid = lambda z: 1 / (1 + np.exp(-z)) x = util.add_intercept(x) probs = sigmoid(x.dot(self.theta)) preds = (probs >= 0.5).astype(np.int) return preds
def predict(self, x, p): """Make a prediction given new inputs x. Args: p: Cut-off probability x: Inputs of shape (m, n). Returns: Outputs of shape (m,). """ sigmoid = lambda z: 1 / (1 + np.exp(- z)) if self.intercept is True: x = util.add_intercept(x) probs = sigmoid(np.dot(x, self.theta)) preds = (probs >= p).astype(np.float64) return preds
def predict(self, x): """Make a prediction given new inputs x. Args: x: Inputs of shape (m, n). Returns: Outputs of shape (m,). """ # *** START CODE HERE *** def sigmoid(z): return 1 / (1 + np.exp(-z)) x = util.add_intercept(x) probability = sigmoid(x.dot(self.theta)) predictions = (probability >= 0.5).astype(np.int) return predictions
def predict(self, x, p=None): """Make a prediction given new inputs x. Args: p: Cut-off probability x: Inputs of shape (m, n). Returns: Outputs of shape (m,). """ if self.intercept is True: x = util.add_intercept(x) g = lambda x: 1 / (1 + np.exp(-x)) preds = g(x.dot(self.theta)) if p is not None: preds = (preds >= p).astype(np.float64) return preds
def main(lr, train_path, eval_path, pred_path): """Problem 3(d): Poisson regression with gradient ascent. Args: lr: Learning rate for gradient ascent. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ # Load training set x_train, y_train = util.load_dataset(train_path, add_intercept=False) # *** START CODE HERE *** # Fit a Poisson Regression model x_train_ex = util.add_intercept(x_train) model = PoissonRegression(step_size=lr, max_iter=1000) model.fit(x_train_ex, y_train) # Run on the validation set, and use np.savetxt to save outputs to pred_path x_val, _ = util.load_dataset(eval_path, add_intercept=True) np.savetxt(pred_path, model.predict(x_val))
def predict(self, x): """Make a prediction given new inputs x. Args: x: Inputs of shape (m, n). Returns: Outputs of shape (m,) """ # *** START CODE HERE *** x = util.add_intercept(x) m,n = x.shape g = lambda z: 1/(1 + np.exp(-z)) test = g((x.dot(self.theta))) y_pred = test.copy() for i in range(m): if y_pred[i] >0.5: y_pred[i] = 1 else: y_pred[i] = 0 return y_pred
def main(train_path, valid_path, save_path): """Problem: Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. valid_path: Path to CSV file containing dataset for validation. save_path: Path to save predicted probabilities using np.savetxt(). """ # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) # *** START CODE HERE *** # Train a GDA classifier classification = GDA() classification.fit(x_train, y_train) # Plot decision boundary on validation set x_eval, y_eval = util.load_dataset(valid_path, add_intercept=False) util.plot(x_eval, y_eval, classification.theta, save_path.replace('.txt', '.png')) x_eval = util.add_intercept(x_eval) np.savetxt(save_path, classification.predict(x_eval))
def main(train_path, eval_path, pred_path): """Problem 1(e): Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ # Load dataset x_train, y_train = util.load_dataset(train_path, add_intercept=False) # *** START CODE HERE *** # Train a GDA classifier clf = GDA() clf.fit(x_train, y_train) # Plot decision boundary on validation set x_val, y_val = util.load_dataset(eval_path, add_intercept=False) y_pred = clf.predict(x_val) x_val = util.add_intercept(x_val) print(clf.theta_0) # Use np.savetxt to save outputs from validation set to pred_path # print(clf.theta_0) util.plot(x_val, y_val, clf.theta_0, '{}.png'.format(pred_path)) np.savetxt(pred_path, y_pred)
def main(file1): print("Running main") train_path = "output/flights_pass_1_na_0.csv" eval_path = "testinput/all_test_with_failures_clean.csv" #X, Y, X_test, Y_test, dataset = util.load_dataset_new(train_path, eval_path) x_train_org, y_train, x_valid_org, y_valid, dataset = util.load_dataset_new( train_path, eval_path) sc_X = StandardScaler() x_train = util.add_intercept(sc_X.fit_transform(x_train_org)) x_valid = util.add_intercept(sc_X.fit_transform(x_valid_org)) ###plot correlation matrix corr_after_dropping = dataset.corr() labels = corr_after_dropping.columns.values plt.matshow(corr_after_dropping) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(corr_after_dropping, vmin=-1, vmax=1) fig.colorbar(cax) ticks = np.arange(0, len(corr_after_dropping.columns), 1) ax.set_xticks(ticks) ax.set_yticks(ticks) #ax.set_xticklabels(labels, size=1) ax.set_yticklabels(labels, size=5) plot_path = 'output/correlation_plot' plt.savefig(plot_path) ##Scatter #headers = list(dataset.columns.values) ##scatter = pd.DataFrame(dataset, columns=headers) #scatter = pd.DataFrame(dataset) #my_scatter = scatter_matrix(scatter) #plt.savefig("output/flightscatter") def give_error(y_out, y): cnt = 0 for i in range(len(y_out)): if (y_out[i] == y[i]): cnt += 1 return cnt / len(y_out) ##Normal Eq tau = 0.1 lwr = LinearReg_normal_eq_locally_weighted(tau) lwr.x_train = x_train lwr.y_train = y_train lwr.x_valid = x_valid theta_train = lwr.fit(x_train, y_train, 0.05) y_train_out = sigmoid(x_train, theta_train) y_valid_out_ne = sigmoid(x_valid, theta_train) y_train_out_1 = np.where(y_train_out > 0.65, 1, 0) y_valid_out_ne_1 = np.where(y_valid_out_ne > 0.65, 1, 0) print(give_error(y_valid_out_ne_1, y_valid)) print(give_error(y_train_out_1, y_train)) ##print(y_valid_out_ne_1) #print(y_valid_out_ne) ##print(y_valid) ##LWR ###tau_array = np.array([10]) ###r2_valid_lwr = 0 ###for i in range(0, len(tau_array)): ### lwr.tau = tau_array[i] ### y_valid_out_lwr = lwr.predict(x_valid) ### y_valid_out_lwr_1 = np.where(y_valid_out_lwr > 0.65, 1, 0) ### print(give_error(y_valid_out_lwr_1, y_valid)) ##Gradient descent linear_reg = LinearRegression_gradient_descent() linear_reg.x_train = x_train linear_reg.y_train = y_train l1_l2_factor = np.array([1, 2]) ##learning_rate = 4.85e-5 lambda_array = np.array([10, 0.5]) learning_rate = 1e-5 cost_limit = 1e-12 r2_train_gd = 0 r2_valid_gd = 0 for i in range(0, len(l1_l2_factor)): ##for i in range(0, 0): theta_train = linear_reg.fit(x_train, y_train, lambda_array[i], learning_rate, cost_limit, l1_l2_factor[i]) y_train_out = linear_reg.predict(x_train) y_valid_out = linear_reg.predict(x_valid) y_train_out_1 = np.where(y_train_out > 0.6, 1, 0) y_valid_out_1 = np.where(y_valid_out > 0.6, 1, 0) print(give_error(y_valid_out_1, y_valid)) print(give_error(y_train_out_1, y_train))
import util import numpy as np import matplotlib.pyplot as plt from sklearn import svm, datasets from sklearn.decomposition import PCA iris = datasets.load_iris() from sklearn.preprocessing import StandardScaler train_path = "output/flights_pass_1_na_0.csv" eval_path = "testinput/all_test_with_failures_clean.csv" #X, Y, X_test, Y_test, dataset = util.load_dataset_new(train_path, eval_path) x_train_org, y, x_valid_org, y_eval, dataset = util.load_dataset_new(train_path, eval_path) sc_X = StandardScaler() X_Train = util.add_intercept(sc_X.fit_transform(x_train_org)) X_Test = util.add_intercept(sc_X.fit_transform(x_valid_org)) ##X = iris.data ##y = iris.target X = X_Train y = y pca = PCA(n_components=2) Xreduced = pca.fit_transform(X) Xtestreduced = pca.transform(X_Test) def give_error(y_out, y): cnt = 0 cntfour = 0 for i in range(len(y_out)):
def main(profile, input_col, label_col, ransac, cross, profile_test): """ :param profile: read profile_id, int type :param input_col: list of X name :param label_col: list of y name :param cross: whether it is a cross test between two profiles :param profile_test: the test profile if cross == True :return: n/a """ path = 'profile_data/Profile_' + str(profile) + '.csv' if not cross: x, y = util.load_dataset(path, input_col, label_col) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5) else: x_train, y_train = util.load_dataset(path, input_col, label_col) n_label = len(label_col) # number of kinds of labels n_input = len(input_col) # number of kinds of outputs # initialization for theta theta = np.zeros((n_label, n_input + 1)) if cross: # test path path = 'profile_data/Profile_' + str(profile_test) + '.csv' x_test, y_test = util.load_dataset(path, input_col, label_col) high_score = 0 for i in range(n_label): print('For label', label_col[i], " : ") kNeighbor(x_train, y_train[:, i], x_test, y_test[:, i]) if not ransac: reg = LinearRegression().fit(x_train, y_train[:, i]) theta[i, 0] = reg.intercept_ theta[i, 1:] = reg.coef_ else: reg = RANSACRegressor().fit(x_train, y_train[:, i]) theta[i, 0] = reg.estimator_.intercept_ theta[i, 1:] = reg.estimator_.coef_ # score = reg.score(x_train, y_train[:, i]) scores = cross_val_score(reg, x_test, y_test[:, i], cv=5, scoring='neg_mean_squared_error') score = -scores.mean() if score > high_score: high_score = score label = label_col[i] x_new = util.add_intercept(x_test) pred = x_new.dot(theta[i, :]) den = pd.DataFrame({ 'Actual': y_test[:, i], 'Prediction': pred, }) p = den.plot.kde() fig = p.get_figure() fig.savefig("density_plot/" + label_col[i] + '_density.png') print('Multiple Linear Regression MSE for', label_col[i], 'is', score, '+-', scores.std() * 2) print('In profile', profile, 'the highest score for', label, 'is', high_score) # if only one input, we can plot it if n_input == 1: for i in range(n_label): save_path = "plots/" + \ input_col[0] + '_vs_' + label_col[i] + '.png' util.plot(x_test, y_test[:, i], theta[i, :], input_col[0], label_col[0], save_path) print("theta is: ") print(theta)