def test_ridge_singular(self): # test on a singular matrix rng = np.random.RandomState(0) n_samples, n_features = 6, 6 y = rng.randn(n_samples // 2) y = np.concatenate((y, y)) X = rng.randn(n_samples // 2, n_features) X = np.concatenate((X, X), axis=0) ridge = RidgeRegression(alpha=0) ridge.train(X, y) self.assertGreater(ridge.score(X, y), 0.9)
def test_ridge_vs_lstsq(self): # On alpha=0., Ridge and ordinary linear regression yield the same solution. rng = np.random.RandomState(0) # we need more samples than features n_samples, n_features = 5, 4 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = RidgeRegression(alpha=0) ols = LinearRegression(fit_intercept=False) ridge.fit(X, y) ols.fit(X, y) assert_array_almost_equal(ridge.w, ols.coef_)
def test_ridge_vs_lstsq(self): # On alpha=0., # Ridge and ordinary linear regression should yield nearly same solution. rng = np.random.RandomState(0) # we need more samples than features n_samples, n_features = 6, 4 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = RidgeRegression(alpha=0, epoch_num=600, learning_rate=0.1) ols = LinearRegression(fit_intercept=True) np.random.seed(2020) ridge.fit(X, y) ols.fit(X, y) self.assertTrue( np.linalg.norm(ridge.theta.reshape([4]) - ols.coef_) < 0.01)
def test_ridge(self): # Ridge regression convergence test, compare to the true value rng = np.random.RandomState(0) alpha = 1.0 # With more samples than features n_samples, n_features = 6, 5 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, fit_intercept=False) custom_implemented_ridge = RidgeRegression(alpha=alpha) ridge.fit(X, y) custom_implemented_ridge.fit(X, y) self.assertEqual(custom_implemented_ridge.w.shape, (X.shape[1], )) self.assertAlmostEqual(ridge.score(X, y), custom_implemented_ridge.score(X, y))
def test_ridge(self): # Ridge regression convergence test # compare to the implementation of sklearn rng = np.random.RandomState(0) alpha = 1.0 # With more samples than features n_samples, n_features = 6, 5 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, fit_intercept=True, solver='sag') custom_implemented_ridge = RidgeRegression(alpha=alpha) ridge.fit(X, y) np.random.seed(2020) custom_implemented_ridge.fit(X, y) self.assertEqual(custom_implemented_ridge.theta.shape, (X.shape[1], 1)) self.assertTrue( custom_implemented_ridge.score(X, y) > ridge.score(X, y) - 0.1)
def run_2_1(X_train, y_train, X_val, y_val, l2reg_search, print_table=True, PLOT=True): num_features = X_train.shape[1] scores = np.zeros(len(l2reg_search)) for i, l2reg in enumerate(l2reg_search): ridge_regression = RidgeRegression(l2reg=l2reg) ridge_regression.fit(X_train, y_train) scores[i] = ridge_regression.score(X_val, y_val) if PLOT: fig, ax = plt.subplots() ax.semilogx(l2reg_search, scores) ax.grid() ax.set_title("Validation Performance vs. L2 Regularization Parameter") ax.set_xlabel("L2-Penalty Regularization Parameter") ax.set_ylabel("Average Square Error") plt.show() #print vertical table of (l2reg, score) #TODO later, figure out cleaner way with pandas if print_table: print "L2_Parameter | Average Square Error" for i in xrange(len(l2reg_search)): print l2reg_search[i], "|", scores[i] #choose L2-parameter that minimizes score l2reg_opt = l2reg_search[np.argmin(scores)] return l2reg_opt
def do_grid_search_ridge(X_train, y_train, X_val, y_val): # Now let's use sklearn to help us do hyperparameter tuning # GridSearchCv.fit by default splits the data into training and # validation itself; we want to use our own splits, so we need to stack our # training and validation sets together, and supply an index # (validation_fold) to specify which entries are train and which are # validation. X_train_val = np.vstack((X_train, X_val)) y_train_val = np.concatenate((y_train, y_val)) val_fold = [-1] * len(X_train) + [0] * len( X_val) #0 corresponds to validation # Now we set up and do the grid search over l2reg. The np.concatenate # command illustrates my search for the best hyperparameter. In each line, # I'm zooming in to a particular hyperparameter range that showed promise # in the previous grid. This approach works reasonably well when # performance is convex as a function of the hyperparameter, which it seems # to be here. # param_grid = [{'l2reg':np.unique(np.concatenate((10.**np.arange(-6,1,1), # np.arange(1,3,.3) # ))) }] param_grid = [{'l2reg': np.unique(10.**np.arange(-3, 0.5, 0.1))}] ridge_regression_estimator = RidgeRegression() # initialize estimator grid = GridSearchCV( ridge_regression_estimator, # makes use of BaseEstimator wrapper param_grid, return_train_score=True, cv=PredefinedSplit(test_fold=val_fold), refit=True, scoring=make_scorer(mean_squared_error, greater_is_better=False)) grid.fit(X_train_val, y_train_val) df = pd.DataFrame(grid.cv_results_) # Flip sign of score back, because GridSearchCV likes to maximize, # so it flips the sign of the score if "greater_is_better=FALSE" df['mean_test_score'] = -df['mean_test_score'] df['mean_train_score'] = -df['mean_train_score'] cols_to_keep = ["param_l2reg", "mean_test_score", "mean_train_score"] df_toshow = df[cols_to_keep].fillna('-') df_toshow = df_toshow.sort_values(by=["param_l2reg"]) return grid, df_toshow
# # # # # # # # # # # # # # # # print("Running Nystroem Approximation") factor = np.max(np.linalg.norm(train_features, axis=1)) train_features /= factor test_features /= factor dim = min(train_features.shape[1] * 2, train_features.shape[0]) print("Nystroem dim is {}".format(dim)) # Use the Nyström approximation in sklearn approx = Nystroem(kernel='rbf', gamma=1., n_components=dim) approx.fit(train_features) train_features = approx.transform(train_features) test_features = approx.transform(test_features) # # # # # # # # # # # # # # # # # Ridge regression with cross validation # # # # # # # # # # # # # # # # style = 'c' if train_features.shape[0] > train_features.shape[1] else 'k' clf = GridSearchCV(RidgeRegression(), { 'alpha': [(10**i) for i in range(-7, 0)], 'style': [style] }, n_jobs=4) clf.fit(train_features, train_onehot) y_pred_ = np.argmax(clf.predict(test_features), axis=-1) acc = sum(y_pred_ == targets["test"]) * 1.0 / len(targets["test"]) print(acc)
def main(): # Load problem lasso_data_fname = "lasso_data.pickle" x_train, y_train, x_val, y_val, target_fn, coefs_true, featurize = load_problem( lasso_data_fname) # Generate features X_train = featurize(x_train) X_val = featurize(x_val) #Visualize training data # fig, ax = plt.subplots() # ax.imshow(X_train) # ax.set_title("Design Matrix: Color is Feature Value") # ax.set_xlabel("Feature Index") # ax.set_ylabel("Example Number") # plt.show(block=False) # Do hyperparameter tuning with our ridge regression # this is done on the training and validation set grid, results = do_grid_search_ridge(X_train, y_train, X_val, y_val) print(results) # Plot validation performance vs regularization parameter fig, ax = plt.subplots() # ax.loglog(results["param_l2reg"], results["mean_test_score"]) ax.semilogx(results["param_l2reg"], results["mean_test_score"]) ax.grid() ax.set_title("Validation Performance vs L2 Regularization") ax.set_xlabel("L2-Penalty Regularization Parameter") ax.set_ylabel("Mean Squared Error") plt.show() # Let's plot prediction functions and compare coefficients for several fits # and the target function. pred_fns = [] x = np.sort(np.concatenate([np.arange(0, 1, .001), x_train])) name = "Target Parameter Values (i.e. Bayes Optimal)" pred_fns.append({"name": name, "coefs": coefs_true, "preds": target_fn(x)}) l2regs = [0, grid.best_params_['l2reg'], 1] X = featurize(x) for l2reg in l2regs: # for every chosen regularization constant ridge_regression_estimator = RidgeRegression(l2reg=l2reg) # fit ridge_regression_estimator.fit(X_train, y_train) name = "Ridge with L2Reg=" + str(l2reg) pred_fns.append({ "name": name, "coefs": ridge_regression_estimator.w_, "preds": ridge_regression_estimator.predict(X) }) # f = plot_prediction_functions(x, pred_fns, x_train, y_train, legend_loc="best") # plt.show() # f = compare_parameter_vectors(pred_fns) # plt.show() # confusion matrix for different cutoff params cutoffs = [10**(-3), 10**(-2), 10**(-1)] best = pred_fns[1] ctf_fns = [] for cutoff in cutoffs: ridge_regression_estimator = RidgeRegression() W = [w * (abs(w) > cutoff) for w in best["coefs"]] ridge_regression_estimator.w_ = W name = "Ridge with cutoff=" + str(cutoff) ctf_fns.append({ "name": name, "coefs": W, "preds": ridge_regression_estimator.predict(X) }) f = plot_prediction_functions(x, ctf_fns, x_train, y_train, legend_loc="best") plt.show()
def main(): if len(argv) == 2: program, sol = argv else: raise RuntimeError("USAGE: python solution.py OPTION[2 or 3]") # load data and split data lasso_data_fname = "lasso_data.pickle" x_train, y_train, x_val, y_val, target_fn, coefs_true, featurize = load_problem( lasso_data_fname) # turn from 1D binary data to high dimensional featurized data X_train = featurize(x_train) X_val = featurize(x_val) if sol == "2": #### 2.1 # create array of possible L2Reg parameters l2reg_search = 10.**np.arange(-6, +1, 1) # search through l2reg_search l2reg_opt = run_2_1(X_train, y_train, X_val, y_val, l2reg_search, print_table=False, PLOT=False) #### 2.2 # x has many inputs from 0 to 1, as well as the x_train inputs, to help plotting x = np.sort(np.concatenate([np.arange(0, 1, .001), x_train])) X = featurize(x) # pred_fns is a list of dicts with "name", "coefs" and "preds" pred_fns = [] coefs_opt = 0 #for question 2.3 # first entry: Target function pred_fns.append({ "name": "Target", "coefs": coefs_true, "preds": target_fn(x) }) l2reg_values = [0, l2reg_opt] # next entries: prediction functions for L2Reg parameters in l2reg_values for l2reg in l2reg_values: ridge = RidgeRegression(l2reg=l2reg) ridge.fit(X_train, y_train) pred_fns.append({ "name": "Ridge with L2Reg=" + str(l2reg), "coefs": ridge.w_, "preds": ridge.predict(X) }) # for question 2.3 if l2reg == l2reg_opt: coefs_opt = ridge.w_ # with pred_fns populated, plot # "PRED": prediction functions # "COEF": coefficients plots = ["PRED", "COEF"] #plots=[] run_2_2(x, x_train, y_train, pred_fns, plot=plots) #### 2.3 epsilon = [] #epsilon = [1e-6, 1e-3, 1e-2, 5e-2, 1e-1, 5e-1] for e in epsilon: run_2_3(coefs_true, coefs_opt, epsilon=e) if sol == "3": #### 3.2 - experiment with Lasso # Found that start="RR", order="cyclic", epsilon=1e-8 works MARGINALLY better #run_3_2(X_train, y_train, X_val, y_val, l1reg=1, epsilons=[1e-8, 1e-3]) #### 3.3 #### Part a: find optimal l1reg # create array of possible L1Reg parameters #l1reg_search = 10.**np.arange(-6, 2, 1) # search through l1reg_search #l1reg_opt = run_3_3_a(X_train, y_train, X_val, y_val, l1reg_search) l1reg_opt = 1.0 # found from above #### 3.3 #### Part b: plot corresponding prediction function # x has many inputs from 0 to 1, as well as the x_train inputs, to help plotting x = np.sort(np.concatenate([np.arange(0, 1, .001), x_train])) X = featurize(x) # pred_fns is a list of dicts with "name", "coefs" and "preds" pred_fns = [] # first entry: Target function pred_fns.append({ "name": "Target", "coefs": coefs_true, "preds": target_fn(x) }) lasso = LassoRegression(l1reg=l1reg_opt) lasso.shooting_alg(X_train, y_train) pred_fns.append({ "name": "Ridge with L1Reg=" + str(l1reg_opt), "coefs": lasso.w, "preds": lasso.predict(X) }) # with pred_fns populated, plot # "PRED": prediction functions # "COEF": coefficients run_3_3_b(x, x_train, y_train, pred_fns, plot=[]) run_3_4(X_train, y_train, X_val, y_val, p=0.8)
bounds=[(0.0, None)], options={ 'xtol': 1e-6, 'disp': True }) loss = f(res.x) print("fixed design opt reg and loss", res.x, loss) args.l2_reg = max(res.x[0], EPS) else: # construct model if args.model == "logistic_regression": model = LogisticRegression(input_dim=kernel_approx.n_feat, n_class=n_class, reg_lambda=args.l2_reg) elif args.model == "ridge_regression": model = RidgeRegression(input_dim=kernel_approx.n_feat, reg_lambda=args.l2_reg) if use_cuda: model.cuda() model.double() # set up optimizer if args.opt == "sgd": print("using sgd optimizer") optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, weight_decay=args.l2_reg) elif args.opt == "lpsgd": print("using lp sgd optimizer") optimizer = halp.optim.LPSGD(model.parameters(), lr=args.learning_rate, scale_factor=args.scale_model,
mlp = MLP(epoch_num=400, batch_size=batch_size, learning_rate=0.1) np.random.seed(2020) mlp.train(x_train, y_train) y_predict = mlp.predict(x_train) self.assertTrue(numerical_accuracy(y_predict, y_train) > 0.95) def test_xor(self): X = np.array([0, 0, 1, 1, 0, 1, 1, 0], dtype=np.float32).reshape(4, 2) Y = np.array([0, 0, 1, 1], dtype=np.float32) mlp = MLP(hidden_layer_sizes=(2, ), epoch_num=1600, learning_rate=0.22) np.random.seed(2020) mlp.train(X, Y) self.assertAlmostEqual(numerical_accuracy(mlp.predict(X), Y), 1.0) @unittest.skipIf(RidgeRegression().skip, 'skip bonus question') class TestRidgeModel(unittest.TestCase): def test_ridge(self): # Ridge regression convergence test # compare to the implementation of sklearn rng = np.random.RandomState(0) alpha = 1.0 # With more samples than features n_samples, n_features = 6, 5 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, fit_intercept=True, solver='sag') custom_implemented_ridge = RidgeRegression(alpha=alpha) ridge.fit(X, y)
def expr(A, b, lmbd, num_cols_list, num_iters=50): """ Perform experiments using the iterative ridge regression algorithm comparing various sampling methods (uniform, leverage scores, and ridge leverage scores) by evaluating the relative errors and objective errors of the solutions obtained using each sampling method with varying sketch sizes and at each iteration. Args: A: array, design matrix. b: array, response vector. num_cols_list: array, list of sketch sizes used in the experiments. num_iters: int, maximum number of iterations to run the algorithm. Returns: rel_errs_unif, rel_errs_levr, rel_errs_rdge: each of type array(len(num_cols_list), num_iters)), relative errors. obj_errs_unif, obj_errs_levr, obj_errs_rdge: each of type array(len(num_cols_list), num_iters)), objective errors. """ model = RidgeRegression(A, b, lmbd=lmbd) x_opt = model.direct_solver() obj_opt = model.obj_vals(x_opt) num_cols_list = map(int, num_cols_list) rel_errs_unif = np.zeros((len(num_cols_list), num_iters)) rel_errs_levr = np.zeros((len(num_cols_list), num_iters)) rel_errs_rdge = np.zeros((len(num_cols_list), num_iters)) obj_errs_unif = np.zeros((len(num_cols_list), num_iters)) obj_errs_levr = np.zeros((len(num_cols_list), num_iters)) obj_errs_rdge = np.zeros((len(num_cols_list), num_iters)) for k, num_cols in enumerate(num_cols_list): print "k = %d; number of sampled columns = %d\n" % (k, num_cols) probs_unif = np.ones(d) / d probs_levr = model.leverage_scores() probs_rdge = model.ridge_leverage_scores() _, x_unif = model.iterative_solver(num_cols, num_iters, probs=probs_unif) _, x_levr = model.iterative_solver(num_cols, num_iters, probs=probs_levr) _, x_rdge = model.iterative_solver(num_cols, num_iters, probs=probs_rdge) rel_errs_unif[k] = model.rel_err(x_unif, x_opt) rel_errs_levr[k] = model.rel_err(x_levr, x_opt) rel_errs_rdge[k] = model.rel_err(x_rdge, x_opt) obj_errs_unif[k] = model.obj_vals(x_unif) / obj_opt - 1. obj_errs_levr[k] = model.obj_vals(x_levr) / obj_opt - 1. obj_errs_rdge[k] = model.obj_vals(x_rdge) / obj_opt - 1. return rel_errs_unif, rel_errs_levr, rel_errs_rdge, \ obj_errs_unif, obj_errs_levr, obj_errs_rdge