def test_grad_search(Optimizer, model, crit): """check that the paths are the same in the line search""" n_outer = 2 criterion = HeldOutMSE(idx_train, idx_val) monitor1 = Monitor() algo = Forward() optimizer = Optimizer(n_outer=n_outer, tol=1e-16) grad_search(algo, criterion, model, optimizer, X, y, alpha0, monitor1) criterion = HeldOutMSE(idx_train, idx_val) monitor2 = Monitor() algo = Implicit() optimizer = Optimizer(n_outer=n_outer, tol=1e-16) grad_search(algo, criterion, model, optimizer, X, y, alpha0, monitor2) criterion = HeldOutMSE(idx_train, idx_val) monitor3 = Monitor() algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=5000) optimizer = Optimizer(n_outer=n_outer, tol=1e-16) grad_search(algo, criterion, model, optimizer, X, y, alpha0, monitor3) np.testing.assert_allclose(np.array(monitor1.alphas), np.array(monitor3.alphas)) np.testing.assert_allclose(np.array(monitor1.grads), np.array(monitor3.grads), rtol=1e-5) np.testing.assert_allclose(np.array(monitor1.objs), np.array(monitor3.objs)) assert not np.allclose(np.array(monitor1.times), np.array(monitor3.times))
def test_val_grad(model_name, criterion_name, algo): """Check that all methods return the same gradient, comparing to cvxpylayer """ if model_name == 'svr': pytest.xfail("svr needs to be fixed") if criterion_name == 'logistic': pytest.xfail("cvxpylayer seems broken for logistic") if criterion_name == 'MSE': criterion = HeldOutMSE(idx_train, idx_val) elif criterion_name == 'logistic': criterion = HeldOutLogistic(idx_train, idx_val) elif criterion_name == 'SURE': criterion = FiniteDiffMonteCarloSure(sigma_star) log_alpha = dict_log_alpha[model_name] model = models[model_name] val, grad = criterion.get_val_grad(model, X, y, log_alpha, algo.compute_beta_grad, tol=tol) np.testing.assert_allclose(dict_vals_cvxpy[model_name, criterion_name], val, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(dict_grads_cvxpy[model_name, criterion_name], grad, rtol=1e-5, atol=1e-5)
def test_grad_search(model, crit): """check that the paths are the same in the line search""" if crit == 'cv': n_outer = 2 criterion = HeldOutMSE(idx_train, idx_val) else: n_outer = 2 criterion = SmoothedSURE(sigma_star) # TODO MM@QBE if else scheme surprising criterion = HeldOutMSE(idx_train, idx_val) monitor1 = Monitor() algo = Forward() grad_search(algo, criterion, model, X, y, log_alpha, monitor1, n_outer=n_outer, tol=1e-16) criterion = HeldOutMSE(idx_train, idx_val) monitor2 = Monitor() algo = Implicit() grad_search(algo, criterion, model, X, y, log_alpha, monitor2, n_outer=n_outer, tol=1e-16) criterion = HeldOutMSE(idx_train, idx_val) monitor3 = Monitor() algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=5000) grad_search(algo, criterion, model, X, y, log_alpha, monitor3, n_outer=n_outer, tol=1e-16) np.testing.assert_allclose(np.array(monitor1.log_alphas), np.array(monitor3.log_alphas)) np.testing.assert_allclose(np.array(monitor1.grads), np.array(monitor3.grads), atol=1e-8) np.testing.assert_allclose(np.array(monitor1.objs), np.array(monitor3.objs)) assert not np.allclose(np.array(monitor1.times), np.array(monitor3.times))
def test_grad_search(model, crit): """check that the paths are the same in the line search""" if crit == 'cv': n_outer = 2 criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) else: n_outer = 2 criterion = SURE(X_train, y_train, model, sigma=sigma_star, X_test=X_test, y_test=y_test) criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) monitor1 = Monitor() algo = Forward() grad_search(algo, criterion, log_alpha, monitor1, n_outer=n_outer, tol=1e-16) criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) monitor2 = Monitor() algo = Implicit() grad_search(algo, criterion, log_alpha, monitor2, n_outer=n_outer, tol=1e-16) criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) monitor3 = Monitor() algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=5000) grad_search(algo, criterion, log_alpha, monitor3, n_outer=n_outer, tol=1e-16) assert np.allclose(np.array(monitor1.log_alphas), np.array(monitor3.log_alphas)) assert np.allclose(np.array(monitor1.grads), np.array(monitor3.grads)) assert np.allclose(np.array(monitor1.objs), np.array(monitor3.objs)) assert np.allclose(np.array(monitor1.objs_test), np.array(monitor3.objs_test)) assert not np.allclose(np.array(monitor1.times), np.array(monitor3.times))
def test_grad_search(): n_outer = 3 criterion = HeldOutMSE(X_val, y_val, model, X_test=None, y_test=None) monitor1 = Monitor() algo = Forward() grad_search(algo, criterion, np.array([log_alpha1, log_alpha2]), monitor1, n_outer=n_outer, tol=1e-16) criterion = HeldOutMSE(X_val, y_val, model, X_test=None, y_test=None) monitor2 = Monitor() algo = Implicit() grad_search(algo, criterion, np.array([log_alpha1, log_alpha2]), monitor2, n_outer=n_outer, tol=1e-16) criterion = HeldOutMSE(X_val, y_val, model, X_test=None, y_test=None) monitor3 = Monitor() algo = ImplicitForward(tol_jac=1e-3, n_iter_jac=1000) grad_search(algo, criterion, np.array([log_alpha1, log_alpha2]), monitor3, n_outer=n_outer, tol=1e-16) [np.linalg.norm(grad) for grad in monitor1.grads] [np.exp(alpha) for alpha in monitor1.log_alphas] assert np.allclose(np.array(monitor1.log_alphas), np.array(monitor3.log_alphas)) assert np.allclose(np.array(monitor1.grads), np.array(monitor3.grads)) assert np.allclose(np.array(monitor1.objs), np.array(monitor3.objs)) assert not np.allclose(np.array(monitor1.times), np.array(monitor3.times)) assert np.allclose(np.array(monitor1.log_alphas), np.array(monitor2.log_alphas), atol=1e-2) assert np.allclose(np.array(monitor1.grads), np.array(monitor2.grads), atol=1e-2) assert np.allclose(np.array(monitor1.objs), np.array(monitor2.objs), atol=1e-2) assert not np.allclose(np.array(monitor1.times), np.array(monitor2.times))
def test_check_grad_sparse_ho(model_name, criterion, algo): """Check that all methods return a good gradient using check_grad""" if criterion == 'MSE': criterion = HeldOutMSE(idx_train, idx_val) elif criterion == 'SURE': criterion = FiniteDiffMonteCarloSure(sigma_star) elif criterion == 'logistic': criterion = HeldOutLogistic(idx_train, idx_val) model = models[model_name] log_alpha = dict_log_alpha[model_name] def get_val(log_alpha): val, _ = criterion.get_val_grad( model, X, y, np.squeeze(log_alpha), algo.get_beta_jac_v, tol=tol) return val def get_grad(log_alpha): _, grad = criterion.get_val_grad( model, X, y, np.squeeze(log_alpha), algo.get_beta_jac_v, tol=tol) return grad for log_alpha in dict_list_log_alphas[model_name]: grad_error = check_grad(get_val, get_grad, log_alpha) assert grad_error < 1e-1
def test_grid_search(): max_evals = 5 monitor_grid = Monitor() model = Lasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_train) algo = Forward() log_alpha_opt_grid, _ = grid_search( algo, criterion, model, X, y, log_alpha_min, log_alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = HeldOutMSE(idx_train, idx_val) algo = Forward() log_alpha_opt_random, _ = grid_search( algo, criterion, model, X, y, log_alpha_min, log_alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") assert(monitor_random.log_alphas[ np.argmin(monitor_random.objs)] == log_alpha_opt_random) assert(monitor_grid.log_alphas[ np.argmin(monitor_grid.objs)] == log_alpha_opt_grid) monitor_grid = Monitor() model = Lasso(estimator=estimator) criterion = SmoothedSURE(sigma=sigma_star) algo = Forward() log_alpha_opt_grid, _ = grid_search( algo, criterion, model, X, y, log_alpha_min, log_alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = SmoothedSURE(sigma=sigma_star) algo = Forward() log_alpha_opt_random, _ = grid_search( algo, criterion, model, X, y, log_alpha_min, log_alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") assert(monitor_random.log_alphas[ np.argmin(monitor_random.objs)] == log_alpha_opt_random) assert(monitor_grid.log_alphas[ np.argmin(monitor_grid.objs)] == log_alpha_opt_grid)
def test_monitor(): model = Lasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_val) algo = ImplicitForward() monitor = Monitor(callback=callback) optimizer = LineSearch(n_outer=10, tol=tol) grad_search(algo, criterion, model, optimizer, X, y, alpha0, monitor) np.testing.assert_allclose(np.array(monitor.objs), np.array(objs))
def test_cross_val_criterion(model_name, XX): model = models[model_name] alpha_min = alpha_max / 10 max_iter = 10000 n_alphas = 10 kf = KFold(n_splits=5, shuffle=True, random_state=56) monitor_grid = Monitor() if model_name.startswith("lasso"): sub_crit = HeldOutMSE(None, None) else: sub_crit = HeldOutLogistic(None, None) criterion = CrossVal(sub_crit, cv=kf) grid_search(criterion, model, XX, y, alpha_min, alpha_max, monitor_grid, max_evals=n_alphas, tol=tol) if model_name.startswith("lasso"): reg = linear_model.LassoCV(cv=kf, verbose=True, tol=tol, fit_intercept=False, alphas=np.geomspace(alpha_max, alpha_min, num=n_alphas), max_iter=max_iter).fit(X, y) else: reg = linear_model.LogisticRegressionCV( cv=kf, verbose=True, tol=tol, fit_intercept=False, Cs=len(idx_train) / np.geomspace(alpha_max, alpha_min, num=n_alphas), max_iter=max_iter, penalty='l1', solver='liblinear').fit(X, y) reg.score(XX, y) if model_name.startswith("lasso"): objs_grid_sk = reg.mse_path_.mean(axis=1) else: objs_grid_sk = reg.scores_[1.0].mean(axis=1) # these 2 value should be the same (objs_grid_sk - np.array(monitor_grid.objs)) np.testing.assert_allclose(objs_grid_sk, monitor_grid.objs)
def __init__(self, X, y, Model, cv=None, max_iter=1000, estimator=None): """ Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) Data y : {ndarray, sparse matrix} of shape (n_samples,) Target Model: class The Model class definition (e.g. Lasso or SparseLogreg) cv : int, cross-validation generator or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - int, to specify the number of folds. - scikit-learn CV splitter - An iterable yielding (train, test) splits as arrays of indices. For int/None inputs, KFold is used. max_iter: int Maximal number of iteration for the state-of-the-art solver estimator: instance of ``sklearn.base.BaseEstimator`` An estimator that follows the scikit-learn API. """ self.X = X self.y = y self.dict_crits = {} self.dict_models = {} self.rmse = None self.estimator = estimator cv = check_cv(cv) for i, (idx_train, idx_val) in enumerate(cv.split(X)): X_train = X[idx_train, :] y_train = y[idx_train] if issparse(X_train): X_train = X_train.tocsc() # TODO get rid of this self.models[i] = Model(X_train, y_train, max_iter=max_iter, estimator=estimator) criterion = HeldOutMSE(idx_train, idx_val) self.dict_crits[i] = criterion self.n_splits = cv.n_splits
def test_grad_search(model, crit): """check that the paths are the same in the line search""" if crit == 'MSE': n_outer = 2 criterion = HeldOutMSE(idx_train, idx_val) else: n_outer = 2 criterion = FiniteDiffMonteCarloSure(sigma_star) # TODO MM@QBE if else scheme surprising criterion = HeldOutMSE(idx_train, idx_val) monitor1 = Monitor() algo = Forward() optimizer = LineSearch(n_outer=n_outer, tol=1e-16) grad_search(algo, criterion, model, optimizer, X, y, alpha0, monitor1) criterion = HeldOutMSE(idx_train, idx_val) monitor2 = Monitor() algo = Implicit() optimizer = LineSearch(n_outer=n_outer, tol=1e-16) grad_search(algo, criterion, model, optimizer, X, y, alpha0, monitor2) criterion = HeldOutMSE(idx_train, idx_val) monitor3 = Monitor() algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=5000) optimizer = LineSearch(n_outer=n_outer, tol=1e-16) grad_search(algo, criterion, model, optimizer, X, y, alpha0, monitor3) np.testing.assert_allclose(np.array(monitor1.alphas), np.array(monitor3.alphas)) np.testing.assert_allclose(np.array(monitor1.grads), np.array(monitor3.grads), rtol=1e-5) np.testing.assert_allclose(np.array(monitor1.objs), np.array(monitor3.objs)) assert not np.allclose(np.array(monitor1.times), np.array(monitor3.times))
def test_monitor(): model = Lasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_val) algo = ImplicitForward() monitor = Monitor(callback=callback) grad_search(algo, criterion, model, X, y, np.log(alpha_max / 10), monitor, n_outer=10, tol=tol) np.testing.assert_allclose(np.array(monitor.objs), np.array(objs))
def test_val_grad(): ####################################################################### # Not all methods computes the full Jacobian, but all # compute the gradients # check that the gradient returned by all methods are the same criterion = HeldOutMSE(X_val, y_val, model) algo = Forward() val_fwd, grad_fwd = criterion.get_val_grad(np.array( [log_alpha1, log_alpha2]), algo.get_beta_jac_v, tol=tol) criterion = HeldOutMSE(X_val, y_val, model) algo = ImplicitForward(tol_jac=1e-16, n_iter_jac=5000) val_imp_fwd, grad_imp_fwd = criterion.get_val_grad(np.array( [log_alpha1, log_alpha2]), algo.get_beta_jac_v, tol=tol) criterion = HeldOutMSE(X_val, y_val, model) algo = ImplicitForward(tol_jac=1e-16, n_iter_jac=5000) val_imp_fwd_custom, grad_imp_fwd_custom = criterion.get_val_grad( np.array([log_alpha1, log_alpha2]), algo.get_beta_jac_v, tol=tol) criterion = HeldOutMSE(X_val, y_val, model) algo = Implicit() val_imp, grad_imp = criterion.get_val_grad(np.array( [log_alpha1, log_alpha2]), algo.get_beta_jac_v, tol=tol) assert np.allclose(val_fwd, val_imp_fwd) assert np.allclose(grad_fwd, grad_imp_fwd) assert np.allclose(val_imp_fwd, val_imp) assert np.allclose(val_imp_fwd, val_imp_fwd_custom) # for the implcit the conjugate grad does not converge # hence the rtol=1e-2 assert np.allclose(grad_imp_fwd, grad_imp, atol=1e-3) assert np.allclose(grad_imp_fwd, grad_imp_fwd_custom)
def parallel_function(dataset_name, method, tol=1e-5, n_outer=50, tolerance_decrease='constant'): # load data X, y = fetch_libsvm(dataset_name) y -= np.mean(y) # compute alpha_max alpha_max = np.abs(X.T @ y).max() / len(y) if model_name == "logreg": alpha_max /= 2 alpha_min = alpha_max * dict_palphamin[dataset_name] if model_name == "enet": estimator = linear_model.ElasticNet(fit_intercept=False, max_iter=10_000, warm_start=True, tol=tol) model = ElasticNet(estimator=estimator) elif model_name == "logreg": model = SparseLogreg(estimator=estimator) # TODO improve this try: n_outer = dict_n_outers[dataset_name, method] except Exception: n_outer = 20 size_loop = 2 for _ in range(size_loop): if model_name == "lasso" or model_name == "enet": sub_criterion = HeldOutMSE(None, None) elif model_name == "logreg": criterion = HeldOutLogistic(None, None) kf = KFold(n_splits=5, shuffle=True, random_state=42) criterion = CrossVal(sub_criterion, cv=kf) algo = ImplicitForward(tol_jac=1e-3) monitor = Monitor() t_max = dict_t_max[dataset_name] if method == 'grid_search': num1D = dict_point_grid_search[dataset_name] alpha1D = np.geomspace(alpha_max, alpha_min, num=num1D) alphas = [np.array(i) for i in product(alpha1D, alpha1D)] grid_search(algo, criterion, model, X, y, alpha_min, alpha_max, monitor, max_evals=100, tol=tol, alphas=alphas) elif method == 'random' or method == 'bayesian': hyperopt_wrapper(algo, criterion, model, X, y, alpha_min, alpha_max, monitor, max_evals=30, tol=tol, method=method, size_space=2, t_max=t_max) elif method.startswith("implicit_forward"): # do gradient descent to find the optimal lambda alpha0 = np.array([alpha_max / 100, alpha_max / 100]) n_outer = 30 if method == 'implicit_forward': optimizer = GradientDescent(n_outer=n_outer, p_grad_norm=1, verbose=True, tol=tol, t_max=t_max) else: optimizer = GradientDescent(n_outer=n_outer, p_grad_norm=1, verbose=True, tol=tol, t_max=t_max, tol_decrease="geom") grad_search(algo, criterion, model, optimizer, X, y, alpha0, monitor) else: raise NotImplementedError monitor.times = np.array(monitor.times) monitor.objs = np.array(monitor.objs) monitor.objs_test = 0 # TODO monitor.alphas = np.array(monitor.alphas) return (dataset_name, method, tol, n_outer, tolerance_decrease, monitor.times, monitor.objs, monitor.objs_test, monitor.alphas, alpha_max, model_name)
import pytest import numpy as np from sparse_ho.criterion import (HeldOutMSE, HeldOutLogistic, FiniteDiffMonteCarloSure) from sparse_ho.utils import Monitor from sparse_ho import Forward from sparse_ho.tests.common import (X, y, sigma_star, idx_train, idx_val, models, dict_list_log_alphas) list_model_crit = [('lasso', HeldOutMSE(idx_train, idx_val)), ('enet', HeldOutMSE(idx_train, idx_val)), ('wLasso', HeldOutMSE(idx_train, idx_val)), ('lasso', FiniteDiffMonteCarloSure(sigma_star)), ('logreg', HeldOutLogistic(idx_train, idx_val))] tol = 1e-15 @pytest.mark.parametrize('model_name,criterion', list_model_crit) def test_cross_val_criterion(model_name, criterion): # verify dtype from criterion, and the good shape algo = Forward() monitor_get_val = Monitor() monitor_get_val_grad = Monitor() model = models[model_name] for log_alpha in dict_list_log_alphas[model_name]: criterion.get_val(model, X,
def test_val_grad(): ####################################################################### # Not all methods computes the full Jacobian, but all # compute the gradients # check that the gradient returned by all methods are the same for key in models.keys(): # model = Lasso(log_alpha) log_alpha = dict_log_alpha[key] model = models[key] # model = Lasso(log_alpha) criterion = HeldOutMSE(X_val, y_val, model) algo = Forward() val_fwd, grad_fwd = criterion.get_val_grad(log_alpha, algo.get_beta_jac_v, tol=tol) criterion = HeldOutMSE(X_val, y_val, model) algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=5000) val_imp_fwd, grad_imp_fwd = criterion.get_val_grad(log_alpha, algo.get_beta_jac_v, tol=tol) criterion = HeldOutMSE(X_val, y_val, model) algo = Implicit() val_imp, grad_imp = criterion.get_val_grad(log_alpha, algo.get_beta_jac_v, tol=tol) # import ipdb; ipdb.set_trace() criterion = HeldOutMSE(X_val, y_val, model) algo = Backward() val_bwd, grad_bwd = criterion.get_val_grad(log_alpha, algo.get_beta_jac_v, tol=tol) assert np.allclose(val_fwd, val_imp_fwd) assert np.allclose(grad_fwd, grad_imp_fwd) # assert np.allclose(val_imp_fwd, val_imp) assert np.allclose(val_bwd, val_fwd) assert np.allclose(val_bwd, val_imp_fwd) assert np.allclose(grad_fwd, grad_bwd) assert np.allclose(grad_bwd, grad_imp_fwd) # for the implcit the conjugate grad does not converge # hence the rtol=1e-2 assert np.allclose(grad_imp_fwd, grad_imp, atol=1e-3) for key in models.keys(): # model = Lasso(log_alpha) log_alpha = dict_log_alpha[key] model = models[key] # model = Lasso(log_alpha) criterion = SURE(X_train, y_train, model, sigma_star) algo = Forward() val_fwd, grad_fwd = criterion.get_val_grad(log_alpha, algo.get_beta_jac_v, tol=tol) criterion = SURE(X_train, y_train, model, sigma_star) algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=5000) val_imp_fwd, grad_imp_fwd = criterion.get_val_grad(log_alpha, algo.get_beta_jac_v, tol=tol) criterion = SURE(X_train, y_train, model, sigma_star) algo = Implicit(criterion) val_imp, grad_imp = criterion.get_val_grad(log_alpha, algo.get_beta_jac_v, tol=tol) criterion = SURE(X_train, y_train, model, sigma_star) algo = Backward() val_bwd, grad_bwd = criterion.get_val_grad(log_alpha, algo.get_beta_jac_v, tol=tol) assert np.allclose(val_fwd, val_imp_fwd) assert np.allclose(grad_fwd, grad_imp_fwd) assert np.allclose(val_imp_fwd, val_imp) assert np.allclose(val_bwd, val_fwd) assert np.allclose(val_bwd, val_imp_fwd) assert np.allclose(grad_fwd, grad_bwd) assert np.allclose(grad_bwd, grad_imp_fwd)
cv=2, n_jobs=2).fit(X_train_val, y_train_val) # Measure mse on test mse_cv = mean_squared_error(y_test, model_cv.predict(X_test)) print("Vanilla LassoCV: Mean-squared error on test data %f" % mse_cv) ############################################################################## ############################################################################## # Weighted Lasso with sparse-ho. # We use the vanilla lassoCV coefficients as a starting point alpha0 = np.log(model_cv.alpha_) * np.ones(X_train.shape[1]) # Weighted Lasso: Sparse-ho: 1 param per feature estimator = Lasso(fit_intercept=False, max_iter=10, warm_start=True) model = WeightedLasso(X_train, y_train, estimator=estimator) criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = ImplicitForward() monitor = Monitor() grad_search(algo, criterion, alpha0, monitor, n_outer=20, tol=1e-6) ############################################################################## ############################################################################## # MSE on validation set mse_sho_val = mean_squared_error(y_val, estimator.predict(X_val)) # MSE on test set, ie unseen data mse_sho_test = mean_squared_error(y_test, estimator.predict(X_test)) print("Sparse-ho: Mean-squared error on validation data %f" % mse_sho_val) print("Sparse-ho: Mean-squared error on test (unseen) data %f" % mse_sho_test)
cv=2, n_jobs=2).fit(X, y) # Measure mse on test mse_cv = mean_squared_error(y_test, model_cv.predict(X_test)) print("Vanilla LassoCV: Mean-squared error on test data %f" % mse_cv) ############################################################################## ############################################################################## # Weighted Lasso with sparse-ho. # We use the vanilla lassoCV coefficients as a starting point log_alpha0 = np.log(model_cv.alpha_) * np.ones(n_features) # Weighted Lasso: Sparse-ho: 1 param per feature estimator = Lasso(fit_intercept=False, max_iter=10, warm_start=True) model = WeightedLasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_val) algo = ImplicitForward() monitor = Monitor() grad_search(algo, criterion, model, X, y, log_alpha0, monitor, n_outer=20, tol=1e-6) ############################################################################## ############################################################################## # MSE on validation set
def parallel_function(dataset_name, method, tol=1e-5, n_outer=50, tolerance_decrease='exponential'): # load data X_train, X_val, X_test, y_train, y_val, y_test = get_data(dataset_name) n_samples, n_features = X_train.shape print('n_samples', n_samples) print('n_features', n_features) y_train[y_train == 0.0] = -1.0 y_val[y_val == 0.0] = -1.0 y_test[y_test == 0.0] = -1.0 alpha_max = np.max(np.abs(X_train.T @ y_train)) alpha_max /= X_train.shape[0] log_alpha_max = np.log(alpha_max) alpha_min = alpha_max * 1e-2 # alphas = np.geomspace(alpha_max, alpha_min, 10) # log_alphas = np.log(alphas) log_alpha1_0 = np.log(0.1 * alpha_max) log_alpha2_0 = np.log(0.1 * alpha_max) log_alpha_max = np.log(alpha_max) n_outer = 25 if dataset_name == "rcv1": size_loop = 2 else: size_loop = 2 model = ElasticNet(X_train, y_train, log_alpha1_0, log_alpha2_0, log_alpha_max, max_iter=1000, tol=tol) for i in range(size_loop): monitor = Monitor() if method == "implicit_forward": criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = ImplicitForward(criterion, tol_jac=1e-3, n_iter_jac=100) _, _, _ = grad_search(algo=algo, verbose=False, log_alpha0=np.array( [log_alpha1_0, log_alpha2_0]), tol=tol, n_outer=n_outer, monitor=monitor, t_max=dict_t_max[dataset_name], tolerance_decrease=tolerance_decrease) elif method == "forward": criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Forward(criterion) _, _, _ = grad_search(algo=algo, log_alpha0=np.array( [log_alpha1_0, log_alpha2_0]), tol=tol, n_outer=n_outer, monitor=monitor, t_max=dict_t_max[dataset_name], tolerance_decrease=tolerance_decrease) elif method == "implicit": criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Implicit(criterion) _, _, _ = grad_search(algo=algo, log_alpha0=np.array( [log_alpha1_0, log_alpha2_0]), tol=tol, n_outer=n_outer, monitor=monitor, t_max=dict_t_max[dataset_name], tolerance_decrease=tolerance_decrease) elif method == "grid_search": criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Forward(criterion) log_alpha_min = np.log(alpha_min) log_alpha_opt, min_g_func = grid_search( algo, log_alpha_min, log_alpha_max, monitor, max_evals=10, tol=tol, samp="grid", t_max=dict_t_max[dataset_name], log_alphas=None, nb_hyperparam=2) print(log_alpha_opt) elif method == "random": criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Forward(criterion) log_alpha_min = np.log(alpha_min) log_alpha_opt, min_g_func = grid_search( algo, log_alpha_min, np.log(alpha_max), monitor, max_evals=10, tol=tol, samp="random", t_max=dict_t_max[dataset_name], nb_hyperparam=2) print(log_alpha_opt) elif method == "lhs": criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Forward(criterion) log_alpha_min = np.log(alpha_min) log_alpha_opt, min_g_func = grid_search( algo, log_alpha_min, np.log(alpha_max), monitor, max_evals=10, tol=tol, samp="lhs", t_max=dict_t_max[dataset_name]) print(log_alpha_opt) monitor.times = np.array(monitor.times).copy() monitor.objs = np.array(monitor.objs).copy() monitor.objs_test = np.array(monitor.objs_test).copy() monitor.log_alphas = np.array(monitor.log_alphas).copy() return (dataset_name, method, tol, n_outer, tolerance_decrease, monitor.times, monitor.objs, monitor.objs_test, monitor.log_alphas, norm(y_val), norm(y_test), log_alpha_max)
def parallel_function(name_model, div_alpha): index_col = np.arange(10) alpha_max = (np.abs(X[np.ix_(idx_train, index_col)].T @ y[idx_train])).max() / len(idx_train) if name_model == "lasso": log_alpha = np.log(alpha_max / div_alpha) elif name_model == "enet": alpha0 = alpha_max / div_alpha alpha1 = (1 - l1_ratio) * alpha0 / l1_ratio log_alpha = np.log(np.array([alpha0, alpha1])) criterion = HeldOutMSE(idx_train, idx_val) algo = Forward() monitor = Monitor() val, grad = criterion.get_val_grad(dict_models[name_model], X[:, index_col], y, log_alpha, algo.compute_beta_grad, tol=tol, monitor=monitor) criterion = HeldOutMSE(idx_train, idx_val) algo = Backward() monitor = Monitor() val, grad = criterion.get_val_grad(dict_models[name_model], X[:, index_col], y, log_alpha, algo.compute_beta_grad, tol=tol, monitor=monitor) val_cvxpy, grad_cvxpy = dict_cvxpy[name_model](X[:, index_col], y, np.exp(log_alpha), idx_train, idx_val) list_times_fwd = [] list_times_bwd = [] list_times_cvxpy = [] for n_col in dict_ncols[div_alpha]: temp_fwd = [] temp_bwd = [] temp_cvxpy = [] for i in range(repeat): rng = np.random.RandomState(i) index_col = rng.choice(n_features, n_col, replace=False) alpha_max = (np.abs(X[np.ix_(idx_train, index_col)].T @ y[idx_train])).max() / len(idx_train) if name_model == "lasso": log_alpha = np.log(alpha_max / div_alpha) elif name_model == "enet": alpha0 = alpha_max / div_alpha alpha1 = (1 - l1_ratio) * alpha0 / l1_ratio log_alpha = np.log(np.array([alpha0, alpha1])) criterion = HeldOutMSE(idx_train, idx_val) algo = Forward() monitor = Monitor() val, grad = criterion.get_val_grad(dict_models[name_model], X[:, index_col], y, log_alpha, algo.compute_beta_grad, tol=tol, monitor=monitor) temp_fwd.append(monitor.times) criterion = HeldOutMSE(idx_train, idx_val) algo = Backward() monitor = Monitor() val, grad = criterion.get_val_grad(dict_models[name_model], X[:, index_col], y, log_alpha, algo.compute_beta_grad, tol=tol, monitor=monitor) temp_bwd.append(monitor.times) t0 = time.time() val_cvxpy, grad_cvxpy = dict_cvxpy[name_model](X[:, index_col], y, np.exp(log_alpha), idx_train, idx_val) temp_cvxpy.append(time.time() - t0) print(np.abs(grad - grad_cvxpy * np.exp(log_alpha))) list_times_fwd.append(np.mean(np.array(temp_fwd))) list_times_bwd.append(np.mean(np.array(temp_bwd))) list_times_cvxpy.append(np.mean(np.array(temp_cvxpy))) np.save("results/times_%s_forward_%s" % (name_model, div_alpha), list_times_fwd) np.save("results/times_%s_backward_%s" % (name_model, div_alpha), list_times_bwd) np.save("results/times_%s_cvxpy_%s" % (name_model, div_alpha), list_times_cvxpy) np.save("results/nfeatures_%s_%s" % (name_model, div_alpha), dict_ncols[div_alpha])
estimator = linear_model.ElasticNet(fit_intercept=False, max_iter=1000, warm_start=True, tol=tol) dict_monitor = {} all_algo_name = ['grid_search'] # , 'implicit_forward', "implicit_forward_approx", 'bayesian'] # , 'random_search'] # all_algo_name = ['random_search'] for algo_name in all_algo_name: model = ElasticNet(estimator=estimator) sub_criterion = HeldOutMSE(None, None) alpha0 = np.array([alpha_max / 10, alpha_max / 10]) monitor = Monitor() kf = KFold(n_splits=5, shuffle=True, random_state=42) criterion = CrossVal(sub_criterion, cv=kf) algo = ImplicitForward(tol_jac=1e-3) # optimizer = LineSearch(n_outer=10, tol=tol) if algo_name.startswith('implicit_forward'): if algo_name == "implicit_forward_approx": optimizer = GradientDescent(n_outer=30, p_grad_norm=1., verbose=True, tol=tol, tol_decrease="geom") else: optimizer = GradientDescent(n_outer=30,
def parallel_function(dataset_name, method, tol=1e-5, n_outer=50, tolerance_decrease='constant'): # load data X_train, X_val, X_test, y_train, y_val, y_test = get_data(dataset_name) n_samples, _ = X_train.shape # compute alpha_max alpha_max = np.abs(X_train.T @ y_train).max() / n_samples if model_name == "logreg": alpha_max /= 2 alpha_min = alpha_max / 10_000 log_alpha_max = np.log(alpha_max) log_alpha_min = np.log(alpha_min) log_alpha0 = np.log(0.1 * alpha_max) if model_name == "lasso": model = Lasso(X_train, y_train) elif model_name == "logreg": model = SparseLogreg(X_train, y_train) try: n_outer = dict_n_outers[dataset_name, method] except Exception: n_outer = 20 size_loop = 2 for _ in range(size_loop): if model_name == "lasso": criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) elif model_name == "logreg": criterion = HeldOutLogistic(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = dict_algo[method](criterion) monitor = Monitor() if method == 'grid_search': log_alphas = np.log(np.geomspace(alpha_max, alpha_min, num=100)) grid_search(algo, None, None, monitor, log_alphas=log_alphas, tol=tol) elif method == 'random': grid_search(algo, log_alpha_max, log_alpha_min, monitor, tol=tol, max_evals=n_alphas, t_max=dict_t_max[dataset_name]) elif method in ("bayesian"): hyperopt_wrapper(algo, log_alpha_min, log_alpha_max, monitor, max_evals=n_alphas, tol=tol, method='bayesian', t_max=dict_t_max[dataset_name]) else: # do line search to find the optimal lambda grad_search(algo, log_alpha0, monitor, n_outer=n_outer, tol=tol, tolerance_decrease=tolerance_decrease, t_max=dict_t_max[dataset_name]) monitor.times = np.array(monitor.times) monitor.objs = np.array(monitor.objs) monitor.objs_test = np.array(monitor.objs_test) monitor.log_alphas = np.array(monitor.log_alphas) return (dataset_name, method, tol, n_outer, tolerance_decrease, monitor.times, monitor.objs, monitor.objs_test, monitor.log_alphas, norm(y_val), norm(y_test), log_alpha_max, model_name)
def parallel_function(dataset_name, div_alpha, method): X, y = fetch_libsvm(dataset_name) n_samples = len(y) if dataset_name == "news20" and div_alpha == 100: rng = np.random.RandomState(42) y += rng.randn(n_samples) * 0.01 for maxit in dict_maxits[(dataset_name, div_alpha)]: print("Dataset %s, maxit %i" % (method, maxit)) for i in range(2): rng = np.random.RandomState(i) idx_train = rng.choice(n_samples, n_samples // 2, replace=False) idx = np.arange(0, n_samples) idx_val = idx[np.logical_not(np.isin(idx, idx_train))] alpha_max = np.max(np.abs(X[idx_train, :].T.dot(y[idx_train]))) alpha_max /= len(idx_train) log_alpha = np.log(alpha_max / div_alpha) monitor = Monitor() if method == "celer": clf = Lasso_celer(alpha=np.exp(log_alpha), fit_intercept=False, tol=1e-12, max_iter=maxit) model = Lasso(estimator=clf, max_iter=maxit) criterion = HeldOutMSE(idx_train, idx_val) algo = ImplicitForward(tol_jac=1e-32, n_iter_jac=maxit, use_stop_crit=False) algo.max_iter = maxit val, grad = criterion.get_val_grad(model, X, y, log_alpha, algo.compute_beta_grad, tol=1e-12, monitor=monitor, max_iter=maxit) elif method == "ground_truth": for file in os.listdir("results/"): if file.startswith("hypergradient_%s_%i_%s" % (dataset_name, div_alpha, method)): return clf = Lasso_celer(alpha=np.exp(log_alpha), fit_intercept=False, warm_start=True, tol=1e-14, max_iter=10000) criterion = HeldOutMSE(idx_train, idx_val) if dataset_name == "news20": algo = ImplicitForward(tol_jac=1e-11, n_iter_jac=100000) else: algo = Implicit(criterion) model = Lasso(estimator=clf, max_iter=10000) val, grad = criterion.get_val_grad(model, X, y, log_alpha, algo.compute_beta_grad, tol=1e-14, monitor=monitor) else: model = Lasso(max_iter=maxit) criterion = HeldOutMSE(idx_train, idx_val) if method == "forward": algo = Forward(use_stop_crit=False) elif method == "implicit_forward": algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=maxit, use_stop_crit=False) elif method == "implicit": algo = Implicit(max_iter=1000) elif method == "backward": algo = Backward() else: raise NotImplementedError algo.max_iter = maxit algo.use_stop_crit = False val, grad = criterion.get_val_grad(model, X, y, log_alpha, algo.compute_beta_grad, tol=tol, monitor=monitor, max_iter=maxit) results = (dataset_name, div_alpha, method, maxit, val, grad, monitor.times[0]) df = pandas.DataFrame(results).transpose() df.columns = [ 'dataset', 'div_alpha', 'method', 'maxit', 'val', 'grad', 'time' ] str_results = "results/hypergradient_%s_%i_%s_%i.pkl" % ( dataset_name, div_alpha, method, maxit) df.to_pickle(str_results)
def test_grid_search(): max_evals = 5 monitor_grid = Monitor() model = Lasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_train) alpha_opt_grid, _ = grid_search(criterion, model, X, y, alpha_min, alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = HeldOutMSE(idx_train, idx_val) alpha_opt_random, _ = grid_search(criterion, model, X, y, alpha_min, alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") np.testing.assert_allclose( monitor_random.alphas[np.argmin(monitor_random.objs)], alpha_opt_random) np.testing.assert_allclose( monitor_grid.alphas[np.argmin(monitor_grid.objs)], alpha_opt_grid) monitor_grid = Monitor() model = Lasso(estimator=estimator) criterion = FiniteDiffMonteCarloSure(sigma=sigma_star) alpha_opt_grid, _ = grid_search(criterion, model, X, y, alpha_min, alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = FiniteDiffMonteCarloSure(sigma=sigma_star) alpha_opt_random, _ = grid_search(criterion, model, X, y, alpha_min, alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") np.testing.assert_allclose( monitor_random.alphas[np.argmin(monitor_random.objs)], alpha_opt_random) np.testing.assert_allclose( monitor_grid.alphas[np.argmin(monitor_grid.objs)], alpha_opt_grid)
def parallel_function(dataset_name, div_alpha, method, ind_rep, random_state=10): maxit = dict_maxits[(dataset_name, div_alpha)][ind_rep] print("Dataset %s, algo %s, maxit %i" % (dataset_name, method, maxit)) X, y = fetch_libsvm(dataset_name) n_samples = len(y) kf = KFold(n_splits=5, random_state=random_state, shuffle=True) for i in range(2): alpha_max = np.max(np.abs(X.T.dot(y))) / n_samples log_alpha = np.log(alpha_max / div_alpha) monitor = Monitor() if method == "celer": clf = Lasso_celer( alpha=np.exp(log_alpha), fit_intercept=False, # TODO maybe change this tol tol=1e-8, max_iter=maxit) model = Lasso(estimator=clf, max_iter=maxit) criterion = HeldOutMSE(None, None) cross_val = CrossVal(cv=kf, criterion=criterion) algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=maxit, use_stop_crit=False) algo.max_iter = maxit val, grad = cross_val.get_val_grad(model, X, y, log_alpha, algo.get_beta_jac_v, tol=tol, monitor=monitor, max_iter=maxit) elif method == "ground_truth": for file in os.listdir("results/"): if file.startswith("hypergradient_%s_%i_%s" % (dataset_name, div_alpha, method)): return else: clf = Lasso_celer(alpha=np.exp(log_alpha), fit_intercept=False, warm_start=True, tol=1e-13, max_iter=10000) criterion = HeldOutMSE(None, None) cross_val = CrossVal(cv=kf, criterion=criterion) algo = Implicit(criterion) model = Lasso(estimator=clf, max_iter=10000) val, grad = cross_val.get_val_grad(model, X, y, log_alpha, algo.get_beta_jac_v, tol=1e-13, monitor=monitor) else: model = Lasso(max_iter=maxit) criterion = HeldOutMSE(None, None) cross_val = CrossVal(cv=kf, criterion=criterion) if method == "forward": algo = Forward(use_stop_crit=False) elif method == "implicit_forward": algo = ImplicitForward(use_stop_crit=False, tol_jac=1e-8, n_iter_jac=maxit, max_iter=1000) elif method == "implicit": algo = Implicit(use_stop_crit=False, max_iter=1000) elif method == "backward": algo = Backward() else: 1 / 0 algo.max_iter = maxit algo.use_stop_crit = False val, grad = cross_val.get_val_grad(model, X, y, log_alpha, algo.get_beta_jac_v, tol=tol, monitor=monitor, max_iter=maxit) results = (dataset_name, div_alpha, method, maxit, val, grad, monitor.times[0]) df = pandas.DataFrame(results).transpose() df.columns = [ 'dataset', 'div_alpha', 'method', 'maxit', 'val', 'grad', 'time' ] str_results = "results/hypergradient_%s_%i_%s_%i.pkl" % ( dataset_name, div_alpha, method, maxit) df.to_pickle(str_results)
dict_res = {} for maxit in maxits: for method in methods: print("Dataset %s, maxit %i" % (method, maxit)) for i in range(2): alpha_max = np.max(np.abs(X.T.dot(y))) / n_samples log_alpha = np.log(alpha_max * p_alpha_max) monitor = Monitor() if method == "celer": clf = Lasso_celer(alpha=np.exp(log_alpha), fit_intercept=False, tol=1e-12, max_iter=maxit) model = Lasso(estimator=clf, max_iter=maxit) criterion = HeldOutMSE(idx_train, idx_val) algo = ImplicitForward(tol_jac=1e-32, n_iter_jac=maxit, use_stop_crit=False) algo.max_iter = maxit val, grad = criterion.get_val_grad(model, X, y, log_alpha, algo.get_beta_jac_v, tol=1e-12, monitor=monitor, max_iter=maxit) else: model = Lasso(max_iter=maxit) criterion = HeldOutMSE(idx_train, idx_val)
def test_grid_search(): max_evals = 5 monitor_grid = Monitor() model = Lasso(X_train, y_train, estimator=estimator) criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Forward() log_alpha_opt_grid, _ = grid_search(algo, criterion, log_alpha_min, log_alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Forward() log_alpha_opt_random, _ = grid_search(algo, criterion, log_alpha_min, log_alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") assert (monitor_random.log_alphas[np.argmin( monitor_random.objs)] == log_alpha_opt_random) assert (monitor_grid.log_alphas[np.argmin( monitor_grid.objs)] == log_alpha_opt_grid) monitor_grid = Monitor() model = Lasso(X_train, y_train, estimator=estimator) criterion = SURE(X_train, y_train, model, sigma=sigma_star, X_test=X_test, y_test=y_test) algo = Forward() log_alpha_opt_grid, _ = grid_search(algo, criterion, log_alpha_min, log_alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = SURE(X_train, y_train, model, sigma=sigma_star, X_test=X_test, y_test=y_test) algo = Forward() log_alpha_opt_random, _ = grid_search(algo, criterion, log_alpha_min, log_alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") assert (monitor_random.log_alphas[np.argmin( monitor_random.objs)] == log_alpha_opt_random) assert (monitor_grid.log_alphas[np.argmin( monitor_grid.objs)] == log_alpha_opt_grid)