def test_grid_search(): max_evals = 5 monitor_grid = Monitor() model = Lasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_train) algo = Forward() log_alpha_opt_grid, _ = grid_search( algo, criterion, model, X, y, log_alpha_min, log_alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = HeldOutMSE(idx_train, idx_val) algo = Forward() log_alpha_opt_random, _ = grid_search( algo, criterion, model, X, y, log_alpha_min, log_alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") assert(monitor_random.log_alphas[ np.argmin(monitor_random.objs)] == log_alpha_opt_random) assert(monitor_grid.log_alphas[ np.argmin(monitor_grid.objs)] == log_alpha_opt_grid) monitor_grid = Monitor() model = Lasso(estimator=estimator) criterion = SmoothedSURE(sigma=sigma_star) algo = Forward() log_alpha_opt_grid, _ = grid_search( algo, criterion, model, X, y, log_alpha_min, log_alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = SmoothedSURE(sigma=sigma_star) algo = Forward() log_alpha_opt_random, _ = grid_search( algo, criterion, model, X, y, log_alpha_min, log_alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") assert(monitor_random.log_alphas[ np.argmin(monitor_random.objs)] == log_alpha_opt_random) assert(monitor_grid.log_alphas[ np.argmin(monitor_grid.objs)] == log_alpha_opt_grid)
def test_monitor(): model = Lasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_val) algo = ImplicitForward() monitor = Monitor(callback=callback) optimizer = LineSearch(n_outer=10, tol=tol) grad_search(algo, criterion, model, optimizer, X, y, alpha0, monitor) np.testing.assert_allclose(np.array(monitor.objs), np.array(objs))
def test_monitor(): model = Lasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_val) algo = ImplicitForward() monitor = Monitor(callback=callback) grad_search(algo, criterion, model, X, y, np.log(alpha_max / 10), monitor, n_outer=10, tol=tol) np.testing.assert_allclose(np.array(monitor.objs), np.array(objs))
def parallel_function(dataset_name, div_alpha, method): X, y = fetch_libsvm(dataset_name) n_samples = len(y) if dataset_name == "news20" and div_alpha == 100: rng = np.random.RandomState(42) y += rng.randn(n_samples) * 0.01 for maxit in dict_maxits[(dataset_name, div_alpha)]: print("Dataset %s, maxit %i" % (method, maxit)) for i in range(2): rng = np.random.RandomState(i) idx_train = rng.choice(n_samples, n_samples // 2, replace=False) idx = np.arange(0, n_samples) idx_val = idx[np.logical_not(np.isin(idx, idx_train))] alpha_max = np.max(np.abs(X[idx_train, :].T.dot(y[idx_train]))) alpha_max /= len(idx_train) log_alpha = np.log(alpha_max / div_alpha) monitor = Monitor() if method == "celer": clf = Lasso_celer(alpha=np.exp(log_alpha), fit_intercept=False, tol=1e-12, max_iter=maxit) model = Lasso(estimator=clf, max_iter=maxit) criterion = HeldOutMSE(idx_train, idx_val) algo = ImplicitForward(tol_jac=1e-32, n_iter_jac=maxit, use_stop_crit=False) algo.max_iter = maxit val, grad = criterion.get_val_grad(model, X, y, log_alpha, algo.compute_beta_grad, tol=1e-12, monitor=monitor, max_iter=maxit) elif method == "ground_truth": for file in os.listdir("results/"): if file.startswith("hypergradient_%s_%i_%s" % (dataset_name, div_alpha, method)): return clf = Lasso_celer(alpha=np.exp(log_alpha), fit_intercept=False, warm_start=True, tol=1e-14, max_iter=10000) criterion = HeldOutMSE(idx_train, idx_val) if dataset_name == "news20": algo = ImplicitForward(tol_jac=1e-11, n_iter_jac=100000) else: algo = Implicit(criterion) model = Lasso(estimator=clf, max_iter=10000) val, grad = criterion.get_val_grad(model, X, y, log_alpha, algo.compute_beta_grad, tol=1e-14, monitor=monitor) else: model = Lasso(max_iter=maxit) criterion = HeldOutMSE(idx_train, idx_val) if method == "forward": algo = Forward(use_stop_crit=False) elif method == "implicit_forward": algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=maxit, use_stop_crit=False) elif method == "implicit": algo = Implicit(max_iter=1000) elif method == "backward": algo = Backward() else: raise NotImplementedError algo.max_iter = maxit algo.use_stop_crit = False val, grad = criterion.get_val_grad(model, X, y, log_alpha, algo.compute_beta_grad, tol=tol, monitor=monitor, max_iter=maxit) results = (dataset_name, div_alpha, method, maxit, val, grad, monitor.times[0]) df = pandas.DataFrame(results).transpose() df.columns = [ 'dataset', 'div_alpha', 'method', 'maxit', 'val', 'grad', 'time' ] str_results = "results/hypergradient_%s_%i_%s_%i.pkl" % ( dataset_name, div_alpha, method, maxit) df.to_pickle(str_results)
log_alphas = np.log(alphas) tol = 1e-7 # grid search # model = Lasso(X_train, y_train, np.log(alpha_max/10)) # criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) # algo = Forward(criterion) # monitor_grid_sk = Monitor() # grid_search( # algo, None, None, monitor_grid_sk, log_alphas=log_alphas, # tol=tol) # np.save("p_alphas.npy", p_alphas) # objs = np.array(monitor_grid_sk.objs) # np.save("objs.npy", objs) # grad_search estimator = linear_model.Lasso(fit_intercept=False, warm_start=True) model = Lasso(X_train, y_train, np.log(alpha_max / 10), estimator=estimator) criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = ImplicitForward(criterion) monitor_grad = Monitor() grad_search(algo, np.log(alpha_max / 10), monitor_grad, n_outer=10, tol=tol) p_alphas_grad = np.exp(np.array(monitor_grad.log_alphas)) / alpha_max np.save("p_alphas_grad.npy", p_alphas_grad) objs_grad = np.array(monitor_grad.objs) np.save("objs_grad.npy", objs_grad)
alpha_max = (np.abs(X[idx_train, :].T @ y[idx_train])).max() / n_samples p_alpha = 0.9 alpha = p_alpha * alpha_max log_alpha = np.log(alpha) log_alphas = np.log(alpha_max * np.geomspace(1, 0.1)) tol = 1e-16 dict_log_alpha = {} dict_log_alpha["lasso"] = log_alpha tab = np.linspace(1, 1000, n_features) dict_log_alpha["wlasso"] = log_alpha + np.log(tab / tab.max()) models = {} models["lasso"] = Lasso(estimator=None) models["wlasso"] = WeightedLasso(estimator=None) def get_v(mask, dense): return 2 * (X[np.ix_(idx_val, mask)].T @ ( X[np.ix_(idx_val, mask)] @ dense - y[idx_val])) / len(idx_val) def test_beta_jac(): ######################################################################### # check that the methods computing the full Jacobian compute the same sol # maybe we could add a test comparing with sklearn for key in models.keys(): supp1, dense1, jac1 = get_beta_jac_iterdiff(X[idx_train, :], y[idx_train],
def linear_cv(dataset_name, tol=1e-3, compute_jac=True, model_name="lasso"): X, y = load_libsvm(dataset_name) X = csc_matrix(X) n_samples, n_features = X.shape p_alpha = p_alphas[dataset_name, model_name] max_iter = max_iters[dataset_name] if model_name == "lasso": model = Lasso(X, y, 0, max_iter=max_iter, tol=tol) elif model_name == "logreg": model = SparseLogreg(X, y, 0, max_iter=max_iter, tol=tol) alpha_max = np.exp(model.compute_alpha_max()) alpha = p_alpha * alpha_max if model_name == "lasso": clf = Lasso_cel(alpha=alpha, fit_intercept=False, warm_start=True, tol=tol * norm(y)**2 / 2, max_iter=10000) clf.fit(X, y) beta_star = clf.coef_ mask = beta_star != 0 dense = beta_star[mask] elif model_name == "logreg": # clf = LogisticRegression( # penalty='l1', C=(1 / (alpha * n_samples)), # fit_intercept=False, # warm_start=True, max_iter=10000, # tol=tol, verbose=True).fit(X, y) # clf = LogisticRegression( # penalty='l1', C=(1 / (alpha * n_samples)), # fit_intercept=False, # warm_start=True, max_iter=10000, # tol=tol, verbose=True, # solver='liblinear').fit(X, y) # beta_star = clf.coef_[0] blitzl1.set_use_intercept(False) blitzl1.set_tolerance(1e-32) blitzl1.set_verbose(True) # blitzl1.set_min_time(60) prob = blitzl1.LogRegProblem(X, y) # # lammax = prob.compute_lambda_max() clf = prob.solve(alpha * n_samples) beta_star = clf.x mask = beta_star != 0 mask = np.array(mask) dense = beta_star[mask] # if model == "lasso": v = -n_samples * alpha * np.sign(beta_star[mask]) mat_to_inv = model.get_hessian(mask, dense, np.log(alpha)) # mat_to_inv = X[:, mask].T @ X[:, mask] jac_temp = cg(mat_to_inv, v, tol=1e-10) jac_star = np.zeros(n_features) jac_star[mask] = jac_temp[0] # elif model == "logreg": # v = - n_samples * alpha * np.sign(beta_star[mask]) log_alpha = np.log(alpha) list_beta, list_jac = get_beta_jac_iterdiff(X, y, log_alpha, model, save_iterates=True, tol=tol, max_iter=max_iter, compute_jac=compute_jac) diff_beta = norm(list_beta - beta_star, axis=1) diff_jac = norm(list_jac - jac_star, axis=1) supp_star = beta_star != 0 n_iter = list_beta.shape[0] for i in np.arange(n_iter)[::-1]: supp = list_beta[i, :] != 0 if not np.all(supp == supp_star): supp_id = i + 1 break supp_id = 0 return dataset_name, p_alpha, diff_beta, diff_jac, n_iter, supp_id
def parallel_function(dataset_name, div_alpha, method, ind_rep, random_state=10): maxit = dict_maxits[(dataset_name, div_alpha)][ind_rep] print("Dataset %s, algo %s, maxit %i" % (dataset_name, method, maxit)) X, y = fetch_libsvm(dataset_name) n_samples = len(y) kf = KFold(n_splits=5, random_state=random_state, shuffle=True) for i in range(2): alpha_max = np.max(np.abs(X.T.dot(y))) / n_samples log_alpha = np.log(alpha_max / div_alpha) monitor = Monitor() if method == "celer": clf = Lasso_celer( alpha=np.exp(log_alpha), fit_intercept=False, # TODO maybe change this tol tol=1e-8, max_iter=maxit) model = Lasso(estimator=clf, max_iter=maxit) criterion = HeldOutMSE(None, None) cross_val = CrossVal(cv=kf, criterion=criterion) algo = ImplicitForward(tol_jac=1e-8, n_iter_jac=maxit, use_stop_crit=False) algo.max_iter = maxit val, grad = cross_val.get_val_grad(model, X, y, log_alpha, algo.get_beta_jac_v, tol=tol, monitor=monitor, max_iter=maxit) elif method == "ground_truth": for file in os.listdir("results/"): if file.startswith("hypergradient_%s_%i_%s" % (dataset_name, div_alpha, method)): return else: clf = Lasso_celer(alpha=np.exp(log_alpha), fit_intercept=False, warm_start=True, tol=1e-13, max_iter=10000) criterion = HeldOutMSE(None, None) cross_val = CrossVal(cv=kf, criterion=criterion) algo = Implicit(criterion) model = Lasso(estimator=clf, max_iter=10000) val, grad = cross_val.get_val_grad(model, X, y, log_alpha, algo.get_beta_jac_v, tol=1e-13, monitor=monitor) else: model = Lasso(max_iter=maxit) criterion = HeldOutMSE(None, None) cross_val = CrossVal(cv=kf, criterion=criterion) if method == "forward": algo = Forward(use_stop_crit=False) elif method == "implicit_forward": algo = ImplicitForward(use_stop_crit=False, tol_jac=1e-8, n_iter_jac=maxit, max_iter=1000) elif method == "implicit": algo = Implicit(use_stop_crit=False, max_iter=1000) elif method == "backward": algo = Backward() else: 1 / 0 algo.max_iter = maxit algo.use_stop_crit = False val, grad = cross_val.get_val_grad(model, X, y, log_alpha, algo.get_beta_jac_v, tol=tol, monitor=monitor, max_iter=maxit) results = (dataset_name, div_alpha, method, maxit, val, grad, monitor.times[0]) df = pandas.DataFrame(results).transpose() df.columns = [ 'dataset', 'div_alpha', 'method', 'maxit', 'val', 'grad', 'time' ] str_results = "results/hypergradient_%s_%i_%s_%i.pkl" % ( dataset_name, div_alpha, method, maxit) df.to_pickle(str_results)
from andersoncd.data.real import get_gina_agnostic from sparse_ho import Forward, Backward from sparse_ho.models import Lasso, ElasticNet from sparse_ho.tests.cvxpylayer import lasso_cvxpy, enet_cvxpy from sparse_ho.criterion import HeldOutMSE import time X, y = get_gina_agnostic(normalize_y=False) n_samples, n_features = X.shape idx_train = np.arange(0, n_samples // 2) idx_val = np.arange(n_samples // 2, n_samples) name_models = ["lasso", "enet"] dict_models = {} dict_models["lasso"] = Lasso() dict_models["enet"] = ElasticNet() dict_cvxpy = {} dict_cvxpy["lasso"] = lasso_cvxpy dict_cvxpy["enet"] = enet_cvxpy dict_ncols = {} dict_ncols[10] = np.geomspace(100, n_features, num=10, dtype=int) dict_ncols[100] = np.geomspace(100, n_features, num=10, dtype=int) tol = 1e-6 l1_ratio = 0.8 repeat = 10 div_alphas = [10, 100]
alpha_max = np.max(np.abs(X[idx_train, :].T @ y[idx_train])) / len(idx_train) p_alpha = 0.7 alpha0 = p_alpha * alpha_max # log_alpha = np.log(alpha) log_alphas = np.log(alpha_max * np.geomspace(1, 0.1)) tol = 1e-16 max_iter = 1000 # dict_log_alpha0 = {} # dict_log_alpha0["lasso"] = log_alpha # tab = np.linspace(1, 1000, n_features) # dict_log_alpha0["wlasso"] = log_alpha + np.log(tab / tab.max()) models = [ Lasso(max_iter=max_iter, estimator=None), ] estimator = linear_model.Lasso(fit_intercept=False, max_iter=1000, warm_start=True) models_custom = [ Lasso(max_iter=max_iter, estimator=estimator), ] @pytest.mark.parametrize('model', models) @pytest.mark.parametrize('crit', ['MSE', 'sure']) def test_grad_search(model, crit): """check that the paths are the same in the line search""" if crit == 'MSE':
X_test_s = csc_matrix(X_test) alpha_max = (X_train.T @ y_train).max() / n_samples p_alpha = 0.7 alpha = p_alpha * alpha_max log_alpha = np.log(alpha) log_alphas = np.log(alpha_max * np.geomspace(1, 0.1)) tol = 1e-16 dict_log_alpha = {} dict_log_alpha["lasso"] = log_alpha tab = np.linspace(1, 1000, n_features) dict_log_alpha["wlasso"] = log_alpha + np.log(tab / tab.max()) models = [Lasso(X_train, y_train, dict_log_alpha["lasso"])] def test_cross_val_criterion(): alpha_min = alpha_max / 10 log_alpha_max = np.log(alpha_max) log_alpha_min = np.log(alpha_min) max_iter = 10000 n_alphas = 10 kf = KFold(n_splits=5, shuffle=True, random_state=56) estimator = sklearn.linear_model.Lasso(fit_intercept=False, max_iter=1000, warm_start=True) monitor_grid = Monitor() criterion = CrossVal(X, y, Lasso, cv=kf, estimator=estimator)
alpha_max = np.max(np.abs(X[idx_train, :].T @ y[idx_train])) / len(idx_train) p_alpha = 0.7 alpha0 = p_alpha * alpha_max # log_alpha = np.log(alpha) log_alphas = np.log(alpha_max * np.geomspace(1, 0.1)) tol = 1e-16 max_iter = 1000 # dict_log_alpha0 = {} # dict_log_alpha0["lasso"] = log_alpha # tab = np.linspace(1, 1000, n_features) # dict_log_alpha0["wlasso"] = log_alpha + np.log(tab / tab.max()) models = [ Lasso(estimator=None), ] estimator = linear_model.Lasso(fit_intercept=False, max_iter=1000, warm_start=True) models_custom = [ Lasso(estimator=estimator), ] Optimizers = [LineSearch, GradientDescent] @pytest.mark.parametrize('Optimizer', Optimizers) @pytest.mark.parametrize('model', models) @pytest.mark.parametrize('crit', ['MSE', 'sure'])
tol = 1e-7 max_iter = 1e5 ############################################################################## # Grid-search with scikit-learn # ----------------------------- estimator = linear_model.Lasso(fit_intercept=False, max_iter=1000, warm_start=True) print('scikit-learn started') t0 = time.time() model = Lasso(X_train, y_train, estimator=estimator) criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Forward(criterion) monitor_grid_sk = Monitor() grid_search(algo, criterion, None, None, monitor_grid_sk, log_alphas=log_alphas, tol=tol) objs = np.array(monitor_grid_sk.objs) t_sk = time.time() - t0 print('scikit-learn finished')
def parallel_function(dataset_name, method, tol=1e-5, n_outer=50, tolerance_decrease='constant'): # load data X_train, X_val, X_test, y_train, y_val, y_test = get_data(dataset_name) n_samples, _ = X_train.shape # compute alpha_max alpha_max = np.abs(X_train.T @ y_train).max() / n_samples if model_name == "logreg": alpha_max /= 2 alpha_min = alpha_max / 10_000 log_alpha_max = np.log(alpha_max) log_alpha_min = np.log(alpha_min) log_alpha0 = np.log(0.1 * alpha_max) if model_name == "lasso": model = Lasso(X_train, y_train) elif model_name == "logreg": model = SparseLogreg(X_train, y_train) try: n_outer = dict_n_outers[dataset_name, method] except Exception: n_outer = 20 size_loop = 2 for _ in range(size_loop): if model_name == "lasso": criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) elif model_name == "logreg": criterion = HeldOutLogistic(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = dict_algo[method](criterion) monitor = Monitor() if method == 'grid_search': log_alphas = np.log(np.geomspace(alpha_max, alpha_min, num=100)) grid_search(algo, None, None, monitor, log_alphas=log_alphas, tol=tol) elif method == 'random': grid_search(algo, log_alpha_max, log_alpha_min, monitor, tol=tol, max_evals=n_alphas, t_max=dict_t_max[dataset_name]) elif method in ("bayesian"): hyperopt_wrapper(algo, log_alpha_min, log_alpha_max, monitor, max_evals=n_alphas, tol=tol, method='bayesian', t_max=dict_t_max[dataset_name]) else: # do line search to find the optimal lambda grad_search(algo, log_alpha0, monitor, n_outer=n_outer, tol=tol, tolerance_decrease=tolerance_decrease, t_max=dict_t_max[dataset_name]) monitor.times = np.array(monitor.times) monitor.objs = np.array(monitor.objs) monitor.objs_test = np.array(monitor.objs_test) monitor.log_alphas = np.array(monitor.log_alphas) return (dataset_name, method, tol, n_outer, tolerance_decrease, monitor.times, monitor.objs, monitor.objs_test, monitor.log_alphas, norm(y_val), norm(y_test), log_alpha_max, model_name)
print('scikit finished') ############################################################################## # Now do the hyperparameter optimization with implicit differentiation # -------------------------------------------------------------------- estimator = sklearn.linear_model.Lasso(fit_intercept=False, max_iter=1000, warm_start=True, tol=tol) print('sparse-ho started') t0 = time.time() model = Lasso() criterion = HeldOutMSE(None, None) alpha0 = alpha_max / 10 monitor_grad = Monitor() cross_val_criterion = CrossVal(criterion, cv=kf) algo = ImplicitForward() optimizer = LineSearch(n_outer=10, tol=tol) grad_search(algo, cross_val_criterion, model, optimizer, X, y, alpha0, monitor_grad) t_grad_search = time.time() - t0 print('sparse-ho finished') ############################################################################## # Plot results
def test_grid_search(): max_evals = 5 monitor_grid = Monitor() model = Lasso(X_train, y_train, estimator=estimator) criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Forward() log_alpha_opt_grid, _ = grid_search(algo, criterion, log_alpha_min, log_alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = HeldOutMSE(X_val, y_val, model, X_test=X_test, y_test=y_test) algo = Forward() log_alpha_opt_random, _ = grid_search(algo, criterion, log_alpha_min, log_alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") assert (monitor_random.log_alphas[np.argmin( monitor_random.objs)] == log_alpha_opt_random) assert (monitor_grid.log_alphas[np.argmin( monitor_grid.objs)] == log_alpha_opt_grid) monitor_grid = Monitor() model = Lasso(X_train, y_train, estimator=estimator) criterion = SURE(X_train, y_train, model, sigma=sigma_star, X_test=X_test, y_test=y_test) algo = Forward() log_alpha_opt_grid, _ = grid_search(algo, criterion, log_alpha_min, log_alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = SURE(X_train, y_train, model, sigma=sigma_star, X_test=X_test, y_test=y_test) algo = Forward() log_alpha_opt_random, _ = grid_search(algo, criterion, log_alpha_min, log_alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") assert (monitor_random.log_alphas[np.argmin( monitor_random.objs)] == log_alpha_opt_random) assert (monitor_grid.log_alphas[np.argmin( monitor_grid.objs)] == log_alpha_opt_grid)
alpha_1 = p_alpha * alpha_max alpha_2 = 0.1 log_alpha1 = np.log(alpha_1) log_alpha2 = np.log(alpha_2) dict_log_alpha = {} dict_log_alpha["lasso"] = log_alpha dict_log_alpha["enet"] = np.array([log_alpha1, log_alpha2]) tab = np.linspace(1, 1000, n_features) dict_log_alpha["wLasso"] = log_alpha + np.log(tab / tab.max()) dict_log_alpha["logreg"] = (log_alpha - np.log(2)) dict_log_alpha["svm"] = 1e-4 dict_log_alpha["svr"] = np.array([1e-2, 1e-2]) # Set models to be tested models = {} models["lasso"] = Lasso(estimator=None) models["enet"] = ElasticNet(estimator=None) models["wLasso"] = WeightedLasso(estimator=None) models["logreg"] = SparseLogreg(estimator=None) models["svm"] = SVM(estimator=None) models["svr"] = SVR(estimator=None) custom_models = {} custom_models["lasso"] = Lasso(estimator=celer.Lasso( warm_start=True, fit_intercept=False)) custom_models["enet"] = ElasticNet( estimator=linear_model.ElasticNet(warm_start=True, fit_intercept=False)) custom_models["logreg"] = SparseLogreg( estimator=celer.LogisticRegression(warm_start=True, fit_intercept=False)) # Compute "ground truth" with cvxpylayer
tol = 1e-7 max_iter = 1e5 ############################################################################## # Grid-search with scikit-learn # ----------------------------- estimator = linear_model.Lasso(fit_intercept=False, max_iter=1000, warm_start=True) print('scikit-learn started') t0 = time.time() model = Lasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_val) algo = Forward() monitor_grid_sk = Monitor() grid_search(algo, criterion, model, X, y, None, None, monitor_grid_sk, log_alphas=log_alphas, tol=tol) objs = np.array(monitor_grid_sk.objs) t_sk = time.time() - t0
def parallel_function( dataset_name, method, tol=1e-5, n_outer=50, tolerance_decrease='constant'): # load data X, y = fetch_libsvm(dataset_name) y -= y.mean() # compute alpha_max alpha_max = np.abs(X.T @ y).max() / len(y) if model_name == "logreg": alpha_max /= 2 alpha_min = alpha_max / 10_000 if model_name == "lasso": estimator = celer.Lasso( fit_intercept=False, max_iter=100, warm_start=True, tol=tol) model = Lasso(estimator=estimator) elif model_name == "logreg": model = SparseLogreg(estimator=estimator) # TODO improve this try: n_outer = dict_n_outers[dataset_name, method] except Exception: n_outer = 20 size_loop = 2 for _ in range(size_loop): if model_name == "lasso": sub_criterion = HeldOutMSE(None, None) elif model_name == "logreg": criterion = HeldOutLogistic(None, None) kf = KFold(n_splits=5, shuffle=True, random_state=42) criterion = CrossVal(sub_criterion, cv=kf) algo = ImplicitForward(tol_jac=1e-3) monitor = Monitor() t_max = dict_t_max[dataset_name] if method == 'grid_search': grid_search( algo, criterion, model, X, y, alpha_min, alpha_max, monitor, max_evals=100, tol=tol) elif method == 'random' or method == 'bayesian': hyperopt_wrapper( algo, criterion, model, X, y, alpha_min, alpha_max, monitor, max_evals=30, tol=tol, method=method, size_space=1, t_max=t_max) elif method.startswith("implicit_forward"): # do gradient descent to find the optimal lambda alpha0 = alpha_max / 100 n_outer = 30 if method == 'implicit_forward': optimizer = GradientDescent( n_outer=n_outer, p_grad_norm=1, verbose=True, tol=tol, t_max=t_max) else: optimizer = GradientDescent( n_outer=n_outer, p_grad_norm=1, verbose=True, tol=tol, t_max=t_max, tol_decrease="geom") grad_search( algo, criterion, model, optimizer, X, y, alpha0, monitor) else: raise NotImplementedError monitor.times = np.array(monitor.times) monitor.objs = np.array(monitor.objs) monitor.objs_test = 0 # TODO monitor.alphas = np.array(monitor.alphas) return (dataset_name, method, tol, n_outer, tolerance_decrease, monitor.times, monitor.objs, monitor.objs_test, monitor.alphas, alpha_max, model_name)
alpha_max = (X_train.T @ y_train).max() / n_samples p_alpha = 0.9 alpha = p_alpha * alpha_max log_alpha = np.log(alpha) log_alphas = np.log(alpha_max * np.geomspace(1, 0.1)) tol = 1e-16 dict_log_alpha = {} dict_log_alpha["lasso"] = log_alpha tab = np.linspace(1, 1000, n_features) dict_log_alpha["wlasso"] = log_alpha + np.log(tab / tab.max()) models = {} models["lasso"] = Lasso(X_train, y_train, estimator=None) models["wlasso"] = WeightedLasso(X_train, y_train, estimator=None) def get_v(mask, dense): return 2 * ( X_val[:, mask].T @ (X_val[:, mask] @ dense - y_val)) / X_val.shape[0] def test_beta_jac(): ######################################################################### # check that the methods computing the full Jacobian compute the same sol # maybe we could add a test comparing with sklearn for key in models.keys(): supp1, dense1, jac1 = get_beta_jac_iterdiff(X_train, y_train,
alpha_max = (X_train.T @ y_train).max() / n_samples p_alpha = 0.7 alpha = p_alpha * alpha_max log_alpha = np.log(alpha) log_alphas = np.log(alpha_max * np.geomspace(1, 0.1)) tol = 1e-16 max_iter = 1000 dict_log_alpha0 = {} dict_log_alpha0["lasso"] = log_alpha tab = np.linspace(1, 1000, n_features) dict_log_alpha0["wlasso"] = log_alpha + np.log(tab / tab.max()) models = [ Lasso(X_train, y_train, max_iter=max_iter, estimator=None), ] estimator = linear_model.Lasso(fit_intercept=False, max_iter=1000, warm_start=True) models_custom = [ Lasso(X_train, y_train, max_iter=max_iter, estimator=estimator), ] @pytest.mark.parametrize('model', models) @pytest.mark.parametrize('crit', ['cv', 'sure']) def test_grad_search(model, crit): """check that the paths are the same in the line search""" if crit == 'cv':
dict_res = {} for maxit in maxits: for method in methods: print("Dataset %s, maxit %i" % (method, maxit)) for i in range(2): alpha_max = np.max(np.abs(X.T.dot(y))) / n_samples log_alpha = np.log(alpha_max * p_alpha_max) monitor = Monitor() if method == "celer": clf = Lasso_celer(alpha=np.exp(log_alpha), fit_intercept=False, tol=1e-12, max_iter=maxit) model = Lasso(estimator=clf, max_iter=maxit) criterion = HeldOutMSE(idx_train, idx_val) algo = ImplicitForward(tol_jac=1e-32, n_iter_jac=maxit, use_stop_crit=False) algo.max_iter = maxit val, grad = criterion.get_val_grad(model, X, y, log_alpha, algo.get_beta_jac_v, tol=1e-12, monitor=monitor, max_iter=maxit) else: model = Lasso(max_iter=maxit)
def test_grid_search(): max_evals = 5 monitor_grid = Monitor() model = Lasso(estimator=estimator) criterion = HeldOutMSE(idx_train, idx_train) alpha_opt_grid, _ = grid_search(criterion, model, X, y, alpha_min, alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = HeldOutMSE(idx_train, idx_val) alpha_opt_random, _ = grid_search(criterion, model, X, y, alpha_min, alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") np.testing.assert_allclose( monitor_random.alphas[np.argmin(monitor_random.objs)], alpha_opt_random) np.testing.assert_allclose( monitor_grid.alphas[np.argmin(monitor_grid.objs)], alpha_opt_grid) monitor_grid = Monitor() model = Lasso(estimator=estimator) criterion = FiniteDiffMonteCarloSure(sigma=sigma_star) alpha_opt_grid, _ = grid_search(criterion, model, X, y, alpha_min, alpha_max, monitor_grid, max_evals=max_evals, tol=1e-5, samp="grid") monitor_random = Monitor() criterion = FiniteDiffMonteCarloSure(sigma=sigma_star) alpha_opt_random, _ = grid_search(criterion, model, X, y, alpha_min, alpha_max, monitor_random, max_evals=max_evals, tol=1e-5, samp="random") np.testing.assert_allclose( monitor_random.alphas[np.argmin(monitor_random.objs)], alpha_opt_random) np.testing.assert_allclose( monitor_grid.alphas[np.argmin(monitor_grid.objs)], alpha_opt_grid)