def test_optimize(accelerated, loss, penalty): """Test a method on both the line_search and fixed step size strategy.""" max_iter = 200 for alpha in np.logspace(-1, 3, 3): obj = loss(A, b, alpha) if penalty is not None: prox = penalty(1e-3).prox else: prox = None opt = cp.minimize_proximal_gradient( obj.f_grad, np.zeros(n_features), prox=prox, jac=True, step="backtracking", max_iter=max_iter, accelerated=accelerated, ) grad_x = obj.f_grad(opt.x)[1] assert certificate(opt.x, grad_x, prox) < 1e-5 opt_2 = cp.minimize_proximal_gradient( obj.f_grad, np.zeros(n_features), prox=prox, jac=True, max_iter=max_iter, step=lambda x: 1 / obj.lipschitz, accelerated=accelerated, ) grad_2x = obj.f_grad(opt_2.x)[1] assert certificate(opt_2.x, grad_2x, prox) < 1e-5
def run(self, n_iter): X, y, solver = self.X, self.y, self.solver n_features = X.shape[1] x0 = np.zeros(n_features) if n_iter == 0: self.beta = x0 return f = cp.loss.LogLoss(X, y) g = cp.penalty.L1Norm(self.lmbd / X.shape[0]) warnings.filterwarnings('ignore', category=RuntimeWarning) if solver == 'pgd': if self.line_search: step = 'backtracking' else: def step(x): return 1.0 / f.lipschitz result = cp.minimize_proximal_gradient( f.f_grad, x0, g.prox, step=step, tol=0, max_iter=n_iter, jac=True, accelerated=self.accelerated, ) elif solver == 'saga': step_size = 1.0 / (3 * f.max_lipschitz) result = cp.minimize_saga( f.partial_deriv, X, y, x0, prox=g.prox_factory(n_features), step_size=step_size, tol=0, max_iter=n_iter, ) else: assert solver == 'svrg' step_size = 1.0 / (3 * f.max_lipschitz) result = cp.minimize_svrg( f.partial_deriv, X, y, x0, prox=g.prox_factory(n_features), step_size=step_size, tol=0, max_iter=n_iter, ) self.beta = result.x
def test_vrtos_fl(A_data): """Test on overlapping group lasso""" n_samples, n_features = A_data.shape alpha = 1.0 / n_samples f = cp.utils.LogLoss(A_data, b, alpha) for beta in np.logspace(-3, 3, 3): pen = cp.utils.FusedLasso(beta) L = cp.utils.get_max_lipschitz(A_data, "logloss") + alpha / density opt_vrtos = cp.minimize_vrtos( f.partial_deriv, A_data, b, np.zeros(n_features), 1 / (3 * L), alpha=alpha, max_iter=2000, prox_1=pen.prox_1_factory(n_features), prox_2=pen.prox_2_factory(n_features), tol=0, ) opt_pgd = cp.minimize_proximal_gradient( f.f_grad, np.zeros(n_features), prox=pen.prox, max_iter=2000, tol=0 ) norm = np.linalg.norm(opt_pgd.x) if norm < 1e-10: norm = 1 assert np.linalg.norm(opt_vrtos.x - opt_pgd.x) / norm < 1e-4 # check also the gradient mapping ss = 1.0 / L grad = f.f_grad(opt_vrtos.x)[1] grad_map = (opt_vrtos.x - pen.prox(opt_vrtos.x - ss * grad, ss)) / ss assert np.linalg.norm(grad_map) < 1e-6
X, y = datasets.make_regression() n_samples, n_features = X.shape def loss(w): """Squared error loss.""" z = np.dot(X, w) - y return np.sum(z * z) / n_samples # .. use JAX to compute the gradient of loss value_and_grad .. # .. returns both the gradient and the objective, which is .. # .. the format that COPT accepts .. f_grad = jax.value_and_grad(loss) w0 = onp.zeros(n_features) l1_ball = copt.penalty.L1Norm(0.1) cb = cp.utils.Trace(lambda x: loss(x) + l1_ball(x)) sol = cp.minimize_proximal_gradient(f_grad, w0, prox=l1_ball.prox, callback=cb, jac=True) plt.plot(cb.trace_fx, lw=3) plt.yscale("log") plt.xlabel("# Iterations") plt.ylabel("Objective value") plt.grid() plt.show()
def minimize_accelerated(*args, **kw): kw["accelerated"] = True return cp.minimize_proximal_gradient(*args, **kw)
all_betas = [0, 1e-2, 1e-1, 0.2] all_trace_ls, all_trace_nols = [], [] out_img = [] for i, beta in enumerate(all_betas): print("beta = %s" % beta) G1 = cp.utils.GroupL1(beta, groups) def loss(x): return f(x) + G1(x) x0 = np.zeros(n_features) pgd = cp.minimize_proximal_gradient( f.f_grad, x0, G1.prox, jac=True, max_iter=max_iter, tol=1e-10, trace_certificate=True, ) out_img.append(pgd.x) # .. plot the results .. fig, ax = plt.subplots(2, 4, sharey=False) xlim = [0.02, 0.02, 0.1] markevery = [1000, 1000, 100, 100] for i, beta in enumerate(all_betas): ax[0, i].set_title("regularization=%s" % beta) ax[0, i].set_title("$regularization=%s" % beta) ax[0, i].plot(out_img[i])
import numpy as np import pylab as plt # .. construct (random) dataset .. n_samples, n_features = 1000, 200 np.random.seed(0) X = np.random.randn(n_samples, n_features) y = np.random.rand(n_samples) f = cp.utils.LogLoss(X, y) step_size = 1. / f.lipschitz cb_pgd = cp.utils.Trace(f) result_pgd = cp.minimize_proximal_gradient(f.f_grad, np.zeros(n_features), step_size=step_size, callback=cb_pgd, tol=0, accelerated=False) cb_apgd = cp.utils.Trace(f) result_apgd = cp.minimize_proximal_gradient(f.f_grad, np.zeros(n_features), step_size=step_size, callback=cb_apgd, tol=0, accelerated=True) # .. plot the result .. fmin = min(np.min(cb_pgd.trace_fx), np.min(cb_apgd.trace_fx)) plt.title('Comparison of full gradient optimizers') plt.plot(cb_apgd.trace_fx - fmin, lw=4, label='accelerated gradient descent')
def sgl_estimator( x_train, y_train, x_test, y_test, groups, bias_index=None, beta0=None, alpha1=0.0, alpha2=0.0, eta=1.0, transform_type=None, max_iter=5000, tol=1e-6, verbose=0, suppress_warnings=True, cb_trace=False, accelerate=False, loss_type="logloss", clf_threshold=0.5, random_state=None, ): """Find solution to sparse group lasso problem by proximal gradient descent Solve sparse group lasso [1]_ problem for feature matrix `x_train` and target vector `y_train` with features partitioned into groups. Solve using the proximal gradient descent (PGD) algorithm. Compute accuracy and ROC AUC using `x_test` and `y_test`. Parameters ---------- x_train : numpy.ndarray Training feature matrix y_train : numpy.ndarray Training target array x_test : numpy.ndarray Testing feature matrix y_test : numpy.ndarray Testing target array groups : numpy.ndarray Array of non-overlapping indices for each group. For example, if nine features are grouped into equal contiguous groups of three, then groups would be an nd.array like [[0, 1, 2], [3, 4, 5], [6, 7, 8]]. bias_index : int or None, default=None the index of the bias feature in x_train and x_test. If None, assume no bias feature. beta0 : numpy.ndarray Initial guess for coefficient array alpha1 : float, default=0.0 Group lasso regularization parameter. This encourages groupwise sparsity. alpha2 : float, default=0.0 Lasso regularization parameter. This encourages within group sparsity. eta : float, default=1.0 Target variable transformation parameter. transform_type : ["power", "exponentiation", None], default=None Type of transformation, see insight.target_transformation max_iter : int, default=5000 Maximum number of iterations for PGD algorithm. tol : float, default=1e-6 Convergence tolerance for PGD algorithm. verbose : int, default=0 Verbosity flag for PGD algorithm. suppress_warnings : bool, default=True If True, suppress convergence warnings from PGD algorithm. This is useful for hyperparameter tuning when some combinations of hyperparameters may not converge. cb_trace : bool, default=False If True, include copt.utils.Trace() object in return accelerate : bool, default=False If True, use accelerated PGD algorithm, otherwise use standard PGD. loss_type : {'logloss', 'square', 'huber'} The type of loss function to use. If 'logloss', treat this problem as a binary classification problem using logistic regression. Otherwise, treat this problem as a regression problem using either the mean square error or the Huber loss. clf_threshold : float, default=0.5 Decision threshold for binary classification random_state : int, numpy.RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If numpy.RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- dict dict with keys: alpha1 - group lasso regularization parameter, alpha2 - lasso regularization parameter, eta - target variable transformation parameter, beta_hat - estimate of the optimal beta, test - scores dict for test set, train - scores dict for train set, trace - copt.utils.Trace object if cv_trace is True, None otherwise References ---------- .. [1] Noah Simon, Jerome Friedman, Trevor Hastie & Robert Tibshirani, "A Sparse-Group Lasso," Journal of Computational and Graphical Statistics, vol. 22:2, pp. 231-245, 2012 DOI: 10.1080/10618600.2012.681250 """ n_features = x_train.shape[1] rng = check_random_state(random_state) np.random.set_state(rng.get_state()) if beta0 is None: beta0 = np.zeros(n_features) sg1 = SparseGroupL1(alpha1, alpha2, groups, bias_index=bias_index) if loss_type not in ["logloss", "square", "huber"]: raise ValueError("loss_type must be one of " "['logloss', 'square', 'huber'].") ind = np.ones(x_train.shape[1], bool) if bias_index is not None: ind[bias_index] = False # Inverse transform target variables if loss_type != "logloss": y_train = target_transformation(y=y_train, eta=eta, transform_type=transform_type, direction="inverse") if loss_type == "logloss": f = cp.utils.LogLoss(x_train, y_train) elif loss_type == "huber": f = cp.utils.HuberLoss(x_train, y_train) else: f = cp.utils.SquareLoss(x_train, y_train) step_size = 1.0 / f.lipschitz if cb_trace: cb_tos = cp.utils.Trace(f) else: cb_tos = None if suppress_warnings: ctx_mgr = warnings.catch_warnings() else: ctx_mgr = contextlib.suppress() with ctx_mgr: # For some metaparameters, minimize_PGD or minimize_APGD might not # reach the desired tolerance level. This might be okay during # hyperparameter optimization. So ignore the warning if the user # specifies suppress_warnings=True if suppress_warnings: warnings.filterwarnings("ignore", category=RuntimeWarning) pgd = cp.minimize_proximal_gradient( f.f_grad, beta0, sg1.prox, step_size=step_size, max_iter=max_iter, tol=tol, verbose=verbose, callback=cb_tos, accelerated=accelerate, ) beta_hat = np.copy(pgd.x) # Transform the target variables back to original if loss_type != "logloss": y_train = target_transformation(y=y_train, eta=eta, transform_type=transform_type, direction="forward") if loss_type == "logloss": train = classification_scores(x=x_train, y=y_train, beta_hat=beta_hat, clf_threshold=clf_threshold) test = classification_scores(x=x_test, y=y_test, beta_hat=beta_hat, clf_threshold=clf_threshold) else: train = regression_scores( x=x_train, y=y_train, beta_hat=beta_hat, eta=eta, transform_type=transform_type, ) test = regression_scores( x=x_test, y=y_test, beta_hat=beta_hat, eta=eta, transform_type=transform_type, ) return dict( alpha1=alpha1, alpha2=alpha2, eta=eta, transform_type=transform_type, beta_hat=beta_hat, test=test, train=train, trace=cb_tos, init_random_state=random_state, )
def fit(self, X, y, loss="squared_loss"): """Fit a linear model using the sparse group lasso. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. y : array-like, shape (n_samples,) The target values (class labels in classification, real numbers in regression). loss : ["squared_loss", "huber", "log"] The type of loss function to use in the PGD solver. Returns ------- self : object Returns self. """ if not isinstance(self.warm_start, bool): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) allowed_losses = ["squared_loss", "huber"] if is_regressor(self) and loss.lower() not in allowed_losses: raise ValueError( "For regression, the argument loss must be one of {0};" "got {1}".format(allowed_losses, loss)) if not 0 <= self.l1_ratio <= 1: raise ValueError( "The parameter l1_ratio must satisfy 0 <= l1_ratio <= 1;" "got {0}".format(self.l1_ratio)) if y is None: raise ValueError( "requires y to be passed, but the target y is None") X, y = check_X_y( X, y, accept_sparse=False, dtype=[np.float64, np.float32], y_numeric=not is_classifier(self), multi_output=False, ) _, self.n_features_in_ = X.shape if is_classifier(self): check_classification_targets(y) self.classes_ = np.unique(y) y = np.logical_not(y == self.classes_[0]).astype(int) n_samples, n_features = X.shape if self.fit_intercept: X = np.hstack([X, np.ones((n_samples, 1))]) if self.warm_start and hasattr(self, "coef_"): # pylint: disable=access-member-before-definition if self.fit_intercept: coef = np.concatenate( (self.coef_, np.array([self.intercept_]))) else: coef = self.coef_ else: if self.fit_intercept: coef = np.zeros(n_features + 1) # Initial bias condition gives 50/50 for binary classification coef[-1] = 0.5 else: coef = np.zeros(n_features) if loss == "huber": f = cp.utils.HuberLoss(X, y) elif loss == "log": f = cp.utils.LogLoss(X, y) else: f = cp.utils.SquareLoss(X, y) if self.include_solver_trace: self.solver_trace_ = cp.utils.Trace(f) else: self.solver_trace_ = None if self.suppress_solver_warnings: ctx_mgr = warnings.catch_warnings() else: ctx_mgr = contextlib.suppress() groups = check_groups(self.groups, X, allow_overlap=False, fit_intercept=self.fit_intercept) if self.scale_l2_by not in ["group_length", None]: raise ValueError("scale_l2_by must be 'group_length' or None; " "got {0}".format(self.scale_l2_by)) bias_index = n_features if self.fit_intercept else None sg1 = SparseGroupL1( l1_ratio=self.l1_ratio, alpha=self.alpha, groups=groups, bias_index=bias_index, scale_l2_by=self.scale_l2_by, ) with ctx_mgr: # For some metaparameters, minimize_PGD might not reach the desired # tolerance level. This might be okay during hyperparameter # optimization. So ignore the warning if the user specifies # suppress_solver_warnings=True if self.suppress_solver_warnings: warnings.filterwarnings("ignore", category=RuntimeWarning) pgd = cp.minimize_proximal_gradient( f.f_grad, coef, sg1.prox, jac=True, step="backtracking", max_iter=self.max_iter, tol=self.tol, verbose=self.verbose, callback=self.solver_trace_, accelerated=False, ) if self.fit_intercept: self.intercept_ = pgd.x[-1] self.coef_ = pgd.x[:-1] else: # set intercept to zero as the other linear models do self.intercept_ = 0.0 self.coef_ = pgd.x self.n_iter_ = pgd.nit self.is_fitted_ = True return self
all_trace_ls, all_trace_nols = [], [] out_img = [] for i, beta in enumerate(all_betas): print("beta = %s" % beta) G1 = cp.utils.GroupL1(beta, groups) def loss(x): return f(x) + G1(x) cb_tosls = cp.utils.Trace() x0 = np.zeros(n_features) pgd_ls = cp.minimize_proximal_gradient( f.f_grad, x0, G1.prox, step_size=step_size, max_iter=max_iter, tol=1e-14, verbose=1, callback=cb_tosls, ) trace_ls = np.array([loss(x) for x in cb_tosls.trace_x]) all_trace_ls.append(trace_ls) cb_tos = cp.utils.Trace() x0 = np.zeros(n_features) pgd = cp.minimize_proximal_gradient( f.f_grad, x0, G1.prox, step_size=step_size, max_iter=max_iter,