def minimize_pfw_l1(f_grad, alpha, n_features, lipschitz=None, max_iter=1000, tol=1e-12, backtracking=True, callback=None, verbose=0): """Pairwise FW on the L1 ball. .. warning:: This feature is experimental, API is likely to change. """ x = np.zeros(n_features) if lipschitz is None: lipschitz_t = utils.init_lipschitz(f_grad, x) else: lipschitz_t = lipschitz active_set = np.zeros(2 * n_features + 1) active_set[2 * n_features] = 1. all_lipschitz = [] num_bad_steps = 0 # do a first FW step to f_t, grad = f_grad(x) pbar = trange(max_iter, disable=(verbose == 0)) it = 0 for it in pbar: # FW oracle idx_oracle = np.argmax(np.abs(grad)) if grad[idx_oracle] > 0: idx_oracle += n_features mag_oracle = alpha * np.sign(-grad[idx_oracle % n_features]) # Away Oracle _, idx_oracle_away = max_active(grad, active_set, n_features, include_zero=False) mag_away = alpha * np.sign(float(n_features - idx_oracle_away)) is_away_zero = False if idx_oracle_away < 0 or active_set[2 * n_features] > 0 and grad[ idx_oracle_away % n_features] * mag_away < 0: is_away_zero = True mag_away = 0. gamma_max = active_set[2 * n_features] else: assert grad[idx_oracle_away % n_features] * mag_away > grad.dot(x) - 1e-3 gamma_max = active_set[idx_oracle_away] if gamma_max <= 0: pbar.close() raise ValueError g_t = grad[idx_oracle_away % n_features] * mag_away - \ grad[idx_oracle % n_features] * mag_oracle if g_t <= tol: break d2_t = 2 * (alpha**2) if backtracking: # because of the specific form of the update # we can achieve some extra efficiency this way for i in range(100): x_next = x.copy() step_size = min(g_t / (d2_t * lipschitz_t), gamma_max) x_next[idx_oracle % n_features] += step_size * mag_oracle x_next[idx_oracle_away % n_features] -= step_size * mag_away f_next, grad_next = f_grad(x_next) if step_size < 1e-7: break elif f_next - f_t <= -g_t * step_size + 0.5 * ( step_size**2) * lipschitz_t * d2_t: if i == 0: lipschitz_t *= 0.999 break else: lipschitz_t *= 2 # import pdb; pdb.set_trace() else: x_next = x.copy() step_size = min(g_t / (d2_t * lipschitz_t), gamma_max) x_next[idx_oracle % n_features] = x[idx_oracle % n_features] + step_size * mag_oracle x_next[idx_oracle_away % n_features] = x[idx_oracle_away % n_features] - step_size * mag_away f_next, grad_next = f_grad(x_next) if lipschitz_t >= 1e10: raise ValueError # was it a drop step? # x_t[idx_oracle] += step_size * mag_oracle x = x_next active_set[idx_oracle] += step_size if is_away_zero: active_set[2 * n_features] -= step_size else: active_set[idx_oracle_away] -= step_size if active_set[idx_oracle_away] < 0: raise ValueError if active_set[idx_oracle] > 1: raise ValueError f_t, grad = f_next, grad_next if gamma_max < 1 and step_size == gamma_max: num_bad_steps += 1 if it % 100 == 0: all_lipschitz.append(lipschitz_t) pbar.set_postfix(tol=g_t, gmax=gamma_max, gamma=step_size, L_t_mean=np.mean(all_lipschitz), L_t=lipschitz_t, bad_steps_quot=(num_bad_steps) / (it + 1)) if callback is not None: callback(locals()) if callback is not None: callback(locals()) pbar.close() return optimize.OptimizeResult(x=x, nit=it, certificate=g_t)
def minimize_proximal_gradient( fun, x0, prox=None, jac="2-point", tol=1e-6, max_iter=500, args=(), verbose=0, callback=None, step="backtracking", accelerated=False, eps=1e-8, max_iter_backtracking=1000, backtracking_factor=0.6, trace_certificate=False, ): """Proximal gradient descent. Solves problems of the form minimize_x f(x) + g(x) where f is a differentiable function and we have access to the proximal operator of g. Args: fun : callable The objective function to be minimized. ``fun(x, *args) -> float`` where x is an 1-D array with shape (n,) and `args` is a tuple of the fixed parameters needed to completely specify the function. x0 : ndarray, shape (n,) Initial guess. Array of real elements of size (n,), where 'n' is the number of independent variables. jac : {callable, '2-point', bool}, optional Method for computing the gradient vector. If it is a callable, it should be a function that returns the gradient vector: ``jac(x, *args) -> array_like, shape (n,)`` where x is an array with shape (n,) and `args` is a tuple with the fixed parameters. Alternatively, the '2-point' select a finite difference scheme for numerical estimation of the gradient. If `jac` is a Boolean and is True, `fun` is assumed to return the gradient along with the objective function. If False, the gradient will be estimated using '2-point' finite difference estimation. prox : callable, optional. Proximal operator g. args : tuple, optional Extra arguments passed to the objective function and its derivatives (`fun`, `jac` and `hess` functions). tol: float, optional Tolerance of the optimization procedure. The iteration stops when the gradient mapping (a generalization of the gradient to non-smooth functions) is below this tolerance. max_iter : int, optional. Maximum number of iterations. verbose : int, optional. Verbosity level, from 0 (no output) to 2 (output on each iteration) callback : callable. callback function (optional). Takes a single argument (x) with the current coefficients in the algorithm. The algorithm will exit if callback returns False. step : "backtracking" or callable. Step-size strategy to use. "backtracking" will use a backtracking line-search, while callable will use the value returned by step(locals()). accelerated: boolean Whether to use the accelerated variant of the algorithm. eps: float or ndarray If jac is approximated, use this value for the step size. max_iter_backtracking: int backtracking_factor: float trace_certificate: bool Returns: res : The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References: Beck, Amir, and Marc Teboulle. "Gradient-based algorithms with applications to signal recovery." Convex optimization in signal processing and communications (2009) Examples: * :ref:`sphx_glr_auto_examples_plot_group_lasso.py` """ x = np.asarray(x0).flatten() if max_iter_backtracking <= 0: raise ValueError("Line search iterations need to be greater than 0") if prox is None: def _prox(x, _): return x prox = _prox success = False certificate = np.NaN if not callable(jac): if bool(jac): fun = optimize.optimize.MemoizeJac(fun) jac = fun.derivative elif jac == "2-point": jac = None else: raise NotImplementedError("jac has unexpected value.") if jac is None: def func_and_grad(x): f = fun(x, *args) g = optimize._approx_fprime_helper(x, fun, eps, args=args, f0=f) else: def func_and_grad(x): f = fun(x, *args) g = jac(x, *args) return f, g # find initial step-size if step == "backtracking": step_size = 1.8 / utils.init_lipschitz(func_and_grad, x0) else: # to avoid step_size being undefined upon return step_size = None n_iterations = 0 certificate_list = [] # .. a while loop instead of a for loop .. # .. allows for infinite or floating point max_iter .. if not accelerated: fk, grad_fk = func_and_grad(x) while True: if callback is not None: if callback(locals()) is False: # pylint: disable=g-bool-id-comparison break # .. compute gradient and step size if hasattr(step, "__call__"): step_size = step(locals()) x_next = prox(x - step_size * grad_fk, step_size) update_direction = x_next - x f_next, grad_next = func_and_grad(x_next) elif step == "backtracking": x_next = prox(x - step_size * grad_fk, step_size) update_direction = x_next - x step_size *= 1.1 for _ in range(max_iter_backtracking): f_next, grad_next = func_and_grad(x_next) rhs = ( fk + grad_fk.dot(update_direction) + update_direction.dot(update_direction) / (2.0 * step_size) ) if f_next <= rhs: # .. step size found .. break else: # .. backtracking, reduce step size .. step_size *= backtracking_factor x_next = prox(x - step_size * grad_fk, step_size) update_direction = x_next - x else: warnings.warn("Maxium number of line-search iterations reached") elif step == "fixed": x_next = prox(x - step_size * grad_fk, step_size) update_direction = x_next - x f_next, grad_next = func_and_grad(x_next) else: raise ValueError("Step-size strategy not understood") certificate = np.linalg.norm((x - x_next) / step_size) if trace_certificate: certificate_list.append(certificate) x[:] = x_next fk = f_next grad_fk = grad_next if certificate < tol: success = True break if n_iterations >= max_iter: break else: n_iterations += 1 else: warnings.warn( "minimize_proximal_gradient did not reach the desired tolerance level", RuntimeWarning, ) else: tk = 1 # .. a while loop instead of a for loop .. # .. allows for infinite or floating point max_iter .. yk = x.copy() while True: grad_fk = func_and_grad(yk)[1] if callback is not None: if callback(locals()) is False: # pylint: disable=g-bool-id-comparison break # .. compute gradient and step size if hasattr(step, "__call__"): current_step_size = step(locals()) x_next = prox(yk - current_step_size * grad_fk, current_step_size) t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2 yk = x_next + ((tk - 1.0) / t_next) * (x_next - x) t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2 yk = x_next + ((tk - 1.0) / t_next) * (x_next - x) x_prox = prox( x_next - current_step_size * func_and_grad(x_next)[1], current_step_size, ) certificate = np.linalg.norm((x - x_prox) / current_step_size) tk = t_next x = x_next.copy() elif step == "backtracking": current_step_size = step_size x_next = prox(yk - current_step_size * grad_fk, current_step_size) for _ in range(max_iter_backtracking): update_direction = x_next - yk if func_and_grad(x_next)[0] <= func_and_grad(yk)[0] + grad_fk.dot( update_direction ) + update_direction.dot(update_direction) / ( 2.0 * current_step_size ): # .. step size found .. break else: # .. backtracking, reduce step size .. current_step_size *= backtracking_factor x_next = prox( yk - current_step_size * grad_fk, current_step_size ) else: warnings.warn("Maxium number of line-search iterations reached") t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2 yk = x_next + ((tk - 1.0) / t_next) * (x_next - x) x_prox = prox( x_next - current_step_size * func_and_grad(x_next)[1], current_step_size, ) certificate = np.linalg.norm((x - x_prox) / current_step_size) if trace_certificate: certificate_list.append(certificate) tk = t_next x = x_next.copy() if certificate < tol: success = True break if n_iterations >= max_iter: break else: n_iterations += 1 if n_iterations >= max_iter: warnings.warn( "minimize_proximal_gradient did not reach the desired tolerance level", RuntimeWarning, ) return optimize.OptimizeResult( x=x, success=success, certificate=certificate, nit=n_iterations, step_size=step_size, trace_certificate=certificate_list, )
def minimize_frank_wolfe(f_grad, lmo, x0, lipschitz=None, max_iter=1000, tol=1e-12, line_search=True, callback=None, verbose=0): r"""Frank-Wolfe algorithm. This method for optimization problems of the form .. math:: \\argmin_{\\bs{x} \\in \\mathcal{D}} f(\\bs{x}) where f is a differentiable function for which we have access to its gradient and D is a compact set for which we have access to its linear minimization oracle (lmo), i.e., a routine that given a vector :math:`\\bs{u}` returns a solution to .. math:: \\argmin_{\\bs{x} \\in D}\\, \\langle\\bs{u}, \\bs{x}\\rangle Args: f_grad: callable Takes as input the current iterate (a vector of same size as x0) and returns the function value and gradient of the objective function. It should accept the optional argument return_gradient, and when False it should return only the function value. lmo: callable Takes as input a vector u of same size as x0 and returns a solution to the linear minimization oracle (defined above). x0 : array-like Initial guess for solution. lipschitz: float (optional) Estimate for the Lipschitz constant of the gradient. max_iter: integer tol: float line_search: boolean or callable callback: callable verbose: int Returns: res : scipy.optimize.OptimizeResult The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References: Jaggi, Martin. `"Revisiting Frank-Wolfe: Projection-Free Sparse Convex Optimization." <http://proceedings.mlr.press/v28/jaggi13-supp.pdf>`_ ICML 2013. Pedregosa, Fabian `"Notes on the Frank-Wolfe Algorithm" <http://fa.bianp.net/blog/2018/notes-on-the-frank-wolfe-algorithm-part-i/>`_, 2018 Pedregosa, Fabian, et al. `"Step-Size Adaptivity in Projection-Free Optimization." <https://arxiv.org/pdf/1806.05123.pdf>`_ arXiv preprint arXiv:1806.05123 (2018). """ x0 = sparse.csr_matrix(x0).T if tol < 0: raise ValueError('Tol must be non-negative') x = x0.copy() pbar = trange(max_iter, disable=(verbose == 0)) f_t, grad = f_grad(x) if lipschitz is None: lipschitz_t = utils.init_lipschitz(f_grad, x0) else: lipschitz_t = lipschitz it = 0 for it in pbar: s_t = lmo(-grad) d_t = s_t - x g_t = -safe_sparse_dot(d_t.T, grad) if sparse.issparse(g_t): g_t = g_t[0, 0] else: g_t = g_t[0] if g_t <= tol: break d2_t = splinalg.norm(d_t)**2 if hasattr(line_search, '__call__'): step_size = line_search(locals()) f_next, grad_next = f_grad(x + step_size * d_t) elif line_search: ratio_decrease = 0.999 ratio_increase = 2 for i in range(max_iter): step_size = min(g_t / (d2_t * lipschitz_t), 1) rhs = f_t - step_size * g_t + 0.5 * (step_size** 2) * lipschitz_t * d2_t f_next, grad_next = f_grad(x + step_size * d_t) if f_next <= rhs + 1e-6: if i == 0: lipschitz_t *= ratio_decrease break else: lipschitz_t *= ratio_increase else: step_size = min(g_t / (d2_t * lipschitz_t), 1) f_next, grad_next = f_grad(x + step_size * d_t) if callback is not None: callback(locals()) x += step_size * d_t pbar.set_postfix(tol=g_t, iter=it, L_t=lipschitz_t) f_t, grad = f_next, grad_next if callback is not None: callback(locals()) pbar.close() x_final = x.toarray().ravel() return optimize.OptimizeResult(x=x_final, nit=it, certificate=g_t)