Esempio n. 1
0
def minimize_pfw_l1(f_grad,
                    alpha,
                    n_features,
                    lipschitz=None,
                    max_iter=1000,
                    tol=1e-12,
                    backtracking=True,
                    callback=None,
                    verbose=0):
    """Pairwise FW on the L1 ball.

.. warning::
    This feature is experimental, API is likely to change.

    """

    x = np.zeros(n_features)
    if lipschitz is None:
        lipschitz_t = utils.init_lipschitz(f_grad, x)
    else:
        lipschitz_t = lipschitz

    active_set = np.zeros(2 * n_features + 1)
    active_set[2 * n_features] = 1.
    all_lipschitz = []
    num_bad_steps = 0

    # do a first FW step to
    f_t, grad = f_grad(x)

    pbar = trange(max_iter, disable=(verbose == 0))
    it = 0
    for it in pbar:

        # FW oracle
        idx_oracle = np.argmax(np.abs(grad))
        if grad[idx_oracle] > 0:
            idx_oracle += n_features
        mag_oracle = alpha * np.sign(-grad[idx_oracle % n_features])

        # Away Oracle
        _, idx_oracle_away = max_active(grad,
                                        active_set,
                                        n_features,
                                        include_zero=False)

        mag_away = alpha * np.sign(float(n_features - idx_oracle_away))

        is_away_zero = False
        if idx_oracle_away < 0 or active_set[2 * n_features] > 0 and grad[
                idx_oracle_away % n_features] * mag_away < 0:
            is_away_zero = True
            mag_away = 0.
            gamma_max = active_set[2 * n_features]
        else:
            assert grad[idx_oracle_away %
                        n_features] * mag_away > grad.dot(x) - 1e-3
            gamma_max = active_set[idx_oracle_away]

        if gamma_max <= 0:
            pbar.close()
            raise ValueError

        g_t = grad[idx_oracle_away % n_features] * mag_away - \
              grad[idx_oracle % n_features] * mag_oracle
        if g_t <= tol:
            break

        d2_t = 2 * (alpha**2)
        if backtracking:
            # because of the specific form of the update
            # we can achieve some extra efficiency this way
            for i in range(100):
                x_next = x.copy()
                step_size = min(g_t / (d2_t * lipschitz_t), gamma_max)

                x_next[idx_oracle % n_features] += step_size * mag_oracle
                x_next[idx_oracle_away % n_features] -= step_size * mag_away
                f_next, grad_next = f_grad(x_next)
                if step_size < 1e-7:
                    break
                elif f_next - f_t <= -g_t * step_size + 0.5 * (
                        step_size**2) * lipschitz_t * d2_t:
                    if i == 0:
                        lipschitz_t *= 0.999
                    break
                else:
                    lipschitz_t *= 2
            # import pdb; pdb.set_trace()
        else:
            x_next = x.copy()
            step_size = min(g_t / (d2_t * lipschitz_t), gamma_max)
            x_next[idx_oracle %
                   n_features] = x[idx_oracle %
                                   n_features] + step_size * mag_oracle
            x_next[idx_oracle_away %
                   n_features] = x[idx_oracle_away %
                                   n_features] - step_size * mag_away
            f_next, grad_next = f_grad(x_next)

        if lipschitz_t >= 1e10:
            raise ValueError
        # was it a drop step?
        # x_t[idx_oracle] += step_size * mag_oracle
        x = x_next
        active_set[idx_oracle] += step_size
        if is_away_zero:
            active_set[2 * n_features] -= step_size
        else:
            active_set[idx_oracle_away] -= step_size
        if active_set[idx_oracle_away] < 0:
            raise ValueError
        if active_set[idx_oracle] > 1:
            raise ValueError

        f_t, grad = f_next, grad_next

        if gamma_max < 1 and step_size == gamma_max:
            num_bad_steps += 1

        if it % 100 == 0:
            all_lipschitz.append(lipschitz_t)
        pbar.set_postfix(tol=g_t,
                         gmax=gamma_max,
                         gamma=step_size,
                         L_t_mean=np.mean(all_lipschitz),
                         L_t=lipschitz_t,
                         bad_steps_quot=(num_bad_steps) / (it + 1))

        if callback is not None:
            callback(locals())

    if callback is not None:
        callback(locals())
    pbar.close()
    return optimize.OptimizeResult(x=x, nit=it, certificate=g_t)
Esempio n. 2
0
def minimize_proximal_gradient(
    fun,
    x0,
    prox=None,
    jac="2-point",
    tol=1e-6,
    max_iter=500,
    args=(),
    verbose=0,
    callback=None,
    step="backtracking",
    accelerated=False,
    eps=1e-8,
    max_iter_backtracking=1000,
    backtracking_factor=0.6,
    trace_certificate=False,
):
    """Proximal gradient descent.

  Solves problems of the form

          minimize_x f(x) + g(x)

  where f is a differentiable function and we have access to the proximal
  operator of g.

  Args:
    fun : callable
        The objective function to be minimized.
            ``fun(x, *args) -> float``
        where x is an 1-D array with shape (n,) and `args`
        is a tuple of the fixed parameters needed to completely
        specify the function.

    x0 : ndarray, shape (n,)
        Initial guess. Array of real elements of size (n,),
        where 'n' is the number of independent variables.

    jac : {callable,  '2-point', bool}, optional
        Method for computing the gradient vector. If it is a callable,
        it should be a function that returns the gradient vector:
            ``jac(x, *args) -> array_like, shape (n,)``
        where x is an array with shape (n,) and `args` is a tuple with
        the fixed parameters. Alternatively, the '2-point' select a finite
        difference scheme for numerical estimation of the gradient.
        If `jac` is a Boolean and is True, `fun` is assumed to return the
        gradient along with the objective function. If False, the gradient
        will be estimated using '2-point' finite difference estimation.

    prox : callable, optional.
        Proximal operator g.

    args : tuple, optional
        Extra arguments passed to the objective function and its
        derivatives (`fun`, `jac` and `hess` functions).


    tol: float, optional
        Tolerance of the optimization procedure. The iteration stops when the gradient mapping
        (a generalization of the gradient to non-smooth functions) is below this tolerance.

    max_iter : int, optional.
        Maximum number of iterations.

    verbose : int, optional.
        Verbosity level, from 0 (no output) to 2 (output on each iteration)

    callback : callable.
        callback function (optional). Takes a single argument (x) with the
        current coefficients in the algorithm. The algorithm will exit if
        callback returns False.

    step : "backtracking" or callable.
        Step-size strategy to use. "backtracking" will use a backtracking line-search,
        while callable will use the value returned by step(locals()).

    accelerated: boolean
        Whether to use the accelerated variant of the algorithm.

    eps: float or ndarray
        If jac is approximated, use this value for the step size.

    max_iter_backtracking: int

    backtracking_factor: float

    trace_certificate: bool

  Returns:
    res : The optimization result represented as a
        ``scipy.optimize.OptimizeResult`` object. Important attributes are:
        ``x`` the solution array, ``success`` a Boolean flag indicating if
        the optimizer exited successfully and ``message`` which describes
        the cause of the termination. See `scipy.optimize.OptimizeResult`
        for a description of other attributes.

  References:
    Beck, Amir, and Marc Teboulle. "Gradient-based algorithms with applications
    to signal recovery." Convex optimization in signal processing and
    communications (2009)

  Examples:
    * :ref:`sphx_glr_auto_examples_plot_group_lasso.py`
  """
    x = np.asarray(x0).flatten()
    if max_iter_backtracking <= 0:
        raise ValueError("Line search iterations need to be greater than 0")

    if prox is None:

        def _prox(x, _):
            return x

        prox = _prox

    success = False
    certificate = np.NaN

    if not callable(jac):
        if bool(jac):
            fun = optimize.optimize.MemoizeJac(fun)
            jac = fun.derivative
        elif jac == "2-point":
            jac = None
        else:
            raise NotImplementedError("jac has unexpected value.")

    if jac is None:

        def func_and_grad(x):
            f = fun(x, *args)
            g = optimize._approx_fprime_helper(x, fun, eps, args=args, f0=f)

    else:

        def func_and_grad(x):
            f = fun(x, *args)
            g = jac(x, *args)
            return f, g

    # find initial step-size
    if step == "backtracking":
        step_size = 1.8 / utils.init_lipschitz(func_and_grad, x0)
    else:
        # to avoid step_size being undefined upon return
        step_size = None

    n_iterations = 0
    certificate_list = []
    # .. a while loop instead of a for loop ..
    # .. allows for infinite or floating point max_iter ..
    if not accelerated:
        fk, grad_fk = func_and_grad(x)
        while True:
            if callback is not None:
                if callback(locals()) is False:  # pylint: disable=g-bool-id-comparison
                    break
            # .. compute gradient and step size
            if hasattr(step, "__call__"):
                step_size = step(locals())
                x_next = prox(x - step_size * grad_fk, step_size)
                update_direction = x_next - x
                f_next, grad_next = func_and_grad(x_next)
            elif step == "backtracking":
                x_next = prox(x - step_size * grad_fk, step_size)
                update_direction = x_next - x
                step_size *= 1.1
                for _ in range(max_iter_backtracking):
                    f_next, grad_next = func_and_grad(x_next)
                    rhs = (
                        fk
                        + grad_fk.dot(update_direction)
                        + update_direction.dot(update_direction) / (2.0 * step_size)
                    )
                    if f_next <= rhs:
                        # .. step size found ..
                        break
                    else:
                        # .. backtracking, reduce step size ..
                        step_size *= backtracking_factor
                        x_next = prox(x - step_size * grad_fk, step_size)
                        update_direction = x_next - x
                else:
                    warnings.warn("Maxium number of line-search iterations reached")
            elif step == "fixed":
                x_next = prox(x - step_size * grad_fk, step_size)
                update_direction = x_next - x
                f_next, grad_next = func_and_grad(x_next)
            else:
                raise ValueError("Step-size strategy not understood")
            certificate = np.linalg.norm((x - x_next) / step_size)
            if trace_certificate:
                certificate_list.append(certificate)
            x[:] = x_next
            fk = f_next
            grad_fk = grad_next

            if certificate < tol:
                success = True
                break

            if n_iterations >= max_iter:
                break
            else:
                n_iterations += 1
        else:
            warnings.warn(
                "minimize_proximal_gradient did not reach the desired tolerance level",
                RuntimeWarning,
            )
    else:
        tk = 1
        # .. a while loop instead of a for loop ..
        # .. allows for infinite or floating point max_iter ..
        yk = x.copy()
        while True:
            grad_fk = func_and_grad(yk)[1]
            if callback is not None:
                if callback(locals()) is False:  # pylint: disable=g-bool-id-comparison
                    break

            # .. compute gradient and step size
            if hasattr(step, "__call__"):
                current_step_size = step(locals())
                x_next = prox(yk - current_step_size * grad_fk, current_step_size)
                t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2
                yk = x_next + ((tk - 1.0) / t_next) * (x_next - x)

                t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2
                yk = x_next + ((tk - 1.0) / t_next) * (x_next - x)

                x_prox = prox(
                    x_next - current_step_size * func_and_grad(x_next)[1],
                    current_step_size,
                )
                certificate = np.linalg.norm((x - x_prox) / current_step_size)
                tk = t_next
                x = x_next.copy()

            elif step == "backtracking":
                current_step_size = step_size
                x_next = prox(yk - current_step_size * grad_fk, current_step_size)
                for _ in range(max_iter_backtracking):
                    update_direction = x_next - yk
                    if func_and_grad(x_next)[0] <= func_and_grad(yk)[0] + grad_fk.dot(
                        update_direction
                    ) + update_direction.dot(update_direction) / (
                        2.0 * current_step_size
                    ):
                        # .. step size found ..
                        break
                    else:
                        # .. backtracking, reduce step size ..
                        current_step_size *= backtracking_factor
                        x_next = prox(
                            yk - current_step_size * grad_fk, current_step_size
                        )
                else:
                    warnings.warn("Maxium number of line-search iterations reached")
                t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2
                yk = x_next + ((tk - 1.0) / t_next) * (x_next - x)

                x_prox = prox(
                    x_next - current_step_size * func_and_grad(x_next)[1],
                    current_step_size,
                )
                certificate = np.linalg.norm((x - x_prox) / current_step_size)
                if trace_certificate:
                    certificate_list.append(certificate)
                tk = t_next
                x = x_next.copy()

            if certificate < tol:
                success = True
                break

            if n_iterations >= max_iter:
                break
            else:
                n_iterations += 1

        if n_iterations >= max_iter:
            warnings.warn(
                "minimize_proximal_gradient did not reach the desired tolerance level",
                RuntimeWarning,
            )

    return optimize.OptimizeResult(
        x=x,
        success=success,
        certificate=certificate,
        nit=n_iterations,
        step_size=step_size,
        trace_certificate=certificate_list,
    )
Esempio n. 3
0
def minimize_frank_wolfe(f_grad,
                         lmo,
                         x0,
                         lipschitz=None,
                         max_iter=1000,
                         tol=1e-12,
                         line_search=True,
                         callback=None,
                         verbose=0):
    r"""Frank-Wolfe algorithm.

  This method for optimization problems of the form

  .. math::
      \\argmin_{\\bs{x} \\in \\mathcal{D}} f(\\bs{x})

  where f is a differentiable function for which we have access to its
  gradient and D is a compact set for which we have access to its
  linear minimization oracle (lmo), i.e., a routine that given a vector
  :math:`\\bs{u}` returns a solution to

  .. math::
      \\argmin_{\\bs{x} \\in D}\\, \\langle\\bs{u}, \\bs{x}\\rangle

  Args:
    f_grad: callable
        Takes as input the current iterate (a vector of same size as x0) and
        returns the function value and gradient of the objective function.
        It should accept the optional argument return_gradient, and when False
        it should return only the function value.

    lmo: callable
        Takes as input a vector u of same size as x0 and returns a solution to
        the linear minimization oracle (defined above).

    x0 : array-like
        Initial guess for solution.

    lipschitz: float (optional)
        Estimate for the Lipschitz constant of the gradient.

    max_iter: integer

    tol: float

    line_search: boolean or callable

    callback: callable

    verbose: int


  Returns:
    res : scipy.optimize.OptimizeResult
        The optimization result represented as a
        ``scipy.optimize.OptimizeResult`` object. Important attributes are:
        ``x`` the solution array, ``success`` a Boolean flag indicating if
        the optimizer exited successfully and ``message`` which describes
        the cause of the termination. See `scipy.optimize.OptimizeResult`
        for a description of other attributes.


  References:
    Jaggi, Martin. `"Revisiting Frank-Wolfe: Projection-Free Sparse Convex
    Optimization." <http://proceedings.mlr.press/v28/jaggi13-supp.pdf>`_ ICML
    2013.

    Pedregosa, Fabian `"Notes on the Frank-Wolfe Algorithm"
    <http://fa.bianp.net/blog/2018/notes-on-the-frank-wolfe-algorithm-part-i/>`_,
    2018

    Pedregosa, Fabian, et al. `"Step-Size Adaptivity in Projection-Free
    Optimization." <https://arxiv.org/pdf/1806.05123.pdf>`_ arXiv preprint
    arXiv:1806.05123 (2018).
  """
    x0 = sparse.csr_matrix(x0).T
    if tol < 0:
        raise ValueError('Tol must be non-negative')
    x = x0.copy()
    pbar = trange(max_iter, disable=(verbose == 0))
    f_t, grad = f_grad(x)
    if lipschitz is None:
        lipschitz_t = utils.init_lipschitz(f_grad, x0)
    else:
        lipschitz_t = lipschitz
    it = 0
    for it in pbar:
        s_t = lmo(-grad)
        d_t = s_t - x

        g_t = -safe_sparse_dot(d_t.T, grad)
        if sparse.issparse(g_t):
            g_t = g_t[0, 0]
        else:
            g_t = g_t[0]
        if g_t <= tol:
            break
        d2_t = splinalg.norm(d_t)**2
        if hasattr(line_search, '__call__'):
            step_size = line_search(locals())
            f_next, grad_next = f_grad(x + step_size * d_t)
        elif line_search:
            ratio_decrease = 0.999
            ratio_increase = 2
            for i in range(max_iter):
                step_size = min(g_t / (d2_t * lipschitz_t), 1)
                rhs = f_t - step_size * g_t + 0.5 * (step_size**
                                                     2) * lipschitz_t * d2_t
                f_next, grad_next = f_grad(x + step_size * d_t)
                if f_next <= rhs + 1e-6:
                    if i == 0:
                        lipschitz_t *= ratio_decrease
                    break
                else:
                    lipschitz_t *= ratio_increase
        else:
            step_size = min(g_t / (d2_t * lipschitz_t), 1)
            f_next, grad_next = f_grad(x + step_size * d_t)
        if callback is not None:
            callback(locals())
        x += step_size * d_t
        pbar.set_postfix(tol=g_t, iter=it, L_t=lipschitz_t)

        f_t, grad = f_next, grad_next
    if callback is not None:
        callback(locals())
    pbar.close()
    x_final = x.toarray().ravel()
    return optimize.OptimizeResult(x=x_final, nit=it, certificate=g_t)