Beispiel #1
0
def adagrad(learning_rate: ScalarOrSchedule,
            initial_accumulator_value: float = 0.1,
            eps: float = 1e-6) -> Optimizer:
  """The Adagrad optimizer.

  Adagrad is an algorithm for gradient based optimisation that anneals the
  learning rate for each parameter during the course of training.

  WARNING: Adagrad's main limit is the monotonic accumulation of squared
  gradients in the denominator: since all terms are >0, the sum keeps growing
  during training and the learning rate eventually becomes vanishingly small.

  References:
    [Duchi et al, 2011](https://jmlr.org/papers/v12/duchi11a.html)

  Args:
    learning_rate: This is a fixed global scaling factor.
    initial_accumulator_value: Initialisation for the accumulator.
    eps: A small constant applied to denominator inside of the square root (as
      in RMSProp) to avoid dividing by zero when rescaling.

  Returns:
    The corresponding `Optimizer`.
  """
  return create_optimizer_from_optax(
      optax.adagrad(
          learning_rate=learning_rate,
          initial_accumulator_value=initial_accumulator_value,
          eps=eps))
Beispiel #2
0
 def __init__(
     self,
     learning_rate: float = 0.001,
     initial_accumulator_value: float = 0.1,
     eps: float = 1e-7,
 ):
     super(Adagrad, self).__init__(learning_rate=learning_rate)
     self._initial_accumulator_value = initial_accumulator_value
     self._eps = eps
     self._optimizer = optax.adagrad(
         learning_rate=learning_rate,
         initial_accumulator_value=initial_accumulator_value,
         eps=eps,
     )
     self._optimizer_update = jit(self._optimizer.update)
Beispiel #3
0
def get_optimizer(optimizer_name: OptimizerName,
                  learning_rate: float,
                  momentum: float = 0.0,
                  adam_beta1: float = 0.9,
                  adam_beta2: float = 0.999,
                  adam_epsilon: float = 1e-8,
                  rmsprop_decay: float = 0.9,
                  rmsprop_epsilon: float = 1e-8,
                  adagrad_init_accumulator: float = 0.1,
                  adagrad_epsilon: float = 1e-6) -> Optimizer:
  """Given parameters, returns the corresponding optimizer.

  Args:
    optimizer_name: One of SGD, MOMENTUM, ADAM, RMSPROP.
    learning_rate: Learning rate for all optimizers.
    momentum: Momentum parameter for MOMENTUM.
    adam_beta1: beta1 parameter for ADAM.
    adam_beta2: beta2 parameter for ADAM.
    adam_epsilon: epsilon parameter for ADAM.
    rmsprop_decay: decay parameter for RMSPROP.
    rmsprop_epsilon: epsilon parameter for RMSPROP.
    adagrad_init_accumulator: initial accumulator for ADAGRAD.
    adagrad_epsilon: epsilon parameter for ADAGRAD.

  Returns:
    Returns the Optimizer with the specified properties.

  Raises:
    ValueError: iff the optimizer names is not one of SGD, MOMENTUM, ADAM,
  RMSPROP, or Adagrad, raises errors.
  """
  if optimizer_name == OptimizerName.SGD:
    return Optimizer(*optax.sgd(learning_rate))
  elif optimizer_name == OptimizerName.MOMENTUM:
    return Optimizer(*optax.sgd(learning_rate, momentum))
  elif optimizer_name == OptimizerName.ADAM:
    return Optimizer(*optax.adam(
        learning_rate, b1=adam_beta1, b2=adam_beta2, eps=adam_epsilon))
  elif optimizer_name == OptimizerName.RMSPROP:
    return Optimizer(
        *optax.rmsprop(learning_rate, decay=rmsprop_decay, eps=rmsprop_epsilon))
  elif optimizer_name == OptimizerName.ADAGRAD:
    return Optimizer(*optax.adagrad(
        learning_rate,
        initial_accumulator_value=adagrad_init_accumulator,
        eps=adagrad_epsilon))
  else:
    raise ValueError(f'Unsupported optimizer_name {optimizer_name}.')
Beispiel #4
0
    def test_adagrad(self):
        true_adagrad = optax.adagrad(0.7, initial_accumulator_value=0.3)
        ks_adagrad = transform_chain(
            ['precondition_by_rss', 'first_moment_ema'],
            [{
                'initial_accumulator_value': 0.3
            }, {
                'decay': 0.0
            }],
            learning_rate=0.7)

        targets = _optimizer_loop(true_adagrad)
        results = _optimizer_loop(ks_adagrad)

        for target, result in zip(targets, results):
            chex.assert_trees_all_close(target, result)
Beispiel #5
0
def AdaGrad(
    learning_rate: float = 0.001,
    epscut: float = 1.0e-7,
    initial_accumulator_value: float = 0.1,
):
    r"""AdaGrad Optimizer.
        In many cases, in Sgd the learning rate :math`\eta` should
        decay as a function of training iteration to prevent overshooting
        as the optimum is approached. AdaGrad is an adaptive learning
        rate algorithm that automatically scales the learning rate with a sum
        over past gradients. The vector :math:`\mathbf{g}` is initialized to zero.
        Given a stochastic estimate of the gradient of the cost function :math:`G(\mathbf{p})`,
        the updates for :math:`g_k` and the parameter :math:`p_k` are


        .. math:: g^\prime_k &= g_k + G_k(\mathbf{p})^2\\
                  p^\prime_k &= p_k - \frac{\eta}{\sqrt{g_k + \epsilon}}G_k(\mathbf{p})

        AdaGrad has been shown to perform particularly well when
        the gradients are sparse, but the learning rate may become too small
        after many updates because the sum over the squares of past gradients is cumulative.


        Args:
           learning_rate: Learning rate :math:`\eta`.
           epscut: Small :math:`\epsilon` cutoff.
           initial_accumulator_value: initial value of the accumulator

        Examples:
           Simple AdaGrad optimizer.

           >>> from netket.optimizer import AdaGrad
           >>> op = AdaGrad()
        """
    from optax import adagrad

    return adagrad(
        learning_rate, eps=epscut, initial_accumulator_value=initial_accumulator_value
    )
Beispiel #6
0
 def update(
         self, gradient: Weights, state: GenericGradientState,
         parameters: Optional[Weights]
 ) -> Tuple[Weights, GenericGradientState]:
     return GenericGradientState.wrap(*adagrad(
         **asdict(self)).update(gradient, state.data, parameters))
Beispiel #7
0
 def init(self, parameters: Weights) -> GenericGradientState:
     return GenericGradientState(adagrad(**asdict(self)).init(parameters))