class RMSprop(optimizer.GradientMethod): """RMSprop optimizer. See: T. Tieleman and G. Hinton (2012). Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning. Args: lr (float): Learning rate. alpha (float): Exponential decay rate of the second order moment. eps (float): Small value for the numerical stability. """ def __init__(self, lr=_default_hyperparam.lr, alpha=_default_hyperparam.alpha, eps=_default_hyperparam.eps): super(RMSprop, self).__init__() self.hyperparam.lr = lr self.hyperparam.alpha = alpha self.hyperparam.eps = eps lr = optimizer.HyperparameterProxy('lr') alpha = optimizer.HyperparameterProxy('alpha') eps = optimizer.HyperparameterProxy('eps') def create_update_rule(self): return RMSpropRule(self.hyperparam)
class RMSpropGraves(optimizer.GradientMethod): """Alex Graves's RMSprop. See: http://arxiv.org/abs/1308.0850 Args: lr (float): Learning rate. alpha (float): Exponential decay rate of the first and second order moments of the raw gradient. momentum (float): Exponential decay rate of the first order moment of the adjusted gradient. eps (float): Small value for the numerical stability. """ def __init__(self, lr=_default_hyperparam.lr, alpha=_default_hyperparam.alpha, momentum=_default_hyperparam.momentum, eps=_default_hyperparam.eps, model=None): super(RMSpropGraves, self).__init__(model) self.hyperparam.lr = lr self.hyperparam.alpha = alpha self.hyperparam.momentum = momentum self.hyperparam.eps = eps lr = optimizer.HyperparameterProxy('lr') alpha = optimizer.HyperparameterProxy('alpha') momentum = optimizer.HyperparameterProxy('momentum') eps = optimizer.HyperparameterProxy('eps') def create_update_rule(self): return RMSpropGravesRule(self.hyperparam)
class AdaDeltaWithLearningRate(optimizer.GradientMethod): """Zeiler's ADADELTA. See: http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf Args: rho (float): Exponential decay rate of the first and second order moments. eps (float): Small value for the numerical stability. """ def __init__(self, lr=_default_hyperparam.lr, rho=_default_hyperparam.rho, eps=_default_hyperparam.eps): super(AdaDeltaWithLearningRate, self).__init__() self.hyperparam.lr = lr self.hyperparam.rho = rho self.hyperparam.eps = eps lr = optimizer.HyperparameterProxy('lr') rho = optimizer.HyperparameterProxy('rho') eps = optimizer.HyperparameterProxy('eps') def create_update_rule(self): return AdaDeltaRuleWithLearningRate(self.hyperparam)
class RAdam(optimizer.GradientMethod): """Rectified Adam optimizer. See: `On the Variance of the Adaptive Learning Rate and Beyond \ <https://arxiv.org/abs/1908.03265>`_ Args: alpha (float): Coefficient of learning rate. beta1 (float): Exponential decay rate of the first order moment. beta2 (float): Exponential decay rate of the second order moment. eps (float): Small value for the numerical stability. eta (float): Schedule multiplier, can be used for warm restarts. weight_decay_rate (float): Weight decay rate. """ def __init__(self, alpha=_default_hyperparam.alpha, beta1=_default_hyperparam.beta1, beta2=_default_hyperparam.beta2, eps=_default_hyperparam.eps, weight_decay_rate=_default_hyperparam.weight_decay_rate): super(RAdam, self).__init__() self.hyperparam.alpha = alpha self.hyperparam.beta1 = beta1 self.hyperparam.beta2 = beta2 self.hyperparam.eps = eps self.hyperparam.weight_decay_rate = weight_decay_rate alpha = optimizer.HyperparameterProxy('alpha') beta1 = optimizer.HyperparameterProxy('beta1') beta2 = optimizer.HyperparameterProxy('beta2') eps = optimizer.HyperparameterProxy('eps') weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate') def create_update_rule(self): return RAdamRule(self.hyperparam) @property def alpha_t(self): return _learning_rate(self.hyperparam, self.t) @property def lr(self): warnings.warn( 'RAdam.lr has been renamed to RAdamRule.alpha_t. ' 'Use of Adam.lr is deprecated in Chainer v6.', DeprecationWarning) return self.alpha_t
class Hamiltonian(optimizer.GradientMethod): def __init__(self, epsilon=_default_hyperparam.epsilon, delta=_default_hyperparam.delta): super(Hamiltonian, self).__init__() self.hyperparam.epsilon = epsilon self.hyperparam.delta = delta epsilon = optimizer.HyperparameterProxy('epsilon') delta = optimizer.HyperparameterProxy('delta') def create_update_rule(self): return HamiltonianRule(self.hyperparam)
class CorrectedMomentumSGD(optimizer.GradientMethod): """Momentum SGD optimizer. This implements momentum correction discussed in the third section of `Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour <https://arxiv.org/abs/1706.02677>`_. :class:`~chainer.optimizers.MomentumSGD` implements the equation (10) of the paper. This optimizer implements the equation (9). To get better understanding between the two methods, we show the equivalence between the equation (9) and modification of the equation (10) that takes momentum correction into account. First, we set :math:`v_{t} = \\eta_{t} u_t`. We substitute this relation to the equation (10). .. math:: v_{t+1} &= m\\frac{\\eta_{t+1}}{\\eta_{t}}v_t + \\eta_{t+1}g_t \\\\ &= m\\frac{\\eta_{t+1}}{\\eta_{t}}\\eta_{t}u_t + \\eta_{t+1}g_t \\\\ &= \\eta_{t+1}(m u_t + g_t) \\\\ From this result, we derive :math:`u_{t+1} = m u_t + g_t`, which is how update tensors are calculated by :class:`~chainer.optimizers.CorrectedMomentumSGD`. Thus, the equivalence is shown. Args: lr (float): Learning rate. momentum (float): Exponential decay rate of the first order moment. """ def __init__(self, lr=_default_hyperparam.lr, momentum=_default_hyperparam.momentum): super(CorrectedMomentumSGD, self).__init__() self.hyperparam.lr = lr self.hyperparam.momentum = momentum lr = optimizer.HyperparameterProxy('lr') momentum = optimizer.HyperparameterProxy('momentum') def create_update_rule(self): return CorrectedMomentumSGDRule(self.hyperparam)
class LBFGS(optimizer.GradientMethod): """L-BFGS. """ def __init__(self, lr=_default_hyperparam.lr, stack_size=_default_hyperparam.stack_size, min_ro=_default_hyperparam.min_ro): super(LBFGS, self).__init__() self.hyperparam.lr = lr self.hyperparam.stack_size = stack_size self.hyperparam.min_ro = min_ro lr = optimizer.HyperparameterProxy('lr') stack_size = optimizer.HyperparameterProxy('stack_size') min_ro = optimizer.HyperparameterProxy('min_ro') def create_update_rule(self): return LBFGSRule(self.hyperparam)
class MSVAG(optimizer.GradientMethod): """M-SVAG optimizer. See: `Dissecting Adam: The Sign, Magnitude and Variance of Stochastic \ Gradients <https://arxiv.org/abs/1705.07774>`_ Modified for proper weight decay (also called AdamW). AdamW introduces the additional parameters ``eta`` and ``weight_decay_rate``, which can be used to properly scale the learning rate, and decouple the weight decay rate from ``alpha``, as shown in the below paper. See: `Fixing Weight Decay Regularization in Adam \ <https://openreview.net/forum?id=rk6qdGgCZ>`_ Args: lr (float): Learning rate. beta (float): Exponential decay rate of the first and second order moment. eta (float): Schedule multiplier, can be used for warm restarts. weight_decay_rate (float): Weight decay rate. """ def __init__(self, lr=_default_hyperparam.lr, beta=_default_hyperparam.beta, eta=_default_hyperparam.eta, weight_decay_rate=_default_hyperparam.weight_decay_rate): super(MSVAG, self).__init__() self.hyperparam.lr = lr self.hyperparam.beta = beta self.hyperparam.eta = eta self.hyperparam.weight_decay_rate = weight_decay_rate lr = optimizer.HyperparameterProxy('lr') beta = optimizer.HyperparameterProxy('beta') eta = optimizer.HyperparameterProxy('eta') weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate') def create_update_rule(self): return MSVAGRule(self.hyperparam)
class Yogi(optimizer.GradientMethod): """Yogi optimizer. See: `Adaptive Methods for Nonconvex Optimization \ <https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization>`_ See :class:`~chainer.optimizers.Adam` for weight decay and AMSGrad options. Args: alpha (float): Coefficient of learning rate. beta1 (float): Exponential decay rate of the first order moment. beta2 (float): Exponential decay rate of the second order moment. eps (float): Small value for the numerical stability. eta (float): Schedule multiplier, can be used for warm restarts. weight_decay_rate (float): Weight decay rate. amsgrad (bool): Whether to use AMSGrad variant of Yogi. """ def __init__(self, alpha=_default_hyperparam.alpha, beta1=_default_hyperparam.beta1, beta2=_default_hyperparam.beta2, eps=_default_hyperparam.eps, eta=_default_hyperparam.eta, weight_decay_rate=_default_hyperparam.weight_decay_rate, amsgrad=_default_hyperparam.amsgrad): super(Yogi, self).__init__() self.hyperparam.alpha = alpha self.hyperparam.beta1 = beta1 self.hyperparam.beta2 = beta2 self.hyperparam.eps = eps self.hyperparam.eta = eta self.hyperparam.weight_decay_rate = weight_decay_rate self.hyperparam.amsgrad = amsgrad alpha = optimizer.HyperparameterProxy('alpha') beta1 = optimizer.HyperparameterProxy('beta1') beta2 = optimizer.HyperparameterProxy('beta2') eps = optimizer.HyperparameterProxy('eps') eta = optimizer.HyperparameterProxy('eta') weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate') amsgrad = optimizer.HyperparameterProxy('amsgrad') def create_update_rule(self): return YogiRule(self.hyperparam) @property def alpha_t(self): return _learning_rate(self.hyperparam, self.t) @property def lr(self): warnings.warn( 'Yogi.lr has been renamed to YogiRule.alpha_t. ' 'Use of Yogi.lr is deprecated in Chainer v6.', DeprecationWarning) return self.alpha_t
class MomentumSGD(optimizer.GradientMethod): """Momentum SGD optimizer. Args: lr (float): Learning rate. momentum (float): Exponential decay rate of the first order moment. """ def __init__(self, lr=_default_hyperparam.lr, momentum=_default_hyperparam.momentum): super(MomentumSGD, self).__init__() self.hyperparam.lr = lr self.hyperparam.momentum = momentum lr = optimizer.HyperparameterProxy('lr') momentum = optimizer.HyperparameterProxy('momentum') def create_update_rule(self): return MomentumSGDRule(self.hyperparam)
class AdaGrad(optimizer.GradientMethod): """AdaGrad optimizer. See: http://jmlr.org/papers/v12/duchi11a.html Args: lr (float): Learning rate. eps (float): Small value for the numerical stability. """ def __init__(self, lr=_default_hyperparam.lr, eps=_default_hyperparam.eps): super(AdaGrad, self).__init__() self.hyperparam.lr = lr self.hyperparam.eps = eps lr = optimizer.HyperparameterProxy('lr') eps = optimizer.HyperparameterProxy('eps') def create_update_rule(self): return AdaGradRule(self.hyperparam)
class SMORMS3(optimizer.GradientMethod): """Simon Funk's SMORMS3. See http://sifter.org/~simon/journal/20150420.html. Args: lr (float): Learning rate. eps (float): Small value for the numerical stability. """ def __init__(self, lr=_default_hyperparam.lr, eps=_default_hyperparam.eps): super(SMORMS3, self).__init__() self.hyperparam.lr = lr self.hyperparam.eps = eps lr = optimizer.HyperparameterProxy('lr') eps = optimizer.HyperparameterProxy('eps') def create_update_rule(self): return SMORMS3Rule(self.hyperparam)
class RMSpropAsync(optimizer.GradientMethod): """RMSprop for asynchronous methods. The only difference from chainer.optimizers.RMSprop in that the epsilon is outside the square root. """ def __init__(self, lr=_default_hyperparam.lr, alpha=_default_hyperparam.alpha, eps=_default_hyperparam.eps): super(RMSpropAsync, self).__init__() self.hyperparam.lr = lr self.hyperparam.alpha = alpha self.hyperparam.eps = eps lr = optimizer.HyperparameterProxy('lr') alpha = optimizer.HyperparameterProxy('alpha') eps = optimizer.HyperparameterProxy('eps') def create_update_rule(self): return RMSpropAsyncRule(self.hyperparam)
class NesterovAG(optimizer.GradientMethod): """Nesterov's Accelerated Gradient. See: http://arxiv.org/abs/1212.0901 Args: lr (float): Learning rate. momentum (float): Exponential decay rate of the first order moment. """ def __init__(self, lr=_default_hyperparam.lr, momentum=_default_hyperparam.momentum): super(NesterovAG, self).__init__() self.hyperparam.lr = lr self.hyperparam.momentum = momentum lr = optimizer.HyperparameterProxy('lr') momentum = optimizer.HyperparameterProxy('momentum') def create_update_rule(self): return NesterovAGRule(self.hyperparam)
class Adam(optimizer.GradientMethod): """Adam optimizer. See: `Adam: A Method for Stochastic Optimization \ <http://arxiv.org/abs/1412.6980v8>`_ Modified for proper weight decay (also called AdamW). AdamW introduces the additional parameters ``eta`` and ``weight_decay_rate``, which can be used to properly scale the learning rate, and decouple the weight decay rate from ``alpha``, as shown in the below paper. Note that with the default values ``eta = 1`` and ``weight_decay_rate = 0``, this implementation is identical to the standard Adam method. See: `Fixing Weight Decay Regularization in Adam \ <https://openreview.net/forum?id=rk6qdGgCZ>`_ Args: alpha (float): Step size. beta1 (float): Exponential decay rate of the first order moment. beta2 (float): Exponential decay rate of the second order moment. eps (float): Small value for the numerical stability. eta (float): Schedule multiplier, can be used for warm restarts. weight_decay_rate (float): Weight decay rate. """ def __init__(self, alpha=_default_hyperparam.alpha, beta1=_default_hyperparam.beta1, beta2=_default_hyperparam.beta2, eps=_default_hyperparam.eps, eta=_default_hyperparam.eta, weight_decay_rate=_default_hyperparam.weight_decay_rate): super(Adam, self).__init__() self.hyperparam.alpha = alpha self.hyperparam.beta1 = beta1 self.hyperparam.beta2 = beta2 self.hyperparam.eps = eps self.hyperparam.eta = eta self.hyperparam.weight_decay_rate = weight_decay_rate alpha = optimizer.HyperparameterProxy('alpha') beta1 = optimizer.HyperparameterProxy('beta1') beta2 = optimizer.HyperparameterProxy('beta2') eps = optimizer.HyperparameterProxy('eps') eta = optimizer.HyperparameterProxy('eta') weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate') def create_update_rule(self): return AdamRule(self.hyperparam) @property def lr(self): fix1 = 1. - math.pow(self.hyperparam.beta1, self.t) fix2 = 1. - math.pow(self.hyperparam.beta2, self.t) return self.hyperparam.alpha * math.sqrt(fix2) / fix1
class RMSprop(optimizer.GradientMethod): """RMSprop optimizer. See: T. Tieleman and G. Hinton (2012). Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning. Args: lr (float): Learning rate. alpha (float): Exponential decay rate of the second order moment. eps (float): Small value for the numerical stability. eps_inside_sqrt (bool): When ``True``, gradient will be divided by :math:`\\sqrt{ms + eps}` where ``ms`` is the mean square. When ``False`` (default), gradient will be divided by :math:`\\sqrt{ms} + eps` instead. This option may be convenient for users porting code from other frameworks; see `#4754 <https://github.com/chainer/chainer/issues/4754>`__ for details. """ def __init__(self, lr=_default_hyperparam.lr, alpha=_default_hyperparam.alpha, eps=_default_hyperparam.eps, eps_inside_sqrt=_default_hyperparam.eps_inside_sqrt): super(RMSprop, self).__init__() self.hyperparam.lr = lr self.hyperparam.alpha = alpha self.hyperparam.eps = eps self.hyperparam.eps_inside_sqrt = eps_inside_sqrt lr = optimizer.HyperparameterProxy('lr') alpha = optimizer.HyperparameterProxy('alpha') eps = optimizer.HyperparameterProxy('eps') eps_inside_sqrt = optimizer.HyperparameterProxy('eps_inside_sqrt') def create_update_rule(self): return RMSpropRule(self.hyperparam)
class VaswaniAdam(chainer.optimizers.Adam): def __init__(self, factor, warmup, model_size, inverse_square=False, **kwargs): super(VaswaniAdam, self).__init__(**kwargs) # Vaswani self.hyperparam.factor = factor self.hyperparam.warmup = warmup self.hyperparam.model_size = model_size self.inverse_square = inverse_square def create_update_rule(self): return VaswaniAdamRule(self.hyperparam, inverse_square=self.inverse_square) # Vaswani factor = optimizer.HyperparameterProxy('factor') warmup = optimizer.HyperparameterProxy('warmup') model_size = optimizer.HyperparameterProxy('model_size') @property def lr(self): if self.inverse_square: return _learning_rate_fairseq(self.hyperparam, self.t) else: return _learning_rate(self.hyperparam, self.t)
class Adam(optimizer.GradientMethod): """Adam optimizer. See: http://arxiv.org/abs/1412.6980v8 Args: alpha (float): Step size. beta1 (float): Exponential decay rate of the first order moment. beta2 (float): Exponential decay rate of the second order moment. eps (float): Small value for the numerical stability. """ def __init__(self, alpha=_default_hyperparam.alpha, beta1=_default_hyperparam.beta1, beta2=_default_hyperparam.beta2, eps=_default_hyperparam.eps, model=None): super(Adam, self).__init__(model) self.hyperparam.alpha = alpha self.hyperparam.beta1 = beta1 self.hyperparam.beta2 = beta2 self.hyperparam.eps = eps alpha = optimizer.HyperparameterProxy('alpha') beta1 = optimizer.HyperparameterProxy('beta1') beta2 = optimizer.HyperparameterProxy('beta2') eps = optimizer.HyperparameterProxy('eps') def create_update_rule(self): return AdamRule(self.hyperparam) @property def lr(self): fix1 = 1. - math.pow(self.hyperparam.beta1, self.t) fix2 = 1. - math.pow(self.hyperparam.beta2, self.t) return self.hyperparam.alpha * math.sqrt(fix2) / fix1
class SGD(optimizer.GradientMethod): """Vanilla Stochastic Gradient Descent. Args: lr (float): Learning rate. """ def __init__(self, lr=_default_hyperparam.lr): super(SGD, self).__init__() self.hyperparam.lr = lr lr = optimizer.HyperparameterProxy('lr') def create_update_rule(self): return SGDRule(self.hyperparam)
class RMSpropWarmup(optimizer.GradientMethod): """RMSprop optimizer. See: T. Tieleman and G. Hinton (2012). Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning. Args: lr (float): Learning rate. alpha_sgd (float): Learning rate. mu1 (float): Exponential decay rate of the first order moment. alpha_rmsprop (float): Learning rate. mu2 (float): Exponential decay rate of the second order moment. eps (float): Small value for the numerical stability. """ def __init__(self, lr=_default_hyperparam.lr, alpha_sgd=_default_hyperparam.alpha_sgd, mu1=_default_hyperparam.mu1, alpha_rmsprop=_default_hyperparam.alpha_rmsprop, mu2=_default_hyperparam.mu2, eps=_default_hyperparam.eps, lars=False): super(RMSpropWarmup, self).__init__() self.hyperparam.lr = lr self.hyperparam.alpha_sgd = alpha_sgd self.hyperparam.mu1 = mu1 self.hyperparam.alpha_rmsprop = alpha_rmsprop self.hyperparam.mu2 = mu2 self.hyperparam.eps = eps self._lars = lars lr = optimizer.HyperparameterProxy('lr') alpha_sgd = optimizer.HyperparameterProxy('alpha_sgd') mu1 = optimizer.HyperparameterProxy('mu1') alpha_rmsprop = optimizer.HyperparameterProxy('alpha_rmsprop') mu2 = optimizer.HyperparameterProxy('mu2') eps = optimizer.HyperparameterProxy('eps') def create_update_rule(self): return RMSpropWarmupRule(self.hyperparam, self._lars)
class Adam(optimizer.GradientMethod): """Adam optimizer. See: `Adam: A Method for Stochastic Optimization \ <https://arxiv.org/abs/1412.6980v8>`_ Modified for proper weight decay (also called AdamW). AdamW introduces the additional parameters ``eta`` and ``weight_decay_rate``, which can be used to properly scale the learning rate, and decouple the weight decay rate from ``alpha``, as shown in the below paper. Note that with the default values ``eta = 1`` and ``weight_decay_rate = 0``, this implementation is identical to the standard Adam method. See: `Fixing Weight Decay Regularization in Adam \ <https://openreview.net/forum?id=rk6qdGgCZ>`_ A flag ``amsgrad`` to use the AMSGrad variant of Adam from the paper: `On the Convergence of Adam and Beyond \ <https://openreview.net/forum?id=ryQu7f-RZ>`_ A flag ``adastand`` to use the Adastand variant of Adam from the paper: `Adaptive Learning Rate via Covariance Matrix Based Preconditioning for Deep Neural Networks \ <https://www.ijcai.org/proceedings/2017/0267.pdf>`_ Args: alpha (float): Step size. beta1 (float): Exponential decay rate of the first order moment. beta2 (float): Exponential decay rate of the second order moment. eps (float): Small value for the numerical stability. eta (float): Schedule multiplier, can be used for warm restarts. weight_decay_rate (float): Weight decay rate. amsgrad (bool): Whether to use AMSGrad variant of Adam. adastand (bool): Whether to use Adastand variant of Adam. """ def __init__(self, alpha=_default_hyperparam.alpha, beta1=_default_hyperparam.beta1, beta2=_default_hyperparam.beta2, eps=_default_hyperparam.eps, eta=_default_hyperparam.eta, weight_decay_rate=_default_hyperparam.weight_decay_rate, amsgrad=_default_hyperparam.amsgrad, adastand=_default_hyperparam.adastand): super(Adam, self).__init__() self.hyperparam.alpha = alpha self.hyperparam.beta1 = beta1 self.hyperparam.beta2 = beta2 self.hyperparam.eps = eps self.hyperparam.eta = eta self.hyperparam.weight_decay_rate = weight_decay_rate self.hyperparam.amsgrad = amsgrad self.hyperparam.adastand = adastand alpha = optimizer.HyperparameterProxy('alpha') beta1 = optimizer.HyperparameterProxy('beta1') beta2 = optimizer.HyperparameterProxy('beta2') eps = optimizer.HyperparameterProxy('eps') eta = optimizer.HyperparameterProxy('eta') weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate') amsgrad = optimizer.HyperparameterProxy('amsgrad') adastand = optimizer.HyperparameterProxy('adastand') def create_update_rule(self): return AdamRule(self.hyperparam) @property def lr(self): return _learning_rate(self.hyperparam, self.t)
class Eve(optimizer.GradientMethod): """Eve optimizer. See: https://arxiv.org/abs/1611.01505v3 Args: alpha (float): Coefficient of learning rate. beta1 (float): Exponential decay rate of the first order moment. beta2 (float): Exponential decay rate of the second order moment. beta3 (float): Exponential decay rate of the objective-dependent coefficient of learning rate. c (float): Constant used to clip the objective-dependent coefficient. eps (float): Small value for the numerical stability. eta (float): Schedule multiplier, can be used for warm restarts. f_star (float): Minimum value that the loss function can take. weight_decay_rate (float): Weight decay rate. amsgrad (bool): Whether to use AMSGrad variant of Eve. """ def __init__(self, alpha=_default_hyperparam.alpha, beta1=_default_hyperparam.beta1, beta2=_default_hyperparam.beta2, beta3=_default_hyperparam.beta3, c=_default_hyperparam.c, eps=_default_hyperparam.eps, eta=_default_hyperparam.eta, f_star=_default_hyperparam.f_star, weight_decay_rate=_default_hyperparam.weight_decay_rate, amsgrad=_default_hyperparam.amsgrad): super(Eve, self).__init__() self.hyperparam.alpha = alpha self.hyperparam.beta1 = beta1 self.hyperparam.beta2 = beta2 self.hyperparam.beta3 = beta3 self.hyperparam.c = c self.hyperparam.eps = eps self.hyperparam.eta = eta self.hyperparam.f_star = f_star self.hyperparam.weight_decay_rate = weight_decay_rate self.hyperparam.amsgrad = amsgrad alpha = optimizer.HyperparameterProxy('alpha') beta1 = optimizer.HyperparameterProxy('beta1') beta2 = optimizer.HyperparameterProxy('beta2') beta3 = optimizer.HyperparameterProxy('beta3') c = optimizer.HyperparameterProxy('c') eps = optimizer.HyperparameterProxy('eps') eta = optimizer.HyperparameterProxy('eta') f_star = optimizer.HyperparameterProxy('f_star') weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate') amsgrad = optimizer.HyperparameterProxy('amsgrad') def setup(self, link): """Sets a target link and initializes the optimizer states. Given link is set to the :attr:`target` attribute. It also prepares the optimizer state dictionaries corresponding to all parameters in the link hierarchy. The existing states are discarded. Args: link (~chainer.Link): Target link object. Returns: The optimizer instance. .. note:: As of v4.0.0, this function returns the optimizer instance itself so that you can instantiate and setup the optimizer in one line, e.g., ``optimizer = SomeOptimizer().setup(link)``. """ super(Eve, self).setup(link) self.d_tilde = numpy.nan self.f = numpy.nan return self def create_update_rule(self): return EveRule(self.hyperparam) @property def lr(self): return _learning_rate(self.hyperparam, self.t, self.d_tilde) def update(self, lossfun, *args, **kwds): """Updates parameters based on a loss function or computed gradients. Because Eve uses loss values, `lossfun` is required unlike in the case of other optimizers. Args: lossfun (callable): Callable that returns a ~chainer.Variable to be minimized. *args, **kwds: Arguments passed to `lossfun`. """ assert lossfun is not None, 'Eve requires lossfun to be specified' use_cleargrads = getattr(self, '_use_cleargrads', True) loss = lossfun(*args, **kwds) if use_cleargrads: self.target.cleargrads() else: self.target.zerograds() loss.backward(loss_scale=self._loss_scale) loss_value = float(loss.array) del loss self.reallocate_cleared_grads() self.call_hooks('pre') self.t += 1 self._update_d_tilde_and_f(loss_value) for param in self.target.params(): param.update_rule.d_tilde = self.d_tilde param.update() self.reallocate_cleared_grads() self.call_hooks('post') def serialize(self, serializer): """Serializes or deserializes the optimizer. It only saves or loads the following things: - Optimizer states - Global states (:attr:`t`, :attr:`epoch`, :attr:`d_tilde`, and :attr:`f`) **It does not saves nor loads the parameters of the target link.** They should be separately saved or loaded. Args: serializer (~chainer.AbstractSerializer): Serializer or deserializer object. """ super(Eve, self).serialize(serializer) self.d_tilde = serializer('d_tilde', self.d_tilde) self.f = serializer('f', self.f) def _update_d_tilde_and_f(self, loss): if self.t > 1: d = abs(loss - self.f) / (min(loss, self.f) - self.f_star) d_hat = numpy.clip(d, 1 / self.c, self.c) self.d_tilde = self.beta3 * self.d_tilde + (1 - self.beta3) * d_hat else: self.d_tilde = 1 self.f = loss
class KFAC(chainer.optimizer.GradientMethod): """K-FAC optimizer. See: `Optimizing Neural Networks with \ Kronecker-factored Approximate Curvature \ <https://arxiv.org/abs/1503.05671>`_ Args: lr (float): Learning rate. momentum (float): Exponential decay rate of the first order moment. cov_ema_decay (float): Decay factor used when calculating the covariance estimate Exponential Moving Average. inv_freq (int): Frequency to calculate the inverse of covariance estimate EMA for each layer. inv_alg (str): Algorithm used when calculating the inverse. damping (float): Damping factor used to stabilize training due to errors in the local approximation with the Fisher information matrix. Attributes: fisher_blocks (list): Keep data to compute Fisher block. """ def __init__( self, communicator=None, lr=_default_hyperparam.lr, momentum=_default_hyperparam.momentum, cov_ema_decay=_default_hyperparam.cov_ema_decay, inv_freq=_default_hyperparam.inv_freq, inv_alg=None, damping=_default_hyperparam.damping, ): super(KFAC, self).__init__() self.communicator = communicator self.hyperparam.lr = lr self.hyperparam.momentum = momentum self.hyperparam.cov_ema_decay = cov_ema_decay self.hyperparam.inv_freq = inv_freq self.hyperparam.damping = damping self.fisher_blocks = [] self.inv_alg = inv_alg lr = optimizer.HyperparameterProxy('lr') momentum = optimizer.HyperparameterProxy('momentum') cov_ema_decay = optimizer.HyperparameterProxy('cov_ema_decay') inv_freq = optimizer.HyperparameterProxy('inv_freq') damping = optimizer.HyperparameterProxy('damping') def setup(self, link): super(KFAC, self).setup(link) for linkname, sub_link in link.namedlinks(): if isinstance(sub_link, _linear_link): fb = fisher_block.FisherBlockLinear(sub_link, linkname) elif isinstance(sub_link, _convolution_2d_link): fb = fisher_block.FisherBlockConv2D(sub_link, linkname) elif isinstance(sub_link, _batch_norm_link): fb = fisher_block.FisherBlockBatchNorm(sub_link, linkname) else: continue self.fisher_blocks.append(fb) return self def create_update_rule(self): return KFACUpdateRule(self.hyperparam) def update(self, lossfun=None, *args, **kwds): if lossfun is not None: use_cleargrads = getattr(self, '_use_cleargrads', True) loss = lossfun(*args, **kwds) if use_cleargrads: self.target.cleargrads() else: self.target.zerograds() # We removed ``loss.backward()`` from here. # Do backprop, and obtain ``grads`` which contains the dependency # graph inside. backward_main = getattr(loss, '_backward_main') self.kfac_backward(self.target, backward_main) del loss # No more backward computation, free memory # Update param.kfgrad for each layer self.kfgrad_update() self.reallocate_cleared_grads() self.call_hooks('pre') self.t += 1 for param in self.target.params(): param.update() self.reallocate_cleared_grads() self.call_hooks('post') self.cov_ema_update() def kfac_backward(self, link, backward_main): """Backward function for KFAC optimizer. This function is invoked from ``KFAC.update()`` to: 1. calculate backprop 2. obtain the following data for each layer (`~chainer.link.Link`) - acts (inputs = activations after previous layer) - grads (gradients of outputs) - rank (`~chainer.FunctionNode.rank`) - conv2d_args (arguments of `~chainer.links.connection.\ convolution_2d.Convolution2D`) """ with chainer.using_config('enable_backprop', False): # To obtain grads, we need to edit a file ``variable.py`` grads = backward_main(retain_grad=True, loss_scale=None) namedparams = list(link.namedparams()) def get_linkname(param): # Get a linkname from a parameter. for _name, _param in namedparams: if param is _param: # Only return linkname NOT paramname. return _name[:_name.rfind('/')] return None def get_fisher_block(linkname): for fb in self.fisher_blocks: if fb.linkname == linkname: return fb return None for node, out_grads_var in grads.items(): creator_node = node.creator_node # parent function node if creator_node is not None: # ignore leaf node if not any( [isinstance(creator_node, t) for t in _target_functions]): continue (in_acts_var, param) = creator_node.get_retained_inputs() linkname = get_linkname(param) fb = get_fisher_block(linkname) fb.load_data(in_acts_var.data, out_grads_var.data) fb.load_conv2d_args(creator_node, param) def kfgrad_update(self): """Update param.kfgrad which used for K-FAC updates for each laeyer. """ for fb in self.fisher_blocks: fb.update_kfgrads() def cov_ema_update(self): """Update EMA of covariance for each laeyer. """ for fb in self.fisher_blocks: fb.update_cov_emas(alpha=self.hyperparam.cov_ema_decay) def inv_update(self): """Update inverse of EMA of covariance for each laeyer. """ comm = self.communicator if comm is not None: indices local_indices = indices[comm.rank] fisher_blocks = [self.fisher_blocks[i] for i in local_indices] else: fisher_blocks = self.fisher_blocks for fb in fisher_blocks: fb.update_invs(damping=self.hyperparam.damping)
class Adam(optimizer.GradientMethod): """Adam optimizer. See: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980v8>`_ Modified for proper weight decay (also called :class:`~chainer.optimizers.AdamW`). AdamW introduces the additional parameters ``eta`` and ``weight_decay_rate``, which can be used to properly scale the learning rate, and decouple the weight decay rate from ``alpha``, as shown in the below paper. Note that with the default values ``eta = 1`` and ``weight_decay_rate = 0``, this implementation is identical to the standard Adam method. See: `Fixing Weight Decay Regularization in Adam <https://openreview.net/forum?id=rk6qdGgCZ>`_ A flag ``amsgrad`` to use the :class:`~chainer.optimizers.AMSGrad` variant of Adam from the paper: `On the Convergence of Adam and Beyond <https://openreview.net/forum?id=ryQu7f-RZ>`_ A flag ``adabound`` to use the :class:`~chainer.optimizers.AdaBound` variant of Adam from the paper: `Adaptive Gradient Methods with Dynamic Bound of Learning Rate <https://openreview.net/forum?id=Bkg3g2R9FX>`_ If both ``amsgrad`` and ``adabound`` are ``True``, the optimizer is equivalent to :class:`~chainer.optimizers.AMSBound` proposed in the AdaBound paper. Args: alpha (float): Coefficient of learning rate. beta1 (float): Exponential decay rate of the first order moment. beta2 (float): Exponential decay rate of the second order moment. eps (float): Small value for the numerical stability. eta (float): Schedule multiplier, can be used for warm restarts. weight_decay_rate (float): Weight decay rate. amsgrad (bool): Whether to use AMSGrad variant of Adam. adabound (bool): Whether to use the AdaBound variant of Adam. final_lr (float): Final (SGD) learning rate in AdaBound. gamma (float): Convergence speed of the bound functions in AdaBound. """ def __init__(self, alpha=_default_hyperparam.alpha, beta1=_default_hyperparam.beta1, beta2=_default_hyperparam.beta2, eps=_default_hyperparam.eps, eta=_default_hyperparam.eta, weight_decay_rate=_default_hyperparam.weight_decay_rate, amsgrad=_default_hyperparam.amsgrad, adabound=_default_hyperparam.adabound, final_lr=_default_hyperparam.final_lr, gamma=_default_hyperparam.gamma): super(Adam, self).__init__() self.hyperparam.alpha = alpha self.hyperparam.beta1 = beta1 self.hyperparam.beta2 = beta2 self.hyperparam.eps = eps self.hyperparam.eta = eta self.hyperparam.weight_decay_rate = weight_decay_rate self.hyperparam.amsgrad = amsgrad self.hyperparam.adabound = adabound self.hyperparam.final_lr = final_lr self.hyperparam.gamma = gamma alpha = optimizer.HyperparameterProxy('alpha') beta1 = optimizer.HyperparameterProxy('beta1') beta2 = optimizer.HyperparameterProxy('beta2') eps = optimizer.HyperparameterProxy('eps') eta = optimizer.HyperparameterProxy('eta') weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate') amsgrad = optimizer.HyperparameterProxy('amsgrad') adabound = optimizer.HyperparameterProxy('adabound') final_lr = optimizer.HyperparameterProxy('final_lr') gamma = optimizer.HyperparameterProxy('gamma') def create_update_rule(self): return AdamRule(self.hyperparam) @property def alpha_t(self): return _learning_rate(self.hyperparam, self.t) @property def lr(self): warnings.warn( 'Adam.lr has been renamed to AdamRule.alpha_t. ' 'Use of Adam.lr is deprecated in Chainer v6.', DeprecationWarning) return self.alpha_t