Beispiel #1
0
class RMSprop(optimizer.GradientMethod):
    """RMSprop optimizer.

    See: T. Tieleman and G. Hinton (2012). Lecture 6.5 - rmsprop, COURSERA:
    Neural Networks for Machine Learning.

    Args:
        lr (float): Learning rate.
        alpha (float): Exponential decay rate of the second order moment.
        eps (float): Small value for the numerical stability.

    """
    def __init__(self,
                 lr=_default_hyperparam.lr,
                 alpha=_default_hyperparam.alpha,
                 eps=_default_hyperparam.eps):
        super(RMSprop, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.alpha = alpha
        self.hyperparam.eps = eps

    lr = optimizer.HyperparameterProxy('lr')
    alpha = optimizer.HyperparameterProxy('alpha')
    eps = optimizer.HyperparameterProxy('eps')

    def create_update_rule(self):
        return RMSpropRule(self.hyperparam)
Beispiel #2
0
class RMSpropGraves(optimizer.GradientMethod):
    """Alex Graves's RMSprop.

    See: http://arxiv.org/abs/1308.0850

    Args:
        lr (float): Learning rate.
        alpha (float): Exponential decay rate of the first and second order
            moments of the raw gradient.
        momentum (float): Exponential decay rate of the first order moment of
            the adjusted gradient.
        eps (float): Small value for the numerical stability.

    """
    def __init__(self,
                 lr=_default_hyperparam.lr,
                 alpha=_default_hyperparam.alpha,
                 momentum=_default_hyperparam.momentum,
                 eps=_default_hyperparam.eps,
                 model=None):
        super(RMSpropGraves, self).__init__(model)
        self.hyperparam.lr = lr
        self.hyperparam.alpha = alpha
        self.hyperparam.momentum = momentum
        self.hyperparam.eps = eps

    lr = optimizer.HyperparameterProxy('lr')
    alpha = optimizer.HyperparameterProxy('alpha')
    momentum = optimizer.HyperparameterProxy('momentum')
    eps = optimizer.HyperparameterProxy('eps')

    def create_update_rule(self):
        return RMSpropGravesRule(self.hyperparam)
class AdaDeltaWithLearningRate(optimizer.GradientMethod):

    """Zeiler's ADADELTA.

    See: http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf

    Args:
        rho (float): Exponential decay rate of the first and second order
            moments.
        eps (float): Small value for the numerical stability.

    """

    def __init__(self, lr=_default_hyperparam.lr,
                 rho=_default_hyperparam.rho, eps=_default_hyperparam.eps):
        super(AdaDeltaWithLearningRate, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.rho = rho
        self.hyperparam.eps = eps

    lr = optimizer.HyperparameterProxy('lr')
    rho = optimizer.HyperparameterProxy('rho')
    eps = optimizer.HyperparameterProxy('eps')

    def create_update_rule(self):
        return AdaDeltaRuleWithLearningRate(self.hyperparam)
Beispiel #4
0
class RAdam(optimizer.GradientMethod):

    """Rectified Adam optimizer.

    See: `On the Variance of the Adaptive Learning Rate and Beyond \
          <https://arxiv.org/abs/1908.03265>`_

    Args:
        alpha (float): Coefficient of learning rate.
        beta1 (float): Exponential decay rate of the first order moment.
        beta2 (float): Exponential decay rate of the second order moment.
        eps (float): Small value for the numerical stability.
        eta (float): Schedule multiplier, can be used for warm restarts.
        weight_decay_rate (float): Weight decay rate.

    """

    def __init__(self,
                 alpha=_default_hyperparam.alpha,
                 beta1=_default_hyperparam.beta1,
                 beta2=_default_hyperparam.beta2,
                 eps=_default_hyperparam.eps,
                 weight_decay_rate=_default_hyperparam.weight_decay_rate):
        super(RAdam, self).__init__()
        self.hyperparam.alpha = alpha
        self.hyperparam.beta1 = beta1
        self.hyperparam.beta2 = beta2
        self.hyperparam.eps = eps
        self.hyperparam.weight_decay_rate = weight_decay_rate

    alpha = optimizer.HyperparameterProxy('alpha')
    beta1 = optimizer.HyperparameterProxy('beta1')
    beta2 = optimizer.HyperparameterProxy('beta2')
    eps = optimizer.HyperparameterProxy('eps')
    weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate')

    def create_update_rule(self):
        return RAdamRule(self.hyperparam)

    @property
    def alpha_t(self):
        return _learning_rate(self.hyperparam, self.t)

    @property
    def lr(self):
        warnings.warn(
            'RAdam.lr has been renamed to RAdamRule.alpha_t. '
            'Use of Adam.lr is deprecated in Chainer v6.',
            DeprecationWarning)
        return self.alpha_t
Beispiel #5
0
class Hamiltonian(optimizer.GradientMethod):
    def __init__(self,
                 epsilon=_default_hyperparam.epsilon,
                 delta=_default_hyperparam.delta):

        super(Hamiltonian, self).__init__()
        self.hyperparam.epsilon = epsilon
        self.hyperparam.delta = delta

    epsilon = optimizer.HyperparameterProxy('epsilon')
    delta = optimizer.HyperparameterProxy('delta')

    def create_update_rule(self):
        return HamiltonianRule(self.hyperparam)
class CorrectedMomentumSGD(optimizer.GradientMethod):

    """Momentum SGD optimizer.

    This implements momentum correction discussed in the third section of
    `Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour
    <https://arxiv.org/abs/1706.02677>`_.

    :class:`~chainer.optimizers.MomentumSGD` implements the equation (10) of
    the paper. This optimizer implements the equation (9).

    To get better understanding between the two methods,
    we show the equivalence between the equation (9) and modification of
    the equation (10) that takes momentum correction into account.
    First, we set :math:`v_{t} = \\eta_{t} u_t`.
    We substitute this relation to the equation (10).

    .. math::

        v_{t+1} &= m\\frac{\\eta_{t+1}}{\\eta_{t}}v_t + \\eta_{t+1}g_t  \\\\
                &= m\\frac{\\eta_{t+1}}{\\eta_{t}}\\eta_{t}u_t +
                \\eta_{t+1}g_t \\\\
                &= \\eta_{t+1}(m u_t + g_t) \\\\

    From this result, we derive :math:`u_{t+1} = m u_t + g_t`, which is how
    update tensors are calculated by
    :class:`~chainer.optimizers.CorrectedMomentumSGD`. Thus, the equivalence
    is shown.

    Args:
        lr (float): Learning rate.
        momentum (float): Exponential decay rate of the first order moment.

    """

    def __init__(self, lr=_default_hyperparam.lr,
                 momentum=_default_hyperparam.momentum):
        super(CorrectedMomentumSGD, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.momentum = momentum

    lr = optimizer.HyperparameterProxy('lr')
    momentum = optimizer.HyperparameterProxy('momentum')

    def create_update_rule(self):
        return CorrectedMomentumSGDRule(self.hyperparam)
Beispiel #7
0
class LBFGS(optimizer.GradientMethod):
    """L-BFGS.
    """
    def __init__(self,
                 lr=_default_hyperparam.lr,
                 stack_size=_default_hyperparam.stack_size,
                 min_ro=_default_hyperparam.min_ro):
        super(LBFGS, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.stack_size = stack_size
        self.hyperparam.min_ro = min_ro

    lr = optimizer.HyperparameterProxy('lr')
    stack_size = optimizer.HyperparameterProxy('stack_size')
    min_ro = optimizer.HyperparameterProxy('min_ro')

    def create_update_rule(self):
        return LBFGSRule(self.hyperparam)
Beispiel #8
0
class MSVAG(optimizer.GradientMethod):

    """M-SVAG optimizer.

    See: `Dissecting Adam: The Sign, Magnitude and Variance of Stochastic \
          Gradients <https://arxiv.org/abs/1705.07774>`_

    Modified for proper weight decay (also called AdamW).
    AdamW introduces the additional parameters ``eta``
    and ``weight_decay_rate``, which can be used to properly scale the
    learning rate, and decouple the weight decay rate from ``alpha``,
    as shown in the below paper.

    See: `Fixing Weight Decay Regularization in Adam \
          <https://openreview.net/forum?id=rk6qdGgCZ>`_

    Args:
        lr (float): Learning rate.
        beta (float): Exponential decay rate of the first and second order
                      moment.
        eta (float): Schedule multiplier, can be used for warm restarts.
        weight_decay_rate (float): Weight decay rate.

    """

    def __init__(self,
                 lr=_default_hyperparam.lr,
                 beta=_default_hyperparam.beta,
                 eta=_default_hyperparam.eta,
                 weight_decay_rate=_default_hyperparam.weight_decay_rate):
        super(MSVAG, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.beta = beta
        self.hyperparam.eta = eta
        self.hyperparam.weight_decay_rate = weight_decay_rate

    lr = optimizer.HyperparameterProxy('lr')
    beta = optimizer.HyperparameterProxy('beta')
    eta = optimizer.HyperparameterProxy('eta')
    weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate')

    def create_update_rule(self):
        return MSVAGRule(self.hyperparam)
Beispiel #9
0
class Yogi(optimizer.GradientMethod):
    """Yogi optimizer.

    See: `Adaptive Methods for Nonconvex Optimization \
          <https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization>`_


    See :class:`~chainer.optimizers.Adam` for weight decay and AMSGrad options.

    Args:
        alpha (float): Coefficient of learning rate.
        beta1 (float): Exponential decay rate of the first order moment.
        beta2 (float): Exponential decay rate of the second order moment.
        eps (float): Small value for the numerical stability.
        eta (float): Schedule multiplier, can be used for warm restarts.
        weight_decay_rate (float): Weight decay rate.
        amsgrad (bool): Whether to use AMSGrad variant of Yogi.

    """
    def __init__(self,
                 alpha=_default_hyperparam.alpha,
                 beta1=_default_hyperparam.beta1,
                 beta2=_default_hyperparam.beta2,
                 eps=_default_hyperparam.eps,
                 eta=_default_hyperparam.eta,
                 weight_decay_rate=_default_hyperparam.weight_decay_rate,
                 amsgrad=_default_hyperparam.amsgrad):
        super(Yogi, self).__init__()
        self.hyperparam.alpha = alpha
        self.hyperparam.beta1 = beta1
        self.hyperparam.beta2 = beta2
        self.hyperparam.eps = eps
        self.hyperparam.eta = eta
        self.hyperparam.weight_decay_rate = weight_decay_rate
        self.hyperparam.amsgrad = amsgrad

    alpha = optimizer.HyperparameterProxy('alpha')
    beta1 = optimizer.HyperparameterProxy('beta1')
    beta2 = optimizer.HyperparameterProxy('beta2')
    eps = optimizer.HyperparameterProxy('eps')
    eta = optimizer.HyperparameterProxy('eta')
    weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate')
    amsgrad = optimizer.HyperparameterProxy('amsgrad')

    def create_update_rule(self):
        return YogiRule(self.hyperparam)

    @property
    def alpha_t(self):
        return _learning_rate(self.hyperparam, self.t)

    @property
    def lr(self):
        warnings.warn(
            'Yogi.lr has been renamed to YogiRule.alpha_t. '
            'Use of Yogi.lr is deprecated in Chainer v6.', DeprecationWarning)
        return self.alpha_t
class MomentumSGD(optimizer.GradientMethod):
    """Momentum SGD optimizer.

    Args:
        lr (float): Learning rate.
        momentum (float): Exponential decay rate of the first order moment.

    """
    def __init__(self,
                 lr=_default_hyperparam.lr,
                 momentum=_default_hyperparam.momentum):
        super(MomentumSGD, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.momentum = momentum

    lr = optimizer.HyperparameterProxy('lr')
    momentum = optimizer.HyperparameterProxy('momentum')

    def create_update_rule(self):
        return MomentumSGDRule(self.hyperparam)
Beispiel #11
0
class AdaGrad(optimizer.GradientMethod):
    """AdaGrad optimizer.

    See: http://jmlr.org/papers/v12/duchi11a.html

    Args:
        lr (float): Learning rate.
        eps (float): Small value for the numerical stability.

    """
    def __init__(self, lr=_default_hyperparam.lr, eps=_default_hyperparam.eps):
        super(AdaGrad, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.eps = eps

    lr = optimizer.HyperparameterProxy('lr')
    eps = optimizer.HyperparameterProxy('eps')

    def create_update_rule(self):
        return AdaGradRule(self.hyperparam)
Beispiel #12
0
class SMORMS3(optimizer.GradientMethod):
    """Simon Funk's SMORMS3.

    See http://sifter.org/~simon/journal/20150420.html.

    Args:
        lr (float): Learning rate.
        eps (float): Small value for the numerical stability.

    """
    def __init__(self, lr=_default_hyperparam.lr, eps=_default_hyperparam.eps):
        super(SMORMS3, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.eps = eps

    lr = optimizer.HyperparameterProxy('lr')
    eps = optimizer.HyperparameterProxy('eps')

    def create_update_rule(self):
        return SMORMS3Rule(self.hyperparam)
Beispiel #13
0
class RMSpropAsync(optimizer.GradientMethod):
    """RMSprop for asynchronous methods.
    The only difference from chainer.optimizers.RMSprop in that the epsilon is
    outside the square root.
    """
    def __init__(self,
                 lr=_default_hyperparam.lr,
                 alpha=_default_hyperparam.alpha,
                 eps=_default_hyperparam.eps):
        super(RMSpropAsync, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.alpha = alpha
        self.hyperparam.eps = eps

    lr = optimizer.HyperparameterProxy('lr')
    alpha = optimizer.HyperparameterProxy('alpha')
    eps = optimizer.HyperparameterProxy('eps')

    def create_update_rule(self):
        return RMSpropAsyncRule(self.hyperparam)
Beispiel #14
0
class NesterovAG(optimizer.GradientMethod):
    """Nesterov's Accelerated Gradient.

    See: http://arxiv.org/abs/1212.0901

    Args:
        lr (float): Learning rate.
        momentum (float): Exponential decay rate of the first order moment.

    """
    def __init__(self,
                 lr=_default_hyperparam.lr,
                 momentum=_default_hyperparam.momentum):
        super(NesterovAG, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.momentum = momentum

    lr = optimizer.HyperparameterProxy('lr')
    momentum = optimizer.HyperparameterProxy('momentum')

    def create_update_rule(self):
        return NesterovAGRule(self.hyperparam)
Beispiel #15
0
class Adam(optimizer.GradientMethod):
    """Adam optimizer.

    See: `Adam: A Method for Stochastic Optimization \
          <http://arxiv.org/abs/1412.6980v8>`_

    Modified for proper weight decay (also called AdamW).
    AdamW introduces the additional parameters ``eta``
    and ``weight_decay_rate``, which can be used to properly scale the
    learning rate, and decouple the weight decay rate from ``alpha``,
    as shown in the below paper.

    Note that with the default values ``eta = 1`` and
    ``weight_decay_rate = 0``, this implementation is identical to
    the standard Adam method.

    See: `Fixing Weight Decay Regularization in Adam \
          <https://openreview.net/forum?id=rk6qdGgCZ>`_

    Args:
        alpha (float): Step size.
        beta1 (float): Exponential decay rate of the first order moment.
        beta2 (float): Exponential decay rate of the second order moment.
        eps (float): Small value for the numerical stability.
        eta (float): Schedule multiplier, can be used for warm restarts.
        weight_decay_rate (float): Weight decay rate.

    """
    def __init__(self,
                 alpha=_default_hyperparam.alpha,
                 beta1=_default_hyperparam.beta1,
                 beta2=_default_hyperparam.beta2,
                 eps=_default_hyperparam.eps,
                 eta=_default_hyperparam.eta,
                 weight_decay_rate=_default_hyperparam.weight_decay_rate):
        super(Adam, self).__init__()
        self.hyperparam.alpha = alpha
        self.hyperparam.beta1 = beta1
        self.hyperparam.beta2 = beta2
        self.hyperparam.eps = eps
        self.hyperparam.eta = eta
        self.hyperparam.weight_decay_rate = weight_decay_rate

    alpha = optimizer.HyperparameterProxy('alpha')
    beta1 = optimizer.HyperparameterProxy('beta1')
    beta2 = optimizer.HyperparameterProxy('beta2')
    eps = optimizer.HyperparameterProxy('eps')
    eta = optimizer.HyperparameterProxy('eta')
    weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate')

    def create_update_rule(self):
        return AdamRule(self.hyperparam)

    @property
    def lr(self):
        fix1 = 1. - math.pow(self.hyperparam.beta1, self.t)
        fix2 = 1. - math.pow(self.hyperparam.beta2, self.t)
        return self.hyperparam.alpha * math.sqrt(fix2) / fix1
Beispiel #16
0
class RMSprop(optimizer.GradientMethod):
    """RMSprop optimizer.

    See: T. Tieleman and G. Hinton (2012). Lecture 6.5 - rmsprop, COURSERA:
    Neural Networks for Machine Learning.

    Args:
        lr (float): Learning rate.
        alpha (float): Exponential decay rate of the second order moment.
        eps (float): Small value for the numerical stability.
        eps_inside_sqrt (bool): When ``True``, gradient will be divided by
            :math:`\\sqrt{ms + eps}` where ``ms`` is the mean square. When
            ``False`` (default), gradient will be divided by
            :math:`\\sqrt{ms} + eps` instead.
            This option may be convenient for users porting code from other
            frameworks;
            see `#4754 <https://github.com/chainer/chainer/issues/4754>`__ for
            details.

    """
    def __init__(self,
                 lr=_default_hyperparam.lr,
                 alpha=_default_hyperparam.alpha,
                 eps=_default_hyperparam.eps,
                 eps_inside_sqrt=_default_hyperparam.eps_inside_sqrt):
        super(RMSprop, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.alpha = alpha
        self.hyperparam.eps = eps
        self.hyperparam.eps_inside_sqrt = eps_inside_sqrt

    lr = optimizer.HyperparameterProxy('lr')
    alpha = optimizer.HyperparameterProxy('alpha')
    eps = optimizer.HyperparameterProxy('eps')
    eps_inside_sqrt = optimizer.HyperparameterProxy('eps_inside_sqrt')

    def create_update_rule(self):
        return RMSpropRule(self.hyperparam)
Beispiel #17
0
class VaswaniAdam(chainer.optimizers.Adam):
    def __init__(self, factor, warmup, model_size, inverse_square=False, **kwargs):
        super(VaswaniAdam, self).__init__(**kwargs)
        # Vaswani
        self.hyperparam.factor = factor
        self.hyperparam.warmup = warmup
        self.hyperparam.model_size = model_size
        self.inverse_square = inverse_square

    def create_update_rule(self):
        return VaswaniAdamRule(self.hyperparam, inverse_square=self.inverse_square)

    # Vaswani
    factor = optimizer.HyperparameterProxy('factor')
    warmup = optimizer.HyperparameterProxy('warmup')
    model_size = optimizer.HyperparameterProxy('model_size')

    @property
    def lr(self):
        if self.inverse_square:
            return _learning_rate_fairseq(self.hyperparam, self.t)
        else:
            return _learning_rate(self.hyperparam, self.t)
Beispiel #18
0
class Adam(optimizer.GradientMethod):
    """Adam optimizer.

    See: http://arxiv.org/abs/1412.6980v8

    Args:
        alpha (float): Step size.
        beta1 (float): Exponential decay rate of the first order moment.
        beta2 (float): Exponential decay rate of the second order moment.
        eps (float): Small value for the numerical stability.

    """
    def __init__(self,
                 alpha=_default_hyperparam.alpha,
                 beta1=_default_hyperparam.beta1,
                 beta2=_default_hyperparam.beta2,
                 eps=_default_hyperparam.eps,
                 model=None):
        super(Adam, self).__init__(model)
        self.hyperparam.alpha = alpha
        self.hyperparam.beta1 = beta1
        self.hyperparam.beta2 = beta2
        self.hyperparam.eps = eps

    alpha = optimizer.HyperparameterProxy('alpha')
    beta1 = optimizer.HyperparameterProxy('beta1')
    beta2 = optimizer.HyperparameterProxy('beta2')
    eps = optimizer.HyperparameterProxy('eps')

    def create_update_rule(self):
        return AdamRule(self.hyperparam)

    @property
    def lr(self):
        fix1 = 1. - math.pow(self.hyperparam.beta1, self.t)
        fix2 = 1. - math.pow(self.hyperparam.beta2, self.t)
        return self.hyperparam.alpha * math.sqrt(fix2) / fix1
Beispiel #19
0
class SGD(optimizer.GradientMethod):
    """Vanilla Stochastic Gradient Descent.

    Args:
        lr (float): Learning rate.

    """
    def __init__(self, lr=_default_hyperparam.lr):
        super(SGD, self).__init__()
        self.hyperparam.lr = lr

    lr = optimizer.HyperparameterProxy('lr')

    def create_update_rule(self):
        return SGDRule(self.hyperparam)
class RMSpropWarmup(optimizer.GradientMethod):
    """RMSprop optimizer.

    See: T. Tieleman and G. Hinton (2012). Lecture 6.5 - rmsprop, COURSERA:
    Neural Networks for Machine Learning.

    Args:
        lr (float): Learning rate.
        alpha_sgd (float): Learning rate.
        mu1 (float): Exponential decay rate of the first order moment.
        alpha_rmsprop (float): Learning rate.
        mu2 (float): Exponential decay rate of the second order moment.
        eps (float): Small value for the numerical stability.

    """
    def __init__(self,
                 lr=_default_hyperparam.lr,
                 alpha_sgd=_default_hyperparam.alpha_sgd,
                 mu1=_default_hyperparam.mu1,
                 alpha_rmsprop=_default_hyperparam.alpha_rmsprop,
                 mu2=_default_hyperparam.mu2,
                 eps=_default_hyperparam.eps,
                 lars=False):
        super(RMSpropWarmup, self).__init__()
        self.hyperparam.lr = lr
        self.hyperparam.alpha_sgd = alpha_sgd
        self.hyperparam.mu1 = mu1
        self.hyperparam.alpha_rmsprop = alpha_rmsprop
        self.hyperparam.mu2 = mu2
        self.hyperparam.eps = eps
        self._lars = lars

    lr = optimizer.HyperparameterProxy('lr')
    alpha_sgd = optimizer.HyperparameterProxy('alpha_sgd')
    mu1 = optimizer.HyperparameterProxy('mu1')
    alpha_rmsprop = optimizer.HyperparameterProxy('alpha_rmsprop')
    mu2 = optimizer.HyperparameterProxy('mu2')
    eps = optimizer.HyperparameterProxy('eps')

    def create_update_rule(self):
        return RMSpropWarmupRule(self.hyperparam, self._lars)
Beispiel #21
0
class Adam(optimizer.GradientMethod):

    """Adam optimizer.

    See: `Adam: A Method for Stochastic Optimization \
          <https://arxiv.org/abs/1412.6980v8>`_

    Modified for proper weight decay (also called AdamW).
    AdamW introduces the additional parameters ``eta``
    and ``weight_decay_rate``, which can be used to properly scale the
    learning rate, and decouple the weight decay rate from ``alpha``,
    as shown in the below paper.

    Note that with the default values ``eta = 1`` and
    ``weight_decay_rate = 0``, this implementation is identical to
    the standard Adam method.

    See: `Fixing Weight Decay Regularization in Adam \
          <https://openreview.net/forum?id=rk6qdGgCZ>`_

    A flag ``amsgrad`` to use the AMSGrad variant of Adam from
    the paper: `On the Convergence of Adam and Beyond \
               <https://openreview.net/forum?id=ryQu7f-RZ>`_

    A flag ``adastand`` to use the Adastand variant of Adam from
    the paper: `Adaptive Learning Rate via Covariance Matrix Based Preconditioning for Deep Neural Networks \
               <https://www.ijcai.org/proceedings/2017/0267.pdf>`_

    Args:
        alpha (float): Step size.
        beta1 (float): Exponential decay rate of the first order moment.
        beta2 (float): Exponential decay rate of the second order moment.
        eps (float): Small value for the numerical stability.
        eta (float): Schedule multiplier, can be used for warm restarts.
        weight_decay_rate (float): Weight decay rate.
        amsgrad (bool): Whether to use AMSGrad variant of Adam.
        adastand (bool): Whether to use Adastand variant of Adam.

    """

    def __init__(self,
                 alpha=_default_hyperparam.alpha,
                 beta1=_default_hyperparam.beta1,
                 beta2=_default_hyperparam.beta2,
                 eps=_default_hyperparam.eps,
                 eta=_default_hyperparam.eta,
                 weight_decay_rate=_default_hyperparam.weight_decay_rate,
                 amsgrad=_default_hyperparam.amsgrad,
                 adastand=_default_hyperparam.adastand):
        super(Adam, self).__init__()
        self.hyperparam.alpha = alpha
        self.hyperparam.beta1 = beta1
        self.hyperparam.beta2 = beta2
        self.hyperparam.eps = eps
        self.hyperparam.eta = eta
        self.hyperparam.weight_decay_rate = weight_decay_rate
        self.hyperparam.amsgrad = amsgrad
        self.hyperparam.adastand = adastand

    alpha = optimizer.HyperparameterProxy('alpha')
    beta1 = optimizer.HyperparameterProxy('beta1')
    beta2 = optimizer.HyperparameterProxy('beta2')
    eps = optimizer.HyperparameterProxy('eps')
    eta = optimizer.HyperparameterProxy('eta')
    weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate')
    amsgrad = optimizer.HyperparameterProxy('amsgrad')
    adastand = optimizer.HyperparameterProxy('adastand')

    def create_update_rule(self):
        return AdamRule(self.hyperparam)

    @property
    def lr(self):
        return _learning_rate(self.hyperparam, self.t)
Beispiel #22
0
class Eve(optimizer.GradientMethod):
    """Eve optimizer.

    See: https://arxiv.org/abs/1611.01505v3

    Args:
        alpha (float): Coefficient of learning rate.
        beta1 (float): Exponential decay rate of the first order moment.
        beta2 (float): Exponential decay rate of the second order moment.
        beta3 (float): Exponential decay rate of the objective-dependent
            coefficient of learning rate.
        c (float): Constant used to clip the objective-dependent coefficient.
        eps (float): Small value for the numerical stability.
        eta (float): Schedule multiplier, can be used for warm restarts.
        f_star (float): Minimum value that the loss function can take.
        weight_decay_rate (float): Weight decay rate.
        amsgrad (bool): Whether to use AMSGrad variant of Eve.

    """
    def __init__(self,
                 alpha=_default_hyperparam.alpha,
                 beta1=_default_hyperparam.beta1,
                 beta2=_default_hyperparam.beta2,
                 beta3=_default_hyperparam.beta3,
                 c=_default_hyperparam.c,
                 eps=_default_hyperparam.eps,
                 eta=_default_hyperparam.eta,
                 f_star=_default_hyperparam.f_star,
                 weight_decay_rate=_default_hyperparam.weight_decay_rate,
                 amsgrad=_default_hyperparam.amsgrad):
        super(Eve, self).__init__()
        self.hyperparam.alpha = alpha
        self.hyperparam.beta1 = beta1
        self.hyperparam.beta2 = beta2
        self.hyperparam.beta3 = beta3
        self.hyperparam.c = c
        self.hyperparam.eps = eps
        self.hyperparam.eta = eta
        self.hyperparam.f_star = f_star
        self.hyperparam.weight_decay_rate = weight_decay_rate
        self.hyperparam.amsgrad = amsgrad

    alpha = optimizer.HyperparameterProxy('alpha')
    beta1 = optimizer.HyperparameterProxy('beta1')
    beta2 = optimizer.HyperparameterProxy('beta2')
    beta3 = optimizer.HyperparameterProxy('beta3')
    c = optimizer.HyperparameterProxy('c')
    eps = optimizer.HyperparameterProxy('eps')
    eta = optimizer.HyperparameterProxy('eta')
    f_star = optimizer.HyperparameterProxy('f_star')
    weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate')
    amsgrad = optimizer.HyperparameterProxy('amsgrad')

    def setup(self, link):
        """Sets a target link and initializes the optimizer states.

        Given link is set to the :attr:`target` attribute. It also prepares the
        optimizer state dictionaries corresponding to all parameters in the
        link hierarchy. The existing states are discarded.

        Args:
            link (~chainer.Link): Target link object.

        Returns:
            The optimizer instance.

        .. note::
           As of v4.0.0, this function returns the optimizer instance itself
           so that you can instantiate and setup the optimizer in one line,
           e.g., ``optimizer = SomeOptimizer().setup(link)``.

        """
        super(Eve, self).setup(link)
        self.d_tilde = numpy.nan
        self.f = numpy.nan
        return self

    def create_update_rule(self):
        return EveRule(self.hyperparam)

    @property
    def lr(self):
        return _learning_rate(self.hyperparam, self.t, self.d_tilde)

    def update(self, lossfun, *args, **kwds):
        """Updates parameters based on a loss function or computed gradients.

        Because Eve uses loss values, `lossfun` is required unlike in the
        case of other optimizers.

        Args:
            lossfun (callable): Callable that returns a ~chainer.Variable to be
                minimized.
            *args, **kwds: Arguments passed to `lossfun`.

        """
        assert lossfun is not None, 'Eve requires lossfun to be specified'
        use_cleargrads = getattr(self, '_use_cleargrads', True)
        loss = lossfun(*args, **kwds)
        if use_cleargrads:
            self.target.cleargrads()
        else:
            self.target.zerograds()
        loss.backward(loss_scale=self._loss_scale)
        loss_value = float(loss.array)
        del loss

        self.reallocate_cleared_grads()

        self.call_hooks('pre')

        self.t += 1
        self._update_d_tilde_and_f(loss_value)
        for param in self.target.params():
            param.update_rule.d_tilde = self.d_tilde
            param.update()

        self.reallocate_cleared_grads()

        self.call_hooks('post')

    def serialize(self, serializer):
        """Serializes or deserializes the optimizer.

        It only saves or loads the following things:

        - Optimizer states
        - Global states (:attr:`t`, :attr:`epoch`, :attr:`d_tilde`, and
            :attr:`f`)

        **It does not saves nor loads the parameters of the target link.** They
        should be separately saved or loaded.

        Args:
            serializer (~chainer.AbstractSerializer): Serializer or
                deserializer object.
        """
        super(Eve, self).serialize(serializer)
        self.d_tilde = serializer('d_tilde', self.d_tilde)
        self.f = serializer('f', self.f)

    def _update_d_tilde_and_f(self, loss):
        if self.t > 1:
            d = abs(loss - self.f) / (min(loss, self.f) - self.f_star)
            d_hat = numpy.clip(d, 1 / self.c, self.c)
            self.d_tilde = self.beta3 * self.d_tilde + (1 - self.beta3) * d_hat
        else:
            self.d_tilde = 1
        self.f = loss
Beispiel #23
0
class KFAC(chainer.optimizer.GradientMethod):
    """K-FAC optimizer.

    See: `Optimizing Neural Networks with \
          Kronecker-factored Approximate Curvature \
          <https://arxiv.org/abs/1503.05671>`_

    Args:
        lr (float): Learning rate.
        momentum (float): Exponential decay rate of the first order moment.
        cov_ema_decay (float): Decay factor used when calculating the
                               covariance estimate Exponential Moving Average.
        inv_freq (int): Frequency to calculate the inverse of covariance
                        estimate EMA for each layer.
        inv_alg (str): Algorithm used when calculating the inverse.
        damping (float): Damping factor used to stabilize training
                         due to errors in the local approximation with the
                         Fisher information matrix.

    Attributes:
        fisher_blocks (list): Keep data to compute Fisher block.

    """
    def __init__(
        self,
        communicator=None,
        lr=_default_hyperparam.lr,
        momentum=_default_hyperparam.momentum,
        cov_ema_decay=_default_hyperparam.cov_ema_decay,
        inv_freq=_default_hyperparam.inv_freq,
        inv_alg=None,
        damping=_default_hyperparam.damping,
    ):
        super(KFAC, self).__init__()
        self.communicator = communicator
        self.hyperparam.lr = lr
        self.hyperparam.momentum = momentum
        self.hyperparam.cov_ema_decay = cov_ema_decay
        self.hyperparam.inv_freq = inv_freq
        self.hyperparam.damping = damping

        self.fisher_blocks = []
        self.inv_alg = inv_alg

    lr = optimizer.HyperparameterProxy('lr')
    momentum = optimizer.HyperparameterProxy('momentum')
    cov_ema_decay = optimizer.HyperparameterProxy('cov_ema_decay')
    inv_freq = optimizer.HyperparameterProxy('inv_freq')
    damping = optimizer.HyperparameterProxy('damping')

    def setup(self, link):
        super(KFAC, self).setup(link)
        for linkname, sub_link in link.namedlinks():
            if isinstance(sub_link, _linear_link):
                fb = fisher_block.FisherBlockLinear(sub_link, linkname)
            elif isinstance(sub_link, _convolution_2d_link):
                fb = fisher_block.FisherBlockConv2D(sub_link, linkname)
            elif isinstance(sub_link, _batch_norm_link):
                fb = fisher_block.FisherBlockBatchNorm(sub_link, linkname)
            else:
                continue
            self.fisher_blocks.append(fb)
        return self

    def create_update_rule(self):
        return KFACUpdateRule(self.hyperparam)

    def update(self, lossfun=None, *args, **kwds):
        if lossfun is not None:
            use_cleargrads = getattr(self, '_use_cleargrads', True)
            loss = lossfun(*args, **kwds)
            if use_cleargrads:
                self.target.cleargrads()
            else:
                self.target.zerograds()

            # We removed ``loss.backward()`` from here.
            # Do backprop, and obtain ``grads`` which contains the dependency
            # graph inside.
            backward_main = getattr(loss, '_backward_main')
            self.kfac_backward(self.target, backward_main)

            del loss  # No more backward computation, free memory

            # Update param.kfgrad for each layer
            self.kfgrad_update()

        self.reallocate_cleared_grads()
        self.call_hooks('pre')

        self.t += 1
        for param in self.target.params():
            param.update()

        self.reallocate_cleared_grads()
        self.call_hooks('post')

        self.cov_ema_update()

    def kfac_backward(self, link, backward_main):
        """Backward function for KFAC optimizer.
        This function is invoked from ``KFAC.update()`` to:
            1. calculate backprop
            2. obtain the following data for each layer (`~chainer.link.Link`)
                - acts (inputs = activations after previous layer)
                - grads (gradients of outputs)
                - rank (`~chainer.FunctionNode.rank`)
                - conv2d_args (arguments of `~chainer.links.connection.\
                                           convolution_2d.Convolution2D`)
        """
        with chainer.using_config('enable_backprop', False):
            # To obtain grads, we need to edit a file ``variable.py``
            grads = backward_main(retain_grad=True, loss_scale=None)

        namedparams = list(link.namedparams())

        def get_linkname(param):
            # Get a linkname from a parameter.
            for _name, _param in namedparams:
                if param is _param:
                    # Only return linkname NOT paramname.
                    return _name[:_name.rfind('/')]
            return None

        def get_fisher_block(linkname):
            for fb in self.fisher_blocks:
                if fb.linkname == linkname:
                    return fb
            return None

        for node, out_grads_var in grads.items():
            creator_node = node.creator_node  # parent function node
            if creator_node is not None:  # ignore leaf node
                if not any(
                    [isinstance(creator_node, t) for t in _target_functions]):
                    continue
                (in_acts_var, param) = creator_node.get_retained_inputs()
                linkname = get_linkname(param)
                fb = get_fisher_block(linkname)
                fb.load_data(in_acts_var.data, out_grads_var.data)
                fb.load_conv2d_args(creator_node, param)

    def kfgrad_update(self):
        """Update param.kfgrad which used for K-FAC updates for each laeyer.
        """
        for fb in self.fisher_blocks:
            fb.update_kfgrads()

    def cov_ema_update(self):
        """Update EMA of covariance for each laeyer.
        """
        for fb in self.fisher_blocks:
            fb.update_cov_emas(alpha=self.hyperparam.cov_ema_decay)

    def inv_update(self):
        """Update inverse of EMA of covariance for each laeyer.
        """
        comm = self.communicator
        if comm is not None:
            indices
            local_indices = indices[comm.rank]
            fisher_blocks = [self.fisher_blocks[i] for i in local_indices]
        else:
            fisher_blocks = self.fisher_blocks
        for fb in fisher_blocks:
            fb.update_invs(damping=self.hyperparam.damping)
Beispiel #24
0
class Adam(optimizer.GradientMethod):
    """Adam optimizer.

    See: `Adam: A Method for Stochastic Optimization
    <https://arxiv.org/abs/1412.6980v8>`_

    Modified for proper weight decay (also called
    :class:`~chainer.optimizers.AdamW`).
    AdamW introduces the additional parameters ``eta``
    and ``weight_decay_rate``, which can be used to properly scale the
    learning rate, and decouple the weight decay rate from ``alpha``,
    as shown in the below paper.

    Note that with the default values ``eta = 1`` and
    ``weight_decay_rate = 0``, this implementation is identical to
    the standard Adam method.

    See: `Fixing Weight Decay Regularization in Adam
    <https://openreview.net/forum?id=rk6qdGgCZ>`_

    A flag ``amsgrad`` to use the :class:`~chainer.optimizers.AMSGrad`
    variant of Adam from the paper:
    `On the Convergence of Adam and Beyond
    <https://openreview.net/forum?id=ryQu7f-RZ>`_

    A flag ``adabound`` to use the :class:`~chainer.optimizers.AdaBound`
    variant of Adam from the paper:
    `Adaptive Gradient Methods with Dynamic Bound of Learning Rate
    <https://openreview.net/forum?id=Bkg3g2R9FX>`_

    If both ``amsgrad`` and ``adabound`` are ``True``, the optimizer is
    equivalent to :class:`~chainer.optimizers.AMSBound` proposed in the
    AdaBound paper.

    Args:
        alpha (float): Coefficient of learning rate.
        beta1 (float): Exponential decay rate of the first order moment.
        beta2 (float): Exponential decay rate of the second order moment.
        eps (float): Small value for the numerical stability.
        eta (float): Schedule multiplier, can be used for warm restarts.
        weight_decay_rate (float): Weight decay rate.
        amsgrad (bool): Whether to use AMSGrad variant of Adam.
        adabound (bool): Whether to use the AdaBound variant of Adam.
        final_lr (float): Final (SGD) learning rate in AdaBound.
        gamma (float): Convergence speed of the bound functions in AdaBound.

    """
    def __init__(self,
                 alpha=_default_hyperparam.alpha,
                 beta1=_default_hyperparam.beta1,
                 beta2=_default_hyperparam.beta2,
                 eps=_default_hyperparam.eps,
                 eta=_default_hyperparam.eta,
                 weight_decay_rate=_default_hyperparam.weight_decay_rate,
                 amsgrad=_default_hyperparam.amsgrad,
                 adabound=_default_hyperparam.adabound,
                 final_lr=_default_hyperparam.final_lr,
                 gamma=_default_hyperparam.gamma):
        super(Adam, self).__init__()
        self.hyperparam.alpha = alpha
        self.hyperparam.beta1 = beta1
        self.hyperparam.beta2 = beta2
        self.hyperparam.eps = eps
        self.hyperparam.eta = eta
        self.hyperparam.weight_decay_rate = weight_decay_rate
        self.hyperparam.amsgrad = amsgrad
        self.hyperparam.adabound = adabound
        self.hyperparam.final_lr = final_lr
        self.hyperparam.gamma = gamma

    alpha = optimizer.HyperparameterProxy('alpha')
    beta1 = optimizer.HyperparameterProxy('beta1')
    beta2 = optimizer.HyperparameterProxy('beta2')
    eps = optimizer.HyperparameterProxy('eps')
    eta = optimizer.HyperparameterProxy('eta')
    weight_decay_rate = optimizer.HyperparameterProxy('weight_decay_rate')
    amsgrad = optimizer.HyperparameterProxy('amsgrad')
    adabound = optimizer.HyperparameterProxy('adabound')
    final_lr = optimizer.HyperparameterProxy('final_lr')
    gamma = optimizer.HyperparameterProxy('gamma')

    def create_update_rule(self):
        return AdamRule(self.hyperparam)

    @property
    def alpha_t(self):
        return _learning_rate(self.hyperparam, self.t)

    @property
    def lr(self):
        warnings.warn(
            'Adam.lr has been renamed to AdamRule.alpha_t. '
            'Use of Adam.lr is deprecated in Chainer v6.', DeprecationWarning)
        return self.alpha_t