def AdaHessian( params, lr=0.1, hessian_power=1, hutchinson_trace=None, mom=0.9, sqr_mom=0.98, eps=1e-4, wd=0.0, decouple_wd=True, ): "A `Optimizer` for Adam with `lr`, `mom`, `sqr_mom`, `eps` and `params`" cbs = [weight_decay] if decouple_wd else [l2_reg] cbs += [ partial(average_grad, dampening=True), average_sqr_diag_hessian, step_stat, adahessian_step, ] return Optimizer( params, cbs, lr=lr, mom=mom, sqr_mom=sqr_mom, hessian_power=hessian_power, eps=eps, wd=wd, )
def RAdamAdabelief( params, lr, mom=0.9, sqr_mom=0.99, eps=1e-5, wd=0.0, beta=0.0, decouple_wd=True ): "A `Optimizer` for Adam with `lr`, `mom`, `sqr_mom`, `eps` and `params`" cbs = [weight_decay] if decouple_wd else [l2_reg] cbs += [ partial(average_grad, dampening=True), asqg, step_stat, radam_adabelief_step, ] return Optimizer( params, cbs, lr=lr, mom=mom, sqr_mom=sqr_mom, eps=eps, wd=wd, beta=beta )
""" --- title: Optimizers summary: > A set of PyTorch implementations/tutorials of popular gradient descent based optimizers. Currently includes Adam, AMSGrad and RAdam optimizers. --- # Optimizers ## Optimizer Implementations * [Adam Optimizer](adam.html) * [AMSGrad Optimizer](amsgrad.html) * [Adam Optimizer with warmup](adam_warmup.html) * [Noam Optimizer](noam.html) * [Rectified Adam Optimizer](radam.html) * [AdaBelief Optimizer](ada_belief.html) This [MNIST example](mnist_experiment.html) uses these optimizers. ## Generic Adaptive Optimizer Base class and Weight Decay This file defines a common base class for *Adam* and extensions of it. The base class helps use implement other optimizers with minimal code because of re-usability. We also define a special class for L2 weight decay, so that we don't have to implement it inside each of the optimizers, and can easily extend to other weight decays like L1 without changing the optimizers.