def __init__(self, params, lr: float = 1e-3, betas: Tuple[float, float] = (0.9, 0.999), eps: float = 1e-16, weight_decay: WeightDecay = WeightDecay(), optimized_update: bool = True, defaults: Optional[Dict[str, Any]] = None): """ ### Initialize the optimizer * `params` is the list of parameters * `lr` is the learning rate $\alpha$ * `betas` is a tuple of ($\beta_1$, $\beta_2$) * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update` * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html) * `optimized_update` is a flag whether to optimize the bias correction of the second moment by doing it after adding $\epsilon$ * `defaults` is a dictionary of default for group values. This is useful when you want to extend the class `Adam`. """ defaults = {} if defaults is None else defaults defaults.update(weight_decay.defaults()) super().__init__(params, defaults, lr, betas, eps) self.weight_decay = weight_decay self.optimized_update = optimized_update
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, weight_decay: WeightDecay = WeightDecay(), optimized_update: bool = True, amsgrad=False, warmup=0, defaults=None): """ ### Initialize the optimizer * `params` is the list of parameters * `lr` is the learning rate $\alpha$ * `betas` is a tuple of ($\beta_1$, $\beta_2$) * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update` * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html) * 'optimized_update' is a flag whether to optimize the bias correction of the second moment by doing it after adding $\epsilon$ * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam * `warmup` number of warmup steps * `defaults` is a dictionary of default for group values. This is useful when you want to extend the class `AdamWarmup`. """ defaults = {} if defaults is None else defaults defaults.update(dict(warmup=warmup)) super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, weight_decay: WeightDecay = WeightDecay(), amsgrad=False, degenerate_to_sgd=True, rectify=True, defaults=None): """ ### Initialize the optimizer * `params` is the list of parameters * `lr` is the learning rate $\alpha$ * `betas` is a tuple of ($\beta_1$, $\beta_2$) * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update` * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html) * 'optimized_update' is a flag whether to optimize the bias correction of the second moment by doing it after adding $\epsilon$ * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam * `degenerate_to_sgd` whether to use sgd when the rectification term $r_t is intractable * 'rectify' is whether to use RAdam update * `defaults` is a dictionary of default for group values. This is useful when you want to extend the class `AdaBelief`. """ defaults = {} if defaults is None else defaults super().__init__(params, lr, betas, eps, weight_decay, amsgrad, degenerate_to_sgd, defaults) self.rectify = rectify
def _weight_decay(c: OptimizerConfigs): return WeightDecay(c.weight_decay, c.weight_decouple, c.weight_decay_absolute)