def adagrad(learning_rate: ScalarOrSchedule, initial_accumulator_value: float = 0.1, eps: float = 1e-7) -> base.GradientTransformation: """The Adagrad optimizer. Adagrad is an algorithm for gradient based optimisation that anneals the learning rate for each parameter during the course of training. WARNING: Adagrad's main limit is the monotonic accumulation of squared gradients in the denominator: since all terms are >0, the sum keeps growing during training and the learning rate eventually becomes vanishingly small. References: [Duchi et al, 2011](https://jmlr.org/papers/v12/duchi11a.html) Args: learning_rate: this is a fixed global scaling factor. initial_accumulator_value: initialisation for the accumulator. eps: a small constant applied to denominator inside of the square root (as in RMSProp) to avoid dividing by zero when rescaling. Returns: the corresponding `GradientTransformation`. """ return combine.chain( transform.scale_by_rss( initial_accumulator_value=initial_accumulator_value, eps=eps), _scale_by_learning_rate(learning_rate), )
def adagrad(learning_rate: float, initial_accumulator_value: float = 0.1, eps: float = 1e-7) -> GradientTransformation: return combine.chain( transform.scale_by_rss( initial_accumulator_value=initial_accumulator_value, eps=eps), transform.scale(-learning_rate), )