def apply_gradients(self, grads_and_vars): self._iterations += 1 grads, var_list = list(zip(*grads_and_vars)) new_grads = [] if self._summaries: summary.scalar("optimizer/scale", self._scale, utils.get_global_step()) for grad in grads: if grad is None: new_grads.append(None) continue norm = grad.data.norm() if not torch.isfinite(norm): self._update_if_not_finite_grads() return else: # Rescale gradients new_grads.append(grad.data.float().mul_(1.0 / self._scale)) self._update_if_finite_grads() self._optimizer.apply_gradients(zip(new_grads, var_list))
def _save_summary(grads_and_vars): total_norm = 0.0 for grad, var in grads_and_vars: if grad is None: continue _, var = var grad_norm = grad.data.norm() total_norm += grad_norm**2 summary.histogram(var.tensor_name, var, utils.get_global_step()) summary.scalar("norm/" + var.tensor_name, var.norm(), utils.get_global_step()) summary.scalar("grad_norm/" + var.tensor_name, grad_norm, utils.get_global_step()) total_norm = total_norm**0.5 summary.scalar("grad_norm", total_norm, utils.get_global_step()) return float(total_norm)
def __call__(self, step): if step <= self._warmup_steps: lr_step = self._maximum_learning_rate - self._initial_learning_rate lr_step /= self._warmup_steps lr = self._initial_learning_rate + lr_step * step else: lr = self._maximum_learning_rate if self._warmup_steps != 0: # approximately hidden_size ** -0.5 lr = lr * self._warmup_steps**0.5 lr = lr * (step**-0.5) if self._summary: summary.scalar("learning_rate", lr, utils.get_global_step()) return lr
def __call__(self, step): # See reference: The Best of Both Worlds: Combining Recent Advances # in Neural Machine Translation n = self._n p = self._warmup_steps / n s = n * self._start_decay_step e = n * self._end_decay_step learning_rate = self._learning_rate learning_rate *= min( 1.0 + (n - 1) * step / float(n * p), n, n * ((2 * n)**(float(s - n * step) / float(e - s)))) if self._summary: summary.scalar("learning_rate", learning_rate, utils.get_global_step()) return learning_rate
def __call__(self, step): boundaries = self._boundaries values = self._values learning_rate = values[0] if step <= boundaries[0]: learning_rate = values[0] elif step > boundaries[-1]: learning_rate = values[-1] else: for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]): if step > low and step <= high: learning_rate = v break if self._summary: summary.scalar("learning_rate", learning_rate, utils.get_global_step()) return learning_rate