Esempio n. 1
0
    def apply_gradients(self, grads_and_vars):
        self._iterations += 1
        grads, var_list = list(zip(*grads_and_vars))
        new_grads = []

        if self._summaries:
            summary.scalar("optimizer/scale", self._scale,
                           utils.get_global_step())

        for grad in grads:
            if grad is None:
                new_grads.append(None)
                continue

            norm = grad.data.norm()

            if not torch.isfinite(norm):
                self._update_if_not_finite_grads()
                return
            else:
                # Rescale gradients
                new_grads.append(grad.data.float().mul_(1.0 / self._scale))

        self._update_if_finite_grads()
        self._optimizer.apply_gradients(zip(new_grads, var_list))
    def __call__(self, step):
        if step <= self._warmup_steps:
            lr_step = self._maximum_learning_rate - self._initial_learning_rate
            lr_step /= self._warmup_steps
            lr = self._initial_learning_rate + lr_step * step
        else:
            step = step / self._warmup_steps
            lr = self._maximum_learning_rate * (step**-0.5)

        if self._summary:
            summary.scalar("learning_rate", lr, utils.get_global_step())

        return lr
    def __call__(self, step):
        # See reference: The Best of Both Worlds: Combining Recent Advances
        # in Neural Machine Translation
        n = self._n
        p = self._warmup_steps / n
        s = n * self._start_decay_step
        e = n * self._end_decay_step

        learning_rate = self._learning_rate

        learning_rate *= min(
            1.0 + (n - 1) * step / float(n * p), n,
            n * ((2 * n)**(float(s - n * step) / float(e - s))))

        if self._summary:
            summary.scalar("learning_rate", learning_rate,
                           utils.get_global_step())

        return learning_rate
    def step_fn(features, step):
        t = time.time()
        features = data.lookup(features, "train", params)
        loss = train_fn(features)
        gradients = optimizer.compute_gradients(loss, list(model.parameters()))
        if params.clip_grad_norm:
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           params.clip_grad_norm)

        optimizer.apply_gradients(
            zip(gradients, list(model.named_parameters())))

        t = time.time() - t

        summary.scalar("loss", loss, step, write_every_n_steps=1)
        summary.scalar("global_step/sec", t, step)

        print("epoch = %d, step = %d, loss = %.3f (%.3f sec)" %
              (epoch + 1, step, float(loss), t))
    def __call__(self, step):
        boundaries = self._boundaries
        values = self._values
        learning_rate = values[0]

        if step <= boundaries[0]:
            learning_rate = values[0]
        elif step > boundaries[-1]:
            learning_rate = values[-1]
        else:
            for low, high, v in zip(boundaries[:-1], boundaries[1:],
                                    values[1:-1]):

                if step > low and step <= high:
                    learning_rate = v
                    break

        if self._summary:
            summary.scalar("learning_rate", learning_rate,
                           utils.get_global_step())

        return learning_rate
Esempio n. 6
0
def _save_summary(grads_and_vars):
    total_norm = 0.0

    for grad, var in grads_and_vars:
        if grad is None:
            continue

        _, var = var
        grad_norm = grad.data.norm()
        total_norm += grad_norm**2
        summary.histogram(var.tensor_name, var, utils.get_global_step())
        summary.scalar("norm/" + var.tensor_name, var.norm(),
                       utils.get_global_step())
        summary.scalar("grad_norm/" + var.tensor_name, grad_norm,
                       utils.get_global_step())

    total_norm = total_norm**0.5
    summary.scalar("grad_norm", total_norm, utils.get_global_step())

    return float(total_norm)