Esempio n. 1
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Apply gradients to variables.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      An `Operation` that applies the gradients. If `global_step` was not None,
      that operation also increments `global_step`.

    Raises:
      ValueError: If the grads_and_vars is malformed.
    """
        summed_grads_and_vars = []
        for (grad, var) in grads_and_vars:
            if grad is None:
                summed_grads_and_vars.append((grad, var))
            else:
                with ops.colocate_with(grad):
                    summed_grads_and_vars.append(
                        (gen_poputil_ops.ipu_stateful_gradient_accumulate(
                            grad,
                            num_mini_batches=self._num_mini_batches,
                            verify_usage=self._verify_usage), var))
        return self._opt.apply_gradients(summed_grads_and_vars, global_step,
                                         name)
Esempio n. 2
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        summed_grads_and_vars = []
        for (grad, var) in grads_and_vars:
            if grad is None:
                summed_grads_and_vars.append((grad, var))
            else:
                with ops.colocate_with(grad):
                    # gradient accumulation
                    if self._gradients_to_accumulate > 1 and not self._pipelining:
                        grad = gen_poputil_ops.ipu_stateful_gradient_accumulate(
                            grad / self._gradients_to_accumulate,
                            num_mini_batches=self._gradients_to_accumulate)

                    # replication
                    if self._replicas > 1:
                        grad = gen_poputil_ops.ipu_replication_normalise(
                            cross_replica_ops.cross_replica_sum(grad))

                    grad = math_ops.cast(grad, var.dtype)
                    summed_grads_and_vars.append((grad, var))

        if self._pipelining:
            # can do weight decay here as apply_gradients is only called on last accumulation step
            summed_grads_and_vars = self.add_WD(summed_grads_and_vars)

        ret = self._optimizer.apply_gradients(summed_grads_and_vars,
                                              global_step, name)
        if self._sharded:
            sharding.propagate_sharding(ops.get_default_graph())
        return ret
Esempio n. 3
0
 def body(i, y):
     ga = gen_poputil_ops.ipu_stateful_gradient_accumulate(
         array_ops.ones_like(y), num_mini_batches=5)
     cr = gen_popops_ops.ipu_cross_replica_sum(ga)
     y = y + cr
     i = i + 1
     return (i, y)
Esempio n. 4
0
 def body(i, x, y):
     x = x + gen_poputil_ops.ipu_stateful_gradient_accumulate(
         array_ops.ones_like(x),
         num_mini_batches=5,
         verify_usage=False)
     y = y + array_ops.ones_like(x)
     i = i + 1
     return (i, x, y)
Esempio n. 5
0
 def body(i, y):
     cr = gen_popops_ops.ipu_cross_replica_sum(
         array_ops.ones_like(y))
     norm = gen_poputil_ops.ipu_replication_normalise(cr)
     ga = gen_poputil_ops.ipu_stateful_gradient_accumulate(
         norm, num_mini_batches=5)
     y = y + ga
     i = i + 1
     return (i, y)
Esempio n. 6
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        summed_grads_and_vars = []
        for (grad, var) in grads_and_vars:
            if grad is None:
                summed_grads_and_vars.append((grad, var))
            else:
                with ops.colocate_with(grad):
                    # gradient accumulation
                    if self._gradient_accumulation_count > 1 and not self._pipelining:
                        grad = gen_poputil_ops.ipu_stateful_gradient_accumulate(
                            grad,
                            num_mini_batches=self._gradient_accumulation_count)

                    # replication
                    if self._replicas > 1:
                        grad = gen_poputil_ops.ipu_replication_normalise(
                            cross_replica_ops.cross_replica_sum(grad))

                    # distribution with IPUMultiWorkerStrategy needs additional normalisation by the number of workers
                    if isinstance(
                            distribute.get_strategy(),
                            ipu_multi_worker_strategy.IPUMultiWorkerStrategy):
                        grad /= distribute.get_strategy().num_replicas_in_sync

                    grad = math_ops.cast(grad, var.dtype)
                    summed_grads_and_vars.append((grad, var))

        if self._pipelining:
            # can do weight decay here as apply_gradients is only called on last accumulation step
            summed_grads_and_vars = self.add_WD(summed_grads_and_vars)

        if self._grad_scale != 1.0:
            # don't rescale batch norm moving average statistics as they are not affected by loss scaling
            summed_grads_and_vars = [
                (grad, var) if 'batch_norm/moving_' in var.name else
                (grad / self._grad_scale, var)
                for grad, var in summed_grads_and_vars
            ]
        ret = self._optimizer.apply_gradients(summed_grads_and_vars,
                                              global_step, name)
        if self._sharded:
            sharding.propagate_sharding(ops.get_default_graph())
        return ret