Esempio n. 1
0
  def Apply(self, lr, var_grad):
    p = self.params

    def _Acc(vg):
      """Updating accumulators."""

      v, g = vg
      with tf.variable_scope(v.op.name):
        a = py_utils.CreateVariable(
            'grad_accumulator',
            py_utils.WeightParams(v.get_shape(),
                                  py_utils.WeightInit.Constant(0.0),
                                  self.params.dtype),
            trainable=False)
        a = tf.assign_add(a, g)

      return py_utils.VarGrad(v, a)

    var_grad = var_grad.Transform(_Acc)

    def _ApplyAndReset():
      with tf.control_dependencies([
          self._opt.Apply(
              lr, py_utils.ApplyGradMultiplier(var_grad, 1. / p.accum_steps))
      ]):
        return tf.group(
            *[tf.assign(a, tf.zeros_like(a)) for _, a in var_grad.Flatten()])

    if self.params.add_summary_in_apply:
      self.AddSummary(lr, self.GetOptimizer(lr), var_grad)
    return tf.cond(
        tf.equal(
            tf.math.floormod(self.global_step, p.accum_steps),
            p.accum_steps - 1), _ApplyAndReset, lambda: tf.group(tf.no_op()))
Esempio n. 2
0
    def invoke_async_preconditioner_computation(self, global_step_int32):
        """Invokes SVD preconditioner and graph runs on the CPU."""
        keys_stats_and_rank = []
        for var in self._all_vars_for_preconditioning:
            shape = var.get_shape()
            if not self._fallback_to_diagonal_for_shape(shape):
                partitioned_v = TensorPartitioner.partition_tensor(
                    var, self._partition_info)
                num_partitions = len(partitioned_v)
                for pt_idx, pt_v in enumerate(partitioned_v):
                    pt_v_shape = pt_v.get_shape()
                    preconditioner_exists_for_dim = (
                        self._preconditioner_available_for_dims(pt_v_shape))
                    for i in range(len(pt_v_shape)):
                        if preconditioner_exists_for_dim[i]:
                            rank = sum(preconditioner_exists_for_dim)
                            key = self._key_for_var(var, i, pt_idx)
                            stat = self.get_slot(
                                var,
                                self._statistics_key_for_partition_and_dim(
                                    i, pt_idx, num_partitions))
                            keys_stats_and_rank.append((key, stat, rank))

        if not keys_stats_and_rank:
            return tf.no_op()
        keys, stats, ranks = zip(*keys_stats_and_rank)

        return x_ops.compute_preconditioners(
            stats, [-1.0 / (2.0 * r) for r in ranks],
            global_step_int32,
            keys=keys,
            sync=self._synchronous_preconditioning,
            preconditioner_compute_graphdef=self.
            _preconditioner_compute_graphdef)
Esempio n. 3
0
    def ApplyPostTrainingLoop(self):
        """Applies any computation to run after each tpu trainining loop.

    Returns:
      Ops to run after training loop ends.
    """
        return tf.no_op()
Esempio n. 4
0
 def _OutfeedEnqueue(self, per_example_tensors):
     if not per_example_tensors:
         return tf.no_op()
     per_example_tensors = py_utils.NestedMap(per_example_tensors)
     device = tpu.core(0) if self.spmd else ''
     with tf.device(device):
         return tpu_ops.outfeed_enqueue_tuple(per_example_tensors.Flatten())
Esempio n. 5
0
    def FProp(self, theta, inputs, *extra_inputs):

        initial_step_seed = py_utils.GetStepSeed()
        final_step_seed = py_utils.GenerateSeedFromName(
            tf.no_op(name='new_step_seed').name)
        num_layers = len(self.sub_layers)

        def Bak(inputs, outputs, d_outputs):
            """Backward step."""
            del inputs  # unused
            output_acts, step_seeds = outputs
            d_outputs = d_outputs[0]

            d_layer_thetas = []
            for layer_idx in reversed(range(num_layers)):
                f_seed, g_seed = step_seeds[layer_idx]
                layer = self.sub_layers[layer_idx]
                layer_theta = theta.sub_layers[layer_idx]

                input_acts, d_inputs, d_theta = layer.ReverseAndGrad(
                    layer_theta, output_acts, d_outputs, f_seed, g_seed,
                    *extra_inputs)

                d_layer_thetas.append(d_theta)
                # Passes reconstructed inputs to the previous layer.
                output_acts = input_acts
                d_outputs = d_inputs
            py_utils.ResetStepSeed(final_step_seed)
            d_theta = py_utils.NestedMap()
            d_theta.sub_layers = list(reversed(d_layer_thetas))

            extra_grads = [tf.zeros_like(t) for t in extra_inputs]
            return [
                tf.zeros_like(initial_step_seed), d_theta, d_inputs,
                extra_grads
            ]

        def Fwd(xs):
            """Forward pass."""
            initial_step_seed, theta, acts, extra_inputs = xs

            py_utils.ResetStepSeed(initial_step_seed)
            layer_step_seeds = []

            for layer_theta, layer in zip(theta.sub_layers, self.sub_layers):
                acts, f_seed, g_seed = layer.FProp(layer_theta, acts,
                                                   *extra_inputs)
                layer_step_seeds += [(f_seed, g_seed)]
            return [acts, layer_step_seeds]

        if self.params.custom_gradient:
            acts, _ = py_utils.CallDefun(
                Fwd, [initial_step_seed, theta, inputs, extra_inputs], Bak)
            py_utils.ResetStepSeed(final_step_seed)
            return acts
        else:
            acts = inputs
            for layer_theta, layer in zip(theta.sub_layers, self.sub_layers):
                acts, _, _ = layer.FProp(layer_theta, acts, *extra_inputs)
            return acts
Esempio n. 6
0
    def ApplyPostTrainingLoop(self, global_step):
        """Applies any computation to run after each tpu trainining loop.

    Args:
      global_step: Global step variable.

    Returns:
      Ops to run after training loop ends.
    """
        return tf.no_op()
Esempio n. 7
0
    def Apply(self, metrics, vmap, gradient_mask=None, gradient_adjuster=None):
        """Computes updates on 'vmap' to optimize 'loss'.

    TODO(rpang): explore merging gradient_mask and gradient_adjuster.

    Args:
      metrics: A Dict[str, (value, weight)], from which loss can be extracted
        according to p.loss_name.
      vmap: A `.NestedMap` object containing variables to optimize.
      gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      (losses, op, eval_metrics), where
        - losses is a list of scalar tensors;
        - op is a tf.Operation to update variables;
        - eval_metrics is a Dict[str, (value, weight)], where each value/weight
          is a scalar tensor.
    """
        # We apply gradients outside the name_scope to maintain backwards
        # compatibility on variables created by self.optimizer.Apply().
        losses, var_grads, eval_metrics = self._ComputeLossesAndGradients(
            metrics, vmap)
        if 'tpu_embedding_var_grads' in var_grads:
            tpu_embedding_var_grads = var_grads.tpu_embedding_var_grads
            del var_grads.tpu_embedding_var_grads

            tpu_embedding_collection = py_utils.GetTpuEmbeddingGraphCollection(
            )[0]
            assert tpu_embedding_collection
            tpu_emb_update_op, stats = tpu_embedding_collection.ApplyGradients(
                py_utils.GetTaskCallScope(),
                tpu_embedding_var_grads.Transform(
                    lambda var_grad: var_grad.grad))
            eval_metrics.update(stats)
        else:
            tpu_emb_update_op = tf.no_op()

        assert py_utils.GetGlobalStep() is not None
        lr = self.LearningRate()

        var_grads, stats = self.AdjustGradients(
            var_grads,
            gradient_mask=gradient_mask,
            gradient_adjuster=gradient_adjuster)
        eval_metrics.update(stats)
        self._var_grads = var_grads

        eval_metrics['learning_rate'] = (tf.convert_to_tensor(lr),
                                         tf.convert_to_tensor(1.))

        var_update_op = tf.group(
            [tpu_emb_update_op,
             self.optimizer.Apply(lr, var_grads)])
        return losses, var_update_op, eval_metrics
Esempio n. 8
0
 def mask_update_op(self):
     with tf.name_scope(self._spec.name):
         if not self._assign_ops:
             self._get_mask_assign_ops()
         with tf.control_dependencies([
                 tf.assign(self._last_update_step,
                           self._global_step,
                           name='last_mask_update_step_assign')
         ]):
             with tf.control_dependencies(self._assign_ops):
                 tf.logging.info('Updating masks.')
                 return tf.no_op('mask_update')
Esempio n. 9
0
 def _GetMaskUpdateOp(self):
   """Returns op to update masks and threshold variables for model pruning."""
   p = self.params
   tp = p.train
   mask_update_op = tf.no_op()
   if tp.pruning_hparams_dict:
     assert isinstance(tp.pruning_hparams_dict, dict)
     pruning_hparams = pruning.get_pruning_hparams().override_from_dict(
         tp.pruning_hparams_dict)
     pruning_obj = pruning.Pruning(
         pruning_hparams, global_step=self.global_step)
     pruning_obj.add_pruning_summaries()
     mask_update_op = pruning_obj.conditional_mask_update_op()
   return mask_update_op
Esempio n. 10
0
 def _Apply():
     if not var_grad.Flatten():
         tf.logging.warning(
             'No gradients are available for optimizer.Apply(). '
             'Make sure this is expected.')
         return tf.no_op()
     if self.params.use_bf16_gradients_ar:
         return self._optimizer.apply_gradients(
             [(tf.cast(g, tf.float32), v)
              for (v, g) in var_grad.Flatten()],
             name='meta_backprop')
     else:
         return self._optimizer.apply_gradients(
             [(g, v) for (v, g) in var_grad.Flatten()],
             name='meta_backprop')
Esempio n. 11
0
 def testExponentialMovingAverage(self):
   p = base_model.SingleTaskModel.Params()
   p.task = BaseTaskTest.TestParams()
   p.task.input = base_input_generator.BaseSequenceInputGenerator.Params()
   p.train.ema_decay = 0.9
   model = p.Instantiate()
   model._task.CreateChild('a',
                           layers.BatchNormLayer.Params().Set(name='a', dim=1))
   model._task._train_op = tf.no_op()
   model._task.ApplyExponentialMovingAverage(model.ema)
   with tf.variable_scope('', reuse=True):
     beta = tf.get_variable('a/beta/var')
     mean = tf.get_variable('a/moving_mean/var')
     self.assertIsNotNone(model.ema.average(beta))
     self.assertIsNone(model.ema.average(mean))
Esempio n. 12
0
 def testExponentialMovingAverage(self):
     p = base_model.SingleTaskModel.Params()
     p.task = BaseTaskTest.TestParams()
     p.task.input = base_input_generator.BaseSequenceInputGenerator.Params()
     p.task.train.ema_decay = 0.9
     p.task.train.ema_decay_moving_vars = False
     model = p.Instantiate()
     task = model._task
     task._train_op = tf.no_op()
     task.ApplyExponentialMovingAverage(model.ema)
     with tf.variable_scope('base_mdl', reuse=True):
         beta = tf.get_variable('x/beta/var')
         mean = tf.get_variable('x/moving_mean/var')
         self.assertIsNotNone(model.ema.average(beta))
         self.assertIsNone(model.ema.average(mean))
Esempio n. 13
0
    def assign_preconditioner_to_host_vars(self):
        """Assign/Grab latest copy of preconditioners."""
        keys_shapes_and_preconditioner_vars = []
        assign_ops = []
        for var in self._all_vars_for_preconditioning:
            shape = var.get_shape()
            if not self._fallback_to_diagonal_for_shape(shape):
                partitioned_v = TensorPartitioner.partition_tensor(
                    var, self._partition_info)
                num_partitions = len(partitioned_v)
                for pt_idx, pt in enumerate(partitioned_v):
                    pt_shape = pt.get_shape()
                    preconditioner_exists_for_dim = (
                        self._preconditioner_available_for_dims(pt_shape))
                    var_rank = len(pt_shape)
                    for i in range(var_rank):
                        if preconditioner_exists_for_dim[i]:
                            key = self._key_for_var(var, i, pt_idx)
                            preconditioner = self.get_slot(
                                var,
                                self._preconditioner_key_for_partition_and_dim(
                                    i, pt_idx, num_partitions))
                            keys_shapes_and_preconditioner_vars.append(
                                (key, tf.shape(preconditioner),
                                 preconditioner))

            if not keys_shapes_and_preconditioner_vars:
                return tf.no_op()

            keys, shapes, preconditioner_vars = zip(
                *keys_shapes_and_preconditioner_vars)

            preconditioner_vals, successes = x_ops.get_preconditioners(
                shapes,
                keys=keys,
                preconditioner_compute_graphdef=(
                    self._preconditioner_compute_graphdef))

            for preconditioner_var, preconditioner_val, success in zip(
                    preconditioner_vars, preconditioner_vals, successes):
                success_mult = tf.cast(success, preconditioner.dtype)
                assign_ops.append(
                    state_ops.assign(
                        preconditioner_var,
                        (1.0 - success_mult) * preconditioner_var +
                        success_mult * preconditioner_val))
        return tf.group(*assign_ops)
Esempio n. 14
0
    def _OutfeedDequeueLoop(self, per_example_tensors, num_loops, num_devices):
        """Process all per-example tensor outfeed data for a TPU sess.run.

    Args:
      per_example_tensors: dict of key -> tensor as generated by TpuTrainStep.
      num_loops: number of times that TpuTrainStep will be executed by TpuTrain.
      num_devices: number of TPU cores assigned to this process.

    Returns:
      A dict of per-example tensors from the latest TpuTrainStep.
    """
        if not per_example_tensors:
            return tf.no_op()

        tensor_shapes = [
            py_utils.GetShape(per_example_tensors[key])
            for key in sorted(per_example_tensors)
        ]
        tensor_types = [
            tf.as_dtype(per_example_tensors[key].dtype)
            for key in sorted(per_example_tensors)
        ]

        def LoopBody(i, *input_arrays):
            """Process outfeed data for a single TpuTrainStep.

      Args:
        i: current loop index.
        *input_arrays: One tf.TensorArray per outfeed tensor.

      Returns:
        i+1 (new index) plus post-write tf.TensorArray handles.
      """
            # Outfeed ops execute on each JF node, so they must be located on the
            # nodes.
            outfeed_devices = []
            device_assignment = py_utils.GetTpuDeviceAssignment()
            assert device_assignment
            for replica in range(device_assignment.num_replicas):
                for core in range(device_assignment.num_cores_per_replica):
                    with tf.device(device_assignment.host_device(
                            replica, core)):
                        outfeed_devices.append(
                            tpu_ops.outfeed_dequeue_tuple(
                                tensor_types,
                                tensor_shapes,
                                device_ordinal=device_assignment.tpu_ordinal(
                                    replica, core)))
            offset = i * num_devices
            output_arrays = list(input_arrays)
            # Each output_array holds a different per-example tensor. We get results
            # for each tensor from each TPU for each TpuTrainStep call.
            for j in range(len(output_arrays)):
                for k in range(len(outfeed_devices)):
                    output_arrays[j] = output_arrays[j].write(
                        offset + k, outfeed_devices[k][j])

            return tuple([i + 1] + output_arrays)

        def LoopCond(i, *output_arrays):
            del output_arrays
            return i < num_loops

        output_arrays = []
        for i in range(len(tensor_shapes)):
            output_arrays.append(
                tf.TensorArray(tensor_types[i],
                               size=num_loops * num_devices,
                               element_shape=tensor_shapes[i]))
        # Loop once for each time that TpuTrainStep runs.
        output_arrays = tf.while_loop(LoopCond,
                                      LoopBody, [0] + output_arrays,
                                      parallel_iterations=1)[1:]
        concatenated_arrays = [array.concat() for array in output_arrays]
        return dict(zip(sorted(per_example_tensors), concatenated_arrays))
Esempio n. 15
0
 def _OutfeedEnqueue(self, per_example_tensors):
     if not per_example_tensors:
         return tf.no_op()
     per_example_tensors = py_utils.NestedMap(per_example_tensors)
     return tpu_ops.outfeed_enqueue_tuple(per_example_tensors.Flatten())
Esempio n. 16
0
 def _apply_sparse(self, grad, var):
     return tf.no_op()
Esempio n. 17
0
 def _resource_apply_sparse(self, grad_values, var, grad_indices):
     return tf.no_op()
Esempio n. 18
0
 def PostTrainingStepUpdate(self, global_step):
   summary_utils.scalar('cap', self._Value(global_step))
   return tf.no_op()
Esempio n. 19
0
 def NoOP(*args, **kwargs):
   return tf.no_op()
Esempio n. 20
0
  def testBatchNormLayer(self):
    p = base_model.SingleTaskModel.Params()
    p.task = self.TestParams(layers.BatchNormLayer.Params().Set(dim=1))
    p.task.train.ema_decay = 0.9
    p.task.train.ema_decay_moving_vars = True
    model = p.Instantiate()
    self.assertIsNotNone(model.ema)
    task = model._task
    task._train_op = tf.no_op()
    task.ApplyExponentialMovingAverage(model.ema)

    layer = task.encoder
    self.assertLen(layer.vars, 4)
    for var in layer.vars.Flatten():
      self.assertIsNotNone(model.ema.average(var), msg=var.name)
    beta = layer.vars.beta
    mean = layer.vars.moving_mean

    global_step = 100
    beta_1 = np.asarray([.2])
    mean_1 = np.asarray([.03])
    beta_1_ema = beta_1 * .1
    mean_1_ema = mean_1 * .1
    with self.session() as sess:
      # Test EMA values.
      sess.run(tf.global_variables_initializer())
      sess.run(tf.assign(py_utils.GetOrCreateGlobalStepVar(), global_step))
      sess.run(tf.assign(beta, beta_1))
      sess.run(tf.assign(mean, mean_1))
      sess.run(task._post_train_ops)

      self.assertAllClose([beta_1, beta_1_ema, mean_1, mean_1_ema],
                          sess.run([
                              beta,
                              model.ema.average(beta), mean,
                              model.ema.average(mean)
                          ]))

      # Test checkpointer.
      train_dir = os.path.join(self.get_temp_dir(), 'testSaveRestore')
      os.mkdir(train_dir)
      saver = checkpointer.Checkpointer(train_dir, model)
      saver.Save(sess, model.global_step)

      self.assertTrue(
          os.path.isfile(
              os.path.join(train_dir, 'ckpt-%08d.index' % global_step)))

    # Restore from ckpt in training mode.
    with self.session(graph=tf.Graph()) as sess:
      model = p.Instantiate()
      self.assertIsNotNone(model.ema)
      task = model._task
      task._train_op = tf.no_op()
      task.ApplyExponentialMovingAverage(model.ema)
      layer = task.encoder
      for var in layer.vars.Flatten():
        self.assertIsNotNone(model.ema.average(var), msg=var.name)
      beta = layer.vars.beta
      mean = layer.vars.moving_mean

      saver = checkpointer.Checkpointer(train_dir, model)
      saver.RestoreIfNeeded(sess)

      self.assertAllClose([beta_1, beta_1_ema, mean_1, mean_1_ema],
                          sess.run([
                              beta,
                              model.ema.average(beta), mean,
                              model.ema.average(mean)
                          ]))

    # Restore from ckpt in eval mode.
    with self.session(graph=tf.Graph()) as sess, self.SetEval(True):
      model = p.Instantiate()
      self.assertIsNotNone(model.ema)
      task = model._task
      # task._train_op = tf.no_op()
      # task.ApplyExponentialMovingAverage(model.ema)
      layer = task.encoder
      # for var in layer.vars.Flatten():
      #   self.assertIsNotNone(model.ema.average(var), msg=var.name)
      beta = layer.vars.beta
      mean = layer.vars.moving_mean

      saver = checkpointer.Checkpointer(train_dir, model)
      saver.RestoreIfNeeded(sess)

      # Both beta and mean should use the EMA value.
      self.assertAllClose([beta_1_ema, mean_1_ema], sess.run([beta, mean]))
Esempio n. 21
0
 def _Accum():
     return tf.no_op()
Esempio n. 22
0
 def no_update_op():
     return tf.no_op()
Esempio n. 23
0
 def control_after_assigns(self):
     if not self._assign_ops:
         return tf.no_op()
     with tf.control_dependencies(self._assign_ops):
         return tf.no_op()