Esempio n. 1
0
    def __init__(self, model, optimizer, checkpoint=None, is_master=True):
        """Initializes the trainer.

    Args:
      model: A :class:`opennmt.models.Model` instance to train.
      optimizer: A ``tf.keras.optimizers.Optimizer`` instance.
      checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance. If
        not set, no checkpoints will be saved.
      is_master: Whether this trainer instance is the master trainer.
    """
        self._checkpoint = checkpoint
        self._is_master = is_master
        self._model = model
        if checkpoint is not None:
            self._summary_writer = tf.summary.create_file_writer(
                checkpoint.model_dir)
        else:
            self._summary_writer = tf.summary.create_noop_writer()
        self._training_stats = None
        self._gradient_accumulator = optimizer_util.GradientAccumulator()

        if optimizer is None:
            raise ValueError("No optimizer is defined")
        graph_optimizer_options = tf.config.optimizer.get_experimental_options(
        )
        mixed_precision_enabled = graph_optimizer_options.get(
            "auto_mixed_precision")
        if (mixed_precision_enabled and not isinstance(
                optimizer,
                tf.keras.mixed_precision.experimental.LossScaleOptimizer)):
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                optimizer, "dynamic")
        self._optimizer = optimizer
Esempio n. 2
0
    def __init__(self, checkpoint, devices=None, mixed_precision=False):
        """Initializes the trainer.

    Args:
      checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance.
      devices: List of device strings to use for training.
      mixed_precision: Whether mixed precision is enabled or not.
    """
        if not devices:
            devices = misc.get_devices(
                count=1)  # Train with 1 device by default.
        self._checkpoint = checkpoint
        self._mixed_precision = mixed_precision
        self._model = checkpoint.model
        self._strategy = tf.distribute.MirroredStrategy(devices=devices)
        self._summary_writer = tf.summary.create_file_writer(
            checkpoint.model_dir)

        optimizer = checkpoint.optimizer
        if optimizer is None:
            raise ValueError("No optimizer is defined")
        if mixed_precision:
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                optimizer, "dynamic")
        self._optimizer = optimizer

        with self._strategy.scope():
            # Create some variables under the strategy scope.
            _ = self._optimizer.iterations
            self._model.create_variables()
            self._gradient_accumulator = optimizer_util.GradientAccumulator()
Esempio n. 3
0
    def __init__(self, model, optimizer, checkpoint=None):
        """Initializes the trainer.

        Args:
          model: A :class:`opennmt.models.Model` instance to train.
          optimizer: A ``tf.keras.optimizers.Optimizer`` instance.
          checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance. If
            not set, no checkpoints will be saved.
        """
        self._checkpoint = checkpoint
        self._model = model
        if checkpoint is not None:
            self._summary_writer = tf.summary.create_file_writer(
                checkpoint.model_dir)
        else:
            self._summary_writer = tf.summary.create_noop_writer()
        self._training_stats = None
        self._gradient_accumulator = optimizer_util.GradientAccumulator()
        self._mixed_precision = misc.mixed_precision_enabled()

        if optimizer is None:
            raise ValueError("No optimizer is defined")
        if self._mixed_precision:
            optimizer = _add_mixed_precision_wrapper(optimizer)
        self._optimizer = optimizer
Esempio n. 4
0
 def testGradientAccumulator(self):
     accumulator = utils.GradientAccumulator()
     accumulator([tf.constant([1.0, 2.0])])
     accumulator([tf.constant([-2.0, 1.0])])
     accumulator([tf.constant([-1.0, 2.0])])
     with self.assertRaises(ValueError):
         accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])])
     self.assertEqual(accumulator.step, 3)
     self.assertEqual(len(accumulator.gradients), 1)
     self.assertAllEqual(accumulator.gradients[0], [-2.0, 5.0])
     accumulator.reset()
     self.assertEqual(accumulator.step, 0)
     self.assertAllEqual(accumulator.gradients[0], [0.0, 0.0])
Esempio n. 5
0
    def testGradientAccumulatorDistributionStrategy(self):
        devices = tf.config.list_logical_devices(device_type="CPU")
        strategy = tf.distribute.MirroredStrategy(devices=devices[:2])

        with strategy.scope():
            accumulator = utils.GradientAccumulator()
            variable = tf.Variable([4.0, 3.0])
            sgd = tf.keras.optimizers.SGD(1.0)
            gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)

        def accumulate_on_replica(gradient):
            accumulator([gradient])

        def apply_on_replica():
            sgd.apply_gradients(list(zip(accumulator.gradients, [variable])))

        @tf.function
        def accumulate(grad1, grad2):
            with strategy.scope():
                local_variables = strategy.experimental_local_results(
                    gradient_placeholder)
                local_variables[0].assign(grad1)
                local_variables[1].assign(grad2)
                strategy.run(accumulate_on_replica,
                             args=(gradient_placeholder, ))

        @tf.function
        def apply_grad():
            with strategy.scope():
                strategy.run(apply_on_replica)

        def _check_local_values(grad1, grad2):
            values = strategy.experimental_local_results(
                accumulator._gradients[0])
            self.assertAllEqual(values[0].value(), grad1)
            self.assertAllEqual(values[1].value(), grad2)

        accumulate([1.0, 2.0], [-1.0, 1.0])
        accumulate([3.0, -1.0], [-1.0, -1.0])
        accumulate([-2.0, 2.0], [3.0, -2.0])
        self.assertEqual(accumulator.step, 3)
        _check_local_values([2.0, 3.0], [1.0, -2.0])
        apply_grad()
        self.assertAllEqual(
            variable.value(),
            [1.0, 2.0])  # [4.0 - (2.0 + 1.0), 3.0 - (3.0 - 2.0)]
        accumulator.reset()
        self.assertEqual(accumulator.step, 0)
        _check_local_values([0.0, 0.0], [0.0, 0.0])
Esempio n. 6
0
  def __init__(self, checkpoint, devices=None):
    """Initializes the trainer.

    Args:
      checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance.
      devices: List of device strings to use for training.
    """
    super(DistributionStrategyTrainer, self).__init__(checkpoint)
    if not devices:
      devices = misc.get_devices(count=1)  # Train with 1 device by default.
    self._strategy = tf.distribute.MirroredStrategy(devices=devices)
    self._words_counters = {}
    with self._strategy.scope():
      # Create some variables under the strategy scope.
      _ = self._optimizer.iterations
      self._gradient_accumulator = optimizer_util.GradientAccumulator()
Esempio n. 7
0
  def testGradientAccumulatorDistributionStrategy(self):
    physical_devices = tf.config.experimental.list_physical_devices("CPU")
    tf.config.experimental.set_virtual_device_configuration(
        physical_devices[0],
        [tf.config.experimental.VirtualDeviceConfiguration(),
         tf.config.experimental.VirtualDeviceConfiguration()])

    devices = tf.config.experimental.list_logical_devices(device_type="CPU")
    strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices])

    with strategy.scope():
      accumulator = utils.GradientAccumulator()
      variable = tf.Variable([4.0, 3.0])
      sgd = tf.keras.optimizers.SGD(1.0)
      gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)

    def accumulate_on_replica(gradient):
      accumulator([gradient])

    def apply_on_replica():
      sgd.apply_gradients(list(zip(accumulator.gradients, [variable])))

    @tf.function
    def accumulate(grad1, grad2):
      with strategy.scope():
        gradient_placeholder.values[0].assign(grad1)
        gradient_placeholder.values[1].assign(grad2)
        strategy.experimental_run_v2(accumulate_on_replica, args=(gradient_placeholder,))

    @tf.function
    def apply_grad():
      with strategy.scope():
        strategy.experimental_run_v2(apply_on_replica)

    accumulate([1.0, 2.0], [-1.0, 1.0])
    accumulate([3.0, -1.0], [-1.0, -1.0])
    accumulate([-2.0, 2.0], [3.0, -2.0])
    self.assertEqual(accumulator.step, 3)
    self.assertAllEqual(accumulator._gradients[0].values[0].value(), [2.0, 3.0])
    self.assertAllEqual(accumulator._gradients[0].values[1].value(), [1.0, -2.0])
    apply_grad()
    self.assertAllEqual(variable.value(), [1.0, 2.0])  # [4.0 - (2.0 + 1.0), 3.0 - (3.0 - 2.0)]
    accumulator.reset()
    self.assertEqual(accumulator.step, 0)
    self.assertAllEqual(accumulator._gradients[0].values[0].value(), [0.0, 0.0])
    self.assertAllEqual(accumulator._gradients[0].values[1].value(), [0.0, 0.0])
Esempio n. 8
0
    def __call__(self,
                 dataset,
                 max_step=None,
                 accum_steps=1,
                 report_steps=100,
                 save_steps=5000,
                 evaluator=None,
                 eval_steps=5000,
                 export_on_best=None):
        """Runs the training.

    Args:
      dataset: A training dataset.
      max_step: The final training step.
      accum_steps: The number of gradient accumulation steps.
      report_steps: Report status every this many steps.
      save_steps: Save a checkpoint every this many steps.
      evaluator: A :class:`opennmt.evaluation.Evaluator` instance to call for
        evaluation.
      eval_steps: Evaluate every this many steps.
      export_on_best: Export a SavedModel when this evaluation metric has the
        best value so far.
    """
        if max_step is not None and self._optimizer.iterations.numpy(
        ) >= max_step:
            tf.get_logger().warning(
                "Model already reached max_step = %d. Exiting.", max_step)
            return
        if evaluator is not None and evaluator.should_stop():
            tf.get_logger().warning(
                "Early stopping conditions are already met. Exiting.")
            return

        with self._strategy.scope():
            self._model.create_variables(optimizer=self._optimizer)
            variables = self._model.trainable_variables
            base_dataset = dataset
            # We prefer not to use experimental_distribute_dataset here because it
            # sometimes fails to split the batches (noticed with tokens batch type).
            # We also assume for now that we are training with a single worker
            # otherwise we would need to correctly shard the input dataset.
            dataset = self._strategy.experimental_distribute_datasets_from_function(
                lambda _: base_dataset)
            gradient_accumulator = optimizer_util.GradientAccumulator()

        if self._mixed_precision:
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                self._optimizer, "dynamic")
        else:
            optimizer = self._optimizer

        def _accumulate_gradients(source, target):
            outputs, _ = self._model(source,
                                     labels=target,
                                     training=True,
                                     step=self._optimizer.iterations)
            loss = self._model.compute_loss(outputs, target, training=True)
            if isinstance(loss, tuple):
                training_loss = loss[0] / loss[1]
                reported_loss = loss[0] / loss[2]
            else:
                training_loss, reported_loss = loss, loss
            training_loss = self._model.regularize_loss(training_loss,
                                                        variables=variables)
            gradients = optimizer.get_gradients(training_loss, variables)
            gradient_accumulator(gradients)
            tf.summary.scalar("gradients/global_norm",
                              tf.linalg.global_norm(gradients))
            num_words = {}
            if "length" in source:
                num_words["source"] = tf.reduce_sum(source["length"])
            if "length" in target:
                num_words["target"] = tf.reduce_sum(target["length"])
            return reported_loss, num_words

        def _apply_gradients():
            grads_and_vars = []
            for gradient, variable in zip(gradient_accumulator.gradients,
                                          variables):
                # optimizer.apply_gradients will sum the gradients accross replicas.
                scaled_gradient = gradient / (
                    self._strategy.num_replicas_in_sync * accum_steps)
                grads_and_vars.append((scaled_gradient, variable))
            optimizer.apply_gradients(grads_and_vars)
            gradient_accumulator.reset()

        @dataset_util.function_on_next(dataset)
        def _forward(next_fn):
            tf.summary.experimental.set_step(self._optimizer.iterations)
            should_record_summaries = tf.logical_and(
                tf.equal(self._optimizer.iterations % report_steps, 0),
                tf.equal(gradient_accumulator.step, 0))
            with tf.summary.record_if(should_record_summaries):
                with self._strategy.scope():
                    per_replica_source, per_replica_target = next_fn()

                    def _run():
                        per_replica_loss, per_replica_words = self._strategy.experimental_run_v2(
                            _accumulate_gradients,
                            args=(per_replica_source, per_replica_target))

                        # TODO: these reductions could be delayed until _step is called.
                        loss = self._strategy.reduce(
                            tf.distribute.ReduceOp.MEAN, per_replica_loss,
                            None)
                        num_words = {
                            k:
                            self._strategy.reduce(tf.distribute.ReduceOp.SUM,
                                                  v, None)
                            for k, v in six.iteritems(per_replica_words)
                        }
                        return loss, num_words, False

                    def _skip():
                        loss = tf.constant(0, dtype=tf.float32)
                        num_words = {}
                        if "length" in per_replica_source:
                            num_words["source"] = tf.constant(0,
                                                              dtype=tf.int32)
                        if "length" in per_replica_target:
                            num_words["target"] = tf.constant(0,
                                                              dtype=tf.int32)
                        return loss, num_words, True

                    # We verify here that each replica receives a non empty batch. If not,
                    # we skip this iteration. This typically happens at the last iteration
                    # when training on a finite dataset.
                    # TODO: is there a simpler way to handle this case?
                    per_replica_non_empty_batch = self._strategy.experimental_run_v2(
                        lambda tensor: tf.math.count_nonzero(
                            tf.shape(tensor)[0]),
                        args=(tf.nest.flatten(per_replica_source)[0], ))
                    non_empty_batch_count = self._strategy.reduce(
                        tf.distribute.ReduceOp.SUM,
                        per_replica_non_empty_batch, None)
                    return tf.cond(tf.math.equal(
                        non_empty_batch_count,
                        self._strategy.num_replicas_in_sync),
                                   true_fn=_run,
                                   false_fn=_skip)

        @tf.function
        def _step():
            with self._strategy.scope():
                self._strategy.experimental_run_v2(_apply_gradients)

        accum_num_words = collections.defaultdict(int)
        last_report_time = time.time()
        last_step = 0

        with self._summary_writer.as_default():
            if self._optimizer.iterations.numpy() == 0:
                self._checkpoint.save(0)
            self._model.visualize(self._checkpoint.model_dir)

            for i, (loss, num_words, skipped) in enumerate(_forward()):  # pylint: disable=no-value-for-parameter
                if skipped:
                    # We assume only the last partial batch can possibly be skipped.
                    tf.get_logger().warning(
                        "Batch %d is partial, i.e. some training replicas "
                        "received an empty batch as input. Skipping.", i + 1)
                    break
                if tf.math.is_nan(loss):
                    raise RuntimeError("Model diverged with loss = NaN.")
                if i == 0 or (i + 1) % accum_steps == 0:
                    _step()

                for key, value in six.iteritems(num_words):
                    accum_num_words[key] += value.numpy()
                step = self._optimizer.iterations.numpy()
                if step == last_step:
                    continue  # Do not process same step twice.
                last_step = step
                if step % report_steps == 0:
                    last_report_time = _report_training_status(
                        step, loss, self._optimizer.learning_rate,
                        accum_num_words, last_report_time)
                if save_steps is not None and step % save_steps == 0:
                    self._checkpoint.save(step)
                if evaluator is not None and eval_steps is not None and step % eval_steps == 0:
                    self._evaluate(evaluator,
                                   step,
                                   export_on_best=export_on_best)
                    if evaluator.should_stop():
                        tf.get_logger().warning(
                            "Early stopping conditions are met. Exiting.")
                        break
                if step == max_step:
                    break

        if evaluator is not None and step != evaluator.last_evaluated_step:
            self._evaluate(evaluator, step, export_on_best=export_on_best)
        self._checkpoint.save(step)