コード例 #1
0
    def __init__(self, checkpoint, devices=None, mixed_precision=False):
        """Initializes the trainer.

    Args:
      checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance.
      devices: List of device strings to use for training.
      mixed_precision: Whether mixed precision is enabled or not.
    """
        if not devices:
            devices = misc.get_devices(
                count=1)  # Train with 1 device by default.
        self._checkpoint = checkpoint
        self._mixed_precision = mixed_precision
        self._model = checkpoint.model
        self._strategy = tf.distribute.MirroredStrategy(devices=devices)
        self._summary_writer = tf.summary.create_file_writer(
            checkpoint.model_dir)

        optimizer = checkpoint.optimizer
        if optimizer is None:
            raise ValueError("No optimizer is defined")
        if mixed_precision:
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                optimizer, "dynamic")
        self._optimizer = optimizer

        with self._strategy.scope():
            # Create some variables under the strategy scope.
            _ = self._optimizer.iterations
            self._model.create_variables()
            self._gradient_accumulator = optimizer_util.GradientAccumulator()
コード例 #2
0
  def __init__(self, checkpoint, devices=None):
    """Initializes the trainer.

    Args:
      checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance.
      devices: List of device strings to use for training.
    """
    super(DistributionStrategyTrainer, self).__init__(checkpoint)
    if not devices:
      devices = misc.get_devices(count=1)  # Train with 1 device by default.
    self._strategy = tf.distribute.MirroredStrategy(devices=devices)
    self._words_counters = {}
    with self._strategy.scope():
      # Create some variables under the strategy scope.
      _ = self._optimizer.iterations
      self._gradient_accumulator = optimizer_util.GradientAccumulator()
コード例 #3
0
    def __init__(self, model, optimizer, checkpoint=None, devices=None):
        """Initializes the trainer.

    Args:
      checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance.
      devices: List of device strings to use for training.
    """
        super(DistributionStrategyTrainer,
              self).__init__(model, optimizer, checkpoint=checkpoint)
        if not devices:
            devices = misc.get_devices(
                count=1)  # Train with 1 device by default.
        self._strategy = tf.distribute.MirroredStrategy(devices=devices)
        with self._strategy.scope():
            # Create some variables under the strategy scope.
            _ = self._optimizer.iterations
コード例 #4
0
    def __init__(self, checkpoint, devices=None, mixed_precision=False):
        """Initializes the trainer.

    Args:
      checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance.
      devices: List of device strings to use for training.
      mixed_precision: Whether mixed precision is enabled or not.
    """
        if checkpoint.optimizer is None:
            raise ValueError("No optimizer is defined")
        if not devices:
            devices = misc.get_devices(
                count=1)  # Train with 1 device by default.
        self._checkpoint = checkpoint
        self._mixed_precision = mixed_precision
        self._model = checkpoint.model
        self._optimizer = checkpoint.optimizer
        self._strategy = tf.distribute.MirroredStrategy(devices=devices)
        self._summary_writer = tf.summary.create_file_writer(
            checkpoint.model_dir)
コード例 #5
0
    def train(self, num_devices=1, with_eval=False, checkpoint_path=None):
        """Runs the training loop.

    Args:
      num_devices: Number of devices to use for training.
      with_eval: Enable evaluation during training.
      checkpoint_path: The checkpoint path to load the model weights from it.

    Returns:
      The path to the final model directory.
    """
        checkpoint, config = self._init_run(num_devices=num_devices,
                                            training=True)
        # load teacher model.
        teacher_checkpoint_path = config["model"]["teacher"].get(
            "teacher_checkpoint_path", None)
        if not teacher_checkpoint_path:
            raise ValueError("teacher_checkpoint_path is None.")
        teacher_checkpoint = tf.train.Checkpoint(
            model=checkpoint.model.teacher_model)
        status = teacher_checkpoint.restore(
            tf.train.latest_checkpoint(teacher_checkpoint_path))
        # load student model.
        student_checkpoint_path = config["model"]["student"].get(
            "student_checkpoint_path", None)
        if student_checkpoint_path:
            student_checkpoint = tf.train.Checkpoint(
                model=checkpoint.model.student_model)
            student_checkpoint.restore(
                tf.train.latest_checkpoint(student_checkpoint_path))
        status = checkpoint.restore(checkpoint_path=checkpoint_path,
                                    weights_only=checkpoint_path is not None)

        model = checkpoint.model
        data_config = config["data"]
        train_config = config["train"]
        eval_config = config["eval"]

        batch_type = train_config["batch_type"]
        if batch_type == "tokens" and self._mixed_precision:
            batch_size_multiple = 8
        else:
            batch_size_multiple = 1

        dataset = model.student_model.examples_inputter.make_training_dataset(
            data_config["train_features_file"],
            data_config.get("train_labels_file"),
            train_config["batch_size"],
            batch_type=batch_type,
            batch_size_multiple=batch_size_multiple,
            shuffle_buffer_size=train_config["sample_buffer_size"],
            length_bucket_width=train_config["length_bucket_width"],
            maximum_features_length=train_config.get(
                "maximum_features_length"),
            maximum_labels_length=train_config.get("maximum_labels_length"),
            single_pass=train_config.get("single_pass", False),
            prefetch_buffer_size=train_config.get("prefetch_buffer_size"))
        # todo
        if with_eval:
            evaluator = evaluation.Evaluator.from_config(model, config)
        else:
            evaluator = None

        # Set gradients accumulation based on the requested effective batch size.
        if train_config.get("effective_batch_size") is not None:
            accum_steps = _count_batch_accum(
                train_config["batch_size"],
                train_config["effective_batch_size"],
                num_replicas=num_devices)
            tf.get_logger().info(
                "Accumulate gradients of %d iterations to reach effective batch size of %d",
                accum_steps, train_config["effective_batch_size"])
        else:
            accum_steps = 1

        trainer = training_util.DistributionStrategyTrainer(
            checkpoint, devices=misc.get_devices(count=num_devices))
        trainer(dataset,
                max_step=train_config.get("max_step"),
                accum_steps=accum_steps,
                report_steps=train_config.get("save_summary_steps", 100),
                save_steps=train_config.get("save_checkpoints_steps", 5000),
                evaluator=evaluator,
                eval_steps=eval_config.get("steps", 5000),
                export_on_best=eval_config.get("export_on_best"))
        average_last_checkpoints = train_config.get("average_last_checkpoints",
                                                    0)
        if average_last_checkpoints > 0:
            return self.average_checkpoints(os.path.join(
                checkpoint.model_dir, "avg"),
                                            max_count=average_last_checkpoints)
        return checkpoint.model_dir
コード例 #6
0
    def train(
        self,
        num_devices=1,
        with_eval=False,
        checkpoint_path=None,
        hvd=None,
        return_summary=False,
        fallback_to_cpu=True,
    ):
        """Runs the training loop.

        Args:
          num_devices: Number of devices to use for training.
          with_eval: Enable evaluation during training.
          checkpoint_path: The checkpoint path to load the model weights from.
          hvd: Optional Horovod module.
          return_summary: Return a summary of the training from this function.
          fallback_to_cpu: If no GPU is detected, allow the training to run on CPU.

        Returns:
          The path to the final model directory and, if :obj:`return_summary` is set,
          a dictionary with various training statistics.
        """
        if hvd is None:
            num_replicas = num_devices
            is_master = True
        else:
            if num_devices > 1:
                raise ValueError(
                    "num_devices (or num_gpus) should be set to 1 when using Horovod"
                )
            num_replicas = hvd.size()
            is_master = hvd.rank() == 0

        devices = misc.get_devices(count=num_devices,
                                   fallback_to_cpu=fallback_to_cpu)

        config = self._finalize_config(training=True,
                                       num_replicas=num_replicas,
                                       num_devices=num_devices)

        mixed_precision = self._mixed_precision and misc.enable_mixed_precision(
        )
        model = self._init_model(config)
        optimizer = model.get_optimizer()

        data_config = config["data"]
        train_config = config["train"]
        eval_config = config["eval"]

        batch_type = train_config["batch_type"]
        batch_size_multiple = 8 if mixed_precision and batch_type == "tokens" else 1

        dataset_fn = (
            lambda input_context: model.examples_inputter.
            make_training_dataset(
                data_config["train_features_file"],
                data_config.get("train_labels_file"),
                train_config["batch_size"],
                batch_type=batch_type,
                batch_size_multiple=batch_size_multiple,
                shuffle_buffer_size=train_config["sample_buffer_size"],
                length_bucket_width=train_config["length_bucket_width"],
                maximum_features_length=train_config.get(
                    "maximum_features_length"),
                maximum_labels_length=train_config.get("maximum_labels_length"
                                                       ),
                single_pass=train_config.get("single_pass", False),
                num_shards=input_context.num_input_pipelines,
                shard_index=input_context.input_pipeline_id,
                prefetch_buffer_size=train_config.get("prefetch_buffer_size"),
                cardinality_multiple=input_context.num_replicas_in_sync,
                weights=data_config.get("train_files_weights"),
                batch_autotune_mode=train_config.get("batch_autotune_mode"),
            ))

        checkpoint = None
        evaluator = None
        if is_master:
            checkpoint = checkpoint_util.Checkpoint.from_config(
                config, model, optimizer=optimizer)
            checkpoint.restore(
                checkpoint_path=checkpoint_path,
                weights_only=checkpoint_path is not None,
            )
            if with_eval:
                evaluator = evaluation.Evaluator.from_config(model, config)

        # Set gradients accumulation based on the requested effective batch size.
        if train_config.get("effective_batch_size") is not None:
            accum_steps = _count_batch_accum(
                train_config["batch_size"],
                train_config["effective_batch_size"],
                num_replicas=num_replicas,
            )
            tf.get_logger().info(
                "Accumulate gradients of %d iterations to reach effective batch size of %d",
                accum_steps,
                train_config["effective_batch_size"],
            )
        else:
            accum_steps = 1

        if hvd is not None:
            trainer = training_util.HorovodTrainer(model,
                                                   optimizer,
                                                   hvd,
                                                   checkpoint=checkpoint)
        elif num_devices > 1:
            trainer = training_util.MirroredStrategyTrainer(
                model, optimizer, checkpoint=checkpoint, devices=devices)
        else:
            trainer = training_util.Trainer(model,
                                            optimizer,
                                            checkpoint=checkpoint)

        summary = trainer(
            dataset_fn,
            max_step=train_config.get("max_step"),
            accum_steps=accum_steps,
            report_steps=train_config.get("save_summary_steps", 100),
            save_steps=train_config.get("save_checkpoints_steps", 5000),
            evaluator=evaluator,
            eval_steps=eval_config.get("steps", 5000),
            moving_average_decay=train_config.get("moving_average_decay"),
        )

        average_last_checkpoints = train_config.get("average_last_checkpoints",
                                                    0)
        if checkpoint is None:
            output_dir = None
        elif average_last_checkpoints > 0:
            output_dir = self.average_checkpoints(
                os.path.join(checkpoint.model_dir, "avg"),
                max_count=average_last_checkpoints,
            )
        else:
            output_dir = checkpoint.model_dir

        if mixed_precision:
            misc.disable_mixed_precision()

        if return_summary:
            return output_dir, summary
        return output_dir
コード例 #7
0
    def train(self, num_devices=1, with_eval=False, checkpoint_path=None):
        """Runs the training loop.

    Args:
      num_devices: Number of devices to use for training.
      with_eval: Enable evaluation during training.
      checkpoint_path: The checkpoint path to load the model weights from it.

    Returns:
      The path to the final model directory.
    """
        devices = misc.get_devices(count=num_devices)
        checkpoint, config = self._init_run(num_devices=num_devices,
                                            training=True)
        checkpoint.restore(checkpoint_path=checkpoint_path,
                           weights_only=checkpoint_path is not None)

        model = checkpoint.model
        data_config = config["data"]
        train_config = config["train"]
        eval_config = config["eval"]

        batch_type = train_config["batch_type"]
        if batch_type == "tokens" and self._mixed_precision:
            batch_size_multiple = 8
        else:
            batch_size_multiple = 1

        dataset_fn = lambda input_context: model.examples_inputter.make_training_dataset(
            data_config["train_features_file"],
            data_config.get("train_labels_file"),
            train_config["batch_size"],
            batch_type=batch_type,
            batch_size_multiple=batch_size_multiple,
            shuffle_buffer_size=train_config["sample_buffer_size"],
            length_bucket_width=train_config["length_bucket_width"],
            maximum_features_length=train_config.get("maximum_features_length"
                                                     ),
            maximum_labels_length=train_config.get("maximum_labels_length"),
            single_pass=train_config.get("single_pass", False),
            num_shards=input_context.num_input_pipelines,
            shard_index=input_context.input_pipeline_id,
            prefetch_buffer_size=train_config.get("prefetch_buffer_size"),
            cardinality_multiple=input_context.num_replicas_in_sync,
            weights=data_config.get("train_files_weights"))

        if with_eval:
            evaluator = evaluation.Evaluator.from_config(model, config)
        else:
            evaluator = None

        # Set gradients accumulation based on the requested effective batch size.
        if train_config.get("effective_batch_size") is not None:
            accum_steps = _count_batch_accum(
                train_config["batch_size"],
                train_config["effective_batch_size"],
                num_replicas=num_devices)
            tf.get_logger().info(
                "Accumulate gradients of %d iterations to reach effective batch size of %d",
                accum_steps, train_config["effective_batch_size"])
        else:
            accum_steps = 1

        trainer = training_util.DistributionStrategyTrainer(checkpoint,
                                                            devices=devices)
        trainer(dataset_fn,
                max_step=train_config.get("max_step"),
                accum_steps=accum_steps,
                report_steps=train_config.get("save_summary_steps", 100),
                save_steps=train_config.get("save_checkpoints_steps", 5000),
                evaluator=evaluator,
                eval_steps=eval_config.get("steps", 5000),
                moving_average_decay=train_config.get("moving_average_decay"))
        average_last_checkpoints = train_config.get("average_last_checkpoints",
                                                    0)
        if average_last_checkpoints > 0:
            return self.average_checkpoints(os.path.join(
                checkpoint.model_dir, "avg"),
                                            max_count=average_last_checkpoints)
        return checkpoint.model_dir
コード例 #8
0
ファイル: runner.py プロジェクト: danielinux7/OpenNMT-tf
    def train(self,
              num_devices=1,
              with_eval=False,
              checkpoint_path=None,
              hvd=None):
        """Runs the training loop.

    Args:
      num_devices: Number of devices to use for training.
      with_eval: Enable evaluation during training.
      checkpoint_path: The checkpoint path to load the model weights from it.
      hvd: Optional Horovod module.

    Returns:
      The path to the final model directory.
    """
        if hvd is None:
            num_replicas = num_devices
            is_master = True
        else:
            num_replicas = hvd.size()
            is_master = hvd.rank() == 0

        config = self._finalize_config(training=True,
                                       num_replicas=num_replicas,
                                       num_devices=num_devices)
        model = self._init_model(config)
        optimizer = model.get_optimizer()

        data_config = config["data"]
        train_config = config["train"]
        eval_config = config["eval"]

        batch_type = train_config["batch_type"]
        if batch_type == "tokens" and self._mixed_precision:
            batch_size_multiple = 8
        else:
            batch_size_multiple = 1

        dataset_fn = lambda input_context: model.examples_inputter.make_training_dataset(
            data_config["train_features_file"],
            data_config.get("train_labels_file"),
            train_config["batch_size"],
            batch_type=batch_type,
            batch_size_multiple=batch_size_multiple,
            shuffle_buffer_size=train_config["sample_buffer_size"],
            length_bucket_width=train_config["length_bucket_width"],
            maximum_features_length=train_config.get("maximum_features_length"
                                                     ),
            maximum_labels_length=train_config.get("maximum_labels_length"),
            single_pass=train_config.get("single_pass", False),
            num_shards=input_context.num_input_pipelines,
            shard_index=input_context.input_pipeline_id,
            prefetch_buffer_size=train_config.get("prefetch_buffer_size"),
            cardinality_multiple=input_context.num_replicas_in_sync,
            weights=data_config.get("train_files_weights"))

        checkpoint = None
        evaluator = None
        if is_master:
            checkpoint = checkpoint_util.Checkpoint.from_config(
                config, model, optimizer=optimizer)
            checkpoint.restore(checkpoint_path=checkpoint_path,
                               weights_only=checkpoint_path is not None)
            if with_eval:
                evaluator = evaluation.Evaluator.from_config(model, config)

        # Set gradients accumulation based on the requested effective batch size.
        if train_config.get("effective_batch_size") is not None:
            accum_steps = _count_batch_accum(
                train_config["batch_size"],
                train_config["effective_batch_size"],
                num_replicas=num_replicas)
            tf.get_logger().info(
                "Accumulate gradients of %d iterations to reach effective batch size of %d",
                accum_steps, train_config["effective_batch_size"])
        else:
            accum_steps = 1

        if hvd is not None:
            if num_devices > 1:
                raise ValueError(
                    "num_devices (or num_gpus) should be set to 1 when using Horovod"
                )
            trainer = training_util.HorovodTrainer(model,
                                                   optimizer,
                                                   hvd,
                                                   checkpoint=checkpoint)
        elif num_devices > 1:
            devices = misc.get_devices(count=num_devices)
            if devices[0][1] == 'TPU':
                trainer = training_util.TPUStrategyTrainer(
                    model, optimizer, checkpoint=checkpoint, devices=devices)
            else:
                trainer = training_util.MirroredStrategyTrainer(
                    model, optimizer, checkpoint=checkpoint, devices=devices)
        else:
            trainer = training_util.Trainer(model,
                                            optimizer,
                                            checkpoint=checkpoint)

        trainer(dataset_fn,
                max_step=train_config.get("max_step"),
                accum_steps=accum_steps,
                report_steps=train_config.get("save_summary_steps", 100),
                save_steps=train_config.get("save_checkpoints_steps", 5000),
                evaluator=evaluator,
                eval_steps=eval_config.get("steps", 5000),
                moving_average_decay=train_config.get("moving_average_decay"))

        if checkpoint is None:
            return None
        average_last_checkpoints = train_config.get("average_last_checkpoints",
                                                    0)
        if average_last_checkpoints > 0:
            return self.average_checkpoints(os.path.join(
                checkpoint.model_dir, "avg"),
                                            max_count=average_last_checkpoints)
        return checkpoint.model_dir