Exemple #1
0
 def test_export_best_ckpt(self, distribution):
   config = cfg.ExperimentConfig(
       trainer=cfg.TrainerConfig(
           best_checkpoint_export_subdir='best_ckpt',
           best_checkpoint_eval_metric='acc',
           optimizer_config=cfg.OptimizationConfig({
               'optimizer': {
                   'type': 'sgd'
               },
               'learning_rate': {
                   'type': 'constant'
               }
           })))
   model_dir = self.get_temp_dir()
   task = mock_task.MockTask(config.task, logging_dir=model_dir)
   ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
   trainer = trainer_lib.Trainer(
       config,
       task,
       model=task.build_model(),
       checkpoint_exporter=ckpt_exporter)
   trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
   trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
   self.assertTrue(
       tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))
Exemple #2
0
    def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
        config = cfg.ExperimentConfig(
            runtime=cfg.RuntimeConfig(
                mixed_precision_dtype=mixed_precision_dtype,
                loss_scale=loss_scale),
            trainer=cfg.TrainerConfig(optimizer_config=cfg.OptimizationConfig({
                'optimizer': {
                    'type': 'sgd'
                },
                'learning_rate': {
                    'type': 'constant'
                }
            })))
        task = mock_task.MockTask()
        trainer = trainer_lib.Trainer(config, task)
        if mixed_precision_dtype != 'float16':
            self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
        elif mixed_precision_dtype == 'float16' and loss_scale is None:
            self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
        else:
            self.assertIsInstance(
                trainer.optimizer,
                tf.keras.mixed_precision.experimental.LossScaleOptimizer)

        metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
        self.assertIn('training_loss', metrics)
Exemple #3
0
def create_trainer(params, task, model_dir, train, evaluate):
    del model_dir
    logging.info('Running default trainer.')
    trainer = base_trainer.Trainer(params,
                                   task,
                                   train=train,
                                   evaluate=evaluate)
    return trainer
Exemple #4
0
 def create_test_trainer(self, config, model_dir=None):
   task = mock_task.MockTask(config.task, logging_dir=model_dir)
   ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
   trainer = trainer_lib.Trainer(
       config,
       task,
       model=task.build_model(),
       optimizer=trainer_lib.create_optimizer(config.trainer, config.runtime),
       checkpoint_exporter=ckpt_exporter)
   return trainer
    def test_recovery(self):
        config = cfg.ExperimentConfig(
            trainer=cfg.TrainerConfig(loss_upper_bound=0.5,
                                      recovery_max_trials=2,
                                      optimizer_config=cfg.OptimizationConfig({
                                          'optimizer': {
                                              'type': 'sgd'
                                          },
                                          'learning_rate': {
                                              'type': 'constant'
                                          }
                                      })))
        model_dir = self.get_temp_dir()
        trainer = self.create_test_trainer(config, model_dir=model_dir)
        checkpoint_manager = tf.train.CheckpointManager(trainer.checkpoint,
                                                        self.get_temp_dir(),
                                                        max_to_keep=2)
        checkpoint_manager.save()
        trainer.add_recovery(config.trainer,
                             checkpoint_manager=checkpoint_manager)
        before_weights = trainer.model.get_weights()
        _ = trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
        # The training loss is 1.0 and upper_bound is 0.5, so the recover happens.
        after_weights = trainer.model.get_weights()
        for left, right in zip(before_weights, after_weights):
            self.assertAllEqual(left, right)

        # Let's the loss be NaN and max_trials = 0 to see RuntimeError.
        config = cfg.ExperimentConfig(
            trainer=cfg.TrainerConfig(recovery_max_trials=0,
                                      optimizer_config=cfg.OptimizationConfig({
                                          'optimizer': {
                                              'type': 'sgd'
                                          },
                                          'learning_rate': {
                                              'type': 'constant'
                                          }
                                      })))
        task = mock_task.MockTask(config.task, logging_dir=model_dir)

        def build_losses(labels, model_outputs, aux_losses=None):
            del labels, model_outputs
            return tf.constant([np.nan], tf.float32) + aux_losses

        task.build_losses = build_losses
        trainer = trainer_lib.Trainer(config,
                                      task,
                                      model=task.build_model(),
                                      optimizer=task.create_optimizer(
                                          config.trainer.optimizer_config,
                                          config.runtime))
        trainer.add_recovery(config.trainer,
                             checkpoint_manager=checkpoint_manager)
        with self.assertRaises(RuntimeError):
            _ = trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
 def test_model_with_compiled_loss(self):
   task = mock_task.MockTask()
   model = task.build_model()
   model.compile(loss=tf.keras.losses.CategoricalCrossentropy())
   trainer = trainer_lib.Trainer(
       self._config,
       task,
       model=model,
       optimizer=task.create_optimizer(self._config.trainer.optimizer_config))
   logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
   self.assertIn('training_loss', logs)
Exemple #7
0
def create_trainer(
    params: config_definitions.ExperimentConfig,
    task: base_task.Task,
    model_dir: str,
    train: bool,
    evaluate: bool,
    checkpoint_exporter: Any = None):
  """Create trainer."""
  del model_dir
  logging.info('Running default trainer.')
  trainer = base_trainer.Trainer(
      params, task, train=train, evaluate=evaluate,
      checkpoint_exporter=checkpoint_exporter)
  return trainer
Exemple #8
0
def create_trainer(params: config_definitions.ExperimentConfig,
                   task: base_task.Task,
                   train: bool,
                   evaluate: bool,
                   checkpoint_exporter: Any = None) -> base_trainer.Trainer:
  """Create trainer."""
  logging.info('Running default trainer.')
  model = task.build_model()
  optimizer = base_trainer.create_optimizer(params.trainer, params.runtime)
  trainer = base_trainer.Trainer(
      params,
      task,
      model=model,
      optimizer=optimizer,
      train=train,
      evaluate=evaluate,
      checkpoint_exporter=checkpoint_exporter)
  return trainer
Exemple #9
0
 def test_trainer_passing_datasets(self, distribution):
   with distribution.scope():
     task = mock_task.MockTask(self._config)
     train_dataset = orbit.utils.make_distributed_dataset(
         distribution, task.build_inputs, self._config.task.train_data)
     validation_dataset = orbit.utils.make_distributed_dataset(
         distribution, task.build_inputs, self._config.task.validation_data)
     self._config.task.train_data = None
     self._config.task.validation_data = None
     trainer = trainer_lib.Trainer(
         self._config,
         task,
         model=task.build_model(),
         optimizer=task.create_optimizer(self._config.trainer.optimizer_config,
                                         self._config.runtime),
         train_dataset=train_dataset,
         validation_dataset=validation_dataset)
   logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
   self.assertIn('training_loss', logs)
   self.assertIn('learning_rate', logs)
   logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
   self.assertIn('validation_loss', logs)
Exemple #10
0
def run_experiment_with_multitask_eval(
    *,
    distribution_strategy: tf.distribute.Strategy,
    train_task: base_task.Task,
    eval_tasks: List[base_task.Task],
    mode: str,
    params: configs.MultiEvalExperimentConfig,
    model_dir: str,
    run_post_eval: bool = False,
    save_summary: bool = True,
    trainer: Optional[core_lib.Trainer] = None) -> tf.keras.Model:
  """Runs train/eval configured by the experiment params.

  Args:
    distribution_strategy: A distribution distribution_strategy.
    train_task: A base_task.Task instance.
    eval_tasks: A list of evaluation tasks.
    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
      or 'continuous_eval'.
    params: MultiEvalExperimentConfig instance.
    model_dir: A 'str', a path to store model checkpoints and summaries.
    run_post_eval: Whether to run post eval once after training, metrics logs
      are returned.
    save_summary: Whether to save train and validation summary.
    trainer: the core_lib.Trainer instance. It should be created within the
      strategy.scope(). If not provided, an instance will be created by default
      if `mode` contains 'train'.

  Returns:
      model: `tf.keras.Model` instance.
  """

  is_training = 'train' in mode
  is_eval = 'eval' in mode
  with distribution_strategy.scope():
    if is_training:
      trainer = trainer or core_lib.Trainer(
          config=params,
          task=train_task,
          model=train_task.build_model(),
          optimizer=train_task.create_optimizer(params.trainer.optimizer_config,
                                                params.runtime),
          train=True,
          evaluate=False)
    else:
      trainer = None
    model = trainer.model if trainer else train_task.build_model()

    if is_eval:
      eval_steps = dict([(task_routine.task_config.name,
                          task_routine.eval_steps)
                         for task_routine in params.eval_tasks])
      evaluator = evaluator_lib.MultiTaskEvaluator(
          eval_tasks=eval_tasks,
          model=model,
          global_step=trainer.global_step if is_training else None,
          eval_steps=eval_steps,
          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
              params, model_dir))
    else:
      evaluator = None

  if trainer:
    checkpoint = trainer.checkpoint
    global_step = trainer.global_step
  else:
    checkpoint = evaluator.checkpoint
    global_step = evaluator.global_step

  checkpoint_manager = tf.train.CheckpointManager(
      checkpoint,
      directory=model_dir,
      max_to_keep=params.trainer.max_to_keep,
      step_counter=global_step,
      checkpoint_interval=params.trainer.checkpoint_interval,
      init_fn=trainer.initialize if trainer else None)

  controller = orbit.Controller(
      strategy=distribution_strategy,
      trainer=trainer,
      evaluator=evaluator,
      global_step=global_step,
      steps_per_loop=params.trainer.steps_per_loop,
      checkpoint_manager=checkpoint_manager,
      summary_dir=os.path.join(model_dir, 'train') if save_summary else None,
      eval_summary_dir=os.path.join(model_dir, 'validation') if
      (save_summary) else None,
      summary_interval=params.trainer.summary_interval if
      (save_summary) else None)

  logging.info('Starts to execute mode: %s', mode)
  with distribution_strategy.scope():
    if mode == 'train':
      controller.train(steps=params.trainer.train_steps)
    elif mode == 'train_and_eval':
      controller.train_and_evaluate(
          train_steps=params.trainer.train_steps,
          eval_steps=params.trainer.validation_steps,
          eval_interval=params.trainer.validation_interval)
    elif mode == 'eval':
      controller.evaluate(steps=params.trainer.validation_steps)
    elif mode == 'continuous_eval':

      def timeout_fn():
        if evaluator.global_step.numpy() >= params.trainer.train_steps:
          return True
        return False

      controller.evaluate_continuously(
          steps=params.trainer.validation_steps,
          timeout=params.trainer.continuous_eval_timeout,
          timeout_fn=timeout_fn)
    else:
      raise NotImplementedError('The mode is not implemented: %s' % mode)

    if run_post_eval:
      return model, evaluator.evaluate(
          tf.convert_to_tensor(params.trainer.validation_steps))
    else:
      return model, {}
Exemple #11
0
 def create_test_trainer(self):
     task = mock_task.MockTask()
     trainer = trainer_lib.Trainer(self._config, task)
     return trainer
Exemple #12
0
def run_experiment_wtih_multitask_eval(
        *, distribution_strategy: tf.distribute.Strategy,
        train_task: base_task.Task, eval_tasks: multitask.MultiTask, mode: str,
        params: configs.MultiEvalExperimentConfig,
        model_dir: str) -> tf.keras.Model:
    """Runs train/eval configured by the experiment params.

  Args:
    distribution_strategy: A distribution distribution_strategy.
    train_task: A base_task.Task instance.
    eval_tasks: A multitask.MultiTask with evaluation tasks.
    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
      or 'continuous_eval'.
    params: MultiEvalExperimentConfig instance.
    model_dir: A 'str', a path to store model checkpoints and summaries.

  Returns:
      model: `tf.keras.Model` instance.
  """

    is_training = 'train' in mode
    is_eval = 'eval' in mode
    with distribution_strategy.scope():
        optimizer = train_task.create_optimizer(params.trainer, params.runtime)
        model = train_task.build_model()
        if is_training:
            trainer = core_lib.Trainer(config=params,
                                       task=train_task,
                                       model=model,
                                       optimizer=optimizer,
                                       train=True,
                                       evaluate=False)
        else:
            trainer = None
        if is_eval:
            evaluator = evaluator_lib.MultiTaskEvaluator(
                task=eval_tasks,
                model=model,
                global_step=trainer.global_step if is_training else None)
        else:
            evaluator = None

    if trainer:
        checkpoint = trainer.checkpoint
        global_step = trainer.global_step
    else:
        checkpoint = evaluator.checkpoint
        global_step = evaluator.global_step

    checkpoint_manager = tf.train.CheckpointManager(
        checkpoint,
        directory=model_dir,
        max_to_keep=params.trainer.max_to_keep,
        step_counter=global_step,
        checkpoint_interval=params.trainer.checkpoint_interval,
        init_fn=trainer.initialize if trainer else None)

    controller = orbit.Controller(
        strategy=distribution_strategy,
        trainer=trainer,
        evaluator=evaluator,
        global_step=global_step,
        steps_per_loop=params.trainer.steps_per_loop,
        checkpoint_manager=checkpoint_manager,
        summary_dir=os.path.join(model_dir, 'train'),
        eval_summary_dir=os.path.join(model_dir, 'validation'),
        summary_interval=params.trainer.summary_interval)

    logging.info('Starts to execute mode: %s', mode)
    with distribution_strategy.scope():
        if mode == 'train':
            controller.train(steps=params.trainer.train_steps)
        elif mode == 'train_and_eval':
            controller.train_and_evaluate(
                train_steps=params.trainer.train_steps,
                eval_steps=params.trainer.validation_steps,
                eval_interval=params.trainer.validation_interval)
        elif mode == 'eval':
            controller.evaluate(steps=params.trainer.validation_steps)
        elif mode == 'continuous_eval':

            def timeout_fn():
                if evaluator.global_step.numpy() >= params.trainer.train_steps:
                    return True
                return False

            controller.evaluate_continuously(
                steps=params.trainer.validation_steps,
                timeout=params.trainer.continuous_eval_timeout,
                timeout_fn=timeout_fn)
        else:
            raise NotImplementedError('The mode is not implemented: %s' % mode)

        return model
Exemple #13
0
 def create_test_trainer(self, config):
   task = mock_task.MockTask()
   trainer = trainer_lib.Trainer(config, task, model=task.build_model())
   return trainer
Exemple #14
0
  Returns:
      model: `tf.keras.Model` instance.
  """

  is_training = 'train' in mode
  is_eval = 'eval' in mode
  with distribution_strategy.scope():
    optimizer = train_task.create_optimizer(params.trainer.optimizer_config,
                                            params.runtime)
    model = train_task.build_model()
    if is_training:
      trainer = core_lib.Trainer(
          config=params,
          task=train_task,
          model=model,
          optimizer=optimizer,
          train=True,
          evaluate=False)
    else:
      trainer = None
    if is_eval:
      evaluator = evaluator_lib.MultiTaskEvaluator(
          task=eval_tasks,
          model=model,
<<<<<<< HEAD
          global_step=trainer.global_step if is_training else None)
=======
          global_step=trainer.global_step if is_training else None,
          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
              params, model_dir))