Beispiel #1
0
    def __init__(self, task: bt.Task):
        super().__init__()

        self._task = task
        self._train_step = None
        self._test_step = None
        self._metrics = None

        self._model = task.build_model()
        task.initialize(self._model)
Beispiel #2
0
def create_trainer(params: config_definitions.ExperimentConfig,
                   task: base_task.Task,
                   model_dir: str,
                   train: bool,
                   evaluate: bool,
                   checkpoint_exporter: Any = None) -> base_trainer.Trainer:
    """Create trainer."""
    del model_dir
    logging.info('Running default trainer.')
    model = task.build_model()
    trainer = base_trainer.Trainer(params,
                                   task,
                                   train=train,
                                   evaluate=evaluate,
                                   model=model,
                                   checkpoint_exporter=checkpoint_exporter)
    return trainer
Beispiel #3
0
def create_trainer(params: config_definitions.ExperimentConfig,
                   task: base_task.Task,
                   train: bool,
                   evaluate: bool,
                   checkpoint_exporter: Optional[BestCheckpointExporter] = None,
                   trainer_cls=base_trainer.Trainer) -> base_trainer.Trainer:
  """Create trainer."""
  logging.info('Running default trainer.')
  model = task.build_model()
  optimizer = task.create_optimizer(params.trainer, params.runtime)
  return trainer_cls(
      params,
      task,
      model=model,
      optimizer=optimizer,
      train=train,
      evaluate=evaluate,
      checkpoint_exporter=checkpoint_exporter)
Beispiel #4
0
def run_experiment_with_multitask_eval(
    *,
    distribution_strategy: tf.distribute.Strategy,
    train_task: base_task.Task,
    eval_tasks: List[base_task.Task],
    mode: str,
    params: configs.MultiEvalExperimentConfig,
    model_dir: str,
    run_post_eval: bool = False,
    save_summary: bool = True,
    trainer: Optional[core_lib.Trainer] = None) -> tf.keras.Model:
  """Runs train/eval configured by the experiment params.

  Args:
    distribution_strategy: A distribution distribution_strategy.
    train_task: A base_task.Task instance.
    eval_tasks: A list of evaluation tasks.
    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
      or 'continuous_eval'.
    params: MultiEvalExperimentConfig instance.
    model_dir: A 'str', a path to store model checkpoints and summaries.
    run_post_eval: Whether to run post eval once after training, metrics logs
      are returned.
    save_summary: Whether to save train and validation summary.
    trainer: the core_lib.Trainer instance. It should be created within the
      strategy.scope(). If not provided, an instance will be created by default
      if `mode` contains 'train'.

  Returns:
      model: `tf.keras.Model` instance.
  """

  is_training = 'train' in mode
  is_eval = 'eval' in mode
  with distribution_strategy.scope():
    if is_training:
      trainer = trainer or core_lib.Trainer(
          config=params,
          task=train_task,
          model=train_task.build_model(),
          optimizer=train_task.create_optimizer(params.trainer.optimizer_config,
                                                params.runtime),
          train=True,
          evaluate=False)
    else:
      trainer = None
    model = trainer.model if trainer else train_task.build_model()

    if is_eval:
      eval_steps = dict([(task_routine.task_config.name,
                          task_routine.eval_steps)
                         for task_routine in params.eval_tasks])
      evaluator = evaluator_lib.MultiTaskEvaluator(
          eval_tasks=eval_tasks,
          model=model,
          global_step=trainer.global_step if is_training else None,
          eval_steps=eval_steps,
          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
              params, model_dir))
    else:
      evaluator = None

  if trainer:
    checkpoint = trainer.checkpoint
    global_step = trainer.global_step
  else:
    checkpoint = evaluator.checkpoint
    global_step = evaluator.global_step

  checkpoint_manager = tf.train.CheckpointManager(
      checkpoint,
      directory=model_dir,
      max_to_keep=params.trainer.max_to_keep,
      step_counter=global_step,
      checkpoint_interval=params.trainer.checkpoint_interval,
      init_fn=trainer.initialize if trainer else None)

  controller = orbit.Controller(
      strategy=distribution_strategy,
      trainer=trainer,
      evaluator=evaluator,
      global_step=global_step,
      steps_per_loop=params.trainer.steps_per_loop,
      checkpoint_manager=checkpoint_manager,
      summary_dir=os.path.join(model_dir, 'train') if save_summary else None,
      eval_summary_dir=os.path.join(model_dir, 'validation') if
      (save_summary) else None,
      summary_interval=params.trainer.summary_interval if
      (save_summary) else None)

  logging.info('Starts to execute mode: %s', mode)
  with distribution_strategy.scope():
    if mode == 'train':
      controller.train(steps=params.trainer.train_steps)
    elif mode == 'train_and_eval':
      controller.train_and_evaluate(
          train_steps=params.trainer.train_steps,
          eval_steps=params.trainer.validation_steps,
          eval_interval=params.trainer.validation_interval)
    elif mode == 'eval':
      controller.evaluate(steps=params.trainer.validation_steps)
    elif mode == 'continuous_eval':

      def timeout_fn():
        if evaluator.global_step.numpy() >= params.trainer.train_steps:
          return True
        return False

      controller.evaluate_continuously(
          steps=params.trainer.validation_steps,
          timeout=params.trainer.continuous_eval_timeout,
          timeout_fn=timeout_fn)
    else:
      raise NotImplementedError('The mode is not implemented: %s' % mode)

    if run_post_eval:
      return model, evaluator.evaluate(
          tf.convert_to_tensor(params.trainer.validation_steps))
    else:
      return model, {}
Beispiel #5
0
def run_experiment_wtih_multitask_eval(
        *, distribution_strategy: tf.distribute.Strategy,
        train_task: base_task.Task, eval_tasks: multitask.MultiTask, mode: str,
        params: configs.MultiEvalExperimentConfig,
        model_dir: str) -> tf.keras.Model:
    """Runs train/eval configured by the experiment params.

  Args:
    distribution_strategy: A distribution distribution_strategy.
    train_task: A base_task.Task instance.
    eval_tasks: A multitask.MultiTask with evaluation tasks.
    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
      or 'continuous_eval'.
    params: MultiEvalExperimentConfig instance.
    model_dir: A 'str', a path to store model checkpoints and summaries.

  Returns:
      model: `tf.keras.Model` instance.
  """

    is_training = 'train' in mode
    is_eval = 'eval' in mode
    with distribution_strategy.scope():
        optimizer = train_task.create_optimizer(params.trainer, params.runtime)
        model = train_task.build_model()
        if is_training:
            trainer = core_lib.Trainer(config=params,
                                       task=train_task,
                                       model=model,
                                       optimizer=optimizer,
                                       train=True,
                                       evaluate=False)
        else:
            trainer = None
        if is_eval:
            evaluator = evaluator_lib.MultiTaskEvaluator(
                task=eval_tasks,
                model=model,
                global_step=trainer.global_step if is_training else None)
        else:
            evaluator = None

    if trainer:
        checkpoint = trainer.checkpoint
        global_step = trainer.global_step
    else:
        checkpoint = evaluator.checkpoint
        global_step = evaluator.global_step

    checkpoint_manager = tf.train.CheckpointManager(
        checkpoint,
        directory=model_dir,
        max_to_keep=params.trainer.max_to_keep,
        step_counter=global_step,
        checkpoint_interval=params.trainer.checkpoint_interval,
        init_fn=trainer.initialize if trainer else None)

    controller = orbit.Controller(
        strategy=distribution_strategy,
        trainer=trainer,
        evaluator=evaluator,
        global_step=global_step,
        steps_per_loop=params.trainer.steps_per_loop,
        checkpoint_manager=checkpoint_manager,
        summary_dir=os.path.join(model_dir, 'train'),
        eval_summary_dir=os.path.join(model_dir, 'validation'),
        summary_interval=params.trainer.summary_interval)

    logging.info('Starts to execute mode: %s', mode)
    with distribution_strategy.scope():
        if mode == 'train':
            controller.train(steps=params.trainer.train_steps)
        elif mode == 'train_and_eval':
            controller.train_and_evaluate(
                train_steps=params.trainer.train_steps,
                eval_steps=params.trainer.validation_steps,
                eval_interval=params.trainer.validation_interval)
        elif mode == 'eval':
            controller.evaluate(steps=params.trainer.validation_steps)
        elif mode == 'continuous_eval':

            def timeout_fn():
                if evaluator.global_step.numpy() >= params.trainer.train_steps:
                    return True
                return False

            controller.evaluate_continuously(
                steps=params.trainer.validation_steps,
                timeout=params.trainer.continuous_eval_timeout,
                timeout_fn=timeout_fn)
        else:
            raise NotImplementedError('The mode is not implemented: %s' % mode)

        return model
    def __init__(self,
                 config: ExperimentConfig,
                 task: base_task.Task,
                 train: bool = True,
                 evaluate: bool = True,
                 model=None,
                 optimizer=None,
                 checkpoint_exporter=None):
        """Initialize common trainer for TensorFlow models.

    Args:
      config: An `ExperimentConfig` instance specifying experiment config.
      task: A base_task.Task instance.
      train: bool, whether or not this trainer will be used for training.
        default to True.
      evaluate: bool, whether or not this trainer will be used for evaluation.
        default to True.
      model: tf.keras.Model instance. If provided, it will be used instead of
        building model using task.build_model(). Default to None.
      optimizer: tf.keras.optimizers.Optimizer instance. If provided, it will
        used instead of the optimizer from config. Default to None.
      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
        interface.
    """
        # Gets the current distribution strategy. If not inside any strategy scope,
        # it gets a single-replica no-op strategy.
        self._strategy = tf.distribute.get_strategy()
        self._config = config
        self._task = task

        self._model = model or task.build_model()

        if optimizer is None:
            opt_factory = optimization.OptimizerFactory(
                config.trainer.optimizer_config)
            self._optimizer = opt_factory.build_optimizer(
                opt_factory.build_learning_rate())
        else:
            self._optimizer = optimizer

        self._checkpoint_exporter = checkpoint_exporter

        # Configuring optimizer when loss_scale is set in runtime config. This helps
        # avoiding overflow/underflow for float16 computations.
        if config.runtime.loss_scale:
            self._optimizer = performance.configure_optimizer(
                self._optimizer,
                use_float16=config.runtime.mixed_precision_dtype == 'float16',
                loss_scale=config.runtime.loss_scale)

        # global_step increases by 1 after each training iteration.
        # We should have global_step.numpy() == self.optimizer.iterations.numpy()
        # when there is only 1 optimizer.
        self._global_step = orbit.utils.create_global_step()
        if hasattr(self.model, 'checkpoint_items'):
            checkpoint_items = self.model.checkpoint_items
        else:
            checkpoint_items = {}
        self._checkpoint = tf.train.Checkpoint(global_step=self.global_step,
                                               model=self.model,
                                               optimizer=self.optimizer,
                                               **checkpoint_items)

        self._train_loss = tf.keras.metrics.Mean('training_loss',
                                                 dtype=tf.float32)
        self._validation_loss = tf.keras.metrics.Mean('validation_loss',
                                                      dtype=tf.float32)
        self._train_metrics = self.task.build_metrics(
            training=True) + self.model.metrics
        self._validation_metrics = self.task.build_metrics(
            training=False) + self.model.metrics

        if train:
            train_dataset = orbit.utils.make_distributed_dataset(
                self.strategy, self.task.build_inputs,
                self.config.task.train_data)
            orbit.StandardTrainer.__init__(
                self,
                train_dataset,
                options=orbit.StandardTrainerOptions(
                    use_tf_while_loop=config.trainer.train_tf_while_loop,
                    use_tf_function=config.trainer.train_tf_function,
                    use_tpu_summary_optimization=config.trainer.
                    allow_tpu_summary))

        if evaluate:
            eval_dataset = orbit.utils.make_distributed_dataset(
                self.strategy, self.task.build_inputs,
                self.config.task.validation_data)
            orbit.StandardEvaluator.__init__(
                self,
                eval_dataset,
                options=orbit.StandardEvaluatorOptions(
                    use_tf_function=config.trainer.eval_tf_function))
def run_experiment(distribution_strategy: tf.distribute.Strategy,
                   task: base_task.Task,
                   mode: str,
                   params: config_definitions.ExperimentConfig,
                   model_dir: str,
                   run_post_eval: bool = False,
                   save_summary: bool = True) \
-> Tuple[tf.keras.Model, Mapping[str, Any]]:
    """Runs train/eval configured by the experiment params.

  Args:
    distribution_strategy: A distribution distribution_strategy.
    task: A Task instance.
    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
      or 'continuous_eval'.
    params: ExperimentConfig instance.
    model_dir: A 'str', a path to store model checkpoints and summaries.
    run_post_eval: Whether to run post eval once after training, metrics logs
      are returned.
    save_summary: Whether to save train and validation summary.

  Returns:
    A 2-tuple of (model, eval_logs).
      model: `tf.keras.Model` instance.
      eval_logs: returns eval metrics logs when run_post_eval is set to True,
        otherwise, returns {}.
  """

    with distribution_strategy.scope():
        model = task.build_model()
        trainer = train_utils.create_trainer(
            params,
            task,
            model=model,
            model_dir=model_dir,
            train='train' in mode,
            evaluate=('eval' in mode) or run_post_eval,
            checkpoint_exporter=maybe_create_best_ckpt_exporter(
                params, model_dir))

    if trainer.checkpoint:
        checkpoint_manager = tf.train.CheckpointManager(
            trainer.checkpoint,
            directory=model_dir,
            max_to_keep=params.trainer.max_to_keep,
            step_counter=trainer.global_step,
            checkpoint_interval=params.trainer.checkpoint_interval,
            init_fn=trainer.initialize)
    else:
        checkpoint_manager = None

    controller = orbit.Controller(
        distribution_strategy,
        trainer=trainer if 'train' in mode else None,
        evaluator=trainer,
        global_step=trainer.global_step,
        steps_per_loop=params.trainer.steps_per_loop,
        checkpoint_manager=checkpoint_manager,
        summary_dir=os.path.join(model_dir, 'train') if
        (save_summary) else None,
        eval_summary_dir=os.path.join(model_dir, 'validation') if
        (save_summary) else None,
        summary_interval=params.trainer.summary_interval if
        (save_summary) else None)

    logging.info('Starts to execute mode: %s', mode)
    with distribution_strategy.scope():
        if mode == 'train':
            controller.train(steps=params.trainer.train_steps)
        elif mode == 'train_and_eval':
            controller.train_and_evaluate(
                train_steps=params.trainer.train_steps,
                eval_steps=params.trainer.validation_steps,
                eval_interval=params.trainer.validation_interval)
        elif mode == 'eval':
            controller.evaluate(steps=params.trainer.validation_steps)
        elif mode == 'continuous_eval':

            def timeout_fn():
                if trainer.global_step.numpy() >= params.trainer.train_steps:
                    return True
                return False

            controller.evaluate_continuously(
                steps=params.trainer.validation_steps,
                timeout=params.trainer.continuous_eval_timeout,
                timeout_fn=timeout_fn)
        else:
            raise NotImplementedError('The mode is not implemented: %s' % mode)

    if run_post_eval:
        with distribution_strategy.scope():
            return trainer.model, trainer.evaluate(
                tf.convert_to_tensor(params.trainer.validation_steps))
    else:
        return trainer.model, {}