def __init__(self, task: bt.Task): super().__init__() self._task = task self._train_step = None self._test_step = None self._metrics = None self._model = task.build_model() task.initialize(self._model)
def create_trainer(params: config_definitions.ExperimentConfig, task: base_task.Task, model_dir: str, train: bool, evaluate: bool, checkpoint_exporter: Any = None) -> base_trainer.Trainer: """Create trainer.""" del model_dir logging.info('Running default trainer.') model = task.build_model() trainer = base_trainer.Trainer(params, task, train=train, evaluate=evaluate, model=model, checkpoint_exporter=checkpoint_exporter) return trainer
def create_trainer(params: config_definitions.ExperimentConfig, task: base_task.Task, train: bool, evaluate: bool, checkpoint_exporter: Optional[BestCheckpointExporter] = None, trainer_cls=base_trainer.Trainer) -> base_trainer.Trainer: """Create trainer.""" logging.info('Running default trainer.') model = task.build_model() optimizer = task.create_optimizer(params.trainer, params.runtime) return trainer_cls( params, task, model=model, optimizer=optimizer, train=train, evaluate=evaluate, checkpoint_exporter=checkpoint_exporter)
def run_experiment_with_multitask_eval( *, distribution_strategy: tf.distribute.Strategy, train_task: base_task.Task, eval_tasks: List[base_task.Task], mode: str, params: configs.MultiEvalExperimentConfig, model_dir: str, run_post_eval: bool = False, save_summary: bool = True, trainer: Optional[core_lib.Trainer] = None) -> tf.keras.Model: """Runs train/eval configured by the experiment params. Args: distribution_strategy: A distribution distribution_strategy. train_task: A base_task.Task instance. eval_tasks: A list of evaluation tasks. mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval' or 'continuous_eval'. params: MultiEvalExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. run_post_eval: Whether to run post eval once after training, metrics logs are returned. save_summary: Whether to save train and validation summary. trainer: the core_lib.Trainer instance. It should be created within the strategy.scope(). If not provided, an instance will be created by default if `mode` contains 'train'. Returns: model: `tf.keras.Model` instance. """ is_training = 'train' in mode is_eval = 'eval' in mode with distribution_strategy.scope(): if is_training: trainer = trainer or core_lib.Trainer( config=params, task=train_task, model=train_task.build_model(), optimizer=train_task.create_optimizer(params.trainer.optimizer_config, params.runtime), train=True, evaluate=False) else: trainer = None model = trainer.model if trainer else train_task.build_model() if is_eval: eval_steps = dict([(task_routine.task_config.name, task_routine.eval_steps) for task_routine in params.eval_tasks]) evaluator = evaluator_lib.MultiTaskEvaluator( eval_tasks=eval_tasks, model=model, global_step=trainer.global_step if is_training else None, eval_steps=eval_steps, checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter( params, model_dir)) else: evaluator = None if trainer: checkpoint = trainer.checkpoint global_step = trainer.global_step else: checkpoint = evaluator.checkpoint global_step = evaluator.global_step checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=model_dir, max_to_keep=params.trainer.max_to_keep, step_counter=global_step, checkpoint_interval=params.trainer.checkpoint_interval, init_fn=trainer.initialize if trainer else None) controller = orbit.Controller( strategy=distribution_strategy, trainer=trainer, evaluator=evaluator, global_step=global_step, steps_per_loop=params.trainer.steps_per_loop, checkpoint_manager=checkpoint_manager, summary_dir=os.path.join(model_dir, 'train') if save_summary else None, eval_summary_dir=os.path.join(model_dir, 'validation') if (save_summary) else None, summary_interval=params.trainer.summary_interval if (save_summary) else None) logging.info('Starts to execute mode: %s', mode) with distribution_strategy.scope(): if mode == 'train': controller.train(steps=params.trainer.train_steps) elif mode == 'train_and_eval': controller.train_and_evaluate( train_steps=params.trainer.train_steps, eval_steps=params.trainer.validation_steps, eval_interval=params.trainer.validation_interval) elif mode == 'eval': controller.evaluate(steps=params.trainer.validation_steps) elif mode == 'continuous_eval': def timeout_fn(): if evaluator.global_step.numpy() >= params.trainer.train_steps: return True return False controller.evaluate_continuously( steps=params.trainer.validation_steps, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn) else: raise NotImplementedError('The mode is not implemented: %s' % mode) if run_post_eval: return model, evaluator.evaluate( tf.convert_to_tensor(params.trainer.validation_steps)) else: return model, {}
def run_experiment_wtih_multitask_eval( *, distribution_strategy: tf.distribute.Strategy, train_task: base_task.Task, eval_tasks: multitask.MultiTask, mode: str, params: configs.MultiEvalExperimentConfig, model_dir: str) -> tf.keras.Model: """Runs train/eval configured by the experiment params. Args: distribution_strategy: A distribution distribution_strategy. train_task: A base_task.Task instance. eval_tasks: A multitask.MultiTask with evaluation tasks. mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval' or 'continuous_eval'. params: MultiEvalExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. Returns: model: `tf.keras.Model` instance. """ is_training = 'train' in mode is_eval = 'eval' in mode with distribution_strategy.scope(): optimizer = train_task.create_optimizer(params.trainer, params.runtime) model = train_task.build_model() if is_training: trainer = core_lib.Trainer(config=params, task=train_task, model=model, optimizer=optimizer, train=True, evaluate=False) else: trainer = None if is_eval: evaluator = evaluator_lib.MultiTaskEvaluator( task=eval_tasks, model=model, global_step=trainer.global_step if is_training else None) else: evaluator = None if trainer: checkpoint = trainer.checkpoint global_step = trainer.global_step else: checkpoint = evaluator.checkpoint global_step = evaluator.global_step checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=model_dir, max_to_keep=params.trainer.max_to_keep, step_counter=global_step, checkpoint_interval=params.trainer.checkpoint_interval, init_fn=trainer.initialize if trainer else None) controller = orbit.Controller( strategy=distribution_strategy, trainer=trainer, evaluator=evaluator, global_step=global_step, steps_per_loop=params.trainer.steps_per_loop, checkpoint_manager=checkpoint_manager, summary_dir=os.path.join(model_dir, 'train'), eval_summary_dir=os.path.join(model_dir, 'validation'), summary_interval=params.trainer.summary_interval) logging.info('Starts to execute mode: %s', mode) with distribution_strategy.scope(): if mode == 'train': controller.train(steps=params.trainer.train_steps) elif mode == 'train_and_eval': controller.train_and_evaluate( train_steps=params.trainer.train_steps, eval_steps=params.trainer.validation_steps, eval_interval=params.trainer.validation_interval) elif mode == 'eval': controller.evaluate(steps=params.trainer.validation_steps) elif mode == 'continuous_eval': def timeout_fn(): if evaluator.global_step.numpy() >= params.trainer.train_steps: return True return False controller.evaluate_continuously( steps=params.trainer.validation_steps, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn) else: raise NotImplementedError('The mode is not implemented: %s' % mode) return model
def __init__(self, config: ExperimentConfig, task: base_task.Task, train: bool = True, evaluate: bool = True, model=None, optimizer=None, checkpoint_exporter=None): """Initialize common trainer for TensorFlow models. Args: config: An `ExperimentConfig` instance specifying experiment config. task: A base_task.Task instance. train: bool, whether or not this trainer will be used for training. default to True. evaluate: bool, whether or not this trainer will be used for evaluation. default to True. model: tf.keras.Model instance. If provided, it will be used instead of building model using task.build_model(). Default to None. optimizer: tf.keras.optimizers.Optimizer instance. If provided, it will used instead of the optimizer from config. Default to None. checkpoint_exporter: an object that has the `maybe_export_checkpoint` interface. """ # Gets the current distribution strategy. If not inside any strategy scope, # it gets a single-replica no-op strategy. self._strategy = tf.distribute.get_strategy() self._config = config self._task = task self._model = model or task.build_model() if optimizer is None: opt_factory = optimization.OptimizerFactory( config.trainer.optimizer_config) self._optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) else: self._optimizer = optimizer self._checkpoint_exporter = checkpoint_exporter # Configuring optimizer when loss_scale is set in runtime config. This helps # avoiding overflow/underflow for float16 computations. if config.runtime.loss_scale: self._optimizer = performance.configure_optimizer( self._optimizer, use_float16=config.runtime.mixed_precision_dtype == 'float16', loss_scale=config.runtime.loss_scale) # global_step increases by 1 after each training iteration. # We should have global_step.numpy() == self.optimizer.iterations.numpy() # when there is only 1 optimizer. self._global_step = orbit.utils.create_global_step() if hasattr(self.model, 'checkpoint_items'): checkpoint_items = self.model.checkpoint_items else: checkpoint_items = {} self._checkpoint = tf.train.Checkpoint(global_step=self.global_step, model=self.model, optimizer=self.optimizer, **checkpoint_items) self._train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) self._validation_loss = tf.keras.metrics.Mean('validation_loss', dtype=tf.float32) self._train_metrics = self.task.build_metrics( training=True) + self.model.metrics self._validation_metrics = self.task.build_metrics( training=False) + self.model.metrics if train: train_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.task.build_inputs, self.config.task.train_data) orbit.StandardTrainer.__init__( self, train_dataset, options=orbit.StandardTrainerOptions( use_tf_while_loop=config.trainer.train_tf_while_loop, use_tf_function=config.trainer.train_tf_function, use_tpu_summary_optimization=config.trainer. allow_tpu_summary)) if evaluate: eval_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.task.build_inputs, self.config.task.validation_data) orbit.StandardEvaluator.__init__( self, eval_dataset, options=orbit.StandardEvaluatorOptions( use_tf_function=config.trainer.eval_tf_function))
def run_experiment(distribution_strategy: tf.distribute.Strategy, task: base_task.Task, mode: str, params: config_definitions.ExperimentConfig, model_dir: str, run_post_eval: bool = False, save_summary: bool = True) \ -> Tuple[tf.keras.Model, Mapping[str, Any]]: """Runs train/eval configured by the experiment params. Args: distribution_strategy: A distribution distribution_strategy. task: A Task instance. mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval' or 'continuous_eval'. params: ExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. run_post_eval: Whether to run post eval once after training, metrics logs are returned. save_summary: Whether to save train and validation summary. Returns: A 2-tuple of (model, eval_logs). model: `tf.keras.Model` instance. eval_logs: returns eval metrics logs when run_post_eval is set to True, otherwise, returns {}. """ with distribution_strategy.scope(): model = task.build_model() trainer = train_utils.create_trainer( params, task, model=model, model_dir=model_dir, train='train' in mode, evaluate=('eval' in mode) or run_post_eval, checkpoint_exporter=maybe_create_best_ckpt_exporter( params, model_dir)) if trainer.checkpoint: checkpoint_manager = tf.train.CheckpointManager( trainer.checkpoint, directory=model_dir, max_to_keep=params.trainer.max_to_keep, step_counter=trainer.global_step, checkpoint_interval=params.trainer.checkpoint_interval, init_fn=trainer.initialize) else: checkpoint_manager = None controller = orbit.Controller( distribution_strategy, trainer=trainer if 'train' in mode else None, evaluator=trainer, global_step=trainer.global_step, steps_per_loop=params.trainer.steps_per_loop, checkpoint_manager=checkpoint_manager, summary_dir=os.path.join(model_dir, 'train') if (save_summary) else None, eval_summary_dir=os.path.join(model_dir, 'validation') if (save_summary) else None, summary_interval=params.trainer.summary_interval if (save_summary) else None) logging.info('Starts to execute mode: %s', mode) with distribution_strategy.scope(): if mode == 'train': controller.train(steps=params.trainer.train_steps) elif mode == 'train_and_eval': controller.train_and_evaluate( train_steps=params.trainer.train_steps, eval_steps=params.trainer.validation_steps, eval_interval=params.trainer.validation_interval) elif mode == 'eval': controller.evaluate(steps=params.trainer.validation_steps) elif mode == 'continuous_eval': def timeout_fn(): if trainer.global_step.numpy() >= params.trainer.train_steps: return True return False controller.evaluate_continuously( steps=params.trainer.validation_steps, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn) else: raise NotImplementedError('The mode is not implemented: %s' % mode) if run_post_eval: with distribution_strategy.scope(): return trainer.model, trainer.evaluate( tf.convert_to_tensor(params.trainer.validation_steps)) else: return trainer.model, {}