def test_export_best_ckpt(self, distribution): config = cfg.ExperimentConfig( trainer=cfg.TrainerConfig( best_checkpoint_export_subdir='best_ckpt', best_checkpoint_eval_metric='acc', optimizer_config=cfg.OptimizationConfig({ 'optimizer': { 'type': 'sgd' }, 'learning_rate': { 'type': 'constant' } }))) model_dir = self.get_temp_dir() task = mock_task.MockTask(config.task, logging_dir=model_dir) ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir) trainer = trainer_lib.Trainer( config, task, model=task.build_model(), checkpoint_exporter=ckpt_exporter) trainer.train(tf.convert_to_tensor(1, dtype=tf.int32)) trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32)) self.assertTrue( tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))
def test_configure_optimizer(self, mixed_precision_dtype, loss_scale): config = cfg.ExperimentConfig( runtime=cfg.RuntimeConfig( mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale), trainer=cfg.TrainerConfig(optimizer_config=cfg.OptimizationConfig({ 'optimizer': { 'type': 'sgd' }, 'learning_rate': { 'type': 'constant' } }))) task = mock_task.MockTask() trainer = trainer_lib.Trainer(config, task) if mixed_precision_dtype != 'float16': self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD) elif mixed_precision_dtype == 'float16' and loss_scale is None: self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD) else: self.assertIsInstance( trainer.optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer) metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32)) self.assertIn('training_loss', metrics)
def create_trainer(params, task, model_dir, train, evaluate): del model_dir logging.info('Running default trainer.') trainer = base_trainer.Trainer(params, task, train=train, evaluate=evaluate) return trainer
def create_test_trainer(self, config, model_dir=None): task = mock_task.MockTask(config.task, logging_dir=model_dir) ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir) trainer = trainer_lib.Trainer( config, task, model=task.build_model(), optimizer=trainer_lib.create_optimizer(config.trainer, config.runtime), checkpoint_exporter=ckpt_exporter) return trainer
def test_recovery(self): config = cfg.ExperimentConfig( trainer=cfg.TrainerConfig(loss_upper_bound=0.5, recovery_max_trials=2, optimizer_config=cfg.OptimizationConfig({ 'optimizer': { 'type': 'sgd' }, 'learning_rate': { 'type': 'constant' } }))) model_dir = self.get_temp_dir() trainer = self.create_test_trainer(config, model_dir=model_dir) checkpoint_manager = tf.train.CheckpointManager(trainer.checkpoint, self.get_temp_dir(), max_to_keep=2) checkpoint_manager.save() trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager) before_weights = trainer.model.get_weights() _ = trainer.train(tf.convert_to_tensor(1, dtype=tf.int32)) # The training loss is 1.0 and upper_bound is 0.5, so the recover happens. after_weights = trainer.model.get_weights() for left, right in zip(before_weights, after_weights): self.assertAllEqual(left, right) # Let's the loss be NaN and max_trials = 0 to see RuntimeError. config = cfg.ExperimentConfig( trainer=cfg.TrainerConfig(recovery_max_trials=0, optimizer_config=cfg.OptimizationConfig({ 'optimizer': { 'type': 'sgd' }, 'learning_rate': { 'type': 'constant' } }))) task = mock_task.MockTask(config.task, logging_dir=model_dir) def build_losses(labels, model_outputs, aux_losses=None): del labels, model_outputs return tf.constant([np.nan], tf.float32) + aux_losses task.build_losses = build_losses trainer = trainer_lib.Trainer(config, task, model=task.build_model(), optimizer=task.create_optimizer( config.trainer.optimizer_config, config.runtime)) trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager) with self.assertRaises(RuntimeError): _ = trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
def test_model_with_compiled_loss(self): task = mock_task.MockTask() model = task.build_model() model.compile(loss=tf.keras.losses.CategoricalCrossentropy()) trainer = trainer_lib.Trainer( self._config, task, model=model, optimizer=task.create_optimizer(self._config.trainer.optimizer_config)) logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32)) self.assertIn('training_loss', logs)
def create_trainer( params: config_definitions.ExperimentConfig, task: base_task.Task, model_dir: str, train: bool, evaluate: bool, checkpoint_exporter: Any = None): """Create trainer.""" del model_dir logging.info('Running default trainer.') trainer = base_trainer.Trainer( params, task, train=train, evaluate=evaluate, checkpoint_exporter=checkpoint_exporter) return trainer
def create_trainer(params: config_definitions.ExperimentConfig, task: base_task.Task, train: bool, evaluate: bool, checkpoint_exporter: Any = None) -> base_trainer.Trainer: """Create trainer.""" logging.info('Running default trainer.') model = task.build_model() optimizer = base_trainer.create_optimizer(params.trainer, params.runtime) trainer = base_trainer.Trainer( params, task, model=model, optimizer=optimizer, train=train, evaluate=evaluate, checkpoint_exporter=checkpoint_exporter) return trainer
def test_trainer_passing_datasets(self, distribution): with distribution.scope(): task = mock_task.MockTask(self._config) train_dataset = orbit.utils.make_distributed_dataset( distribution, task.build_inputs, self._config.task.train_data) validation_dataset = orbit.utils.make_distributed_dataset( distribution, task.build_inputs, self._config.task.validation_data) self._config.task.train_data = None self._config.task.validation_data = None trainer = trainer_lib.Trainer( self._config, task, model=task.build_model(), optimizer=task.create_optimizer(self._config.trainer.optimizer_config, self._config.runtime), train_dataset=train_dataset, validation_dataset=validation_dataset) logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32)) self.assertIn('training_loss', logs) self.assertIn('learning_rate', logs) logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32)) self.assertIn('validation_loss', logs)
def run_experiment_with_multitask_eval( *, distribution_strategy: tf.distribute.Strategy, train_task: base_task.Task, eval_tasks: List[base_task.Task], mode: str, params: configs.MultiEvalExperimentConfig, model_dir: str, run_post_eval: bool = False, save_summary: bool = True, trainer: Optional[core_lib.Trainer] = None) -> tf.keras.Model: """Runs train/eval configured by the experiment params. Args: distribution_strategy: A distribution distribution_strategy. train_task: A base_task.Task instance. eval_tasks: A list of evaluation tasks. mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval' or 'continuous_eval'. params: MultiEvalExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. run_post_eval: Whether to run post eval once after training, metrics logs are returned. save_summary: Whether to save train and validation summary. trainer: the core_lib.Trainer instance. It should be created within the strategy.scope(). If not provided, an instance will be created by default if `mode` contains 'train'. Returns: model: `tf.keras.Model` instance. """ is_training = 'train' in mode is_eval = 'eval' in mode with distribution_strategy.scope(): if is_training: trainer = trainer or core_lib.Trainer( config=params, task=train_task, model=train_task.build_model(), optimizer=train_task.create_optimizer(params.trainer.optimizer_config, params.runtime), train=True, evaluate=False) else: trainer = None model = trainer.model if trainer else train_task.build_model() if is_eval: eval_steps = dict([(task_routine.task_config.name, task_routine.eval_steps) for task_routine in params.eval_tasks]) evaluator = evaluator_lib.MultiTaskEvaluator( eval_tasks=eval_tasks, model=model, global_step=trainer.global_step if is_training else None, eval_steps=eval_steps, checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter( params, model_dir)) else: evaluator = None if trainer: checkpoint = trainer.checkpoint global_step = trainer.global_step else: checkpoint = evaluator.checkpoint global_step = evaluator.global_step checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=model_dir, max_to_keep=params.trainer.max_to_keep, step_counter=global_step, checkpoint_interval=params.trainer.checkpoint_interval, init_fn=trainer.initialize if trainer else None) controller = orbit.Controller( strategy=distribution_strategy, trainer=trainer, evaluator=evaluator, global_step=global_step, steps_per_loop=params.trainer.steps_per_loop, checkpoint_manager=checkpoint_manager, summary_dir=os.path.join(model_dir, 'train') if save_summary else None, eval_summary_dir=os.path.join(model_dir, 'validation') if (save_summary) else None, summary_interval=params.trainer.summary_interval if (save_summary) else None) logging.info('Starts to execute mode: %s', mode) with distribution_strategy.scope(): if mode == 'train': controller.train(steps=params.trainer.train_steps) elif mode == 'train_and_eval': controller.train_and_evaluate( train_steps=params.trainer.train_steps, eval_steps=params.trainer.validation_steps, eval_interval=params.trainer.validation_interval) elif mode == 'eval': controller.evaluate(steps=params.trainer.validation_steps) elif mode == 'continuous_eval': def timeout_fn(): if evaluator.global_step.numpy() >= params.trainer.train_steps: return True return False controller.evaluate_continuously( steps=params.trainer.validation_steps, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn) else: raise NotImplementedError('The mode is not implemented: %s' % mode) if run_post_eval: return model, evaluator.evaluate( tf.convert_to_tensor(params.trainer.validation_steps)) else: return model, {}
def create_test_trainer(self): task = mock_task.MockTask() trainer = trainer_lib.Trainer(self._config, task) return trainer
def run_experiment_wtih_multitask_eval( *, distribution_strategy: tf.distribute.Strategy, train_task: base_task.Task, eval_tasks: multitask.MultiTask, mode: str, params: configs.MultiEvalExperimentConfig, model_dir: str) -> tf.keras.Model: """Runs train/eval configured by the experiment params. Args: distribution_strategy: A distribution distribution_strategy. train_task: A base_task.Task instance. eval_tasks: A multitask.MultiTask with evaluation tasks. mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval' or 'continuous_eval'. params: MultiEvalExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. Returns: model: `tf.keras.Model` instance. """ is_training = 'train' in mode is_eval = 'eval' in mode with distribution_strategy.scope(): optimizer = train_task.create_optimizer(params.trainer, params.runtime) model = train_task.build_model() if is_training: trainer = core_lib.Trainer(config=params, task=train_task, model=model, optimizer=optimizer, train=True, evaluate=False) else: trainer = None if is_eval: evaluator = evaluator_lib.MultiTaskEvaluator( task=eval_tasks, model=model, global_step=trainer.global_step if is_training else None) else: evaluator = None if trainer: checkpoint = trainer.checkpoint global_step = trainer.global_step else: checkpoint = evaluator.checkpoint global_step = evaluator.global_step checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=model_dir, max_to_keep=params.trainer.max_to_keep, step_counter=global_step, checkpoint_interval=params.trainer.checkpoint_interval, init_fn=trainer.initialize if trainer else None) controller = orbit.Controller( strategy=distribution_strategy, trainer=trainer, evaluator=evaluator, global_step=global_step, steps_per_loop=params.trainer.steps_per_loop, checkpoint_manager=checkpoint_manager, summary_dir=os.path.join(model_dir, 'train'), eval_summary_dir=os.path.join(model_dir, 'validation'), summary_interval=params.trainer.summary_interval) logging.info('Starts to execute mode: %s', mode) with distribution_strategy.scope(): if mode == 'train': controller.train(steps=params.trainer.train_steps) elif mode == 'train_and_eval': controller.train_and_evaluate( train_steps=params.trainer.train_steps, eval_steps=params.trainer.validation_steps, eval_interval=params.trainer.validation_interval) elif mode == 'eval': controller.evaluate(steps=params.trainer.validation_steps) elif mode == 'continuous_eval': def timeout_fn(): if evaluator.global_step.numpy() >= params.trainer.train_steps: return True return False controller.evaluate_continuously( steps=params.trainer.validation_steps, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn) else: raise NotImplementedError('The mode is not implemented: %s' % mode) return model
def create_test_trainer(self, config): task = mock_task.MockTask() trainer = trainer_lib.Trainer(config, task, model=task.build_model()) return trainer
Returns: model: `tf.keras.Model` instance. """ is_training = 'train' in mode is_eval = 'eval' in mode with distribution_strategy.scope(): optimizer = train_task.create_optimizer(params.trainer.optimizer_config, params.runtime) model = train_task.build_model() if is_training: trainer = core_lib.Trainer( config=params, task=train_task, model=model, optimizer=optimizer, train=True, evaluate=False) else: trainer = None if is_eval: evaluator = evaluator_lib.MultiTaskEvaluator( task=eval_tasks, model=model, <<<<<<< HEAD global_step=trainer.global_step if is_training else None) ======= global_step=trainer.global_step if is_training else None, checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter( params, model_dir))