def test_step_per_loop_callable(self): test_runner = TestRunner() checkpoint = tf.train.Checkpoint(model=test_runner.model, optimizer=test_runner.optimizer) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step, checkpoint_interval=10) def steps_per_loop_fn(global_step): if global_step > 4: return 4 return 2 test_controller = controller.Controller( trainer=test_runner, global_step=test_runner.global_step, steps_per_loop=steps_per_loop_fn, checkpoint_manager=checkpoint_manager, ) test_controller.train(steps=10) self.assertEqual(test_runner.global_step, 10)
def test_evaluate_with_nested_summaries(self): test_evaluator = TestEvaluatorWithNestedSummary() test_controller = controller.Controller( evaluator=test_evaluator, global_step=tf.Variable(0, dtype=tf.int64), eval_summary_dir=self.model_dir) test_controller.evaluate(steps=5) self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "dataset"))) self.assertNotEmpty( summaries_with_matching_keyword( "loss", os.path.join(self.model_dir, "dataset"))) self.assertNotEmpty( summaries_with_matching_keyword( "accuracy", os.path.join(self.model_dir, "dataset"))) self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "dataset2"))) self.assertNotEmpty( summaries_with_matching_keyword( "loss", os.path.join(self.model_dir, "dataset2"))) self.assertNotEmpty( summaries_with_matching_keyword( "accuracy", os.path.join(self.model_dir, "dataset2")))
def test_eval_and_checkpoint_interval(self): test_runner = TestRunner() checkpoint = tf.train.Checkpoint(model=test_runner.model, optimizer=test_runner.optimizer) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step, checkpoint_interval=5) test_controller = controller.Controller( trainer=test_runner, evaluator=test_runner, global_step=test_runner.global_step, steps_per_loop=10, checkpoint_manager=checkpoint_manager) test_controller.train_and_evaluate(train_steps=10, eval_steps=2, eval_interval=5) # Expect 3 checkpoints to be saved at step: 0, 5, 10. self.assertLen( tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt-*.data*")), 3) # Expect evaluation is performed 2 times at step: 5, 10. self.assertLen( summaries_with_matching_keyword("eval_loss", self.model_dir), 2)
def test_train_and_evaluate_with_same_summary_dir(self): test_runner = TestRunner() checkpoint = tf.train.Checkpoint(model=test_runner.model, optimizer=test_runner.optimizer) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step) test_controller = controller.Controller( trainer=test_runner, evaluator=test_runner, global_step=test_runner.global_step, steps_per_loop=2, summary_dir=os.path.join(self.model_dir, "summaries"), checkpoint_manager=checkpoint_manager, eval_summary_dir=os.path.join(self.model_dir, "summaries")) test_controller.train_and_evaluate(train_steps=10, eval_steps=2, eval_interval=6) # Loss and accuracy values should be written into summaries. self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries"))) self.assertNotEmpty( summaries_with_matching_keyword( "loss", os.path.join(self.model_dir, "summaries"))) self.assertNotEmpty( summaries_with_matching_keyword( "eval_loss", os.path.join(self.model_dir, "summaries")))
def test_train_and_evaluate_reset_datasets(self): test_runner = TestRunner() test_controller = controller.Controller( trainer=test_runner, evaluator=test_runner, global_step=test_runner.global_step, steps_per_loop=2) test_controller.train_and_evaluate(train_steps=10, eval_steps=2, eval_interval=6) train_dataset = ( test_runner.strategy. experimental_distribute_datasets_from_function(dataset_fn)) eval_dataset = ( test_runner.strategy. experimental_distribute_datasets_from_function(dataset_fn)) test_runner.train_dataset = train_dataset test_runner.eval_dataset = eval_dataset test_controller.train_and_evaluate(train_steps=10, eval_steps=2, eval_interval=6)
def test_has_checkpoint_eval_summary_only(self): test_runner = TestRunner() # Has checkpoint, but no summary directories. checkpoint = tf.train.Checkpoint(model=test_runner.model) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step) test_controller = controller.Controller( trainer=test_runner, evaluator=test_runner, global_step=test_runner.global_step, checkpoint_manager=checkpoint_manager, eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"), steps_per_loop=2) test_controller.train_and_evaluate( train_steps=10, eval_steps=2, eval_interval=6) self.assertEqual(test_runner.global_step, 10) # Training summaries are not saved. self.assertEmpty(tf.io.gfile.glob( os.path.join(checkpoint_manager.directory, "events.*"))) # Evaluation summaries are saved. self.assertNotEmpty(tf.io.gfile.glob( os.path.join(self.model_dir, "summaries/eval/events.*")))
def test_summaries_inside_train_fn(self): test_runner = TestTrainerWithSummaries() checkpoint = tf.train.Checkpoint(model=test_runner.model, optimizer=test_runner.optimizer) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step) test_controller = controller.Controller( trainer=test_runner, global_step=test_runner.global_step, steps_per_loop=2, summary_dir=os.path.join(self.model_dir, "summaries/train"), summary_interval=2, checkpoint_manager=checkpoint_manager, ) test_controller.train(steps=10) # Checkpoints are saved. self.assertEmpty( tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*"))) # Only train summaries are written. self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train"))) self.assertNotEmpty( summaries_with_matching_keyword( "loss", os.path.join(self.model_dir, "summaries/train"))) self.assertFalse( tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
def test_no_checkpoint(self): test_runner = TestRunner() # No checkpoint manager and no strategy. test_controller = controller.Controller( trainer=test_runner, evaluator=test_runner, global_step=test_runner.global_step, steps_per_loop=2, summary_dir=os.path.join(self.model_dir, "summaries/train"), eval_summary_dir=os.path.join(self.model_dir, "summaries/eval")) test_controller.train_and_evaluate(train_steps=10, eval_steps=2, eval_interval=6) self.assertEqual(test_runner.global_step, 10) # Loss and accuracy values should be written into summaries. self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train"))) self.assertNotEmpty( summaries_with_matching_keyword( "loss", os.path.join(self.model_dir, "summaries/train"))) self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval"))) self.assertNotEmpty( summaries_with_matching_keyword( "eval_loss", os.path.join(self.model_dir, "summaries/eval"))) # No checkpoint, so global step starts from 0. test_runner.global_step.assign(0) test_controller.train_and_evaluate(train_steps=10, eval_steps=2, eval_interval=6) self.assertEqual(test_runner.global_step, 10)
def test_evaluate_only(self): test_runner = TestRunner() checkpoint = tf.train.Checkpoint(model=test_runner.model) checkpoint.save(os.path.join(self.model_dir, "ckpt")) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step) test_controller = controller.Controller( evaluator=test_runner, global_step=test_runner.global_step, checkpoint_manager=checkpoint_manager, summary_dir=os.path.join(self.model_dir, "summaries/train"), eval_summary_dir=os.path.join(self.model_dir, "summaries/eval")) eval_results = test_controller.evaluate(steps=2) # Only eval summaries are written self.assertFalse( tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/train"))) self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval"))) self.assertNotEmpty( summaries_with_matching_keyword( "eval_loss", os.path.join(self.model_dir, "summaries/eval"))) self.assertIn("eval_loss", eval_results) # Tests continuous eval with timeout and timeout_fn. done_file = os.path.join(self.model_dir, "summaries/eval/Done") def timeout_fn(): with tf.io.gfile.GFile(done_file, "w") as f: f.write("DONE") return True test_controller = controller.Controller( evaluator=test_runner, global_step=test_runner.global_step, checkpoint_manager=checkpoint_manager, eval_summary_dir=os.path.join(self.model_dir, "summaries/eval")) test_controller.evaluate_continuously(timeout=1, timeout_fn=timeout_fn, steps=2) self.assertNotEmpty(tf.io.gfile.glob(done_file))
def test_no_checkpoint_and_summaries(self): test_runner = TestRunner() # No checkpoint + summary directories. test_controller = controller.Controller( trainer=test_runner, evaluator=test_runner, global_step=test_runner.global_step, steps_per_loop=2) test_controller.train_and_evaluate( train_steps=10, eval_steps=2, eval_interval=6) self.assertEqual(test_runner.global_step, 10)
def test_no_eval_steps(self): test_runner = TestRunner() checkpoint = tf.train.Checkpoint(model=test_runner.model) checkpoint.save(os.path.join(self.model_dir, "ckpt")) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step) test_controller = controller.Controller( evaluator=test_runner, global_step=test_runner.global_step, checkpoint_manager=checkpoint_manager) test_controller.evaluate()
def test_actions(self): test_runner = TestRunner() checkpoint = tf.train.Checkpoint(model=test_runner.model, optimizer=test_runner.optimizer) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step, checkpoint_interval=10) class OutputRecorderAction: """Simple `Action` that just saves the outputs passed to `__call__`.""" def __init__(self): self.outputs = [] def __call__(self, output): self.outputs.append(output) train_output_recorder = OutputRecorderAction() eval_output_recorder = OutputRecorderAction() test_controller = controller.Controller( trainer=test_runner, evaluator=test_runner, train_actions=[train_output_recorder], eval_actions=[eval_output_recorder], global_step=test_runner.global_step, steps_per_loop=2, summary_dir=os.path.join(self.model_dir, "summaries/train"), checkpoint_manager=checkpoint_manager, eval_summary_dir=os.path.join(self.model_dir, "summaries/eval")) test_controller.train_and_evaluate(train_steps=10, eval_steps=2, eval_interval=6) self.assertLen(train_output_recorder.outputs, 5) for output in train_output_recorder.outputs: self.assertIn("loss", output) self.assertGreaterEqual(output["loss"], 0) self.assertLen(eval_output_recorder.outputs, 2) for output in eval_output_recorder.outputs: self.assertIn("eval_loss", output) self.assertGreaterEqual(output["eval_loss"], 0)
def test_already_trained_model(self): test_runner = TestRunner() test_runner.global_step.assign(10) checkpoint = tf.train.Checkpoint(model=test_runner.model, optimizer=test_runner.optimizer) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step, checkpoint_interval=10) test_controller = controller.Controller( trainer=test_runner, global_step=test_runner.global_step, steps_per_loop=2, checkpoint_manager=checkpoint_manager) # `global_step` is already `train_steps`. test_controller.train(steps=10)
def test_restore_from_most_recent_checkpoint(self): test_runner = TestRunner() checkpoint = tf.train.Checkpoint(model=test_runner.model) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step, checkpoint_interval=5) test_controller = controller.Controller( trainer=test_runner, global_step=test_runner.global_step, checkpoint_manager=checkpoint_manager, eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"), steps_per_loop=5) test_controller.train(20) self.assertLen(checkpoint_manager.checkpoints, 4) restored_path = test_controller.restore_checkpoint() self.assertEqual(restored_path, checkpoint_manager.checkpoints[-1])
def test_evaluate_with_loss_output(self): test_evaluator = TestEvaluator() checkpoint = tf.train.Checkpoint(model=test_evaluator.model) checkpoint.save(os.path.join(self.model_dir, "ckpt")) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None) test_controller = controller.Controller( evaluator=test_evaluator, global_step=tf.Variable(0, dtype=tf.int64), checkpoint_manager=checkpoint_manager, eval_summary_dir=os.path.join(self.model_dir, "summaries/eval")) test_controller.evaluate(steps=5) # Only eval summaries are written self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval"))) self.assertNotEmpty( summaries_with_matching_keyword( "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
def test_train_and_evaluate(self, return_numpy): test_runner = TestRunner(return_numpy=return_numpy) checkpoint = tf.train.Checkpoint( model=test_runner.model, optimizer=test_runner.optimizer) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.model_dir, max_to_keep=None, step_counter=test_runner.global_step, checkpoint_interval=10) test_controller = controller.Controller( trainer=test_runner, evaluator=test_runner, global_step=test_runner.global_step, steps_per_loop=2, summary_dir=os.path.join(self.model_dir, "summaries/train"), checkpoint_manager=checkpoint_manager, eval_summary_dir=os.path.join(self.model_dir, "summaries/eval")) test_controller.train_and_evaluate( train_steps=10, eval_steps=2, eval_interval=6) # Checkpoints are saved. self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*"))) # Loss and accuracy values should be written into summaries. self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train"))) self.assertTrue( check_eventfile_for_keyword( "loss", os.path.join(self.model_dir, "summaries/train"))) self.assertNotEmpty( tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval"))) self.assertTrue( check_eventfile_for_keyword( "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
def test_evaluate_with_no_output(self): test_controller = controller.Controller( evaluator=TestEvaluatorNoOutput(), global_step=tf.Variable(0, dtype=tf.int64), eval_summary_dir=os.path.join(self.model_dir, "summaries/eval")) self.assertEqual(test_controller.evaluate(steps=5), {})