def testController(self): logdir = os.path.join(tf.test.get_temp_dir(), 'controller_test' + str(random.random())) FLAGS.logdir = logdir cfg = self._GetTestConfig() runner_manager = trainer.RunnerManager(cfg.name) runner_manager.StartRunners( [self._CreateController(cfg), self._CreateTrainer(cfg)]) train_files = tf.io.gfile.glob(logdir + '/train/*') self.assertTrue(self._HasFile(train_files, 'ckpt')) self.assertTrue(self._HasFile(train_files, 'tfevents')) control_files = tf.io.gfile.glob(logdir + '/control/*') self.assertTrue(self._HasFile(control_files, 'params.txt')) self.assertTrue(self._HasFile(control_files, 'model_analysis.txt')) self.assertTrue(self._HasFile(control_files, 'train.pbtxt')) self.assertTrue(self._HasFile(control_files, 'tfevents')) # EvalerDev may not run concurrently with Controller in a single process # because EvalerDev loads checkpoints and overwrites states like global # steps. runner_manager.StartRunners([self._CreateEvalerDev(cfg)]) dev_files = tf.io.gfile.glob(logdir + '/eval_dev/*') self.assertTrue(self._HasFile(dev_files, 'params.txt')) self.assertTrue(self._HasFile(dev_files, 'eval_dev.pbtxt')) self.assertTrue(self._HasFile(dev_files, 'tfevents')) self.assertTrue(self._HasFile(dev_files, 'score')) self.assertTrue( self._HasLine(self._GetMatchedFileName(dev_files, 'score'), 'log_pplx'))
def testWriteInferenceGraph(self): random.seed() logdir = os.path.join(tf.test.get_temp_dir(), 'inference_graphs' + str(random.random())) FLAGS.logdir = logdir cfg = 'punctuator.codelab.RNMTModel' trainer.RunnerManager(cfg).WriteInferenceGraph() inference_files = tf.io.gfile.glob(logdir + '/inference_graphs/*') self.assertTrue(self._HasFile(inference_files, 'inference.pbtxt')) self.assertTrue(self._HasFile(inference_files, 'inference_tpu.pbtxt'))
def testWriteInferenceGraph(self): random.seed() logdir = os.path.join(tf.test.get_temp_dir(), 'inference_graphs' + str(random.random())) FLAGS.logdir = logdir cfg = 'lm.one_billion_wds.WordLevelOneBwdsSimpleSampledSoftmax' trainer.RunnerManager(cfg).WriteInferenceGraph() inference_files = tf.gfile.Glob(logdir + '/inference_graphs/*') self.assertTrue(self._HasFile(inference_files, 'inference.pbtxt')) self.assertTrue(self._HasFile(inference_files, 'inference_tpu.pbtxt'))
def testWriteMultiTaskInferenceGraph(self): random.seed() logdir = os.path.join(tf.test.get_temp_dir(), 'inference_graphs' + str(random.random())) FLAGS.logdir = logdir cfg = 'test.EmptyMultiTaskParams' trainer.RunnerManager(cfg).WriteInferenceGraph() inference_files = tf.io.gfile.glob(logdir + '/inference_graphs/*') self.assertTrue(self._HasFile(inference_files, 'a_inference.pbtxt')) self.assertTrue(self._HasFile(inference_files, 'a_inference_tpu.pbtxt')) self.assertTrue(self._HasFile(inference_files, 'b_inference.pbtxt')) self.assertTrue(self._HasFile(inference_files, 'b_inference_tpu.pbtxt'))
def testDecoder(self): logdir = os.path.join(tf.test.get_temp_dir(), 'decoder_test' + str(random.random())) FLAGS.logdir = logdir cfg = self._GetTestConfig() runner_manager = trainer.RunnerManager(cfg.name) runner_manager.StartRunners( [self._CreateController(cfg), self._CreateTrainer(cfg)]) runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) dec_files = tf.io.gfile.glob(logdir + '/decoder_dev/*') self.assertTrue(self._HasFile(dec_files, 'params.txt')) self.assertTrue(self._HasFile(dec_files, 'decoder_dev.pbtxt')) self.assertTrue(self._HasFile(dec_files, 'tfevents')) # Only the score for the 2-step checkpoint should be present. self.assertTrue( tf.io.gfile.exists( os.path.join(logdir, 'decoder_dev/score-00000002.txt'))) self.assertFalse( tf.io.gfile.exists( os.path.join(logdir, 'decoder_dev/score-00000000.txt'))) self.assertTrue( self._HasLine(self._GetMatchedFileName(dec_files, 'score'), 'examples/sec')) # Test customization of an eval checkpoint. Create a new logdir / decoder # but point the eval checkpoint to the 0th checkpoint of the most # recent experiment. new_logdir = os.path.join(tf.test.get_temp_dir(), 'decoder_test' + str(random.random())) FLAGS.logdir = new_logdir cfg = self._GetTestConfig() cfg.task.eval.load_checkpoint_from = os.path.join( logdir, 'train/ckpt-00000000') runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) # Only the score for the 0th checkpoint should be present. self.assertTrue( tf.io.gfile.exists( os.path.join(new_logdir, 'decoder_dev/score-00000000.txt'))) self.assertFalse( tf.io.gfile.exists( os.path.join(new_logdir, 'decoder_dev/score-00000002.txt')))
def testDecoder(self): logdir = os.path.join(tf.test.get_temp_dir(), 'decoder_test' + str(random.random())) FLAGS.logdir = logdir cfg = self._GetTestConfig() runner_manager = trainer.RunnerManager(cfg.name) runner_manager.StartRunners( [self._CreateController(cfg), self._CreateTrainer(cfg)]) runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) dec_files = tf.gfile.Glob(logdir + '/decoder_dev/*') self.assertTrue(self._HasFile(dec_files, 'params.txt')) self.assertTrue(self._HasFile(dec_files, 'decoder_dev.pbtxt')) self.assertTrue(self._HasFile(dec_files, 'tfevents')) self.assertTrue(self._HasFile(dec_files, 'score')) self.assertTrue( self._HasLine(self._GetMatchedFileName(dec_files, 'score'), 'examples/sec'))
def testIdentityRegressionModel(self): logdir = os.path.join( tf.test.get_temp_dir(), 'identity_regression_test' + str(random.random())) FLAGS.logdir = logdir steps = 100 cfg = trainer_test_utils.IdentityRegressionModel.Params() cfg.cluster.task = 0 cfg.cluster.mode = 'sync' cfg.cluster.job = 'trainer_client' cfg.cluster.worker.name = '/job:localhost' cfg.cluster.worker.replicas = 1 cfg.cluster.worker.gpus_per_replica = 0 cfg.cluster.ps.name = '/job:localhost' cfg.cluster.ps.replicas = 1 cfg.cluster.ps.gpus_per_replica = 0 cfg.train.max_steps = steps cfg.task.train.learning_rate = 0.025 runners = [self._CreateController(cfg), self._CreateTrainer(cfg)] runner_manager = trainer.RunnerManager(cfg.name) runner_manager.StartRunners(runners) train = runners[1] # ProcessFPropResults should have been called <steps> times on the task # and <steps> times on the model. # There are always 2 samples in the batch. expected_samples_in_batch = [(2, 1.0) for _ in range(steps)] self.assertAllEqual( expected_samples_in_batch, [m['num_samples_in_batch'] for m in train._model.metrics]) self.assertAllEqual( expected_samples_in_batch, [m['num_samples_in_batch'] for m in train._model._task.metrics]) # Global steps should increment by 1 for each batch. expected_global_steps = [i + 1 for i in range(steps)] self.assertAllEqual(expected_global_steps, train._model.global_steps) self.assertAllEqual(expected_global_steps, train._model._task.global_steps) # The CountingInputGenerator makes [2,2] inputs that increment for each # batch, like: # [[0, 1], [2, 3]], # [[4, 5], [6, 7]], # ... expected_input_tensors = [{ 'input': np.array([[4 * i, 4 * i + 1], [4 * i + 2, 4 * i + 3]]) } for i in range(steps)] def keep_input_tensors(tensors): return [{'input': d['input']} for d in tensors] self.assertAllClose( expected_input_tensors, keep_input_tensors(train._model.result_per_example_tensors)) self.assertAllClose( expected_input_tensors, keep_input_tensors(train._model._task.result_per_example_tensors)) # This model is training parameters m and b such that: # m * (input[0] + input[1]) + b = (input[0] + input[1]) # So we expect m = 1 and b = 0 after training. # m is more stable so that's the one we test with a tight tolerance. self.assertNear( 1.0, train._model._task.result_per_example_tensors[-1]['m'][0], 0.1) self.assertNear(1.0, train._model.result_per_example_tensors[-1]['m'][0], 0.1) # b isn't so stable but shouldn't be too crazy in size. self.assertNear( 0.0, train._model._task.result_per_example_tensors[-1]['b'][0], 10.0) self.assertNear(0.0, train._model.result_per_example_tensors[-1]['b'][0], 10.0)
def testControllerTrainerEvaler(self): trial = tf.test.mock.create_autospec(base_trial.Trial, instance=True) self._trial = trial logdir = os.path.join(tf.test.get_temp_dir(), 'controller_test' + str(random.random())) FLAGS.logdir = logdir cfg = self._GetTestConfig() trial.Name.return_value = 'trial1' def override_model_params(model_params): model_params.task.softmax.num_classes = 20 model_params.task.filter_shapes = [(5, 5, 1, 10), (5, 5, 10, 50)] model_params.task.train.lr_schedule.decay_start = 100 return model_params trial.OverrideModelParams.side_effect = override_model_params trial.ShouldStop.return_value = False trial.ShouldStopAndMaybeReport.return_value = False # Stop trial once ReportEvalMeasure is called. trial.ReportEvalMeasure.return_value = True runners = [self._CreateController(cfg), self._CreateTrainer(cfg)] # Param override works. for runner in runners: self.assertEqual(runner.params.task.softmax.num_classes, 20) self.assertEqual(runner.params.task.filter_shapes, [(5, 5, 1, 10), (5, 5, 10, 50)]) self.assertEqual(runner.params.task.train.lr_schedule.decay_start, 100) runner_manager = trainer.RunnerManager(cfg.name) runner_manager.StartRunners(runners) # Controller and trainer check whether the trial is stopped. self.assertGreater(trial.OverrideModelParams.call_count, 0) self.assertGreater(trial.ShouldStop.call_count, 0) self.assertGreater(trial.ShouldStopAndMaybeReport.call_count, 0) # Controller and trainer do not call report_measure, request_trial_stop, or # report_done. self.assertEqual(trial.ReportEvalMeasure.call_count, 0) train_files = tf.io.gfile.glob(logdir + '/train/*') self.assertTrue(self._HasFile(train_files, 'params.txt')) self.assertTrue(self._HasFile(train_files, 'trainer_params.txt')) self.assertTrue(self._HasFile(train_files, 'ckpt')) self.assertTrue(self._HasFile(train_files, 'tfevents')) control_files = tf.io.gfile.glob(logdir + '/control/*') self.assertTrue(self._HasFile(control_files, 'params.txt')) self.assertTrue(self._HasFile(control_files, 'model_analysis.txt')) self.assertTrue(self._HasFile(control_files, 'train.pbtxt')) self.assertTrue(self._HasFile(control_files, 'tfevents')) # EvalerDev may not run concurrently with Controller in a single process # because EvalerDev loads checkpoints and overwrites states like global # steps. self._CreateEvalerDev(cfg).EvalLatestCheckpoint() # EvalerDev calls report_measure, request_trial_stop, and report_done. after_eval_count = trial.ReportEvalMeasure.call_count self.assertEqual(after_eval_count, 0) self._CreateDecoderDev(cfg).DecodeLatestCheckpoint() after_decoder_count = trial.ReportEvalMeasure.call_count self.assertGreater(after_decoder_count, 0) dev_files = tf.io.gfile.glob(logdir + '/eval_dev/*') self.assertTrue(self._HasFile(dev_files, 'params.txt')) self.assertTrue(self._HasFile(dev_files, 'eval_dev.pbtxt')) self.assertTrue(self._HasFile(dev_files, 'tfevents')) self.assertTrue(self._HasFile(dev_files, 'score')) self.assertTrue( self._HasLine(self._GetMatchedFileName(dev_files, 'score'), 'log_pplx'))
def testDecoder(self): logdir = os.path.join(tf.test.get_temp_dir(), 'decoder_test' + str(random.random())) FLAGS.logdir = logdir dec_dir = os.path.join(logdir, 'decoder_dev') cfg = self._GetSimpleTestConfig() runner_manager = trainer.RunnerManager(cfg.name) runner_manager.StartRunners([ self._CreateController(cfg), self._CreateTrainer(cfg), ]) # Test decoding with default settings. with self.subTest(name='DefaultDecoder'): runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*')) self.assertTrue(self._HasFile(dec_files, 'params.txt')) self.assertTrue(self._HasFile(dec_files, 'decoder_dev.pbtxt')) self.assertTrue(self._HasFile(dec_files, 'tfevents')) self.assertTrue(self._HasFile(dec_files, 'processed_ckpts.txt')) # Only the score for the 2-step checkpoint should be present. self.assertFalse(self._HasFile(dec_files, 'score-00000000.txt')) self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt')) self.assertTrue( self._HasLine(self._GetMatchedFileName(dec_files, 'score'), 'examples/sec')) # Test that checkpoints are not reevaluated when a job is interrupted. score_2_path = os.path.join(dec_dir, 'score-00000002.txt') score_2_mod_time = pathlib.Path(score_2_path).stat().st_mtime with self.subTest(name='DefaultDecoderNoOp'): cfg = self._GetSimpleTestConfig() runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*')) self.assertFalse(self._HasFile(dec_files, 'score-00000000.txt')) self.assertEqual(score_2_mod_time, pathlib.Path(score_2_path).stat().st_mtime) # Test decoding a specific checkpoint. with self.subTest(name='LoadCheckpointFrom'): cfg = self._GetSimpleTestConfig() cfg.task.eval.load_checkpoint_from = os.path.join( logdir, 'train/ckpt-00000000') runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*')) # Scores for both checkpoints should be present... self.assertTrue(self._HasFile(dec_files, 'score-00000000.txt')) self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt')) # ... but only the score for the 0-step checkpoint should be modified. self.assertEqual(score_2_mod_time, pathlib.Path(score_2_path).stat().st_mtime) # Reset the decoder's cached state and test decoding all checkpoints. shutil.rmtree(dec_dir) with self.subTest(name='DecodeAllCheckpoints'): cfg = self._GetSimpleTestConfig() cfg.task.eval.decode_all_checkpoints = True runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*')) self.assertTrue(self._HasFile(dec_files, 'score-00000000.txt')) self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt')) # Test that decode_all_checkpoints on an already decoded dir is a no-op. score_2_mod_time = pathlib.Path(score_2_path).stat().st_mtime with self.subTest(name='DecodeAllCheckpointsNoOp'): runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) self.assertEqual(score_2_mod_time, pathlib.Path(score_2_path).stat().st_mtime)
def testDecoder(self): logdir = os.path.join(tf.test.get_temp_dir(), 'decoder_test' + str(random.random())) FLAGS.logdir = logdir dec_dir = os.path.join(logdir, 'decoder_dev') cfg = self._GetSimpleTestConfig() runner_manager = trainer.RunnerManager(cfg.name) runner_manager.StartRunners([ self._CreateController(cfg), self._CreateTrainer(cfg), ]) # Test decoding with default settings. with self.subTest(name='DefaultDecoder'): runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*')) self.assertTrue(self._HasFile(dec_files, 'params.txt')) self.assertTrue(self._HasFile(dec_files, 'decoder_dev.pbtxt')) self.assertTrue(self._HasFile(dec_files, 'tfevents')) self.assertTrue(self._HasFile(dec_files, 'processed_ckpts.txt')) # Only the score for the 2-step checkpoint should be present. self.assertFalse(self._HasFile(dec_files, 'score-00000000.txt')) self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt')) self.assertTrue( self._HasLine( self._GetMatchedFileName(dec_files, 'score'), 'examples/sec')) # Test that checkpoints are not reevaluated when a job is interrupted. score_2_path = os.path.join(dec_dir, 'score-00000002.txt') score_2_mod_time = pathlib.Path(score_2_path).stat().st_mtime with self.subTest(name='DefaultDecoderNoOp'): cfg = self._GetSimpleTestConfig() # base_runner uses os._exit to forcibly terminate the program after # encountering the ValueError we expect this to raise. Since it uses # os._exit instead of sys.exit, we cannot use # self.assertRaises(SystemExit) to prevent this termination. Instead, we # use a mock function to indirectly test that the ValueError is raised. # pylint: disable=protected-access _os_exit = os._exit # pylint: disable=invalid-name os._exit = unittest.mock.MagicMock() runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) self.assertTrue(os._exit.called) os._exit = _os_exit # pylint: enable=protected-access dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*')) self.assertFalse(self._HasFile(dec_files, 'score-00000000.txt')) self.assertEqual(score_2_mod_time, pathlib.Path(score_2_path).stat().st_mtime) # Test decoding a specific checkpoint. with self.subTest(name='LoadCheckpointFrom'): cfg = self._GetSimpleTestConfig() cfg.task.eval.load_checkpoint_from = os.path.join(logdir, 'train/ckpt-00000000') runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*')) # Scores for both checkpoints should be present... self.assertTrue(self._HasFile(dec_files, 'score-00000000.txt')) self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt')) # ... but only the score for the 0-step checkpoint should be modified. self.assertEqual(score_2_mod_time, pathlib.Path(score_2_path).stat().st_mtime) # Reset the decoder's cached state and test decoding all checkpoints. shutil.rmtree(dec_dir) with self.subTest(name='DecodeAllCheckpoints'): cfg = self._GetSimpleTestConfig() cfg.task.eval.decode_all_checkpoints = True runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*')) self.assertTrue(self._HasFile(dec_files, 'score-00000000.txt')) self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt')) # Test that decode_all_checkpoints on an already decoded dir is a no-op. score_2_mod_time = pathlib.Path(score_2_path).stat().st_mtime with self.subTest(name='DecodeAllCheckpointsNoOp'): runner_manager.StartRunners([self._CreateDecoderDev(cfg)]) self.assertEqual(score_2_mod_time, pathlib.Path(score_2_path).stat().st_mtime)