Example #1
0
    def testController(self):
        logdir = os.path.join(tf.test.get_temp_dir(),
                              'controller_test' + str(random.random()))
        FLAGS.logdir = logdir
        cfg = self._GetTestConfig()

        runner_manager = trainer.RunnerManager(cfg.name)

        runner_manager.StartRunners(
            [self._CreateController(cfg),
             self._CreateTrainer(cfg)])

        train_files = tf.io.gfile.glob(logdir + '/train/*')
        self.assertTrue(self._HasFile(train_files, 'ckpt'))
        self.assertTrue(self._HasFile(train_files, 'tfevents'))
        control_files = tf.io.gfile.glob(logdir + '/control/*')
        self.assertTrue(self._HasFile(control_files, 'params.txt'))
        self.assertTrue(self._HasFile(control_files, 'model_analysis.txt'))
        self.assertTrue(self._HasFile(control_files, 'train.pbtxt'))
        self.assertTrue(self._HasFile(control_files, 'tfevents'))

        # EvalerDev may not run concurrently with Controller in a single process
        # because EvalerDev loads checkpoints and overwrites states like global
        # steps.
        runner_manager.StartRunners([self._CreateEvalerDev(cfg)])

        dev_files = tf.io.gfile.glob(logdir + '/eval_dev/*')
        self.assertTrue(self._HasFile(dev_files, 'params.txt'))
        self.assertTrue(self._HasFile(dev_files, 'eval_dev.pbtxt'))
        self.assertTrue(self._HasFile(dev_files, 'tfevents'))
        self.assertTrue(self._HasFile(dev_files, 'score'))
        self.assertTrue(
            self._HasLine(self._GetMatchedFileName(dev_files, 'score'),
                          'log_pplx'))
Example #2
0
 def testWriteInferenceGraph(self):
     random.seed()
     logdir = os.path.join(tf.test.get_temp_dir(),
                           'inference_graphs' + str(random.random()))
     FLAGS.logdir = logdir
     cfg = 'punctuator.codelab.RNMTModel'
     trainer.RunnerManager(cfg).WriteInferenceGraph()
     inference_files = tf.io.gfile.glob(logdir + '/inference_graphs/*')
     self.assertTrue(self._HasFile(inference_files, 'inference.pbtxt'))
     self.assertTrue(self._HasFile(inference_files, 'inference_tpu.pbtxt'))
Example #3
0
 def testWriteInferenceGraph(self):
     random.seed()
     logdir = os.path.join(tf.test.get_temp_dir(),
                           'inference_graphs' + str(random.random()))
     FLAGS.logdir = logdir
     cfg = 'lm.one_billion_wds.WordLevelOneBwdsSimpleSampledSoftmax'
     trainer.RunnerManager(cfg).WriteInferenceGraph()
     inference_files = tf.gfile.Glob(logdir + '/inference_graphs/*')
     self.assertTrue(self._HasFile(inference_files, 'inference.pbtxt'))
     self.assertTrue(self._HasFile(inference_files, 'inference_tpu.pbtxt'))
Example #4
0
 def testWriteMultiTaskInferenceGraph(self):
   random.seed()
   logdir = os.path.join(tf.test.get_temp_dir(),
                         'inference_graphs' + str(random.random()))
   FLAGS.logdir = logdir
   cfg = 'test.EmptyMultiTaskParams'
   trainer.RunnerManager(cfg).WriteInferenceGraph()
   inference_files = tf.io.gfile.glob(logdir + '/inference_graphs/*')
   self.assertTrue(self._HasFile(inference_files, 'a_inference.pbtxt'))
   self.assertTrue(self._HasFile(inference_files, 'a_inference_tpu.pbtxt'))
   self.assertTrue(self._HasFile(inference_files, 'b_inference.pbtxt'))
   self.assertTrue(self._HasFile(inference_files, 'b_inference_tpu.pbtxt'))
Example #5
0
    def testDecoder(self):
        logdir = os.path.join(tf.test.get_temp_dir(),
                              'decoder_test' + str(random.random()))
        FLAGS.logdir = logdir
        cfg = self._GetTestConfig()

        runner_manager = trainer.RunnerManager(cfg.name)

        runner_manager.StartRunners(
            [self._CreateController(cfg),
             self._CreateTrainer(cfg)])
        runner_manager.StartRunners([self._CreateDecoderDev(cfg)])

        dec_files = tf.io.gfile.glob(logdir + '/decoder_dev/*')
        self.assertTrue(self._HasFile(dec_files, 'params.txt'))
        self.assertTrue(self._HasFile(dec_files, 'decoder_dev.pbtxt'))
        self.assertTrue(self._HasFile(dec_files, 'tfevents'))
        # Only the score for the 2-step checkpoint should be present.
        self.assertTrue(
            tf.io.gfile.exists(
                os.path.join(logdir, 'decoder_dev/score-00000002.txt')))
        self.assertFalse(
            tf.io.gfile.exists(
                os.path.join(logdir, 'decoder_dev/score-00000000.txt')))
        self.assertTrue(
            self._HasLine(self._GetMatchedFileName(dec_files, 'score'),
                          'examples/sec'))

        # Test customization of an eval checkpoint.  Create a new logdir / decoder
        # but point the eval checkpoint to the 0th checkpoint of the most
        # recent experiment.
        new_logdir = os.path.join(tf.test.get_temp_dir(),
                                  'decoder_test' + str(random.random()))
        FLAGS.logdir = new_logdir
        cfg = self._GetTestConfig()
        cfg.task.eval.load_checkpoint_from = os.path.join(
            logdir, 'train/ckpt-00000000')

        runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
        # Only the score for the 0th checkpoint should be present.
        self.assertTrue(
            tf.io.gfile.exists(
                os.path.join(new_logdir, 'decoder_dev/score-00000000.txt')))
        self.assertFalse(
            tf.io.gfile.exists(
                os.path.join(new_logdir, 'decoder_dev/score-00000002.txt')))
Example #6
0
    def testDecoder(self):
        logdir = os.path.join(tf.test.get_temp_dir(),
                              'decoder_test' + str(random.random()))
        FLAGS.logdir = logdir
        cfg = self._GetTestConfig()

        runner_manager = trainer.RunnerManager(cfg.name)

        runner_manager.StartRunners(
            [self._CreateController(cfg),
             self._CreateTrainer(cfg)])
        runner_manager.StartRunners([self._CreateDecoderDev(cfg)])

        dec_files = tf.gfile.Glob(logdir + '/decoder_dev/*')
        self.assertTrue(self._HasFile(dec_files, 'params.txt'))
        self.assertTrue(self._HasFile(dec_files, 'decoder_dev.pbtxt'))
        self.assertTrue(self._HasFile(dec_files, 'tfevents'))
        self.assertTrue(self._HasFile(dec_files, 'score'))
        self.assertTrue(
            self._HasLine(self._GetMatchedFileName(dec_files, 'score'),
                          'examples/sec'))
Example #7
0
    def testIdentityRegressionModel(self):
        logdir = os.path.join(
            tf.test.get_temp_dir(),
            'identity_regression_test' + str(random.random()))
        FLAGS.logdir = logdir

        steps = 100
        cfg = trainer_test_utils.IdentityRegressionModel.Params()
        cfg.cluster.task = 0
        cfg.cluster.mode = 'sync'
        cfg.cluster.job = 'trainer_client'
        cfg.cluster.worker.name = '/job:localhost'
        cfg.cluster.worker.replicas = 1
        cfg.cluster.worker.gpus_per_replica = 0
        cfg.cluster.ps.name = '/job:localhost'
        cfg.cluster.ps.replicas = 1
        cfg.cluster.ps.gpus_per_replica = 0
        cfg.train.max_steps = steps
        cfg.task.train.learning_rate = 0.025

        runners = [self._CreateController(cfg), self._CreateTrainer(cfg)]

        runner_manager = trainer.RunnerManager(cfg.name)
        runner_manager.StartRunners(runners)
        train = runners[1]

        # ProcessFPropResults should have been called <steps> times on the task
        # and <steps> times on the model.

        # There are always 2 samples in the batch.
        expected_samples_in_batch = [(2, 1.0) for _ in range(steps)]

        self.assertAllEqual(
            expected_samples_in_batch,
            [m['num_samples_in_batch'] for m in train._model.metrics])
        self.assertAllEqual(
            expected_samples_in_batch,
            [m['num_samples_in_batch'] for m in train._model._task.metrics])

        # Global steps should increment by 1 for each batch.
        expected_global_steps = [i + 1 for i in range(steps)]
        self.assertAllEqual(expected_global_steps, train._model.global_steps)
        self.assertAllEqual(expected_global_steps,
                            train._model._task.global_steps)

        # The CountingInputGenerator makes [2,2] inputs that increment for each
        # batch, like:
        #   [[0, 1], [2, 3]],
        #   [[4, 5], [6, 7]],
        #   ...
        expected_input_tensors = [{
            'input':
            np.array([[4 * i, 4 * i + 1], [4 * i + 2, 4 * i + 3]])
        } for i in range(steps)]

        def keep_input_tensors(tensors):
            return [{'input': d['input']} for d in tensors]

        self.assertAllClose(
            expected_input_tensors,
            keep_input_tensors(train._model.result_per_example_tensors))
        self.assertAllClose(
            expected_input_tensors,
            keep_input_tensors(train._model._task.result_per_example_tensors))

        # This model is training parameters m and b such that:
        #    m * (input[0] + input[1]) + b = (input[0] + input[1])
        # So we expect m = 1 and b = 0 after training.

        # m is more stable so that's the one we test with a tight tolerance.
        self.assertNear(
            1.0, train._model._task.result_per_example_tensors[-1]['m'][0],
            0.1)
        self.assertNear(1.0,
                        train._model.result_per_example_tensors[-1]['m'][0],
                        0.1)

        # b isn't so stable but shouldn't be too crazy in size.
        self.assertNear(
            0.0, train._model._task.result_per_example_tensors[-1]['b'][0],
            10.0)
        self.assertNear(0.0,
                        train._model.result_per_example_tensors[-1]['b'][0],
                        10.0)
Example #8
0
    def testControllerTrainerEvaler(self):
        trial = tf.test.mock.create_autospec(base_trial.Trial, instance=True)
        self._trial = trial

        logdir = os.path.join(tf.test.get_temp_dir(),
                              'controller_test' + str(random.random()))
        FLAGS.logdir = logdir
        cfg = self._GetTestConfig()

        trial.Name.return_value = 'trial1'

        def override_model_params(model_params):
            model_params.task.softmax.num_classes = 20
            model_params.task.filter_shapes = [(5, 5, 1, 10), (5, 5, 10, 50)]
            model_params.task.train.lr_schedule.decay_start = 100
            return model_params

        trial.OverrideModelParams.side_effect = override_model_params
        trial.ShouldStop.return_value = False
        trial.ShouldStopAndMaybeReport.return_value = False
        # Stop trial once ReportEvalMeasure is called.
        trial.ReportEvalMeasure.return_value = True

        runners = [self._CreateController(cfg), self._CreateTrainer(cfg)]
        # Param override works.
        for runner in runners:
            self.assertEqual(runner.params.task.softmax.num_classes, 20)
            self.assertEqual(runner.params.task.filter_shapes,
                             [(5, 5, 1, 10), (5, 5, 10, 50)])
            self.assertEqual(runner.params.task.train.lr_schedule.decay_start,
                             100)

        runner_manager = trainer.RunnerManager(cfg.name)
        runner_manager.StartRunners(runners)
        # Controller and trainer check whether the trial is stopped.
        self.assertGreater(trial.OverrideModelParams.call_count, 0)
        self.assertGreater(trial.ShouldStop.call_count, 0)
        self.assertGreater(trial.ShouldStopAndMaybeReport.call_count, 0)
        # Controller and trainer do not call report_measure, request_trial_stop, or
        # report_done.
        self.assertEqual(trial.ReportEvalMeasure.call_count, 0)

        train_files = tf.io.gfile.glob(logdir + '/train/*')
        self.assertTrue(self._HasFile(train_files, 'params.txt'))
        self.assertTrue(self._HasFile(train_files, 'trainer_params.txt'))
        self.assertTrue(self._HasFile(train_files, 'ckpt'))
        self.assertTrue(self._HasFile(train_files, 'tfevents'))
        control_files = tf.io.gfile.glob(logdir + '/control/*')
        self.assertTrue(self._HasFile(control_files, 'params.txt'))
        self.assertTrue(self._HasFile(control_files, 'model_analysis.txt'))
        self.assertTrue(self._HasFile(control_files, 'train.pbtxt'))
        self.assertTrue(self._HasFile(control_files, 'tfevents'))

        # EvalerDev may not run concurrently with Controller in a single process
        # because EvalerDev loads checkpoints and overwrites states like global
        # steps.
        self._CreateEvalerDev(cfg).EvalLatestCheckpoint()
        # EvalerDev calls report_measure, request_trial_stop, and report_done.
        after_eval_count = trial.ReportEvalMeasure.call_count
        self.assertEqual(after_eval_count, 0)

        self._CreateDecoderDev(cfg).DecodeLatestCheckpoint()
        after_decoder_count = trial.ReportEvalMeasure.call_count
        self.assertGreater(after_decoder_count, 0)

        dev_files = tf.io.gfile.glob(logdir + '/eval_dev/*')
        self.assertTrue(self._HasFile(dev_files, 'params.txt'))
        self.assertTrue(self._HasFile(dev_files, 'eval_dev.pbtxt'))
        self.assertTrue(self._HasFile(dev_files, 'tfevents'))
        self.assertTrue(self._HasFile(dev_files, 'score'))
        self.assertTrue(
            self._HasLine(self._GetMatchedFileName(dev_files, 'score'),
                          'log_pplx'))
Example #9
0
    def testDecoder(self):
        logdir = os.path.join(tf.test.get_temp_dir(),
                              'decoder_test' + str(random.random()))
        FLAGS.logdir = logdir
        dec_dir = os.path.join(logdir, 'decoder_dev')
        cfg = self._GetSimpleTestConfig()
        runner_manager = trainer.RunnerManager(cfg.name)
        runner_manager.StartRunners([
            self._CreateController(cfg),
            self._CreateTrainer(cfg),
        ])

        # Test decoding with default settings.
        with self.subTest(name='DefaultDecoder'):
            runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
            dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*'))
            self.assertTrue(self._HasFile(dec_files, 'params.txt'))
            self.assertTrue(self._HasFile(dec_files, 'decoder_dev.pbtxt'))
            self.assertTrue(self._HasFile(dec_files, 'tfevents'))
            self.assertTrue(self._HasFile(dec_files, 'processed_ckpts.txt'))
            # Only the score for the 2-step checkpoint should be present.
            self.assertFalse(self._HasFile(dec_files, 'score-00000000.txt'))
            self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt'))
            self.assertTrue(
                self._HasLine(self._GetMatchedFileName(dec_files, 'score'),
                              'examples/sec'))

        # Test that checkpoints are not reevaluated when a job is interrupted.
        score_2_path = os.path.join(dec_dir, 'score-00000002.txt')
        score_2_mod_time = pathlib.Path(score_2_path).stat().st_mtime
        with self.subTest(name='DefaultDecoderNoOp'):
            cfg = self._GetSimpleTestConfig()
            runner_manager.StartRunners([self._CreateDecoderDev(cfg)])

            dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*'))
            self.assertFalse(self._HasFile(dec_files, 'score-00000000.txt'))
            self.assertEqual(score_2_mod_time,
                             pathlib.Path(score_2_path).stat().st_mtime)

        # Test decoding a specific checkpoint.
        with self.subTest(name='LoadCheckpointFrom'):
            cfg = self._GetSimpleTestConfig()
            cfg.task.eval.load_checkpoint_from = os.path.join(
                logdir, 'train/ckpt-00000000')
            runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
            dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*'))

            # Scores for both checkpoints should be present...
            self.assertTrue(self._HasFile(dec_files, 'score-00000000.txt'))
            self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt'))
            # ... but only the score for the 0-step checkpoint should be modified.
            self.assertEqual(score_2_mod_time,
                             pathlib.Path(score_2_path).stat().st_mtime)

        # Reset the decoder's cached state and test decoding all checkpoints.
        shutil.rmtree(dec_dir)
        with self.subTest(name='DecodeAllCheckpoints'):
            cfg = self._GetSimpleTestConfig()
            cfg.task.eval.decode_all_checkpoints = True
            runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
            dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*'))
            self.assertTrue(self._HasFile(dec_files, 'score-00000000.txt'))
            self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt'))

        # Test that decode_all_checkpoints on an already decoded dir is a no-op.
        score_2_mod_time = pathlib.Path(score_2_path).stat().st_mtime
        with self.subTest(name='DecodeAllCheckpointsNoOp'):
            runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
            self.assertEqual(score_2_mod_time,
                             pathlib.Path(score_2_path).stat().st_mtime)
Example #10
0
  def testDecoder(self):
    logdir = os.path.join(tf.test.get_temp_dir(),
                          'decoder_test' + str(random.random()))
    FLAGS.logdir = logdir
    dec_dir = os.path.join(logdir, 'decoder_dev')
    cfg = self._GetSimpleTestConfig()
    runner_manager = trainer.RunnerManager(cfg.name)
    runner_manager.StartRunners([
        self._CreateController(cfg),
        self._CreateTrainer(cfg),
    ])

    # Test decoding with default settings.
    with self.subTest(name='DefaultDecoder'):
      runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
      dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*'))
      self.assertTrue(self._HasFile(dec_files, 'params.txt'))
      self.assertTrue(self._HasFile(dec_files, 'decoder_dev.pbtxt'))
      self.assertTrue(self._HasFile(dec_files, 'tfevents'))
      self.assertTrue(self._HasFile(dec_files, 'processed_ckpts.txt'))
      # Only the score for the 2-step checkpoint should be present.
      self.assertFalse(self._HasFile(dec_files, 'score-00000000.txt'))
      self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt'))
      self.assertTrue(
          self._HasLine(
              self._GetMatchedFileName(dec_files, 'score'), 'examples/sec'))

    # Test that checkpoints are not reevaluated when a job is interrupted.
    score_2_path = os.path.join(dec_dir, 'score-00000002.txt')
    score_2_mod_time = pathlib.Path(score_2_path).stat().st_mtime
    with self.subTest(name='DefaultDecoderNoOp'):
      cfg = self._GetSimpleTestConfig()
      # base_runner uses os._exit to forcibly terminate the program after
      # encountering the ValueError we expect this to raise. Since it uses
      # os._exit instead of sys.exit, we cannot use
      # self.assertRaises(SystemExit) to prevent this termination. Instead, we
      # use a mock function to indirectly test that the ValueError is raised.
      # pylint: disable=protected-access
      _os_exit = os._exit  # pylint: disable=invalid-name
      os._exit = unittest.mock.MagicMock()
      runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
      self.assertTrue(os._exit.called)
      os._exit = _os_exit
      # pylint: enable=protected-access

      dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*'))
      self.assertFalse(self._HasFile(dec_files, 'score-00000000.txt'))
      self.assertEqual(score_2_mod_time,
                       pathlib.Path(score_2_path).stat().st_mtime)

    # Test decoding a specific checkpoint.
    with self.subTest(name='LoadCheckpointFrom'):
      cfg = self._GetSimpleTestConfig()
      cfg.task.eval.load_checkpoint_from = os.path.join(logdir,
                                                        'train/ckpt-00000000')
      runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
      dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*'))

      # Scores for both checkpoints should be present...
      self.assertTrue(self._HasFile(dec_files, 'score-00000000.txt'))
      self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt'))
      # ... but only the score for the 0-step checkpoint should be modified.
      self.assertEqual(score_2_mod_time,
                       pathlib.Path(score_2_path).stat().st_mtime)

    # Reset the decoder's cached state and test decoding all checkpoints.
    shutil.rmtree(dec_dir)
    with self.subTest(name='DecodeAllCheckpoints'):
      cfg = self._GetSimpleTestConfig()
      cfg.task.eval.decode_all_checkpoints = True
      runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
      dec_files = tf.io.gfile.glob(os.path.join(dec_dir, '*'))
      self.assertTrue(self._HasFile(dec_files, 'score-00000000.txt'))
      self.assertTrue(self._HasFile(dec_files, 'score-00000002.txt'))

    # Test that decode_all_checkpoints on an already decoded dir is a no-op.
    score_2_mod_time = pathlib.Path(score_2_path).stat().st_mtime
    with self.subTest(name='DecodeAllCheckpointsNoOp'):
      runner_manager.StartRunners([self._CreateDecoderDev(cfg)])
      self.assertEqual(score_2_mod_time,
                       pathlib.Path(score_2_path).stat().st_mtime)