def continuous_eval_on_train_data(self): """Evaluate on train data until checkpoints stop being produced.""" for ckpt_path in next_checkpoint(self._hparams.model_dir, self._hparams.eval_timeout_mins): # Skip zero'th step. train_step = decoding.get_step_from_ckpt_path(ckpt_path) if train_step == 0: tf.logging.info("Skipping evaluation at step 0") continue self.evaluate_on_train_data()
def continuous_decode_on_eval_data(self): """Decode from dataset on new checkpoint.""" if self._hparams.mlperf_mode: ckpt_generator = next_undecoded_checkpoint( self._hparams.model_dir, self._decode_hparams.decode_timeout_mins) else: ckpt_generator = next_checkpoint( self._hparams.model_dir, self._decode_hparams.decode_timeout_mins) for ckpt in ckpt_generator: current_step = decoding.get_step_from_ckpt_path(ckpt) tf.logging.info("Decoding step %d" % current_step) # Skip checkpoint 0. if current_step == 0: continue # Decode the latest checkpoint by default. checkpoint_path = None if self._hparams.mlperf_mode: self._decode_hparams.mlperf_decode_step = current_step checkpoint_path = ckpt mlperf_log.transformer_print(key=mlperf_log.EVAL_START) self.decode(dataset_split=tf.estimator.ModeKeys.EVAL, checkpoint_path=checkpoint_path) d_hparams = self._decode_hparams if self._hparams.mlperf_mode and d_hparams.mlperf_success: mlperf_log.transformer_print(key=mlperf_log.RUN_STOP, value={"success": "true"}) break d_hparams = self._decode_hparams if self._hparams.mlperf_mode and not d_hparams.mlperf_success: mlperf_log.transformer_print(key=mlperf_log.RUN_STOP, value={"success": "false"})