def __init__(self, model, dataset_config, eval_config, skip_evaluated_checkpoints=True, eval_wait_interval=30, do_kitti_native_eval=True): """Evaluator class for evaluating model's detection output. Args: model: An instance of DetectionModel dataset_config: Dataset protobuf configuration eval_config: Evaluation protobuf configuration skip_evaluated_checkpoints: (optional) Enables checking evaluation results directory and if the folder names with the checkpoint index exists, it 'assumes' that checkpoint has already been evaluated and skips that checkpoint. eval_wait_interval: (optional) The number of seconds between looking for a new checkpoint. do_kitti_native_eval: (optional) flag to enable running kitti native eval code. """ # Get model configurations self.model = model self.dataset_config = dataset_config self.eval_config = eval_config self.model_config = model.model_config self.model_name = self.model_config.model_name self.full_model = isinstance(self.model, AvodModel) self.paths_config = self.model_config.paths_config self.checkpoint_dir = self.paths_config.checkpoint_dir self.skip_evaluated_checkpoints = skip_evaluated_checkpoints self.eval_wait_interval = eval_wait_interval self.do_kitti_native_eval = do_kitti_native_eval # Create a variable tensor to hold the global step self.global_step_tensor = tf.Variable(0, trainable=False, name='global_step') eval_mode = eval_config.eval_mode if eval_mode not in ['val', 'test']: raise ValueError('Evaluation mode can only be set to `val`' 'or `test`') if not os.path.exists(self.checkpoint_dir): raise ValueError( '{} must have at least one checkpoint entry.'.format( self.checkpoint_dir)) if self.do_kitti_native_eval: if self.eval_config.eval_mode == 'val': # Copy kitti native eval code into the predictions folder evaluator_utils.copy_kitti_native_code( self.model_config.checkpoint_name) allow_gpu_mem_growth = self.eval_config.allow_gpu_mem_growth if allow_gpu_mem_growth: # GPU memory config config = tf.ConfigProto() config.gpu_options.allow_growth = allow_gpu_mem_growth self._sess = tf.Session(config=config) else: self._sess = tf.Session() # The model should return a dictionary of predictions self._prediction_dict = self.model.build() if eval_mode == 'val': # Setup loss and summary writer in val mode only self._loss_dict, self._total_loss = \ self.model.loss(self._prediction_dict) self.summary_writer, self.summary_merged = \ evaluator_utils.set_up_summary_writer(self.model_config, self._sess) else: self._loss_dict = None self._total_loss = None self.summary_writer = None self.summary_merged = None self._saver = tf.train.Saver() # Add maximum memory usage summary op # This op can only be run on device with gpu # so it's skipped on travis is_travis = 'TRAVIS' in os.environ if not is_travis: # tf 1.4 # tf.summary.scalar('bytes_in_use', # tf.contrib.memory_stats.BytesInUse()) tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse())
def repeated_checkpoint_run(self): """Periodically evaluates the checkpoints inside the `checkpoint_dir`. This function evaluates all the existing checkpoints as they are being generated. If there are none, it sleeps until new checkpoints become available. Since there is no synchronization guarantee for the trainer and evaluator, at each iteration it reloads all the checkpoints and searches for the last checkpoint to continue from. This is meant to be called in parallel to the trainer to evaluate the models regularly. Raises: ValueError: if model.checkpoint_dir doesn't have at least one element. """ if not os.path.exists(self.checkpoint_dir): raise ValueError( '{} must have at least one checkpoint entry.'.format( self.checkpoint_dir)) # Copy kitti native eval code into the predictions folder if self.do_kitti_native_eval: evaluator_utils.copy_kitti_native_code( self.model_config.checkpoint_name) if self.skip_evaluated_checkpoints: already_evaluated_ckpts = self.get_evaluated_ckpts( self.model_config, self.full_model) tf.logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) last_checkpoint_id = -1 number_of_evaluations = 0 while True: # Load current checkpoints available trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver) num_checkpoints = len(self._saver.last_checkpoints) start = time.time() if number_of_evaluations >= num_checkpoints: tf.logging.info( 'No new checkpoints found in %s.' 'Will try again in %d seconds', self.checkpoint_dir, self.eval_wait_interval) else: for ckpt_idx in range(num_checkpoints): checkpoint_to_restore = \ self._saver.last_checkpoints[ckpt_idx] ckpt_id = evaluator_utils.strip_checkpoint_id( checkpoint_to_restore) # Check if checkpoint has been evaluated already already_evaluated = ckpt_id in already_evaluated_ckpts if already_evaluated or ckpt_id <= last_checkpoint_id: number_of_evaluations = max( (ckpt_idx + 1, number_of_evaluations)) continue self.run_checkpoint_once(checkpoint_to_restore) number_of_evaluations += 1 # Save the id of the latest evaluated checkpoint last_checkpoint_id = ckpt_id time_to_next_eval = start + self.eval_wait_interval - time.time() if time_to_next_eval > 0: time.sleep(time_to_next_eval)
def repeated_checkpoint_run(self): """Periodically evaluates the checkpoints inside the `checkpoint_dir`. This function evaluates all the existing checkpoints as they are being generated. If there are none, it sleeps until new checkpoints become available. Since there is no synchronization guarantee for the trainer and evaluator, at each iteration it reloads all the checkpoints and searches for the last checkpoint to continue from. This is meant to be called in parallel to the trainer to evaluate the models regularly. Raises: ValueError: if model.checkpoint_dir doesn't have at least one element. """ if not os.path.exists(self.checkpoint_dir): raise ValueError( '{} must have at least one checkpoint entry.'.format( self.checkpoint_dir)) # Copy kitti native eval code into the predictions folder if self.do_kitti_native_eval: evaluator_utils.copy_kitti_native_code( self.model_config.checkpoint_name) if self.skip_evaluated_checkpoints: already_evaluated_ckpts = self.get_evaluated_ckpts( self.model_config) else: already_evaluated_ckpts = [] tf.logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) last_checkpoint_id = -1 number_of_evaluations = 0 #Dont have to add summary(for model inference at each sample) at repeated evaluation.. #only care avg loss at each ckpt step. #self.summary_merged = None evaluated_ckpts = [ckpt for ckpt in already_evaluated_ckpts] while True: # Load current checkpoints available trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver) num_checkpoints = len(self._saver.last_checkpoints) no_newckpts = True evaluated_ckpts.sort() start = time.time() for ckpt_idx in range(num_checkpoints): checkpoint_to_restore = \ self._saver.last_checkpoints[ckpt_idx] ckpt_id = evaluator_utils.strip_checkpoint_id( checkpoint_to_restore) # Check if checkpoint has been evaluated already if ckpt_id == 0 or ckpt_id in evaluated_ckpts: continue else: no_newckpts = False print('evaluated ckpts: ', evaluated_ckpts) print('processing ckpt id: ', ckpt_id) self.run_checkpoint_once(checkpoint_to_restore) evaluated_ckpts.append(ckpt_id) time_to_next_eval = start + self.eval_wait_interval - time.time() if no_newckpts: tf.logging.info( 'No new checkpoints found in %s.' 'Will try again in %d seconds', self.checkpoint_dir, self.eval_wait_interval) if time_to_next_eval > 0: time.sleep(time_to_next_eval)