コード例 #1
0
    def __init__(self,
                 model,
                 dataset_config,
                 eval_config,
                 skip_evaluated_checkpoints=True,
                 eval_wait_interval=30,
                 do_kitti_native_eval=True):
        """Evaluator class for evaluating model's detection output.

        Args:
            model: An instance of DetectionModel
            dataset_config: Dataset protobuf configuration
            eval_config: Evaluation protobuf configuration
            skip_evaluated_checkpoints: (optional) Enables checking evaluation
                results directory and if the folder names with the checkpoint
                index exists, it 'assumes' that checkpoint has already been
                evaluated and skips that checkpoint.
            eval_wait_interval: (optional) The number of seconds between
                looking for a new checkpoint.
            do_kitti_native_eval: (optional) flag to enable running kitti native
                eval code.
        """

        # Get model configurations
        self.model = model
        self.dataset_config = dataset_config
        self.eval_config = eval_config

        self.model_config = model.model_config
        self.model_name = self.model_config.model_name
        self.full_model = isinstance(self.model, AvodModel)

        self.paths_config = self.model_config.paths_config
        self.checkpoint_dir = self.paths_config.checkpoint_dir

        self.skip_evaluated_checkpoints = skip_evaluated_checkpoints
        self.eval_wait_interval = eval_wait_interval

        self.do_kitti_native_eval = do_kitti_native_eval

        # Create a variable tensor to hold the global step
        self.global_step_tensor = tf.Variable(0,
                                              trainable=False,
                                              name='global_step')

        eval_mode = eval_config.eval_mode
        if eval_mode not in ['val', 'test']:
            raise ValueError('Evaluation mode can only be set to `val`'
                             'or `test`')

        if not os.path.exists(self.checkpoint_dir):
            raise ValueError(
                '{} must have at least one checkpoint entry.'.format(
                    self.checkpoint_dir))

        if self.do_kitti_native_eval:
            if self.eval_config.eval_mode == 'val':
                # Copy kitti native eval code into the predictions folder
                evaluator_utils.copy_kitti_native_code(
                    self.model_config.checkpoint_name)

        allow_gpu_mem_growth = self.eval_config.allow_gpu_mem_growth
        if allow_gpu_mem_growth:
            # GPU memory config
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = allow_gpu_mem_growth
            self._sess = tf.Session(config=config)
        else:
            self._sess = tf.Session()

        # The model should return a dictionary of predictions
        self._prediction_dict = self.model.build()
        if eval_mode == 'val':
            # Setup loss and summary writer in val mode only
            self._loss_dict, self._total_loss = \
                self.model.loss(self._prediction_dict)

            self.summary_writer, self.summary_merged = \
                evaluator_utils.set_up_summary_writer(self.model_config,
                                                      self._sess)

        else:
            self._loss_dict = None
            self._total_loss = None
            self.summary_writer = None
            self.summary_merged = None

        self._saver = tf.train.Saver()

        # Add maximum memory usage summary op
        # This op can only be run on device with gpu
        # so it's skipped on travis
        is_travis = 'TRAVIS' in os.environ
        if not is_travis:
            # tf 1.4
            # tf.summary.scalar('bytes_in_use',
            #                   tf.contrib.memory_stats.BytesInUse())
            tf.summary.scalar('max_bytes',
                              tf.contrib.memory_stats.MaxBytesInUse())
コード例 #2
0
    def repeated_checkpoint_run(self):
        """Periodically evaluates the checkpoints inside the `checkpoint_dir`.

        This function evaluates all the existing checkpoints as they are being
        generated. If there are none, it sleeps until new checkpoints become
        available. Since there is no synchronization guarantee for the trainer
        and evaluator, at each iteration it reloads all the checkpoints and
        searches for the last checkpoint to continue from. This is meant to be
        called in parallel to the trainer to evaluate the models regularly.

        Raises:
            ValueError: if model.checkpoint_dir doesn't have at least one
                element.
        """

        if not os.path.exists(self.checkpoint_dir):
            raise ValueError(
                '{} must have at least one checkpoint entry.'.format(
                    self.checkpoint_dir))

        # Copy kitti native eval code into the predictions folder
        if self.do_kitti_native_eval:
            evaluator_utils.copy_kitti_native_code(
                self.model_config.checkpoint_name)

        if self.skip_evaluated_checkpoints:
            already_evaluated_ckpts = self.get_evaluated_ckpts(
                self.model_config, self.full_model)
        tf.logging.info('Starting evaluation at ' +
                        time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

        last_checkpoint_id = -1
        number_of_evaluations = 0
        while True:
            # Load current checkpoints available
            trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver)
            num_checkpoints = len(self._saver.last_checkpoints)

            start = time.time()

            if number_of_evaluations >= num_checkpoints:
                tf.logging.info(
                    'No new checkpoints found in %s.'
                    'Will try again in %d seconds', self.checkpoint_dir,
                    self.eval_wait_interval)
            else:
                for ckpt_idx in range(num_checkpoints):
                    checkpoint_to_restore = \
                        self._saver.last_checkpoints[ckpt_idx]
                    ckpt_id = evaluator_utils.strip_checkpoint_id(
                        checkpoint_to_restore)

                    # Check if checkpoint has been evaluated already
                    already_evaluated = ckpt_id in already_evaluated_ckpts
                    if already_evaluated or ckpt_id <= last_checkpoint_id:
                        number_of_evaluations = max(
                            (ckpt_idx + 1, number_of_evaluations))
                        continue

                    self.run_checkpoint_once(checkpoint_to_restore)
                    number_of_evaluations += 1

                    # Save the id of the latest evaluated checkpoint
                    last_checkpoint_id = ckpt_id

            time_to_next_eval = start + self.eval_wait_interval - time.time()
            if time_to_next_eval > 0:
                time.sleep(time_to_next_eval)
コード例 #3
0
    def repeated_checkpoint_run(self):
        """Periodically evaluates the checkpoints inside the `checkpoint_dir`.

        This function evaluates all the existing checkpoints as they are being
        generated. If there are none, it sleeps until new checkpoints become
        available. Since there is no synchronization guarantee for the trainer
        and evaluator, at each iteration it reloads all the checkpoints and
        searches for the last checkpoint to continue from. This is meant to be
        called in parallel to the trainer to evaluate the models regularly.

        Raises:
            ValueError: if model.checkpoint_dir doesn't have at least one
                element.
        """

        if not os.path.exists(self.checkpoint_dir):
            raise ValueError(
                '{} must have at least one checkpoint entry.'.format(
                    self.checkpoint_dir))

        # Copy kitti native eval code into the predictions folder
        if self.do_kitti_native_eval:
            evaluator_utils.copy_kitti_native_code(
                self.model_config.checkpoint_name)

        if self.skip_evaluated_checkpoints:
            already_evaluated_ckpts = self.get_evaluated_ckpts(
                self.model_config)
        else:
            already_evaluated_ckpts = []
        tf.logging.info('Starting evaluation at ' +
                        time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

        last_checkpoint_id = -1
        number_of_evaluations = 0
        #Dont have to add summary(for model inference at each sample) at repeated evaluation..
        #only care avg loss at each ckpt step.
        #self.summary_merged = None
        evaluated_ckpts = [ckpt for ckpt in already_evaluated_ckpts]
        while True:
            # Load current checkpoints available
            trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver)
            num_checkpoints = len(self._saver.last_checkpoints)
            no_newckpts = True
            evaluated_ckpts.sort()
            start = time.time()
            for ckpt_idx in range(num_checkpoints):
                checkpoint_to_restore = \
                    self._saver.last_checkpoints[ckpt_idx]
                ckpt_id = evaluator_utils.strip_checkpoint_id(
                    checkpoint_to_restore)

                # Check if checkpoint has been evaluated already
                if ckpt_id == 0 or ckpt_id in evaluated_ckpts:
                    continue
                else:
                    no_newckpts = False
                print('evaluated ckpts: ', evaluated_ckpts)
                print('processing ckpt id: ', ckpt_id)
                self.run_checkpoint_once(checkpoint_to_restore)
                evaluated_ckpts.append(ckpt_id)
            time_to_next_eval = start + self.eval_wait_interval - time.time()
            if no_newckpts:
                tf.logging.info(
                    'No new checkpoints found in %s.'
                    'Will try again in %d seconds', self.checkpoint_dir,
                    self.eval_wait_interval)
            if time_to_next_eval > 0:
                time.sleep(time_to_next_eval)