Exemple #1
0
    def run_latest_checkpoints(self):
        """Evaluation function for evaluating all the existing checkpoints.
        This function just runs through all the existing checkpoints.

        Raises:
            ValueError: if model.checkpoint_dir doesn't have at least one
                element.
        """

        if not os.path.exists(self.checkpoint_dir):
            raise ValueError(
                '{} must have at least one checkpoint entry.'.format(
                    self.checkpoint_dir))

        # Load the latest checkpoints available
        trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver)

        num_checkpoints = len(self._saver.last_checkpoints)

        if self.skip_evaluated_checkpoints:
            already_evaluated_ckpts = self.get_evaluated_ckpts(
                self.model_config, self.model_name)

        ckpt_indices = np.asarray(self.eval_config.ckpt_indices)
        if ckpt_indices is not None:
            if ckpt_indices[0] == -1:
                # Restore the most recent checkpoint
                ckpt_idx = num_checkpoints - 1
                ckpt_indices = [ckpt_idx]
                print(ckpt_idx, num_checkpoints, ckpt_indices)
            for ckpt_idx in ckpt_indices:
                checkpoint_to_restore = self._saver.last_checkpoints[ckpt_idx]
                self.run_checkpoint_once(checkpoint_to_restore)

        else:
            last_checkpoint_id = -1
            number_of_evaluations = 0
            # go through all existing checkpoints
            for ckpt_idx in range(num_checkpoints):
                checkpoint_to_restore = self._saver.last_checkpoints[ckpt_idx]
                ckpt_id = evaluator_utils.strip_checkpoint_id(
                    checkpoint_to_restore)

                # Check if checkpoint has been evaluated already
                already_evaluated = ckpt_id in already_evaluated_ckpts
                if already_evaluated or ckpt_id <= last_checkpoint_id:
                    number_of_evaluations = max(
                        (ckpt_idx + 1, number_of_evaluations))
                    continue

                self.run_checkpoint_once(checkpoint_to_restore)
                number_of_evaluations += 1

                # Save the id of the latest evaluated checkpoint
                last_checkpoint_id = ckpt_id
Exemple #2
0
    def repeated_checkpoint_run(self):
        """Periodically evaluates the checkpoints inside the `checkpoint_dir`.

        This function evaluates all the existing checkpoints as they are being
        generated. If there are none, it sleeps until new checkpoints become
        available. Since there is no synchronization guarantee for the trainer
        and evaluator, at each iteration it reloads all the checkpoints and
        searches for the last checkpoint to continue from. This is meant to be
        called in parallel to the trainer to evaluate the models regularly.

        Raises:
            ValueError: if model.checkpoint_dir doesn't have at least one
                element.
        """

        if not os.path.exists(self.checkpoint_dir):
            raise ValueError(
                '{} must have at least one checkpoint entry.'.format(
                    self.checkpoint_dir))

        # Copy kitti native eval code into the predictions folder
        if self.do_kitti_native_eval:
            evaluator_utils.copy_kitti_native_code(
                self.model_config.checkpoint_name)

        if self.skip_evaluated_checkpoints:
            already_evaluated_ckpts = self.get_evaluated_ckpts(
                self.model_config, self.full_model)
        tf.logging.info('Starting evaluation at ' +
                        time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

        last_checkpoint_id = -1
        number_of_evaluations = 0
        while True:
            # Load current checkpoints available
            trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver)
            num_checkpoints = len(self._saver.last_checkpoints)

            start = time.time()

            if number_of_evaluations >= num_checkpoints:
                tf.logging.info(
                    'No new checkpoints found in %s.'
                    'Will try again in %d seconds', self.checkpoint_dir,
                    self.eval_wait_interval)
            else:
                for ckpt_idx in range(num_checkpoints):
                    checkpoint_to_restore = \
                        self._saver.last_checkpoints[ckpt_idx]
                    ckpt_id = evaluator_utils.strip_checkpoint_id(
                        checkpoint_to_restore)

                    # Check if checkpoint has been evaluated already
                    already_evaluated = ckpt_id in already_evaluated_ckpts
                    if already_evaluated or ckpt_id <= last_checkpoint_id:
                        number_of_evaluations = max(
                            (ckpt_idx + 1, number_of_evaluations))
                        continue

                    self.run_checkpoint_once(checkpoint_to_restore)
                    number_of_evaluations += 1

                    # Save the id of the latest evaluated checkpoint
                    last_checkpoint_id = ckpt_id

            time_to_next_eval = start + self.eval_wait_interval - time.time()
            if time_to_next_eval > 0:
                time.sleep(time_to_next_eval)
    def repeated_checkpoint_run(self):
        """Periodically evaluates the checkpoints inside the `checkpoint_dir`.

        This function evaluates all the existing checkpoints as they are being
        generated. If there are none, it sleeps until new checkpoints become
        available. Since there is no synchronization guarantee for the trainer
        and evaluator, at each iteration it reloads all the checkpoints and
        searches for the last checkpoint to continue from. This is meant to be
        called in parallel to the trainer to evaluate the models regularly.

        Raises:
            ValueError: if model.checkpoint_dir doesn't have at least one
                element.
        """

        if not os.path.exists(self.checkpoint_dir):
            raise ValueError(
                '{} must have at least one checkpoint entry.'.format(
                    self.checkpoint_dir))

        # Copy kitti native eval code into the predictions folder
        if self.do_kitti_native_eval:
            evaluator_utils.copy_kitti_native_code(
                self.model_config.checkpoint_name)

        if self.skip_evaluated_checkpoints:
            already_evaluated_ckpts = self.get_evaluated_ckpts(
                self.model_config)
        else:
            already_evaluated_ckpts = []
        tf.logging.info('Starting evaluation at ' +
                        time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

        last_checkpoint_id = -1
        number_of_evaluations = 0
        #Dont have to add summary(for model inference at each sample) at repeated evaluation..
        #only care avg loss at each ckpt step.
        #self.summary_merged = None
        evaluated_ckpts = [ckpt for ckpt in already_evaluated_ckpts]
        while True:
            # Load current checkpoints available
            trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver)
            num_checkpoints = len(self._saver.last_checkpoints)
            no_newckpts = True
            evaluated_ckpts.sort()
            start = time.time()
            for ckpt_idx in range(num_checkpoints):
                checkpoint_to_restore = \
                    self._saver.last_checkpoints[ckpt_idx]
                ckpt_id = evaluator_utils.strip_checkpoint_id(
                    checkpoint_to_restore)

                # Check if checkpoint has been evaluated already
                if ckpt_id == 0 or ckpt_id in evaluated_ckpts:
                    continue
                else:
                    no_newckpts = False
                print('evaluated ckpts: ', evaluated_ckpts)
                print('processing ckpt id: ', ckpt_id)
                self.run_checkpoint_once(checkpoint_to_restore)
                evaluated_ckpts.append(ckpt_id)
            time_to_next_eval = start + self.eval_wait_interval - time.time()
            if no_newckpts:
                tf.logging.info(
                    'No new checkpoints found in %s.'
                    'Will try again in %d seconds', self.checkpoint_dir,
                    self.eval_wait_interval)
            if time_to_next_eval > 0:
                time.sleep(time_to_next_eval)