コード例 #1
0
    def testReturnsSingleCheckpointIfOneShardedCheckpoint(self):
        checkpoint_dir = os.path.join(self.get_temp_dir(),
                                      "one_checkpoint_found_sharded")
        if not gfile.Exists(checkpoint_dir):
            gfile.MakeDirs(checkpoint_dir)

        global_step = variables.Variable(0, name="v0")

        # This will result in 3 different checkpoint shard files.
        with ops.device("/cpu:0"):
            variables.Variable(10, name="v1")
        with ops.device("/cpu:1"):
            variables.Variable(20, name="v2")

        saver = saver_lib.Saver(sharded=True)

        with session_lib.Session(target="",
                                 config=config_pb2.ConfigProto(
                                     device_count={"CPU": 2})) as session:

            session.run(variables.global_variables_initializer())
            save_path = os.path.join(checkpoint_dir, "model.ckpt")
            saver.save(session, save_path, global_step=global_step)

        num_found = 0
        for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir,
                                                       timeout=0):
            num_found += 1
        self.assertEqual(num_found, 1)
コード例 #2
0
ファイル: sidecar_evaluator.py プロジェクト: chrisvon62/AiBot
    def start(self):
        """Starts the evaluation loop."""
        optimizer_checkpoint = tracking_util.Checkpoint(iter=self._iterations)
        checkpoint = tracking_util.Checkpoint(model=self.model,
                                              optimizer=optimizer_checkpoint)

        for latest_checkpoint in checkpoint_utils.checkpoints_iterator(
                self.checkpoint_dir):
            try:
                # `expect_partial` because the checkpoint can have other `Trackable`s
                # such as `optimizer`.
                checkpoint.restore(latest_checkpoint).expect_partial()
            except (errors_impl.OpError, ) as e:
                # A couple errors can happen here with the coordinator racing to write
                # checkpoint:
                # 1) OpError: open failed for <file path>: No such file or directory
                # 2) NotFoundError (subclass of OpError): Unsuccessful
                # TensorSliceReader constructor.
                # TODO(rchao): Remove this except block once b/150954027 is resolved.
                logging.info(
                    'SidecarEvaluator has an error loading '
                    'checkpoint: %s. Retrying. Error: %s: %s',
                    latest_checkpoint, e.__class__.__name__, e)
                continue

            if self._iterations.numpy() == _ITERATIONS_UNINITIALIZED:
                raise RuntimeError(
                    '`iterations` cannot be loaded from the '
                    'checkpoint file. Please ensure `iterations` is '
                    'tracked in the `checkpoint` saved by the coordinator.')

            logging.info(
                'Evaluation starts: Model weights loaded from latest '
                'checkpoint file: %s.', latest_checkpoint)

            # TODO(rchao): Support arbitrary callback for extensibility.
            self.model.evaluate(self.data, steps=self.steps)

            logging.info('End of evaluation. Accuracy: %r', [
                metric.result().numpy()
                for metric in self.model.compiled_metrics.metrics
            ])

            if self._summary_writer:
                with summary_ops_v2.always_record_summaries(
                ), self._summary_writer.as_default():
                    for metric in self.model.compiled_metrics.metrics:
                        summary_ops_v2.scalar(
                            metric.name,
                            metric.result(),
                            step=self._iterations.read_value())

            # TODO(rchao): Make the max evaluation robust in case users save the
            # checkpoints with epoch format {epoch:03d}.
            if (self.max_evaluations and latest_checkpoint.endswith(
                    '-{}'.format(self.max_evaluations))):
                # Exit the loop because we have evaluated the final checkpoint file.
                logging.info(
                    'Last checkpoint evaluated. SidecarEvaluator stops.')
                return
コード例 #3
0
  def testReturnsEmptyIfNoCheckpointsFound(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(), "no_checkpoints_found")

    num_found = 0
    for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 0)
コード例 #4
0
  def testTimeoutFn(self):
    timeout_fn_calls = [0]
    def timeout_fn():
      timeout_fn_calls[0] += 1
      return timeout_fn_calls[0] > 3

    results = list(
        checkpoint_utils.checkpoints_iterator(
            "/non-existent-dir", timeout=0.1, timeout_fn=timeout_fn))
    self.assertEqual([], results)
    self.assertEqual(4, timeout_fn_calls[0])
コード例 #5
0
  def testWorksWithFSPath(self):
    checkpoint_dir = pathlib.Path(self.get_temp_dir()) / "one_checkpoint_found"
    if not gfile.Exists(checkpoint_dir):
      gfile.MakeDirs(checkpoint_dir)

    save_path = checkpoint_dir / "model.ckpt"

    a = resource_variable_ops.ResourceVariable(5)
    self.evaluate(a.initializer)
    checkpoint = trackable_utils.Checkpoint(a=a)
    checkpoint.save(file_prefix=save_path)

    num_found = 0
    for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 1)
コード例 #6
0
  def testReturnsSingleCheckpointIfOneCheckpointFound(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(), "one_checkpoint_found")
    if not gfile.Exists(checkpoint_dir):
      gfile.MakeDirs(checkpoint_dir)

    save_path = os.path.join(checkpoint_dir, "model.ckpt")

    a = resource_variable_ops.ResourceVariable(5)
    self.evaluate(a.initializer)
    checkpoint = trackable_utils.Checkpoint(a=a)
    checkpoint.save(file_prefix=save_path)

    num_found = 0
    for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 1)
コード例 #7
0
    def start(self):
        """Starts the evaluation loop."""
        optimizer_checkpoint = tracking_util.Checkpoint(iter=self._iterations)
        checkpoint = tracking_util.Checkpoint(model=self.model,
                                              optimizer=optimizer_checkpoint)

        for latest_checkpoint in checkpoint_utils.checkpoints_iterator(
                self.checkpoint_dir):
            try:
                # `expect_partial` because the checkpoint can have other `Trackable`s
                # such as `optimizer`.
                checkpoint.restore(latest_checkpoint).expect_partial()
                checkpoint_attributes = list_checkpoint_attributes(
                    latest_checkpoint)
                # The checkpoint should contain model and optimizer for SidecarEvaluator
                # to work. But the model weights saved by ModelCheckpoint callback does
                # not contain model as an attribute. To make SidecarEvaluator compatibly
                # work in this case, use model.load_weights to load the model's weights,
                # while self._iterations is still restored by checkpoint variable.
                if 'model' not in checkpoint_attributes:
                    self.model.load_weights(latest_checkpoint)
                # The model checkpoint might not include optimizer in cases, e.g.
                # using a custom training loop. Directly assign the iterations
                # property to be used in callbacks.
                if self.model.optimizer:
                    self.model.optimizer.iterations.assign(self._iterations)
            except (errors_impl.OpError, ) as e:
                # A couple errors can happen here with the coordinator racing to write
                # checkpoint:
                # 1) OpError: open failed for <file path>: No such file or directory
                # 2) NotFoundError (subclass of OpError): Unsuccessful
                # TensorSliceReader constructor.
                # TODO(rchao): Remove this except block once b/150954027 is resolved.
                logging.info(
                    'SidecarEvaluator has an error loading '
                    'checkpoint: %s. Retrying. Error: %s: %s',
                    latest_checkpoint, e.__class__.__name__, e)
                continue

            if self._iterations.numpy() == _ITERATIONS_UNINITIALIZED:
                raise RuntimeError(
                    '`iterations` cannot be loaded from the '
                    'checkpoint file. Please ensure `iterations` is '
                    'tracked in the `checkpoint` saved by the coordinator.')

            logging.info(
                'Evaluation starts: Model weights loaded from latest '
                'checkpoint file: %s.', latest_checkpoint)

            self.model.evaluate(self.data,
                                steps=self.steps,
                                callbacks=self.callbacks,
                                verbose=2)

            return_metrics = {}
            for metric in self.model.metrics:
                result = metric.result()
                if isinstance(result, dict):
                    return_metrics.update(result)
                else:
                    return_metrics[metric.name] = result

            logging.info(
                'End of evaluation. Metrics: %s', ' '.join([
                    '{}={}'.format(name, value.numpy())
                    for name, value in return_metrics.items()
                ]))

            # TODO(rchao): Make the max evaluation robust in case users save the
            # checkpoints with epoch format {epoch:03d}.
            if (self.max_evaluations and latest_checkpoint.endswith(
                    '-{}'.format(self.max_evaluations))):
                # Exit the loop because we have evaluated the final checkpoint file.
                logging.info(
                    'Last checkpoint evaluated. SidecarEvaluator stops.')
                return
コード例 #8
0
  def start(self):
    """Starts the evaluation loop."""
    optimizer_checkpoint = tracking_util.Checkpoint(iter=self._iterations)
    checkpoint = tracking_util.Checkpoint(
        model=self.model, optimizer=optimizer_checkpoint)

    for latest_checkpoint in checkpoint_utils.checkpoints_iterator(
        self.checkpoint_dir):
      try:
        # `expect_partial` because the checkpoint can have other `Trackable`s
        # such as `optimizer`.
        checkpoint.restore(latest_checkpoint).expect_partial()
        checkpoint_attributes = list_checkpoint_attributes(latest_checkpoint)
        # The checkpoint should contain model and optimizer for SidecarEvaluator
        # to work. But the model weights saved by ModelCheckpoint callback does
        # not contain model as an attribute. To make SidecarEvaluator compatibly
        # work in this case, if model attribute is not found but
        # layer_with_weights attribute is found, use model.load_weights to load
        # the model's weights, while self._iterations is still restored by
        # checkpoint variable.
        if 'model' not in checkpoint_attributes:
          for attribute in checkpoint_attributes:
            # check whether the checkpoint has the required attributes for
            # model.load_weights to work.
            if re.match(r'^layer_with_weights-[\d+]', attribute) is not None:
              self.model.load_weights(latest_checkpoint)
              break
      except (errors_impl.OpError,) as e:
        # A couple errors can happen here with the coordinator racing to write
        # checkpoint:
        # 1) OpError: open failed for <file path>: No such file or directory
        # 2) NotFoundError (subclass of OpError): Unsuccessful
        # TensorSliceReader constructor.
        # TODO(rchao): Remove this except block once b/150954027 is resolved.
        logging.info(
            'SidecarEvaluator has an error loading '
            'checkpoint: %s. Retrying. Error: %s: %s', latest_checkpoint,
            e.__class__.__name__, e)
        continue

      if self._iterations.numpy() == _ITERATIONS_UNINITIALIZED:
        raise RuntimeError(
            '`iterations` cannot be loaded from the '
            'checkpoint file. Please ensure `iterations` is '
            'tracked in the `checkpoint` saved by the coordinator.')

      logging.info(
          'Evaluation starts: Model weights loaded from latest '
          'checkpoint file: %s.', latest_checkpoint)

      # TODO(rchao): Support arbitrary callback for extensibility.
      self.model.evaluate(self.data, steps=self.steps)

      logging.info(
          'End of evaluation. Metrics: %s', ' '.join([
              '{}={}'.format(metric.name,
                             metric.result().numpy())
              for metric in self.model.metrics
          ]))

      if self._summary_writer:
        with summary_ops_v2.record_if(True), self._summary_writer.as_default():
          for metric in self.model.metrics:
            summary_ops_v2.scalar(
                metric.name,
                metric.result(),
                step=self._iterations.read_value())

      # TODO(rchao): Make the max evaluation robust in case users save the
      # checkpoints with epoch format {epoch:03d}.
      if (self.max_evaluations and
          latest_checkpoint.endswith('-{}'.format(self.max_evaluations))):
        # Exit the loop because we have evaluated the final checkpoint file.
        logging.info('Last checkpoint evaluated. SidecarEvaluator stops.')
        return