def testReturnsSingleCheckpointIfOneShardedCheckpoint(self): checkpoint_dir = os.path.join(self.get_temp_dir(), "one_checkpoint_found_sharded") if not gfile.Exists(checkpoint_dir): gfile.MakeDirs(checkpoint_dir) global_step = variables.Variable(0, name="v0") # This will result in 3 different checkpoint shard files. with ops.device("/cpu:0"): variables.Variable(10, name="v1") with ops.device("/cpu:1"): variables.Variable(20, name="v2") saver = saver_lib.Saver(sharded=True) with session_lib.Session(target="", config=config_pb2.ConfigProto( device_count={"CPU": 2})) as session: session.run(variables.global_variables_initializer()) save_path = os.path.join(checkpoint_dir, "model.ckpt") saver.save(session, save_path, global_step=global_step) num_found = 0 for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0): num_found += 1 self.assertEqual(num_found, 1)
def start(self): """Starts the evaluation loop.""" optimizer_checkpoint = tracking_util.Checkpoint(iter=self._iterations) checkpoint = tracking_util.Checkpoint(model=self.model, optimizer=optimizer_checkpoint) for latest_checkpoint in checkpoint_utils.checkpoints_iterator( self.checkpoint_dir): try: # `expect_partial` because the checkpoint can have other `Trackable`s # such as `optimizer`. checkpoint.restore(latest_checkpoint).expect_partial() except (errors_impl.OpError, ) as e: # A couple errors can happen here with the coordinator racing to write # checkpoint: # 1) OpError: open failed for <file path>: No such file or directory # 2) NotFoundError (subclass of OpError): Unsuccessful # TensorSliceReader constructor. # TODO(rchao): Remove this except block once b/150954027 is resolved. logging.info( 'SidecarEvaluator has an error loading ' 'checkpoint: %s. Retrying. Error: %s: %s', latest_checkpoint, e.__class__.__name__, e) continue if self._iterations.numpy() == _ITERATIONS_UNINITIALIZED: raise RuntimeError( '`iterations` cannot be loaded from the ' 'checkpoint file. Please ensure `iterations` is ' 'tracked in the `checkpoint` saved by the coordinator.') logging.info( 'Evaluation starts: Model weights loaded from latest ' 'checkpoint file: %s.', latest_checkpoint) # TODO(rchao): Support arbitrary callback for extensibility. self.model.evaluate(self.data, steps=self.steps) logging.info('End of evaluation. Accuracy: %r', [ metric.result().numpy() for metric in self.model.compiled_metrics.metrics ]) if self._summary_writer: with summary_ops_v2.always_record_summaries( ), self._summary_writer.as_default(): for metric in self.model.compiled_metrics.metrics: summary_ops_v2.scalar( metric.name, metric.result(), step=self._iterations.read_value()) # TODO(rchao): Make the max evaluation robust in case users save the # checkpoints with epoch format {epoch:03d}. if (self.max_evaluations and latest_checkpoint.endswith( '-{}'.format(self.max_evaluations))): # Exit the loop because we have evaluated the final checkpoint file. logging.info( 'Last checkpoint evaluated. SidecarEvaluator stops.') return
def testReturnsEmptyIfNoCheckpointsFound(self): checkpoint_dir = os.path.join(self.get_temp_dir(), "no_checkpoints_found") num_found = 0 for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0): num_found += 1 self.assertEqual(num_found, 0)
def testTimeoutFn(self): timeout_fn_calls = [0] def timeout_fn(): timeout_fn_calls[0] += 1 return timeout_fn_calls[0] > 3 results = list( checkpoint_utils.checkpoints_iterator( "/non-existent-dir", timeout=0.1, timeout_fn=timeout_fn)) self.assertEqual([], results) self.assertEqual(4, timeout_fn_calls[0])
def testWorksWithFSPath(self): checkpoint_dir = pathlib.Path(self.get_temp_dir()) / "one_checkpoint_found" if not gfile.Exists(checkpoint_dir): gfile.MakeDirs(checkpoint_dir) save_path = checkpoint_dir / "model.ckpt" a = resource_variable_ops.ResourceVariable(5) self.evaluate(a.initializer) checkpoint = trackable_utils.Checkpoint(a=a) checkpoint.save(file_prefix=save_path) num_found = 0 for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0): num_found += 1 self.assertEqual(num_found, 1)
def testReturnsSingleCheckpointIfOneCheckpointFound(self): checkpoint_dir = os.path.join(self.get_temp_dir(), "one_checkpoint_found") if not gfile.Exists(checkpoint_dir): gfile.MakeDirs(checkpoint_dir) save_path = os.path.join(checkpoint_dir, "model.ckpt") a = resource_variable_ops.ResourceVariable(5) self.evaluate(a.initializer) checkpoint = trackable_utils.Checkpoint(a=a) checkpoint.save(file_prefix=save_path) num_found = 0 for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0): num_found += 1 self.assertEqual(num_found, 1)
def start(self): """Starts the evaluation loop.""" optimizer_checkpoint = tracking_util.Checkpoint(iter=self._iterations) checkpoint = tracking_util.Checkpoint(model=self.model, optimizer=optimizer_checkpoint) for latest_checkpoint in checkpoint_utils.checkpoints_iterator( self.checkpoint_dir): try: # `expect_partial` because the checkpoint can have other `Trackable`s # such as `optimizer`. checkpoint.restore(latest_checkpoint).expect_partial() checkpoint_attributes = list_checkpoint_attributes( latest_checkpoint) # The checkpoint should contain model and optimizer for SidecarEvaluator # to work. But the model weights saved by ModelCheckpoint callback does # not contain model as an attribute. To make SidecarEvaluator compatibly # work in this case, use model.load_weights to load the model's weights, # while self._iterations is still restored by checkpoint variable. if 'model' not in checkpoint_attributes: self.model.load_weights(latest_checkpoint) # The model checkpoint might not include optimizer in cases, e.g. # using a custom training loop. Directly assign the iterations # property to be used in callbacks. if self.model.optimizer: self.model.optimizer.iterations.assign(self._iterations) except (errors_impl.OpError, ) as e: # A couple errors can happen here with the coordinator racing to write # checkpoint: # 1) OpError: open failed for <file path>: No such file or directory # 2) NotFoundError (subclass of OpError): Unsuccessful # TensorSliceReader constructor. # TODO(rchao): Remove this except block once b/150954027 is resolved. logging.info( 'SidecarEvaluator has an error loading ' 'checkpoint: %s. Retrying. Error: %s: %s', latest_checkpoint, e.__class__.__name__, e) continue if self._iterations.numpy() == _ITERATIONS_UNINITIALIZED: raise RuntimeError( '`iterations` cannot be loaded from the ' 'checkpoint file. Please ensure `iterations` is ' 'tracked in the `checkpoint` saved by the coordinator.') logging.info( 'Evaluation starts: Model weights loaded from latest ' 'checkpoint file: %s.', latest_checkpoint) self.model.evaluate(self.data, steps=self.steps, callbacks=self.callbacks, verbose=2) return_metrics = {} for metric in self.model.metrics: result = metric.result() if isinstance(result, dict): return_metrics.update(result) else: return_metrics[metric.name] = result logging.info( 'End of evaluation. Metrics: %s', ' '.join([ '{}={}'.format(name, value.numpy()) for name, value in return_metrics.items() ])) # TODO(rchao): Make the max evaluation robust in case users save the # checkpoints with epoch format {epoch:03d}. if (self.max_evaluations and latest_checkpoint.endswith( '-{}'.format(self.max_evaluations))): # Exit the loop because we have evaluated the final checkpoint file. logging.info( 'Last checkpoint evaluated. SidecarEvaluator stops.') return
def start(self): """Starts the evaluation loop.""" optimizer_checkpoint = tracking_util.Checkpoint(iter=self._iterations) checkpoint = tracking_util.Checkpoint( model=self.model, optimizer=optimizer_checkpoint) for latest_checkpoint in checkpoint_utils.checkpoints_iterator( self.checkpoint_dir): try: # `expect_partial` because the checkpoint can have other `Trackable`s # such as `optimizer`. checkpoint.restore(latest_checkpoint).expect_partial() checkpoint_attributes = list_checkpoint_attributes(latest_checkpoint) # The checkpoint should contain model and optimizer for SidecarEvaluator # to work. But the model weights saved by ModelCheckpoint callback does # not contain model as an attribute. To make SidecarEvaluator compatibly # work in this case, if model attribute is not found but # layer_with_weights attribute is found, use model.load_weights to load # the model's weights, while self._iterations is still restored by # checkpoint variable. if 'model' not in checkpoint_attributes: for attribute in checkpoint_attributes: # check whether the checkpoint has the required attributes for # model.load_weights to work. if re.match(r'^layer_with_weights-[\d+]', attribute) is not None: self.model.load_weights(latest_checkpoint) break except (errors_impl.OpError,) as e: # A couple errors can happen here with the coordinator racing to write # checkpoint: # 1) OpError: open failed for <file path>: No such file or directory # 2) NotFoundError (subclass of OpError): Unsuccessful # TensorSliceReader constructor. # TODO(rchao): Remove this except block once b/150954027 is resolved. logging.info( 'SidecarEvaluator has an error loading ' 'checkpoint: %s. Retrying. Error: %s: %s', latest_checkpoint, e.__class__.__name__, e) continue if self._iterations.numpy() == _ITERATIONS_UNINITIALIZED: raise RuntimeError( '`iterations` cannot be loaded from the ' 'checkpoint file. Please ensure `iterations` is ' 'tracked in the `checkpoint` saved by the coordinator.') logging.info( 'Evaluation starts: Model weights loaded from latest ' 'checkpoint file: %s.', latest_checkpoint) # TODO(rchao): Support arbitrary callback for extensibility. self.model.evaluate(self.data, steps=self.steps) logging.info( 'End of evaluation. Metrics: %s', ' '.join([ '{}={}'.format(metric.name, metric.result().numpy()) for metric in self.model.metrics ])) if self._summary_writer: with summary_ops_v2.record_if(True), self._summary_writer.as_default(): for metric in self.model.metrics: summary_ops_v2.scalar( metric.name, metric.result(), step=self._iterations.read_value()) # TODO(rchao): Make the max evaluation robust in case users save the # checkpoints with epoch format {epoch:03d}. if (self.max_evaluations and latest_checkpoint.endswith('-{}'.format(self.max_evaluations))): # Exit the loop because we have evaluated the final checkpoint file. logging.info('Last checkpoint evaluated. SidecarEvaluator stops.') return