def setUp(self): self.checkpoint_dir = os.path.join( ray._private.utils.get_user_temp_dir(), "tune", "MyTrainable123" ) self.checkpoint_dir = TrainableUtil.make_checkpoint_dir( self.checkpoint_dir, "0" )
def save(self, checkpoint_dir: Optional[str] = None) -> str: """Saves the current model state to a checkpoint. Subclasses should override ``save_checkpoint()`` instead to save state. This method dumps additional metadata alongside the saved path. If a remote checkpoint dir is given, this will also sync up to remote storage. Args: checkpoint_dir: Optional dir to place the checkpoint. Returns: str: path that points to xxx.pkl file. Note the return path should match up with what is expected of `restore()`. """ checkpoint_dir = TrainableUtil.make_checkpoint_dir( checkpoint_dir or self.logdir, index=self.iteration) checkpoint_dict_or_path = self.save_checkpoint(checkpoint_dir) trainable_state = self.get_state() checkpoint_path = TrainableUtil.process_checkpoint( checkpoint_dict_or_path, parent_dir=checkpoint_dir, trainable_state=trainable_state, ) # Maybe sync to cloud self._maybe_save_to_cloud(checkpoint_dir) return checkpoint_path
def _create_checkpoint_dir(self, checkpoint_dir: Optional[str] = None ) -> Optional[str]: # Create checkpoint_xxxxx directory and drop checkpoint marker checkpoint_dir = TrainableUtil.make_checkpoint_dir( checkpoint_dir or self.logdir, index=self.iteration) return checkpoint_dir
def write_checkpoint(trial: Trial, index: int): checkpoint_dir = TrainableUtil.make_checkpoint_dir(trial.logdir, index=index) result = {"training_iteration": index} with open(os.path.join(checkpoint_dir, "cp.json"), "w") as f: json.dump(result, f) tune_cp = _TrackedCheckpoint( dir_or_data=checkpoint_dir, storage_mode=CheckpointStorage.PERSISTENT, metrics=result, ) trial.saving_to = tune_cp return checkpoint_dir