Esempio n. 1
0
 def setUp(self):
     self.checkpoint_dir = os.path.join(
         ray._private.utils.get_user_temp_dir(), "tune", "MyTrainable123"
     )
     self.checkpoint_dir = TrainableUtil.make_checkpoint_dir(
         self.checkpoint_dir, "0"
     )
Esempio n. 2
0
    def save(self, checkpoint_dir: Optional[str] = None) -> str:
        """Saves the current model state to a checkpoint.

        Subclasses should override ``save_checkpoint()`` instead to save state.
        This method dumps additional metadata alongside the saved path.

        If a remote checkpoint dir is given, this will also sync up to remote
        storage.

        Args:
            checkpoint_dir: Optional dir to place the checkpoint.

        Returns:
            str: path that points to xxx.pkl file.

        Note the return path should match up with what is expected of
        `restore()`.
        """
        checkpoint_dir = TrainableUtil.make_checkpoint_dir(
            checkpoint_dir or self.logdir, index=self.iteration)
        checkpoint_dict_or_path = self.save_checkpoint(checkpoint_dir)
        trainable_state = self.get_state()
        checkpoint_path = TrainableUtil.process_checkpoint(
            checkpoint_dict_or_path,
            parent_dir=checkpoint_dir,
            trainable_state=trainable_state,
        )

        # Maybe sync to cloud
        self._maybe_save_to_cloud(checkpoint_dir)

        return checkpoint_path
Esempio n. 3
0
 def _create_checkpoint_dir(self,
                            checkpoint_dir: Optional[str] = None
                            ) -> Optional[str]:
     # Create checkpoint_xxxxx directory and drop checkpoint marker
     checkpoint_dir = TrainableUtil.make_checkpoint_dir(
         checkpoint_dir or self.logdir, index=self.iteration)
     return checkpoint_dir
Esempio n. 4
0
        def write_checkpoint(trial: Trial, index: int):
            checkpoint_dir = TrainableUtil.make_checkpoint_dir(trial.logdir,
                                                               index=index)
            result = {"training_iteration": index}
            with open(os.path.join(checkpoint_dir, "cp.json"), "w") as f:
                json.dump(result, f)

            tune_cp = _TrackedCheckpoint(
                dir_or_data=checkpoint_dir,
                storage_mode=CheckpointStorage.PERSISTENT,
                metrics=result,
            )
            trial.saving_to = tune_cp

            return checkpoint_dir