Esempio n. 1
0
    def testConvertTempToPermanent(self):
        checkpoint_dir = FuncCheckpointUtil.mk_temp_checkpoint_dir(self.logdir)
        new_checkpoint_dir = FuncCheckpointUtil.create_perm_checkpoint(
            checkpoint_dir, self.logdir, step=4)
        assert new_checkpoint_dir == TrainableUtil.find_checkpoint_dir(
            new_checkpoint_dir)
        assert os.path.exists(new_checkpoint_dir)
        assert not FuncCheckpointUtil.is_temp_checkpoint_dir(
            new_checkpoint_dir)

        tmp_checkpoint_dir = FuncCheckpointUtil.mk_temp_checkpoint_dir(
            self.logdir)
        assert tmp_checkpoint_dir != new_checkpoint_dir
Esempio n. 2
0
    def pause(self, trial_runner):
        """ Pause the AdaptDLTrial with a checkpoint. We try to remove the PG
        attached to this trial"""
        assert self.runner is not None
        checkpoint_obj = ray.get(
            self.runner.save_all_states.remote(self.runner.get_state.remote()))
        # Serialize to disk
        temp_checkpoint_dir = (FuncCheckpointUtil.mk_temp_checkpoint_dir(
            self.logdir))
        checkpoint_path = TrainableUtil.create_from_pickle(
            checkpoint_obj, temp_checkpoint_dir)

        # Trial will be restored from the checkpoint_path when it's resumed
        self.restore_path = checkpoint_path

        # Clear the allocation. This is a hack to clear the PG associated with
        # the trial. We assign a temporary PG which will get replaced with a
        # real PG once we resume the trial. This is needed because Tune likes
        # to keep the PGs around even for PAUSED trials.
        self.placement_group_factory = PlacementGroupFactory([{"CPU": 0.001}])
        # This forces Tune to garbage-collect uneeded PGs which can then be
        # reused
        trial_runner.trial_executor._pg_manager.\
            reconcile_placement_groups([self])
        logger.debug(f"PAUSING {self} w/ checkpoint at {checkpoint_path}")
Esempio n. 3
0
    def create_from(cls,
                    trial: Trial,
                    trial_runner: "trial_runner.TrialRunner",
                    new_allocation: List[str],
                    copy_state=False) -> "AdaptDLTrial":
        """ Create a new AdaptDLTrial from a Trial or AdaptDLTrial with new
        allocations. This also replaces the existing Trial."""
        checkpoint_path = None
        logger.debug(f"Creating {trial} with {len(new_allocation)} replicas")
        if copy_state:
            if trial.runner is not None:
                # Fetch the state from the other trial
                checkpoint_obj = ray.get(
                    trial.runner.save_all_states.remote(
                        trial.runner.get_state.remote()))
                # Dump it to disk
                temp_checkpoint_dir = (
                    FuncCheckpointUtil.mk_temp_checkpoint_dir(trial.logdir))
                checkpoint_path = TrainableUtil. \
                    create_from_pickle(checkpoint_obj, temp_checkpoint_dir)
            else:
                # trial was PAUSED
                checkpoint_path = trial.restore_path

        # Spawn a new trial
        new_trial = cls._clone_from(trial,
                                    new_allocation,
                                    restore_path=checkpoint_path)
        # Keep it for later use by the trials
        new_trial._trial_in_use = trial_runner.trial_executor.\
            _pg_manager.trial_in_use
        # Replace with old trial
        new_trial._requeue(trial, trial_runner)
        assert new_trial.restore_path == checkpoint_path
        assert new_trial.status == Trial.PENDING
        return new_trial
Esempio n. 4
0
 def testTempCheckpointDir(self):
     checkpoint_dir = FuncCheckpointUtil.mk_temp_checkpoint_dir(self.logdir)
     assert FuncCheckpointUtil.is_temp_checkpoint_dir(checkpoint_dir)
Esempio n. 5
0
 def testEmptyCheckpoint(self):
     checkpoint_dir = FuncCheckpointUtil.mk_null_checkpoint_dir(self.logdir)
     assert FuncCheckpointUtil.is_null_checkpoint(checkpoint_dir)