def testConvertTempToPermanent(self): checkpoint_dir = FuncCheckpointUtil.mk_temp_checkpoint_dir(self.logdir) new_checkpoint_dir = FuncCheckpointUtil.create_perm_checkpoint( checkpoint_dir, self.logdir, step=4) assert new_checkpoint_dir == TrainableUtil.find_checkpoint_dir( new_checkpoint_dir) assert os.path.exists(new_checkpoint_dir) assert not FuncCheckpointUtil.is_temp_checkpoint_dir( new_checkpoint_dir) tmp_checkpoint_dir = FuncCheckpointUtil.mk_temp_checkpoint_dir( self.logdir) assert tmp_checkpoint_dir != new_checkpoint_dir
def pause(self, trial_runner): """ Pause the AdaptDLTrial with a checkpoint. We try to remove the PG attached to this trial""" assert self.runner is not None checkpoint_obj = ray.get( self.runner.save_all_states.remote(self.runner.get_state.remote())) # Serialize to disk temp_checkpoint_dir = (FuncCheckpointUtil.mk_temp_checkpoint_dir( self.logdir)) checkpoint_path = TrainableUtil.create_from_pickle( checkpoint_obj, temp_checkpoint_dir) # Trial will be restored from the checkpoint_path when it's resumed self.restore_path = checkpoint_path # Clear the allocation. This is a hack to clear the PG associated with # the trial. We assign a temporary PG which will get replaced with a # real PG once we resume the trial. This is needed because Tune likes # to keep the PGs around even for PAUSED trials. self.placement_group_factory = PlacementGroupFactory([{"CPU": 0.001}]) # This forces Tune to garbage-collect uneeded PGs which can then be # reused trial_runner.trial_executor._pg_manager.\ reconcile_placement_groups([self]) logger.debug(f"PAUSING {self} w/ checkpoint at {checkpoint_path}")
def create_from(cls, trial: Trial, trial_runner: "trial_runner.TrialRunner", new_allocation: List[str], copy_state=False) -> "AdaptDLTrial": """ Create a new AdaptDLTrial from a Trial or AdaptDLTrial with new allocations. This also replaces the existing Trial.""" checkpoint_path = None logger.debug(f"Creating {trial} with {len(new_allocation)} replicas") if copy_state: if trial.runner is not None: # Fetch the state from the other trial checkpoint_obj = ray.get( trial.runner.save_all_states.remote( trial.runner.get_state.remote())) # Dump it to disk temp_checkpoint_dir = ( FuncCheckpointUtil.mk_temp_checkpoint_dir(trial.logdir)) checkpoint_path = TrainableUtil. \ create_from_pickle(checkpoint_obj, temp_checkpoint_dir) else: # trial was PAUSED checkpoint_path = trial.restore_path # Spawn a new trial new_trial = cls._clone_from(trial, new_allocation, restore_path=checkpoint_path) # Keep it for later use by the trials new_trial._trial_in_use = trial_runner.trial_executor.\ _pg_manager.trial_in_use # Replace with old trial new_trial._requeue(trial, trial_runner) assert new_trial.restore_path == checkpoint_path assert new_trial.status == Trial.PENDING return new_trial
def testTempCheckpointDir(self): checkpoint_dir = FuncCheckpointUtil.mk_temp_checkpoint_dir(self.logdir) assert FuncCheckpointUtil.is_temp_checkpoint_dir(checkpoint_dir)
def testEmptyCheckpoint(self): checkpoint_dir = FuncCheckpointUtil.mk_null_checkpoint_dir(self.logdir) assert FuncCheckpointUtil.is_null_checkpoint(checkpoint_dir)