def resume(self): """Resumes all checkpointed trials from previous run. Requires user to manually re-register their objects. Also stops all ongoing trials. """ newest_ckpt_path = _find_newest_ckpt(self._local_checkpoint_dir) with open(newest_ckpt_path, "r") as f: runner_state = json.load(f, cls=_TuneFunctionDecoder) self.checkpoint_file = newest_ckpt_path logger.warning("".join([ "Attempting to resume experiment from {}. ".format( self._local_checkpoint_dir), "This feature is experimental, " "and may not work with all search algorithms. ", "This will ignore any new changes to the specification." ])) self.__setstate__(runner_state["runner_data"]) trials = [] for trial_cp in runner_state["checkpoints"]: new_trial = Trial(trial_cp["trainable_name"]) new_trial.__setstate__(trial_cp) trials += [new_trial] for trial in sorted( trials, key=lambda t: t.last_update_time, reverse=True): self.add_trial(trial)
def resume(self, run_errored_only=False): """Resumes all checkpointed trials from previous run. Requires user to manually re-register their objects. Also stops all ongoing trials. """ newest_ckpt_path = _find_newest_ckpt(self._local_checkpoint_dir) with open(newest_ckpt_path, "r") as f: runner_state = json.load(f, cls=_TuneFunctionDecoder) self.checkpoint_file = newest_ckpt_path logger.warning("".join([ "Attempting to resume experiment from {}. ".format( self._local_checkpoint_dir), "This will ignore any new changes to the specification." ])) self.__setstate__(runner_state["runner_data"]) if self._search_alg.has_checkpoint(self._local_checkpoint_dir): self._search_alg.restore_from_dir(self._local_checkpoint_dir) trials = [] for trial_cp in runner_state["checkpoints"]: new_trial = Trial(trial_cp["trainable_name"]) new_trial.__setstate__(trial_cp) trials += [new_trial] for trial in sorted(trials, key=lambda t: t.last_update_time, reverse=True): if run_errored_only and trial.status == Trial.ERROR: new_trial = trial.reset() self.add_trial(new_trial) else: self.add_trial(trial)
def restore(cls, metadata_checkpoint_dir, search_alg=None, scheduler=None, trial_executor=None): """Restores all checkpointed trials from previous run. Requires user to manually re-register their objects. Also stops all ongoing trials. Args: metadata_checkpoint_dir (str): Path to metadata checkpoints. search_alg (SearchAlgorithm): Search Algorithm. Defaults to BasicVariantGenerator. scheduler (TrialScheduler): Scheduler for executing the experiment. trial_executor (TrialExecutor): Manage the execution of trials. Returns: runner (TrialRunner): A TrialRunner to resume experiments from. """ newest_ckpt_path = _find_newest_ckpt(metadata_checkpoint_dir) with open(newest_ckpt_path, "r") as f: runner_state = json.load(f) logger.warning("".join([ "Attempting to resume experiment from {}. ".format( metadata_checkpoint_dir), "This feature is experimental, " "and may not work with all search algorithms. ", "This will ignore any new changes to the specification." ])) from ray.tune.suggest import BasicVariantGenerator runner = TrialRunner( search_alg or BasicVariantGenerator(), scheduler=scheduler, trial_executor=trial_executor) runner.__setstate__(runner_state["runner_data"]) trials = [] for trial_cp in runner_state["checkpoints"]: new_trial = Trial(trial_cp["trainable_name"]) new_trial.__setstate__(trial_cp) trials += [new_trial] for trial in sorted( trials, key=lambda t: t.last_update_time, reverse=True): runner.add_trial(trial) return runner
def make_stub_if_needed(trial: Trial) -> Trial: if trial.stub: return trial trial_copy = Trial(trial.trainable_name, stub=True) trial_copy.__setstate__(trial.__getstate__()) return trial_copy