Exemple #1
0
    def resume(self):
        """Resumes all checkpointed trials from previous run.

        Requires user to manually re-register their objects. Also stops
        all ongoing trials.
        """
        newest_ckpt_path = _find_newest_ckpt(self._local_checkpoint_dir)
        with open(newest_ckpt_path, "r") as f:
            runner_state = json.load(f, cls=_TuneFunctionDecoder)
            self.checkpoint_file = newest_ckpt_path

        logger.warning("".join([
            "Attempting to resume experiment from {}. ".format(
                self._local_checkpoint_dir), "This feature is experimental, "
            "and may not work with all search algorithms. ",
            "This will ignore any new changes to the specification."
        ]))

        self.__setstate__(runner_state["runner_data"])

        trials = []
        for trial_cp in runner_state["checkpoints"]:
            new_trial = Trial(trial_cp["trainable_name"])
            new_trial.__setstate__(trial_cp)
            trials += [new_trial]
        for trial in sorted(
                trials, key=lambda t: t.last_update_time, reverse=True):
            self.add_trial(trial)
Exemple #2
0
    def resume(self, run_errored_only=False):
        """Resumes all checkpointed trials from previous run.

        Requires user to manually re-register their objects. Also stops
        all ongoing trials.
        """
        newest_ckpt_path = _find_newest_ckpt(self._local_checkpoint_dir)
        with open(newest_ckpt_path, "r") as f:
            runner_state = json.load(f, cls=_TuneFunctionDecoder)
            self.checkpoint_file = newest_ckpt_path

        logger.warning("".join([
            "Attempting to resume experiment from {}. ".format(
                self._local_checkpoint_dir),
            "This will ignore any new changes to the specification."
        ]))

        self.__setstate__(runner_state["runner_data"])
        if self._search_alg.has_checkpoint(self._local_checkpoint_dir):
            self._search_alg.restore_from_dir(self._local_checkpoint_dir)

        trials = []
        for trial_cp in runner_state["checkpoints"]:
            new_trial = Trial(trial_cp["trainable_name"])
            new_trial.__setstate__(trial_cp)
            trials += [new_trial]
        for trial in sorted(trials,
                            key=lambda t: t.last_update_time,
                            reverse=True):
            if run_errored_only and trial.status == Trial.ERROR:
                new_trial = trial.reset()
                self.add_trial(new_trial)
            else:
                self.add_trial(trial)
Exemple #3
0
    def restore(cls,
                metadata_checkpoint_dir,
                search_alg=None,
                scheduler=None,
                trial_executor=None):
        """Restores all checkpointed trials from previous run.

        Requires user to manually re-register their objects. Also stops
        all ongoing trials.

        Args:
            metadata_checkpoint_dir (str): Path to metadata checkpoints.
            search_alg (SearchAlgorithm): Search Algorithm. Defaults to
                BasicVariantGenerator.
            scheduler (TrialScheduler): Scheduler for executing
                the experiment.
            trial_executor (TrialExecutor): Manage the execution of trials.

        Returns:
            runner (TrialRunner): A TrialRunner to resume experiments from.
        """

        newest_ckpt_path = _find_newest_ckpt(metadata_checkpoint_dir)
        with open(newest_ckpt_path, "r") as f:
            runner_state = json.load(f)

        logger.warning("".join([
            "Attempting to resume experiment from {}. ".format(
                metadata_checkpoint_dir), "This feature is experimental, "
            "and may not work with all search algorithms. ",
            "This will ignore any new changes to the specification."
        ]))

        from ray.tune.suggest import BasicVariantGenerator
        runner = TrialRunner(
            search_alg or BasicVariantGenerator(),
            scheduler=scheduler,
            trial_executor=trial_executor)

        runner.__setstate__(runner_state["runner_data"])

        trials = []
        for trial_cp in runner_state["checkpoints"]:
            new_trial = Trial(trial_cp["trainable_name"])
            new_trial.__setstate__(trial_cp)
            trials += [new_trial]
        for trial in sorted(
                trials, key=lambda t: t.last_update_time, reverse=True):
            runner.add_trial(trial)
        return runner
Exemple #4
0
    def restore(cls,
                metadata_checkpoint_dir,
                search_alg=None,
                scheduler=None,
                trial_executor=None):
        """Restores all checkpointed trials from previous run.

        Requires user to manually re-register their objects. Also stops
        all ongoing trials.

        Args:
            metadata_checkpoint_dir (str): Path to metadata checkpoints.
            search_alg (SearchAlgorithm): Search Algorithm. Defaults to
                BasicVariantGenerator.
            scheduler (TrialScheduler): Scheduler for executing
                the experiment.
            trial_executor (TrialExecutor): Manage the execution of trials.

        Returns:
            runner (TrialRunner): A TrialRunner to resume experiments from.
        """

        newest_ckpt_path = _find_newest_ckpt(metadata_checkpoint_dir)
        with open(newest_ckpt_path, "r") as f:
            runner_state = json.load(f)

        logger.warning("".join([
            "Attempting to resume experiment from {}. ".format(
                metadata_checkpoint_dir), "This feature is experimental, "
            "and may not work with all search algorithms. ",
            "This will ignore any new changes to the specification."
        ]))

        from ray.tune.suggest import BasicVariantGenerator
        runner = TrialRunner(
            search_alg or BasicVariantGenerator(),
            scheduler=scheduler,
            trial_executor=trial_executor)

        runner.__setstate__(runner_state["runner_data"])

        trials = []
        for trial_cp in runner_state["checkpoints"]:
            new_trial = Trial(trial_cp["trainable_name"])
            new_trial.__setstate__(trial_cp)
            trials += [new_trial]
        for trial in sorted(
                trials, key=lambda t: t.last_update_time, reverse=True):
            runner.add_trial(trial)
        return runner
Exemple #5
0
 def make_stub_if_needed(trial: Trial) -> Trial:
     if trial.stub:
         return trial
     trial_copy = Trial(trial.trainable_name, stub=True)
     trial_copy.__setstate__(trial.__getstate__())
     return trial_copy