Example #1
0
    def testUserCheckpoint(self):
        os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1"  # Don't finish early
        os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"

        ray.init(num_cpus=3)
        runner = TrialRunner(local_checkpoint_dir=self.tmpdir,
                             checkpoint_period=0)
        runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2}))
        trials = runner.get_trials()

        runner.step()  # Start trial
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1)
        runner.step()  # Process result
        self.assertFalse(trials[0].has_checkpoint())
        runner.step()  # Process result
        self.assertFalse(trials[0].has_checkpoint())
        runner.step()  # Process result, dispatch save
        runner.step()  # Process save
        self.assertTrue(trials[0].has_checkpoint())

        runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir)
        runner2.step()  # 5: Start trial and dispatch restore
        trials2 = runner2.get_trials()
        self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1)
Example #2
0
    def testCheckpointing(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {
                "training_iteration": 1
            },
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()  # Start trial
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1)
        runner.step()  # Process result, dispatch save
        runner.step()  # Process save, stop trial
        kwargs["restore_path"] = trials[0].checkpoint.dir_or_data
        self.assertEqual(trials[0].status, Trial.TERMINATED)

        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()
        self.assertEqual(trials[1].status, Trial.PENDING)
        runner.step()  # Start trial, dispatch restore
        self.assertEqual(trials[1].status, Trial.RUNNING)

        runner.step()  # Process restore
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[1].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[1].runner.get_info.remote()), 1)
        self.addCleanup(shutil.rmtree, trials[0].checkpoint.dir_or_data)
Example #3
0
    def testTrialErrorResumeFalse(self):
        ray.init(num_cpus=3, local_mode=True, include_dashboard=False)
        runner = TrialRunner(local_checkpoint_dir=self.tmpdir)
        kwargs = {
            "stopping_criterion": {
                "training_iteration": 4
            },
            "resources": Resources(cpu=1, gpu=0),
        }
        trials = [
            Trial("__fake", config={"mock_error": True}, **kwargs),
            Trial("__fake", **kwargs),
            Trial("__fake", **kwargs),
        ]
        for t in trials:
            runner.add_trial(t)

        while not runner.is_finished():
            runner.step()

        runner.checkpoint(force=True)

        assert trials[0].status == Trial.ERROR
        del runner

        new_runner = TrialRunner(resume=True, local_checkpoint_dir=self.tmpdir)
        assert len(new_runner.get_trials()) == 3
        assert Trial.ERROR in (t.status for t in new_runner.get_trials())
Example #4
0
    def testTrialNoCheckpointSave(self):
        """Check that non-checkpointing trials *are* saved."""
        os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"

        ray.init(num_cpus=3)

        runner = TrialRunner(local_checkpoint_dir=self.tmpdir,
                             checkpoint_period=0)
        runner.add_trial(
            Trial(
                "__fake",
                trial_id="non_checkpoint",
                stopping_criterion={"training_iteration": 2},
            ))

        while not all(t.status == Trial.TERMINATED
                      for t in runner.get_trials()):
            runner.step()

        runner.add_trial(
            Trial(
                "__fake",
                trial_id="checkpoint",
                checkpoint_at_end=True,
                stopping_criterion={"training_iteration": 2},
            ))

        while not all(t.status == Trial.TERMINATED
                      for t in runner.get_trials()):
            runner.step()

        runner.add_trial(
            Trial(
                "__fake",
                trial_id="pending",
                stopping_criterion={"training_iteration": 2},
            ))

        old_trials = runner.get_trials()
        while not old_trials[2].has_reported_at_least_once:
            runner.step()

        runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir)
        new_trials = runner2.get_trials()
        self.assertEqual(len(new_trials), 3)
        self.assertTrue(
            runner2.get_trial("non_checkpoint").status == Trial.TERMINATED)
        self.assertTrue(
            runner2.get_trial("checkpoint").status == Trial.TERMINATED)
        self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING)
        self.assertTrue(
            runner2.get_trial("pending").has_reported_at_least_once)
        runner2.step()
Example #5
0
    def testSearchAlgStalled(self):
        """Checks that runner and searcher state is maintained when stalled."""
        ray.init(num_cpus=4, num_gpus=2)
        experiment_spec = {
            "run": "__fake",
            "num_samples": 3,
            "stop": {
                "training_iteration": 1
            },
        }
        experiments = [Experiment.from_json("test", experiment_spec)]
        search_alg = _MockSuggestionAlgorithm(max_concurrent=1)
        search_alg.add_configurations(experiments)
        searcher = search_alg.searcher
        runner = TrialRunner(search_alg=search_alg)
        runner.step()
        trials = runner.get_trials()
        while trials[0].status != Trial.TERMINATED:
            runner.step()

        runner.step()
        trials = runner.get_trials()
        self.assertEqual(trials[1].status, Trial.RUNNING)
        self.assertEqual(len(searcher.live_trials), 1)

        searcher.stall = True

        while trials[1].status != Trial.TERMINATED:
            runner.step()
        self.assertEqual(trials[1].status, Trial.TERMINATED)
        self.assertEqual(len(searcher.live_trials), 0)

        self.assertTrue(all(trial.is_finished() for trial in trials))
        self.assertFalse(search_alg.is_finished())
        self.assertFalse(runner.is_finished())

        searcher.stall = False

        runner.step()
        trials = runner.get_trials()
        self.assertEqual(trials[2].status, Trial.RUNNING)
        self.assertEqual(len(searcher.live_trials), 1)

        while trials[2].status != Trial.TERMINATED:
            runner.step()

        self.assertEqual(len(searcher.live_trials), 0)
        self.assertTrue(search_alg.is_finished())
        self.assertTrue(runner.is_finished())
Example #6
0
    def testFailureRecoveryEnabled(self):
        ray.init(num_cpus=1, num_gpus=1)
        searchalg, scheduler = create_mock_components()

        runner = TrialRunner(searchalg, scheduler=scheduler)

        kwargs = {
            "stopping_criterion": {
                "training_iteration": 2
            },
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 1,
            "config": {
                "mock_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[0].num_failures, 1)
        self.assertEqual(len(searchalg.errored_trials), 0)
        # Notice this is 1 since during recovery, the previously errored trial
        # is "requeued". This will call scheduler.on_trial_error.
        # Searcher.on_trial_error is, however, not called in this process.
        self.assertEqual(len(scheduler.errored_trials), 1)
Example #7
0
    def _test_repeater(self, num_samples, repeat):
        class TestSuggestion(Searcher):
            index = 0

            def suggest(self, trial_id):
                self.index += 1
                return {"test_variable": 5 + self.index}

            def on_trial_complete(self, *args, **kwargs):
                return

        searcher = TestSuggestion(metric="episode_reward_mean")
        repeat_searcher = Repeater(searcher, repeat=repeat, set_index=False)
        alg = SearchGenerator(repeat_searcher)
        experiment_spec = {
            "run": "__fake",
            "num_samples": num_samples,
            "stop": {
                "training_iteration": 1
            },
        }
        alg.add_configurations({"test": experiment_spec})
        runner = TrialRunner(search_alg=alg)
        while not runner.is_finished():
            runner.step()

        return runner.get_trials()
Example #8
0
def test_cluster_rllib_restore(start_connected_cluster, tmpdir):
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    script = """
import time
import ray
from ray import tune

ray.init(address="{address}")


tune.run(
    "PG",
    name="experiment",
    config=dict(env="CartPole-v1", framework="tf"),
    stop=dict(training_iteration=10),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1,
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(address=cluster.address, checkpoint_dir=dirpath)
    run_string_as_driver_nonblocking(script)
    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    local_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(100):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration"):
                break
        time.sleep(0.3)

    if not TrialRunner.checkpoint_exists(local_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    cluster.wait_for_nodes()

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": "PG",
                "checkpoint_freq": 1,
                "local_dir": dirpath,
            }
        },
        resume=True,
    )
    assert all(t.status == Trial.TERMINATED for t in trials2)
    ray.shutdown()
    cluster.shutdown()
Example #9
0
    def testTrialErrorResumeTrue(self):
        ray.init(num_cpus=3, local_mode=True, include_dashboard=False)
        runner = TrialRunner(local_checkpoint_dir=self.tmpdir)
        kwargs = {
            "stopping_criterion": {
                "training_iteration": 4
            },
            "resources": Resources(cpu=1, gpu=0),
        }
        trials = [
            Trial("__fake", config={"mock_error": True}, **kwargs),
            Trial("__fake", **kwargs),
            Trial("__fake", **kwargs),
        ]
        for t in trials:
            runner.add_trial(t)

        while not runner.is_finished():
            runner.step()

        runner.checkpoint(force=True)

        assert trials[0].status == Trial.ERROR
        del runner

        new_runner = TrialRunner(resume="ERRORED_ONLY",
                                 local_checkpoint_dir=self.tmpdir)
        assert len(new_runner.get_trials()) == 3
        assert Trial.ERROR not in (t.status for t in new_runner.get_trials())
        # The below is just a check for standard behavior.
        disable_error = False
        for t in new_runner.get_trials():
            if t.config.get("mock_error"):
                t.config["mock_error"] = False
                disable_error = True
        assert disable_error

        while not new_runner.is_finished():
            new_runner.step()
        assert Trial.ERROR not in (t.status for t in new_runner.get_trials())
Example #10
0
    def testRestoreMetricsAfterCheckpointing(self):
        ray.init(num_cpus=1, num_gpus=1)

        observer = TrialResultObserver()
        runner = TrialRunner(callbacks=[observer])
        kwargs = {
            "stopping_criterion": {
                "training_iteration": 2
            },
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()

        self.assertEqual(trials[0].status, Trial.TERMINATED)

        kwargs["restore_path"] = trials[0].checkpoint.dir_or_data
        kwargs.pop("stopping_criterion")
        kwargs.pop("checkpoint_freq")  # No checkpointing for next trial
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        observer.reset()
        while not observer.just_received_a_result():
            runner.step()
        self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10)
        self.assertEqual(trials[1].last_result["iterations_since_restore"], 1)
        self.assertGreater(trials[1].last_result["time_since_restore"], 0)

        while not observer.just_received_a_result():
            runner.step()

        self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20)
        self.assertEqual(trials[1].last_result["iterations_since_restore"], 2)
        self.assertGreater(trials[1].last_result["time_since_restore"], 0)
        self.addCleanup(shutil.rmtree, trials[0].checkpoint.dir_or_data)
Example #11
0
def test_counting_resources(start_connected_cluster):
    """Tests that Tune accounting is consistent with actual cluster."""

    cluster = start_connected_cluster
    nodes = []
    assert ray.cluster_resources()["CPU"] == 1
    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {"stopping_criterion": {"training_iteration": 10}}

    trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
    for t in trials:
        runner.add_trial(t)

    runner.step()
    running_trials = _get_running_trials(runner)
    assert len(running_trials) == 1
    assert _check_trial_running(running_trials[0])
    assert ray.available_resources().get("CPU", 0) == 0
    nodes += [cluster.add_node(num_cpus=1)]
    cluster.wait_for_nodes()
    assert ray.cluster_resources()["CPU"] == 2
    cluster.remove_node(nodes.pop())
    cluster.wait_for_nodes()
    assert ray.cluster_resources()["CPU"] == 1
    runner.step()
    # Only 1 trial can be running due to resource limitation.
    assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1

    for i in range(5):
        nodes += [cluster.add_node(num_cpus=1)]
    cluster.wait_for_nodes()
    assert ray.cluster_resources()["CPU"] == 6

    # This is to make sure that pg is ready for the previous pending trial,
    # so that when runner.step() is called next, the trial can be started in
    # the same event loop.
    time.sleep(5)
    runner.step()
    assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 2
Example #12
0
    def testResultDone(self):
        """Tests that last_result is marked `done` after trial is complete."""
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 2},
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].last_result[DONE], True)
Example #13
0
    def testCheckpointingAtEnd(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 2},
            "checkpoint_at_end": True,
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].last_result[DONE], True)
        self.assertEqual(trials[0].has_checkpoint(), True)
Example #14
0
def _report_progress(runner: TrialRunner,
                     reporter: ProgressReporter,
                     done: bool = False):
    """Reports experiment progress.

    Args:
        runner: Trial runner to report on.
        reporter: Progress reporter.
        done: Whether this is the last progress report attempt.
    """
    trials = runner.get_trials()
    if reporter.should_report(trials, done=done):
        sched_debug_str = runner.scheduler_alg.debug_string()
        executor_debug_str = runner.trial_executor.debug_string()
        reporter.report(trials, done, sched_debug_str, executor_debug_str)
Example #15
0
    def testUserCheckpointBuffered(self):
        os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "8"
        os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1"

        def num_checkpoints(trial):
            return sum(
                item.startswith("checkpoint_")
                for item in os.listdir(trial.logdir))

        ray.init(num_cpus=3)
        runner = TrialRunner(local_checkpoint_dir=self.tmpdir,
                             checkpoint_period=0)
        runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 10}))
        trials = runner.get_trials()

        runner.step()  # Start trial, schedule 1-8
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1)
        self.assertEqual(num_checkpoints(trials[0]), 0)

        runner.step()  # Process results 0-8, schedule 9-11 (CP)
        self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 8)
        self.assertFalse(trials[0].has_checkpoint())
        self.assertEqual(num_checkpoints(trials[0]), 0)

        runner.step()  # Process results 9-11
        runner.step()  # handle CP, schedule 12-19
        self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 11)
        self.assertTrue(trials[0].has_checkpoint())
        self.assertEqual(num_checkpoints(trials[0]), 1)

        runner.step()  # Process results 12-19, schedule 20-21
        self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 19)
        self.assertTrue(trials[0].has_checkpoint())
        self.assertEqual(num_checkpoints(trials[0]), 1)

        runner.step()  # Process results 20-21
        runner.step()  # handle CP, schedule 21-29
        self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 21)
        self.assertTrue(trials[0].has_checkpoint())
        self.assertEqual(num_checkpoints(trials[0]), 2)

        runner.step()  # Process results 21-29, schedule 30-31
        self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 29)
        self.assertTrue(trials[0].has_checkpoint())
        self.assertTrue(trials[0].has_checkpoint())
        self.assertEqual(num_checkpoints(trials[0]), 2)
Example #16
0
    def testSearchAlgFinished(self):
        """Checks that SearchAlg is Finished before all trials are done."""
        ray.init(num_cpus=4, local_mode=True, include_dashboard=False)
        experiment_spec = {"run": "__fake", "stop": {"training_iteration": 1}}
        experiments = [Experiment.from_json("test", experiment_spec)]
        searcher = _MockSuggestionAlgorithm()
        searcher.add_configurations(experiments)
        runner = TrialRunner(search_alg=searcher)
        runner.step()
        trials = runner.get_trials()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertTrue(searcher.is_finished())
        self.assertFalse(runner.is_finished())

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(len(searcher.live_trials), 0)
        self.assertTrue(searcher.is_finished())
        self.assertTrue(runner.is_finished())
Example #17
0
    def testFailureRecoveryMaxFailures(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 2,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].status, Trial.ERROR)
        self.assertEqual(trials[0].num_failures, 3)
Example #18
0
    def testPauseThenResume(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 2},
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()  # Start trial
        runner.step()  # Process result
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].runner.get_info.remote()), None)

        self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1)

        runner.trial_executor.pause_trial(trials[0])
        self.assertEqual(trials[0].status, Trial.PAUSED)
Example #19
0
    def testCheckpointWithFunction(self):
        ray.init(num_cpus=2)

        trial = Trial(
            "__fake",
            config={"callbacks": {
                "on_episode_start": lambda i: i,
            }},
            checkpoint_freq=1,
        )
        runner = TrialRunner(local_checkpoint_dir=self.tmpdir,
                             checkpoint_period=0)
        runner.add_trial(trial)
        for _ in range(5):
            runner.step()
        # force checkpoint
        runner.checkpoint()
        runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir)
        new_trial = runner2.get_trials()[0]
        self.assertTrue("callbacks" in new_trial.config)
        self.assertTrue("on_episode_start" in new_trial.config["callbacks"])
Example #20
0
    def testSearchAlgSchedulerInteraction(self):
        """Checks that TrialScheduler killing trial will notify SearchAlg."""
        class _MockScheduler(FIFOScheduler):
            def on_trial_result(self, *args, **kwargs):
                return TrialScheduler.STOP

        ray.init(num_cpus=4, local_mode=True, include_dashboard=False)
        experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}}
        experiments = [Experiment.from_json("test", experiment_spec)]
        searcher = _MockSuggestionAlgorithm()
        searcher.add_configurations(experiments)
        runner = TrialRunner(search_alg=searcher, scheduler=_MockScheduler())
        runner.step()
        trials = runner.get_trials()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertTrue(searcher.is_finished())
        self.assertFalse(runner.is_finished())

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(len(searcher.live_trials), 0)
        self.assertTrue(searcher.is_finished())
        self.assertTrue(runner.is_finished())
Example #21
0
    def testFailureRecoveryDisabled(self):
        ray.init(num_cpus=1, num_gpus=1)
        searchalg, scheduler = create_mock_components()

        runner = TrialRunner(searchalg, scheduler=scheduler)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()

        self.assertEqual(trials[0].status, Trial.ERROR)
        self.assertEqual(trials[0].num_failures, 1)
        self.assertEqual(len(searchalg.errored_trials), 1)
        self.assertEqual(len(scheduler.errored_trials), 1)
Example #22
0
    def testFailFast(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner(fail_fast=True)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].status, Trial.ERROR)
        # Somehow with `fail_fast=True`, if one errors out, the others are
        # then stopped with `TERMINATED` status.
        self.assertEqual(trials[1].status, Trial.TERMINATED)
        self.assertRaises(TuneError, lambda: runner.step())
Example #23
0
    def testFailFastRaise(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner(fail_fast=TrialRunner.RAISE)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        with self.assertRaises(Exception):
            while not runner.is_finished():
                runner.step()

        # Not critical checks. Only to showcase the difference
        # with none raise type FailFast.
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.PENDING)
Example #24
0
def run(
    run_or_experiment: Union[str, Callable, Type],
    name: Optional[str] = None,
    metric: Optional[str] = None,
    mode: Optional[str] = None,
    stop: Optional[Union[Mapping, Stopper, Callable[[str, Mapping],
                                                    bool]]] = None,
    time_budget_s: Optional[Union[int, float, datetime.timedelta]] = None,
    config: Optional[Dict[str, Any]] = None,
    resources_per_trial: Union[None, Mapping[str, Union[float, int, Mapping]],
                               PlacementGroupFactory] = None,
    num_samples: int = 1,
    local_dir: Optional[str] = None,
    search_alg: Optional[Union[Searcher, SearchAlgorithm, str]] = None,
    scheduler: Optional[Union[TrialScheduler, str]] = None,
    keep_checkpoints_num: Optional[int] = None,
    checkpoint_score_attr: Optional[str] = None,
    checkpoint_freq: int = 0,
    checkpoint_at_end: bool = False,
    verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS,
    progress_reporter: Optional[ProgressReporter] = None,
    log_to_file: bool = False,
    trial_name_creator: Optional[Callable[[Trial], str]] = None,
    trial_dirname_creator: Optional[Callable[[Trial], str]] = None,
    sync_config: Optional[SyncConfig] = None,
    export_formats: Optional[Sequence] = None,
    max_failures: int = 0,
    fail_fast: bool = False,
    restore: Optional[str] = None,
    server_port: Optional[int] = None,
    resume: Union[bool, str] = False,
    reuse_actors: Optional[bool] = None,
    trial_executor: Optional[RayTrialExecutor] = None,
    raise_on_failed_trial: bool = True,
    callbacks: Optional[Sequence[Callback]] = None,
    max_concurrent_trials: Optional[int] = None,
    # == internal only ==
    _experiment_checkpoint_dir: Optional[str] = None,
    _remote: Optional[bool] = None,
) -> ExperimentAnalysis:
    """Executes training.

    When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run
    will gracefully shut down and checkpoint the latest experiment state.
    Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step.

    Many aspects of Tune, such as the frequency of global checkpointing,
    maximum pending placement group trials and the path of the result
    directory be configured through environment variables. Refer to
    :ref:`tune-env-vars` for a list of environment variables available.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # automatically retry failed trials up to 3 times
        tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3)

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})

        # Resumes training if a previous machine crashed
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume=True)

        # Rerun ONLY failed trials after an experiment is finished.
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume="ERRORED_ONLY")

    Args:
        run_or_experiment: If function|class|str, this is the algorithm or
            model to train. This may refer to the name of a built-on algorithm
            (e.g. RLlib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        metric: Metric to optimize. This metric should be reported
            with `tune.report()`. If set, will be passed to the search
            algorithm and scheduler.
        mode: Must be one of [min, max]. Determines whether objective is
            minimizing or maximizing the metric attribute. If set, will be
            passed to the search algorithm and scheduler.
        name: Name of experiment.
        stop: Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        time_budget_s: Global time budget in
            seconds after which all trials are stopped. Can also be a
            ``datetime.timedelta`` object.
        config: Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial: Machine resources
            to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``.
            Note that GPUs will not be assigned unless you specify them here.
            Defaults to 1 CPU and 0 GPUs in
            ``Trainable.default_resource_request()``. This can also
            be a PlacementGroupFactory object wrapping arguments to create a
            per-trial placement group.
        num_samples: Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times. If this is -1, (virtually) infinite
            samples are generated until a stopping condition is met.
        local_dir: Local dir to save training results to.
            Defaults to ``~/ray_results``.
        search_alg: Search algorithm for
            optimization. You can also use the name of the algorithm.
        scheduler: Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options. You can also use the
            name of the scheduler.
        keep_checkpoints_num: Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr: Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        checkpoint_freq: How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end: Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        verbose: 0, 1, 2, or 3. Verbosity mode.
            0 = silent, 1 = only status updates, 2 = status and brief trial
            results, 3 = status and detailed trial results. Defaults to 3.
        progress_reporter: Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        log_to_file: Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        trial_name_creator: Optional function
            for generating the trial string representation.
        trial_dirname_creator: Function
            for generating the trial dirname. This function should take
            in a Trial object and return a string representing the
            name of the directory. The return value cannot be a path.
        sync_config: Configuration object for syncing. See
            tune.SyncConfig.
        export_formats: List of formats that exported at the end of
            the experiment. Default is None.
        max_failures: Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 0.
        fail_fast: Whether to fail upon the first error.
            If fail_fast='raise' provided, Tune will automatically
            raise the exception received by the Trainable. fail_fast='raise'
            can easily leak resources and should be used with caution (it
            is best used with `ray.init(local_mode=True)`).
        restore: Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        server_port: Port number for launching TuneServer.
        resume: One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", "AUTO",
            or bool. "LOCAL"/True restores the checkpoint from the
            local experiment directory, determined
            by ``name`` and ``local_dir``. "REMOTE" restores the checkpoint
            from ``upload_dir`` (as passed to ``sync_config``).
            "PROMPT" provides the CLI feedback.
            False forces a new experiment. "ERRORED_ONLY" resets and reruns
            errored trials upon resume - previous trial artifacts will
            be left untouched.
            "AUTO" will attempt to resume from a checkpoint and otherwise
            start a new experiment.
            If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        reuse_actors: Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
            Defaults to ``True`` for function trainables and ``False`` for
            class and registered trainables.
        trial_executor: Manage the execution of trials.
        raise_on_failed_trial: Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        callbacks: List of callbacks that will be called at different
            times in the training loop. Must be instances of the
            ``ray.tune.callback.Callback`` class. If not passed,
            `LoggerCallback` and `SyncerCallback` callbacks are automatically
            added.
        max_concurrent_trials: Maximum number of trials to run
            concurrently. Must be non-negative. If None or 0, no limit will
            be applied. This is achieved by wrapping the ``search_alg`` in
            a :class:`ConcurrencyLimiter`, and thus setting this argument
            will raise an exception if the ``search_alg`` is already a
            :class:`ConcurrencyLimiter`. Defaults to None.
        _remote: Whether to run the Tune driver in a remote function.
            This is disabled automatically if a custom trial executor is
            passed in. This is enabled by default in Ray client mode.

    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.
    """
    # NO CODE IS TO BE ADDED ABOVE THIS COMMENT
    # remote_run_kwargs must be defined before any other
    # code is ran to ensure that at this point,
    # `locals()` is equal to args and kwargs
    remote_run_kwargs = locals().copy()
    remote_run_kwargs.pop("_remote")

    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        remote_run = ray.remote(num_cpus=0)(run)

        # Make sure tune.run is called on the sever node.
        remote_run = force_on_current_node(remote_run)

        set_verbosity(verbose)
        progress_reporter = progress_reporter or detect_reporter()

        # JupyterNotebooks don't work with remote tune runs out of the box
        # (e.g. via Ray client) as they don't have access to the main
        # process stdout. So we introduce a queue here that accepts
        # strings, which will then be displayed on the driver side.
        if isinstance(progress_reporter, RemoteReporterMixin):
            string_queue = Queue(actor_options={
                "num_cpus": 0,
                **force_on_current_node(None)
            })
            progress_reporter.output_queue = string_queue

            def get_next_queue_item():
                try:
                    return string_queue.get(block=False)
                except Empty:
                    return None

        else:
            # If we don't need a queue, use this dummy get fn instead of
            # scheduling an unneeded actor
            def get_next_queue_item():
                return None

        def _handle_string_queue():
            string_item = get_next_queue_item()
            while string_item is not None:
                # This happens on the driver side
                progress_reporter.display(string_item)

                string_item = get_next_queue_item()

        # Override with detected progress reporter
        remote_run_kwargs["progress_reporter"] = progress_reporter
        remote_future = remote_run.remote(_remote=False, **remote_run_kwargs)

        # ray.wait(...)[1] returns futures that are not ready, yet
        while ray.wait([remote_future], timeout=0.2)[1]:
            # Check if we have items to execute
            _handle_string_queue()

        # Handle queue one last time
        _handle_string_queue()

        return ray.get(remote_future)

    del remote_run_kwargs

    all_start = time.time()

    if mode and mode not in ["min", "max"]:
        raise ValueError(
            "The `mode` parameter passed to `tune.run()` has to be one of "
            "['min', 'max']")

    set_verbosity(verbose)

    config = config or {}
    sync_config = sync_config or SyncConfig()
    _validate_upload_dir(sync_config)

    if num_samples == -1:
        num_samples = sys.maxsize

    result_buffer_length = None

    # Create scheduler here as we need access to some of its properties
    if isinstance(scheduler, str):
        # importing at top level causes a recursive dependency
        from ray.tune.schedulers import create_scheduler

        scheduler = create_scheduler(scheduler)
    scheduler = scheduler or FIFOScheduler()

    if not scheduler.supports_buffered_results:
        # Result buffering with e.g. a Hyperband scheduler is a bad idea, as
        # hyperband tries to stop trials when processing brackets. With result
        # buffering, we might trigger this multiple times when evaluating
        # a single trial, which leads to unexpected behavior.
        env_result_buffer_length = os.getenv("TUNE_RESULT_BUFFER_LENGTH", "")
        if env_result_buffer_length:
            warnings.warn(
                f"You are using a {type(scheduler)} scheduler, but "
                f"TUNE_RESULT_BUFFER_LENGTH is set "
                f"({env_result_buffer_length}). This can lead to undesired "
                f"and faulty behavior, so the buffer length was forcibly set "
                f"to 1 instead.")
        result_buffer_length = 1

    # If reuse_actors is unset, default to False for string and class trainables,
    # and default to True for everything else (i.e. function trainables)
    if reuse_actors is None:
        trainable = (run_or_experiment.run_identifier if isinstance(
            run_or_experiment, Experiment) else run_or_experiment)
        reuse_actors = (
            # Only default to True for function trainables that meet certain conditions
            is_function_trainable(trainable) and not (
                # Changing resources requires restarting actors
                scheduler and isinstance(scheduler, ResourceChangingScheduler))
            and not (
                # If GPUs are requested we could run into problems with device memory
                _check_gpus_in_resources(resources_per_trial))
            and not (
                # If the resource request is overridden, we don't know if GPUs
                # will be requested, yet, so default to False
                _check_default_resources_override(trainable)))

    if (isinstance(scheduler,
                   (PopulationBasedTraining, PopulationBasedTrainingReplay))
            and not reuse_actors):
        warnings.warn(
            "Consider boosting PBT performance by enabling `reuse_actors` as "
            "well as implementing `reset_config` for Trainable.")

    trial_executor = trial_executor or RayTrialExecutor(
        reuse_actors=reuse_actors, result_buffer_length=result_buffer_length)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                time_budget_s=time_budget_s,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                _experiment_checkpoint_dir=_experiment_checkpoint_dir,
                sync_config=sync_config,
                trial_name_creator=trial_name_creator,
                trial_dirname_creator=trial_dirname_creator,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore,
            )
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if isinstance(search_alg, str):
        search_alg = create_searcher(search_alg)

    # if local_mode=True is set during ray.init().
    is_local_mode = ray._private.worker._mode(
    ) == ray._private.worker.LOCAL_MODE

    if is_local_mode:
        max_concurrent_trials = 1

    if not search_alg:
        search_alg = BasicVariantGenerator(
            max_concurrent=max_concurrent_trials or 0)
    elif max_concurrent_trials or is_local_mode:
        if isinstance(search_alg, ConcurrencyLimiter):
            if not is_local_mode:
                if search_alg.max_concurrent != max_concurrent_trials:
                    raise ValueError(
                        "You have specified `max_concurrent_trials="
                        f"{max_concurrent_trials}`, but the `search_alg` is "
                        "already a `ConcurrencyLimiter` with `max_concurrent="
                        f"{search_alg.max_concurrent}. FIX THIS by setting "
                        "`max_concurrent_trials=None`.")
                else:
                    logger.warning(
                        "You have specified `max_concurrent_trials="
                        f"{max_concurrent_trials}`, but the `search_alg` is "
                        "already a `ConcurrencyLimiter`. "
                        "`max_concurrent_trials` will be ignored.")
        else:
            if max_concurrent_trials < 1:
                raise ValueError(
                    "`max_concurrent_trials` must be greater or equal than 1, "
                    f"got {max_concurrent_trials}.")
            if isinstance(search_alg, Searcher):
                search_alg = ConcurrencyLimiter(
                    search_alg, max_concurrent=max_concurrent_trials)
            elif not is_local_mode:
                logger.warning(
                    "You have passed a `SearchGenerator` instance as the "
                    "`search_alg`, but `max_concurrent_trials` requires a "
                    "`Searcher` instance`. `max_concurrent_trials` "
                    "will be ignored.")

    if isinstance(search_alg, Searcher):
        search_alg = SearchGenerator(search_alg)

    if config and not searcher_set_search_props(
            search_alg.set_search_properties,
            metric,
            mode,
            config,
            **experiments[0].public_spec,
    ):
        if has_unresolved_values(config):
            raise ValueError(
                "You passed a `config` parameter to `tune.run()` with "
                "unresolved parameters, but the search algorithm was already "
                "instantiated with a search space. Make sure that `config` "
                "does not contain any more parameter definitions - include "
                "them in the search algorithm's search space if necessary.")

    if not scheduler_set_search_props(scheduler.set_search_properties, metric,
                                      mode, **experiments[0].public_spec):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the scheduler you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your scheduler or from your call to `tune.run()`")

    # Create syncer callbacks
    callbacks = create_default_callbacks(callbacks, sync_config, metric=metric)

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler,
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_config=sync_config,
        stopper=experiments[0].stopper,
        resume=resume,
        server_port=server_port,
        fail_fast=fail_fast,
        trial_executor=trial_executor,
        callbacks=callbacks,
        metric=metric,
        # Driver should only sync trial checkpoints if
        # checkpoints are not synced to cloud
        driver_sync_trial_checkpoints=not bool(sync_config.upload_dir),
    )

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info("TrialRunner resumed, ignoring new add_experiment but "
                    "updating trial resources.")
        if resources_per_trial:
            runner.update_pending_trial_resources(resources_per_trial)

    # Calls setup on callbacks
    runner.setup_experiments(experiments=experiments,
                             total_num_samples=search_alg.total_samples)

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if _check_gpus_in_resources(resources=resources_per_trial):
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overridden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    original_handler = signal.getsignal(signal.SIGINT)
    state = {"signal": None}

    def signal_interrupt_tune_run(sig: int, frame):
        logger.warning(
            "Stop signal received (e.g. via SIGINT/Ctrl+C), ending Ray Tune run. "
            "This will try to checkpoint the experiment state one last time. "
            "Press CTRL+C (or send SIGINT/SIGKILL/SIGTERM) "
            "to skip. ")
        state["signal"] = sig
        # Restore original signal handler to react to future SIGINT signals
        signal.signal(signal.SIGINT, original_handler)

    # We should only install the handler when it is safe to do so.
    # When tune.run() is called from worker thread, signal.signal will
    # fail.
    allow_signal_catching = True
    if threading.current_thread() != threading.main_thread():
        allow_signal_catching = False

    if allow_signal_catching:
        if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")):
            signal.signal(signal.SIGINT, signal_interrupt_tune_run)

        # Always register SIGUSR1 if available (not available e.g. on Windows)
        if hasattr(signal, "SIGUSR1"):
            signal.signal(signal.SIGUSR1, signal_interrupt_tune_run)

    progress_reporter = progress_reporter or detect_reporter()

    tune_start = time.time()

    progress_reporter.setup(
        start_time=tune_start,
        total_samples=search_alg.total_samples,
        metric=metric,
        mode=mode,
    )
    while not runner.is_finished() and not state["signal"]:
        runner.step()
        if has_verbosity(Verbosity.V1_EXPERIMENT):
            _report_progress(runner, progress_reporter)
    tune_taken = time.time() - tune_start

    try:
        runner.checkpoint(force=True)
    except Exception as e:
        logger.warning(f"Trial Runner checkpointing failed: {str(e)}")

    if has_verbosity(Verbosity.V1_EXPERIMENT):
        _report_progress(runner, progress_reporter, done=True)

    # Wait for syncing to finish
    for callback in callbacks:
        if isinstance(callback, SyncerCallback):
            try:
                callback.wait_for_all()
            except TuneError as e:
                logger.error(e)

    runner.cleanup()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial and not state["signal"]:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    all_taken = time.time() - all_start
    if has_verbosity(Verbosity.V1_EXPERIMENT):
        logger.info(f"Total run time: {all_taken:.2f} seconds "
                    f"({tune_taken:.2f} seconds for the tuning loop).")

    if state["signal"]:
        logger.warning(
            "Experiment has been interrupted, but the most recent state was "
            "saved. You can continue running this experiment by passing "
            "`resume=True` to `tune.run()`")

    trials = runner.get_trials()
    return ExperimentAnalysis(
        runner.checkpoint_file,
        trials=trials,
        default_metric=metric,
        default_mode=mode,
        sync_config=sync_config,
    )
Example #25
0
    def testPauseResumeCheckpointCount(self):
        ray.init(num_cpus=2)
        tempdir = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, tempdir)

        trial = Trial("__fake", keep_checkpoints_num=2)
        trial.init_logdir()
        trial.checkpoint_manager.set_delete_fn(
            lambda cp: shutil.rmtree(cp.dir_or_data))

        def write_checkpoint(trial: Trial, index: int):
            checkpoint_dir = TrainableUtil.make_checkpoint_dir(trial.logdir,
                                                               index=index)
            result = {"training_iteration": index}
            with open(os.path.join(checkpoint_dir, "cp.json"), "w") as f:
                json.dump(result, f)

            tune_cp = _TrackedCheckpoint(
                dir_or_data=checkpoint_dir,
                storage_mode=CheckpointStorage.PERSISTENT,
                metrics=result,
            )
            trial.saving_to = tune_cp

            return checkpoint_dir

        def get_checkpoint_dirs(trial: Trial):
            return [
                d for d in os.listdir(trial.logdir)
                if d.startswith("checkpoint_")
            ]

        runner = TrialRunner(local_checkpoint_dir=tempdir)
        runner.add_trial(trial)

        # Write 1 checkpoint
        result = write_checkpoint(trial, 1)
        runner._on_saving_result(trial, result)

        # Expect 1 checkpoint
        cp_dirs = get_checkpoint_dirs(trial)
        self.assertEqual(len(cp_dirs), 1, msg=f"Checkpoint dirs: {cp_dirs}")

        # Write second checkpoint
        result = write_checkpoint(trial, 2)
        runner._on_saving_result(trial, result)

        # Expect 2 checkpoints
        cp_dirs = get_checkpoint_dirs(trial)
        self.assertEqual(len(cp_dirs), 2, msg=f"Checkpoint dirs: {cp_dirs}")

        # Write third checkpoint
        result = write_checkpoint(trial, 3)
        runner._on_saving_result(trial, result)

        # Expect 2 checkpoints because keep_checkpoints_num = 2
        cp_dirs = get_checkpoint_dirs(trial)
        self.assertEqual(len(cp_dirs), 2, msg=f"Checkpoint dirs: {cp_dirs}")

        # Re-instantiate trial runner and resume
        runner.checkpoint(force=True)
        runner = TrialRunner(local_checkpoint_dir=tempdir)
        runner.resume()

        trial = runner.get_trials()[0]
        trial.checkpoint_manager.set_delete_fn(
            lambda cp: shutil.rmtree(cp.dir_or_data))

        # Write fourth checkpoint
        result = write_checkpoint(trial, 4)
        runner._on_saving_result(trial, result)

        # Expect 2 checkpoints because keep_checkpoints_num = 2
        cp_dirs = get_checkpoint_dirs(trial)
        self.assertEqual(len(cp_dirs), 2, msg=f"Checkpoint dirs: {cp_dirs}")

        # Write fifth checkpoint
        result = write_checkpoint(trial, 5)
        runner._on_saving_result(trial, result)

        # Expect 2 checkpoints because keep_checkpoints_num = 2
        cp_dirs = get_checkpoint_dirs(trial)
        self.assertEqual(len(cp_dirs), 2, msg=f"Checkpoint dirs: {cp_dirs}")

        # Checkpoints before restore should be deleted
        self.assertIn("checkpoint_000004", cp_dirs)
        self.assertIn("checkpoint_000005", cp_dirs)

        self.assertNotIn("checkpoint_000002", cp_dirs)
        self.assertNotIn("checkpoint_000003", cp_dirs)
def test_cluster_interrupt_searcher(start_connected_cluster, tmpdir, searcher):
    """Tests restoration of HyperOptSearch experiment on cluster shutdown
    with actual interrupt.

    Restoration should restore both state of trials
    and previous search algorithm (HyperOptSearch) state.
    This is an end-to-end test.
    """
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    local_checkpoint_dir = os.path.join(dirpath, "experiment")
    from ray.tune import register_trainable

    register_trainable("trainable", MyTrainableClass)

    def execute_script_with_args(*args):
        current_dir = os.path.dirname(__file__)
        script = os.path.join(current_dir,
                              "_test_cluster_interrupt_searcher.py")
        subprocess.Popen([sys.executable, script] + list(args))

    args = (
        "--ray-address",
        cluster.address,
        "--local-dir",
        dirpath,
        "--searcher",
        searcher,
    )
    execute_script_with_args(*args)
    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    trials = []
    for i in range(100):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()
            if trials and len(trials) >= 10:
                break
        time.sleep(0.5)
    else:
        raise ValueError(f"Didn't generate enough trials: {len(trials)}")

    if not TrialRunner.checkpoint_exists(local_checkpoint_dir):
        raise RuntimeError(
            f"Checkpoint file didn't appear in {local_checkpoint_dir}. "
            f"Current list: {os.listdir(local_checkpoint_dir)}.")

    ray.shutdown()
    cluster.shutdown()

    cluster = _start_new_cluster()
    execute_script_with_args(*(args + ("--resume", )))

    time.sleep(2)

    register_trainable("trainable", MyTrainableClass)
    reached = False
    for i in range(100):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()

            if len(trials) == 0:
                continue  # nonblocking script hasn't resumed yet, wait

            reached = True
            assert len(trials) >= 10
            assert len(trials) <= 20
            if len(trials) == 20:
                break
            else:
                stop_fn = runner.trial_executor.stop_trial
                [stop_fn(t) for t in trials if t.status is not Trial.ERROR]
        time.sleep(0.5)
    assert reached is True

    ray.shutdown()
    cluster.shutdown()
Example #27
0
    def testBurnInPeriod(self):
        runner = TrialRunner(trial_executor=MagicMock())

        scheduler = PopulationBasedTraining(
            time_attr="training_iteration",
            metric="error",
            mode="min",
            perturbation_interval=5,
            burn_in_period=50,
            log_config=True,
            synch=True,
        )

        class MockTrial(Trial):
            @property
            def checkpoint(self):
                return _TrackedCheckpoint(
                    dir_or_data={"data": "None"},
                    storage_mode=CheckpointStorage.MEMORY,
                    metrics={},
                )

            @property
            def status(self):
                return Trial.PAUSED

            @status.setter
            def status(self, status):
                pass

        trial1 = MockTrial("PPO", config=dict(num=1))
        trial2 = MockTrial("PPO", config=dict(num=2))
        trial3 = MockTrial("PPO", config=dict(num=3))
        trial4 = MockTrial("PPO", config=dict(num=4))

        runner.add_trial(trial1)
        runner.add_trial(trial2)
        runner.add_trial(trial3)
        runner.add_trial(trial4)

        scheduler.on_trial_add(runner, trial1)
        scheduler.on_trial_add(runner, trial2)
        scheduler.on_trial_add(runner, trial3)
        scheduler.on_trial_add(runner, trial4)

        # Add initial results.
        scheduler.on_trial_result(runner,
                                  trial1,
                                  result=dict(training_iteration=1, error=50))
        scheduler.on_trial_result(runner,
                                  trial2,
                                  result=dict(training_iteration=1, error=50))
        scheduler.on_trial_result(runner,
                                  trial3,
                                  result=dict(training_iteration=1, error=10))
        scheduler.on_trial_result(runner,
                                  trial4,
                                  result=dict(training_iteration=1, error=100))

        # Add more results. Without burn-in, this would now exploit
        scheduler.on_trial_result(runner,
                                  trial1,
                                  result=dict(training_iteration=30, error=50))
        scheduler.on_trial_result(runner,
                                  trial2,
                                  result=dict(training_iteration=30, error=50))
        scheduler.on_trial_result(runner,
                                  trial3,
                                  result=dict(training_iteration=30, error=10))
        scheduler.on_trial_result(runner,
                                  trial4,
                                  result=dict(training_iteration=30,
                                              error=100))

        self.assertEqual(trial4.config["num"], 4)

        # Add more results. Since this is after burn-in, it should now exploit
        scheduler.on_trial_result(runner,
                                  trial1,
                                  result=dict(training_iteration=50, error=50))
        scheduler.on_trial_result(runner,
                                  trial2,
                                  result=dict(training_iteration=50, error=50))
        scheduler.on_trial_result(runner,
                                  trial3,
                                  result=dict(training_iteration=50, error=10))
        scheduler.on_trial_result(runner,
                                  trial4,
                                  result=dict(training_iteration=50,
                                              error=100))

        self.assertEqual(trial4.config["num"], 3)

        # Assert that trials do not hang after `burn_in_period`
        self.assertTrue(all(t.status == "PAUSED" for t in runner.get_trials()))
        self.assertTrue(scheduler.choose_trial_to_run(runner))

        # Assert that trials do not hang when a terminated trial is added
        trial5 = Trial("PPO", config=dict(num=5))
        runner.add_trial(trial5)
        scheduler.on_trial_add(runner, trial5)
        trial5.set_status(Trial.TERMINATED)
        self.assertTrue(scheduler.choose_trial_to_run(runner))
Example #28
0
def test_cluster_interrupt(start_connected_cluster, tmpdir):
    """Tests run_experiment on cluster shutdown with actual interrupt.

    This is an end-to-end test.
    """
    cluster = start_connected_cluster
    dirpath = str(tmpdir)

    # Needs to be in scope for pytest
    class _Mock(tune.Trainable):
        """Finishes on the 4th iteration."""
        def setup(self, config):
            self.state = {"hi": 0}

        def step(self):
            self.state["hi"] += 1
            time.sleep(0.5)
            return {"done": self.state["hi"] >= 4}

        def save_checkpoint(self, path):
            return self.state

        def load_checkpoint(self, state):
            self.state = state

    # Removes indent from class.
    reformatted = "\n".join(line[4:] if len(line) else line
                            for line in inspect.getsource(_Mock).split("\n"))

    script = """
import os
import time
import ray
from ray import tune

os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0"

ray.init(address="{address}")

{fail_class_code}

tune.run(
    {fail_class},
    name="experiment",
    stop=dict(training_iteration=5),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1,
    raise_on_failed_trial=False)
""".format(
        address=cluster.address,
        checkpoint_dir=dirpath,
        fail_class_code=reformatted,
        fail_class=_Mock.__name__,
    )
    run_string_as_driver_nonblocking(script)

    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    local_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(50):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration") == 3:
                break
        time.sleep(0.2)

    if not TrialRunner.checkpoint_exists(local_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    Experiment.register_if_needed(_Mock)

    # Inspect the internal trialrunner
    runner = TrialRunner(resume="LOCAL",
                         local_checkpoint_dir=local_checkpoint_dir)
    trials = runner.get_trials()
    assert trials[0].last_result["training_iteration"] == 3
    assert trials[0].status == Trial.PENDING

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": _Mock,
                "local_dir": dirpath,
                "checkpoint_freq": 1
            }
        },
        resume=True,
        raise_on_failed_trial=False,
    )
    assert all(t.status == Trial.TERMINATED for t in trials2)
    assert {t.trial_id for t in trials2} == {t.trial_id for t in trials}
    ray.shutdown()
    cluster.shutdown()
Example #29
0
    def testSearcherSaveRestore(self):
        ray.init(num_cpus=8, local_mode=True)

        def create_searcher():
            class TestSuggestion(Searcher):
                def __init__(self, index):
                    self.index = index
                    self.returned_result = []
                    super().__init__(metric="episode_reward_mean", mode="max")

                def suggest(self, trial_id):
                    self.index += 1
                    return {"test_variable": self.index}

                def on_trial_complete(self, trial_id, result=None, **kwargs):
                    self.returned_result.append(result)

                def save(self, checkpoint_path):
                    with open(checkpoint_path, "wb") as f:
                        pickle.dump(self.__dict__, f)

                def restore(self, checkpoint_path):
                    with open(checkpoint_path, "rb") as f:
                        self.__dict__.update(pickle.load(f))

            searcher = TestSuggestion(0)
            searcher = ConcurrencyLimiter(searcher, max_concurrent=2)
            searcher = Repeater(searcher, repeat=3, set_index=False)
            search_alg = SearchGenerator(searcher)
            experiment_spec = {
                "run": "__fake",
                "num_samples": 20,
                "stop": {
                    "training_iteration": 2
                },
            }
            experiments = [Experiment.from_json("test", experiment_spec)]
            search_alg.add_configurations(experiments)
            return search_alg

        searcher = create_searcher()
        runner = TrialRunner(search_alg=searcher,
                             local_checkpoint_dir=self.tmpdir,
                             checkpoint_period=-1)
        for i in range(6):
            runner.step()

        assert len(
            runner.get_trials()) == 6, [t.config for t in runner.get_trials()]
        runner.checkpoint()
        trials = runner.get_trials()
        [
            runner.trial_executor.stop_trial(t) for t in trials
            if t.status is not Trial.ERROR
        ]
        del runner
        # stop_all(runner.get_trials())

        searcher = create_searcher()
        runner2 = TrialRunner(search_alg=searcher,
                              local_checkpoint_dir=self.tmpdir,
                              resume="LOCAL")
        assert len(runner2.get_trials()) == 6, [
            t.config for t in runner2.get_trials()
        ]

        def trial_statuses():
            return [t.status for t in runner2.get_trials()]

        def num_running_trials():
            return sum(t.status == Trial.RUNNING for t in runner2.get_trials())

        for i in range(6):
            runner2.step()
        assert len(set(trial_statuses())) == 1
        assert Trial.RUNNING in trial_statuses()
        for i in range(20):
            runner2.step()
            assert 1 <= num_running_trials() <= 6
        evaluated = [
            t.evaluated_params["test_variable"] for t in runner2.get_trials()
        ]
        count = Counter(evaluated)
        assert all(v <= 3 for v in count.values())