Python TrialRunner Examples, ray.tune.execution.trial_runner.TrialRunner Python Examples

Example #1

0

Show file

File: test_cluster.py Project: ray-project/ray

def test_cluster_rllib_restore(start_connected_cluster, tmpdir):
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    script = """
import time
import ray
from ray import tune

ray.init(address="{address}")


tune.run(
    "PG",
    name="experiment",
    config=dict(env="CartPole-v1", framework="tf"),
    stop=dict(training_iteration=10),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1,
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(address=cluster.address, checkpoint_dir=dirpath)
    run_string_as_driver_nonblocking(script)
    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    local_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(100):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration"):
                break
        time.sleep(0.3)

    if not TrialRunner.checkpoint_exists(local_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    cluster.wait_for_nodes()

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": "PG",
                "checkpoint_freq": 1,
                "local_dir": dirpath,
            }
        },
        resume=True,
    )
    assert all(t.status == Trial.TERMINATED for t in trials2)
    ray.shutdown()
    cluster.shutdown()

Example #2

0

Show file

File: test_trial_runner_callbacks.py Project: vishalbelsare/ray

    def setUp(self):

        ray.init()
        self.tmpdir = tempfile.mkdtemp()
        self.callback = TestCallback()
        self.executor = _MockTrialExecutor()
        self.trial_runner = TrialRunner(trial_executor=self.executor,
                                        callbacks=[self.callback])
        # experiment would never be None normally, but it's fine for testing
        self.trial_runner.setup_experiments(experiments=[None],
                                            total_num_samples=1)

Example #3

0

Show file

File: test_trial_runner_callbacks.py Project: vishalbelsare/ray

    def testCallbackSetupBackwardsCompatible(self, mocked_warning_method):
        class NoExperimentInSetupCallback(Callback):
            # Old method definition didn't take in **experiment.public_spec
            def setup(self):
                return

        callback = NoExperimentInSetupCallback()
        trial_runner = TrialRunner(callbacks=[callback])
        trial_runner.setup_experiments(
            experiments=[Experiment("", lambda x: x)], total_num_samples=1)
        mocked_warning_method.assert_called_once()
        self.assertIn("Please update",
                      mocked_warning_method.call_args_list[0][0][0])

Example #4

0

Show file

File: test_cluster.py Project: ray-project/ray

def test_migration_checkpoint_removal(start_connected_emptyhead_cluster,
                                      tmpdir, durable):
    """Test checks that trial restarts if checkpoint is lost w/ node fail."""
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    if durable:
        upload_dir = "file://" + str(tmpdir)
        syncer_callback = SyncerCallback()
    else:
        upload_dir = None
        syncer_callback = custom_driver_logdir_callback(str(tmpdir))

    runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback])
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 4
        },
        "checkpoint_freq": 2,
        "max_failures": 2,
        "remote_checkpoint_dir": upload_dir,
    }

    # Test recovery of trial that has been checkpointed
    t1 = Trial("__fake", **kwargs)
    runner.add_trial(t1)

    # Start trial, process result (x2), process save
    while not t1.has_checkpoint():
        runner.step()

    cluster.add_node(num_cpus=1)
    cluster.remove_node(node)
    cluster.wait_for_nodes()

    # Remove checkpoint on "remote" node
    shutil.rmtree(os.path.dirname(t1.checkpoint.dir_or_data))

    if not durable:
        # Recover from driver file
        t1.checkpoint.dir_or_data = os.path.join(
            tmpdir,
            t1.relative_logdir,
            os.path.relpath(t1.checkpoint.dir_or_data, t1.logdir),
        )

    while not runner.is_finished():
        runner.step()
    assert t1.status == Trial.TERMINATED, runner.debug_string()

Example #5

0

Show file

File: test_trial_runner_2.py Project: parasj/ray

    def testFailureRecoveryEnabled(self):
        ray.init(num_cpus=1, num_gpus=1)
        searchalg, scheduler = create_mock_components()

        runner = TrialRunner(searchalg, scheduler=scheduler)

        kwargs = {
            "stopping_criterion": {
                "training_iteration": 2
            },
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 1,
            "config": {
                "mock_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[0].num_failures, 1)
        self.assertEqual(len(searchalg.errored_trials), 0)
        # Notice this is 1 since during recovery, the previously errored trial
        # is "requeued". This will call scheduler.on_trial_error.
        # Searcher.on_trial_error is, however, not called in this process.
        self.assertEqual(len(scheduler.errored_trials), 1)

Example #6

0

Show file

File: test_trial_runner_pg.py Project: vishalbelsare/ray

    def testResourceDeadlock(self):
        """Tests that resource deadlock is avoided for heterogeneous PGFs.

        We start 4 trials in a cluster with 2 CPUs. The first two trials
        require 1 CPU each, the third trial 2 CPUs, the fourth trial 1 CPU.

        The second trial needs a bit more time to finish. This means that the
        resources from the first trial will be freed, and the PG of the
        _fourth_ trial becomes ready (not that of the third trial, because that
        requires 2 CPUs - however, one is still occupied by trial 2).

        After the first two trials finished, the FIFOScheduler tries to start
        the third trial. However, it can't be started because its placement
        group is not ready. Instead, the placement group of the fourth
        trial is ready. Thus, we opt to run the fourth trial instead.
        """
        def train(config):
            time.sleep(config["sleep"])
            return 4

        ray.init(num_cpus=2)

        tune.register_trainable("het", train)
        pgf1 = PlacementGroupFactory([{"CPU": 1}])
        pgf2 = PlacementGroupFactory([{"CPU": 2}])

        trial1 = Trial("het",
                       config={"sleep": 0},
                       placement_group_factory=pgf1)
        trial2 = Trial("het",
                       config={"sleep": 2},
                       placement_group_factory=pgf1)
        trial3 = Trial("het",
                       config={"sleep": 0},
                       placement_group_factory=pgf2)
        trial4 = Trial("het",
                       config={"sleep": 0},
                       placement_group_factory=pgf1)

        runner = TrialRunner(fail_fast=True)
        runner.add_trial(trial1)
        runner.add_trial(trial2)
        runner.add_trial(trial3)
        runner.add_trial(trial4)

        timeout = time.monotonic() + 30
        while not runner.is_finished():
            # We enforce a timeout here
            self.assertLess(time.monotonic(),
                            timeout,
                            msg="Ran into a resource deadlock")

            runner.step()

Example #7

0

Show file

    def testChangeResources(self):
        """Checks that resource requirements can be changed on fly."""
        ray.init(num_cpus=2)

        class ChangingScheduler(FIFOScheduler):
            def __init__(self):
                self._has_received_one_trial_result = False

            # For figuring out how many runner.step there are.
            def has_received_one_trial_result(self):
                return self._has_received_one_trial_result

            def on_trial_result(self, trial_runner, trial, result):
                if result["training_iteration"] == 1:
                    self._has_received_one_trial_result = True
                    executor = trial_runner.trial_executor
                    executor.pause_trial(trial)
                    trial.update_resources(dict(cpu=2, gpu=0))
                return TrialScheduler.NOOP

        scheduler = ChangingScheduler()
        runner = TrialRunner(scheduler=scheduler)
        kwargs = {
            "stopping_criterion": {"training_iteration": 2},
            "resources": Resources(cpu=1, gpu=0),
        }
        trials = [Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(
            runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 1
        )
        self.assertRaises(
            ValueError, lambda: trials[0].update_resources(dict(cpu=2, gpu=0))
        )

        while not scheduler.has_received_one_trial_result():
            runner.step()
        self.assertEqual(trials[0].status, Trial.PAUSED)
        # extra step for tune loop to stage the resource requests.
        runner.step()
        self.assertEqual(
            runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 2
        )

Example #8

0

Show file

File: test_trial_runner_3.py Project: ray-project/ray

    def testSearchAlgNotification(self):
        """Checks notification of trial to the Search Algorithm."""
        os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1"  # Don't finish early
        os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"

        ray.init(num_cpus=4, num_gpus=2)
        experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}}
        experiments = [Experiment.from_json("test", experiment_spec)]
        search_alg = _MockSuggestionAlgorithm()
        searcher = search_alg.searcher
        search_alg.add_configurations(experiments)
        runner = TrialRunner(search_alg=search_alg)

        while not runner.is_finished():
            runner.step()

        self.assertEqual(searcher.counter["result"], 1)
        self.assertEqual(searcher.counter["complete"], 1)

Example #9

0

Show file

File: test_trial_runner_2.py Project: ray-project/ray

    def testResultDone(self):
        """Tests that last_result is marked `done` after trial is complete."""
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 2},
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].last_result[DONE], True)

Example #10

0

Show file

File: test_trial_runner_3.py Project: ray-project/ray

    def testStepHook(self):
        ray.init(num_cpus=4, num_gpus=2)
        runner = TrialRunner()

        def on_step_begin(self, trialrunner):
            self._resource_updater.update_avail_resources()
            cnt = self.pre_step if hasattr(self, "pre_step") else 0
            self.pre_step = cnt + 1

        def on_step_end(self, trialrunner):
            cnt = self.pre_step if hasattr(self, "post_step") else 0
            self.post_step = 1 + cnt

        import types

        runner.trial_executor.on_step_begin = types.MethodType(
            on_step_begin, runner.trial_executor)
        runner.trial_executor.on_step_end = types.MethodType(
            on_step_end, runner.trial_executor)

        kwargs = {
            "stopping_criterion": {
                "training_iteration": 5
            },
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("__fake", **kwargs))
        runner.step()
        self.assertEqual(runner.trial_executor.pre_step, 1)
        self.assertEqual(runner.trial_executor.post_step, 1)

Example #11

0

Show file

File: test_trial_runner_3.py Project: ray-project/ray

    def _test_repeater(self, num_samples, repeat):
        class TestSuggestion(Searcher):
            index = 0

            def suggest(self, trial_id):
                self.index += 1
                return {"test_variable": 5 + self.index}

            def on_trial_complete(self, *args, **kwargs):
                return

        searcher = TestSuggestion(metric="episode_reward_mean")
        repeat_searcher = Repeater(searcher, repeat=repeat, set_index=False)
        alg = SearchGenerator(repeat_searcher)
        experiment_spec = {
            "run": "__fake",
            "num_samples": num_samples,
            "stop": {
                "training_iteration": 1
            },
        }
        alg.add_configurations({"test": experiment_spec})
        runner = TrialRunner(search_alg=alg)
        while not runner.is_finished():
            runner.step()

        return runner.get_trials()

Example #12

0

Show file

File: test_trial_runner_2.py Project: ray-project/ray

    def testCheckpointingAtEnd(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 2},
            "checkpoint_at_end": True,
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].last_result[DONE], True)
        self.assertEqual(trials[0].has_checkpoint(), True)

Example #13

0

Show file

File: test_trial_runner_3.py Project: ray-project/ray

    def testSearchAlgFinished(self):
        """Checks that SearchAlg is Finished before all trials are done."""
        ray.init(num_cpus=4, local_mode=True, include_dashboard=False)
        experiment_spec = {"run": "__fake", "stop": {"training_iteration": 1}}
        experiments = [Experiment.from_json("test", experiment_spec)]
        searcher = _MockSuggestionAlgorithm()
        searcher.add_configurations(experiments)
        runner = TrialRunner(search_alg=searcher)
        runner.step()
        trials = runner.get_trials()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertTrue(searcher.is_finished())
        self.assertFalse(runner.is_finished())

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(len(searcher.live_trials), 0)
        self.assertTrue(searcher.is_finished())
        self.assertTrue(runner.is_finished())

Example #14

0

Show file

def _report_progress(runner: TrialRunner,
                     reporter: ProgressReporter,
                     done: bool = False):
    """Reports experiment progress.

    Args:
        runner: Trial runner to report on.
        reporter: Progress reporter.
        done: Whether this is the last progress report attempt.
    """
    trials = runner.get_trials()
    if reporter.should_report(trials, done=done):
        sched_debug_str = runner.scheduler_alg.debug_string()
        executor_debug_str = runner.trial_executor.debug_string()
        reporter.report(trials, done, sched_debug_str, executor_debug_str)

Example #15

0

Show file

File: test_tune_server.py Project: vishalbelsare/ray

    def basicSetup(self):

        ray.init(num_cpus=4, num_gpus=1)
        port = get_valid_port()
        self.runner = TrialRunner(server_port=port)
        runner = self.runner
        kwargs = {
            "stopping_criterion": {
                "training_iteration": 3
            },
            "resources": Resources(cpu=1, gpu=1),
        }
        trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)
        client = TuneClient("localhost", port)
        return runner, client

Example #16

0

Show file

File: test_trial_runner_2.py Project: parasj/ray

    def testFailureRecoveryMaxFailures(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 2,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].status, Trial.ERROR)
        self.assertEqual(trials[0].num_failures, 3)

Example #17

0

Show file

File: test_trial_runner_2.py Project: ray-project/ray

    def testErrorHandling(self):
        ray.init(num_cpus=4, num_gpus=2)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 1},
            "resources": Resources(cpu=1, gpu=1),
        }
        _global_registry.register(TRAINABLE_CLASS, "asdf", None)
        trials = [Trial("asdf", **kwargs), Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        runner.step()
        self.assertEqual(trials[0].status, Trial.ERROR)
        self.assertEqual(trials[1].status, Trial.PENDING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.ERROR)
        self.assertEqual(trials[1].status, Trial.RUNNING)

Example #18

0

Show file

File: test_trial_runner_2.py Project: parasj/ray

    def testFailFast(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner(fail_fast=True)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].status, Trial.ERROR)
        # Somehow with `fail_fast=True`, if one errors out, the others are
        # then stopped with `TERMINATED` status.
        self.assertEqual(trials[1].status, Trial.TERMINATED)
        self.assertRaises(TuneError, lambda: runner.step())

Example #19

0

Show file

File: test_trial_runner_3.py Project: ray-project/ray

    def testSearchAlgSchedulerInteraction(self):
        """Checks that TrialScheduler killing trial will notify SearchAlg."""
        class _MockScheduler(FIFOScheduler):
            def on_trial_result(self, *args, **kwargs):
                return TrialScheduler.STOP

        ray.init(num_cpus=4, local_mode=True, include_dashboard=False)
        experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}}
        experiments = [Experiment.from_json("test", experiment_spec)]
        searcher = _MockSuggestionAlgorithm()
        searcher.add_configurations(experiments)
        runner = TrialRunner(search_alg=searcher, scheduler=_MockScheduler())
        runner.step()
        trials = runner.get_trials()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertTrue(searcher.is_finished())
        self.assertFalse(runner.is_finished())

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(len(searcher.live_trials), 0)
        self.assertTrue(searcher.is_finished())
        self.assertTrue(runner.is_finished())

Example #20

0

Show file

    def testMultiStepRun(self):
        ray.init(num_cpus=4, num_gpus=2)
        kwargs = {
            "stopping_criterion": {"training_iteration": 5},
            "resources": Resources(cpu=1, gpu=1),
        }
        trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
        snapshot = TrialStatusSnapshot()
        runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)])
        for t in trials:
            runner.add_trial(t)

        while not runner.is_finished():
            runner.step()

        self.assertTrue(snapshot.all_trials_are_terminated())

Example #21

0

Show file

    def testMultiStepRun2(self):
        """Checks that runner.step throws when overstepping."""
        ray.init(num_cpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 2},
            "resources": Resources(cpu=1, gpu=0),
        }
        trials = [Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertRaises(TuneError, runner.step)

Example #22

0

Show file

    def testQueueFilling(self):
        os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"

        ray.init(num_cpus=4)

        def f1(config):
            for i in range(10):
                yield i

        tune.register_trainable("f1", f1)

        search_alg = BasicVariantGenerator()
        search_alg.add_configurations(
            {
                "foo": {
                    "run": "f1",
                    "num_samples": 100,
                    "config": {
                        "a": tune.sample_from(lambda spec: 5.0 / 7),
                        "b": tune.sample_from(lambda spec: "long" * 40),
                    },
                    "resources_per_trial": {"cpu": 2},
                }
            }
        )

        runner = TrialRunner(search_alg=search_alg)

        runner.step()
        runner.step()
        runner.step()
        self.assertEqual(len(runner._trials), 3)

        runner.step()
        self.assertEqual(len(runner._trials), 3)

        self.assertEqual(runner._trials[0].status, Trial.RUNNING)
        self.assertEqual(runner._trials[1].status, Trial.RUNNING)
        self.assertEqual(runner._trials[2].status, Trial.PENDING)

Example #23

0

Show file

File: test_trial_runner_2.py Project: ray-project/ray

    def testPauseThenResume(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 2},
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()  # Start trial
        runner.step()  # Process result
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].runner.get_info.remote()), None)

        self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1)

        runner.trial_executor.pause_trial(trials[0])
        self.assertEqual(trials[0].status, Trial.PAUSED)

Example #24

0

Show file

    def testExtraResources(self):
        ray.init(num_cpus=4, num_gpus=2)
        snapshot = TrialStatusSnapshot()
        runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)])
        kwargs = {
            "stopping_criterion": {"training_iteration": 1},
            "placement_group_factory": PlacementGroupFactory(
                [{"CPU": 1}, {"CPU": 3, "GPU": 1}]
            ),
        }
        trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        while not runner.is_finished():
            runner.step()

        self.assertLess(snapshot.max_running_trials(), 2)
        self.assertTrue(snapshot.all_trials_are_terminated())

Example #25

0

Show file

    def testExtraCustomResources(self):
        ray.init(num_cpus=4, num_gpus=2, resources={"a": 2})
        # Since each trial will occupy the full custom resources,
        # there are at most 1 trial running at any given moment.
        snapshot = TrialStatusSnapshot()
        runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)])
        kwargs = {
            "stopping_criterion": {"training_iteration": 1},
            "placement_group_factory": PlacementGroupFactory([{"CPU": 1}, {"a": 2}]),
        }
        trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        while not runner.is_finished():
            runner.step()

        self.assertLess(snapshot.max_running_trials(), 2)
        self.assertTrue(snapshot.all_trials_are_terminated())

Example #26

0

Show file

File: test_trial_runner_2.py Project: parasj/ray

    def testFailureRecoveryDisabled(self):
        ray.init(num_cpus=1, num_gpus=1)
        searchalg, scheduler = create_mock_components()

        runner = TrialRunner(searchalg, scheduler=scheduler)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()

        self.assertEqual(trials[0].status, Trial.ERROR)
        self.assertEqual(trials[0].num_failures, 1)
        self.assertEqual(len(searchalg.errored_trials), 1)
        self.assertEqual(len(scheduler.errored_trials), 1)

Example #27

0

Show file

File: test_trial_runner_2.py Project: parasj/ray

    def testFailFastRaise(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner(fail_fast=TrialRunner.RAISE)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        with self.assertRaises(Exception):
            while not runner.is_finished():
                runner.step()

        # Not critical checks. Only to showcase the difference
        # with none raise type FailFast.
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.PENDING)

Example #28

0

Show file

File: test_trial_runner_3.py Project: ray-project/ray

    def testCheckpointAutoPeriod(self):
        ray.init(num_cpus=3)

        # This makes checkpointing take 2 seconds.

        class CustomSyncer(Syncer):
            def __init__(self, sync_period: float = 300.0):
                super(CustomSyncer, self).__init__(sync_period=sync_period)
                self._sync_status = {}

            def sync_up(self,
                        local_dir: str,
                        remote_dir: str,
                        exclude: list = None) -> bool:
                time.sleep(2)
                return True

            def sync_down(self,
                          remote_dir: str,
                          local_dir: str,
                          exclude: list = None) -> bool:
                time.sleep(2)
                return True

            def delete(self, remote_dir: str) -> bool:
                pass

        runner = TrialRunner(
            local_checkpoint_dir=self.tmpdir,
            checkpoint_period="auto",
            sync_config=SyncConfig(upload_dir="fake",
                                   syncer=CustomSyncer(),
                                   sync_period=0),
            remote_checkpoint_dir="fake",
        )
        runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 1}))

        runner.step()  # Run one step, this will trigger checkpointing

        self.assertGreaterEqual(runner._checkpoint_manager._checkpoint_period,
                                38.0)

Example #29

0

Show file

    def testFractionalGpus(self):
        ray.init(num_cpus=4, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "resources": Resources(cpu=1, gpu=0.5),
        }
        trials = [
            Trial("__fake", **kwargs),
            Trial("__fake", **kwargs),
            Trial("__fake", **kwargs),
            Trial("__fake", **kwargs),
        ]
        for t in trials:
            runner.add_trial(t)

        for _ in range(10):
            runner.step()

        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.RUNNING)
        self.assertEqual(trials[2].status, Trial.PENDING)
        self.assertEqual(trials[3].status, Trial.PENDING)

Example #30

0

Show file

def create_tune_experiment_checkpoint(trials: list, **runner_kwargs) -> str:
    experiment_dir = tempfile.mkdtemp()
    runner_kwargs.setdefault("local_checkpoint_dir", experiment_dir)

    # Update environment
    orig_env = os.environ.copy()

    # Set to 1 to disable ray cluster resource lookup. That way we can
    # create experiment checkpoints without initializing ray.
    os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"

    try:
        runner = TrialRunner(**runner_kwargs)

        for trial in trials:
            runner.add_trial(trial)

        runner.checkpoint(force=True)
    finally:
        os.environ.clear()
        os.environ.update(orig_env)

    return experiment_dir