def testCheckpointOverwrite(self): def count_checkpoints(cdir): return sum((fname.startswith("experiment_state") and fname.endswith(".json")) for fname in os.listdir(cdir)) ray.init(num_cpus=2) trial = Trial("__fake", checkpoint_freq=1) tmpdir = tempfile.mkdtemp() runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0) runner.add_trial(trial) for _ in range(5): runner.step() # force checkpoint runner.checkpoint() self.assertEqual(count_checkpoints(tmpdir), 1) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir) for _ in range(5): runner2.step() self.assertEqual(count_checkpoints(tmpdir), 2) runner2.checkpoint() self.assertEqual(count_checkpoints(tmpdir), 2) shutil.rmtree(tmpdir)
def testUserCheckpoint(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2})) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result, dispatch save runner.step() # Process save self.assertTrue(trials[0].has_checkpoint()) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) runner2.step() # 5: Start trial and dispatch restore trials2 = runner2.get_trials() self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1)
def testTrialErrorResumeFalse(self): ray.init(num_cpus=3, local_mode=True, include_dashboard=False) runner = TrialRunner(local_checkpoint_dir=self.tmpdir) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "resources": Resources(cpu=1, gpu=0), } trials = [ Trial("__fake", config={"mock_error": True}, **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() runner.checkpoint(force=True) assert trials[0].status == Trial.ERROR del runner new_runner = TrialRunner(resume=True, local_checkpoint_dir=self.tmpdir) assert len(new_runner.get_trials()) == 3 assert Trial.ERROR in (t.status for t in new_runner.get_trials())
def testTrialNoCheckpointSave(self): """Check that non-checkpointing trials *are* saved.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial( Trial( "__fake", trial_id="non_checkpoint", stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="checkpoint", checkpoint_at_end=True, stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="pending", stopping_criterion={"training_iteration": 2}, )) old_trials = runner.get_trials() while not old_trials[2].has_reported_at_least_once: runner.step() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) new_trials = runner2.get_trials() self.assertEqual(len(new_trials), 3) self.assertTrue( runner2.get_trial("non_checkpoint").status == Trial.TERMINATED) self.assertTrue( runner2.get_trial("checkpoint").status == Trial.TERMINATED) self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING) self.assertTrue( runner2.get_trial("pending").has_reported_at_least_once) runner2.step()
def test_cluster_rllib_restore(start_connected_cluster, tmpdir): cluster = start_connected_cluster dirpath = str(tmpdir) script = """ import time import ray from ray import tune ray.init(address="{address}") tune.run( "PG", name="experiment", config=dict(env="CartPole-v1", framework="tf"), stop=dict(training_iteration=10), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1, dict(experiment=kwargs), raise_on_failed_trial=False) """.format(address=cluster.address, checkpoint_dir=dirpath) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. local_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(100): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration"): break time.sleep(0.3) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() cluster.wait_for_nodes() # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": "PG", "checkpoint_freq": 1, "local_dir": dirpath, } }, resume=True, ) assert all(t.status == Trial.TERMINATED for t in trials2) ray.shutdown() cluster.shutdown()
def testCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result, dispatch save runner.step() # Process save, stop trial kwargs["restore_path"] = trials[0].checkpoint.dir_or_data self.assertEqual(trials[0].status, Trial.TERMINATED) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() self.assertEqual(trials[1].status, Trial.PENDING) runner.step() # Start trial, dispatch restore self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() # Process restore self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(ray.get(trials[1].runner.get_info.remote()), 1) self.addCleanup(shutil.rmtree, trials[0].checkpoint.dir_or_data)
def testCheckpointFreqBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir)) ray.init(num_cpus=2) trial = Trial("__fake", checkpoint_freq=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(trial) runner.step() # start trial runner.step() # run iteration 1-3 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 1) runner.step() # run iteration 4-6 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 6) self.assertEqual(num_checkpoints(trial), 2) runner.step() # run iteration 7-9 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 9) self.assertEqual(num_checkpoints(trial), 3)
def _test_repeater(self, num_samples, repeat): class TestSuggestion(Searcher): index = 0 def suggest(self, trial_id): self.index += 1 return {"test_variable": 5 + self.index} def on_trial_complete(self, *args, **kwargs): return searcher = TestSuggestion(metric="episode_reward_mean") repeat_searcher = Repeater(searcher, repeat=repeat, set_index=False) alg = SearchGenerator(repeat_searcher) experiment_spec = { "run": "__fake", "num_samples": num_samples, "stop": { "training_iteration": 1 }, } alg.add_configurations({"test": experiment_spec}) runner = TrialRunner(search_alg=alg) while not runner.is_finished(): runner.step() return runner.get_trials()
def testStepHook(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() def on_step_begin(self, trialrunner): self._resource_updater.update_avail_resources() cnt = self.pre_step if hasattr(self, "pre_step") else 0 self.pre_step = cnt + 1 def on_step_end(self, trialrunner): cnt = self.pre_step if hasattr(self, "post_step") else 0 self.post_step = 1 + cnt import types runner.trial_executor.on_step_begin = types.MethodType( on_step_begin, runner.trial_executor) runner.trial_executor.on_step_end = types.MethodType( on_step_end, runner.trial_executor) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) runner.step() self.assertEqual(runner.trial_executor.pre_step, 1) self.assertEqual(runner.trial_executor.post_step, 1)
def testFailureRecoveryEnabled(self): ray.init(num_cpus=1, num_gpus=1) searchalg, scheduler = create_mock_components() runner = TrialRunner(searchalg, scheduler=scheduler) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 1, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[0].num_failures, 1) self.assertEqual(len(searchalg.errored_trials), 0) # Notice this is 1 since during recovery, the previously errored trial # is "requeued". This will call scheduler.on_trial_error. # Searcher.on_trial_error is, however, not called in this process. self.assertEqual(len(scheduler.errored_trials), 1)
def setUp(self): ray.init() self.tmpdir = tempfile.mkdtemp() self.callback = TestCallback() self.executor = _MockTrialExecutor() self.trial_runner = TrialRunner(trial_executor=self.executor, callbacks=[self.callback]) # experiment would never be None normally, but it's fine for testing self.trial_runner.setup_experiments(experiments=[None], total_num_samples=1)
def testResourceDeadlock(self): """Tests that resource deadlock is avoided for heterogeneous PGFs. We start 4 trials in a cluster with 2 CPUs. The first two trials require 1 CPU each, the third trial 2 CPUs, the fourth trial 1 CPU. The second trial needs a bit more time to finish. This means that the resources from the first trial will be freed, and the PG of the _fourth_ trial becomes ready (not that of the third trial, because that requires 2 CPUs - however, one is still occupied by trial 2). After the first two trials finished, the FIFOScheduler tries to start the third trial. However, it can't be started because its placement group is not ready. Instead, the placement group of the fourth trial is ready. Thus, we opt to run the fourth trial instead. """ def train(config): time.sleep(config["sleep"]) return 4 ray.init(num_cpus=2) tune.register_trainable("het", train) pgf1 = PlacementGroupFactory([{"CPU": 1}]) pgf2 = PlacementGroupFactory([{"CPU": 2}]) trial1 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf1) trial2 = Trial("het", config={"sleep": 2}, placement_group_factory=pgf1) trial3 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf2) trial4 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf1) runner = TrialRunner(fail_fast=True) runner.add_trial(trial1) runner.add_trial(trial2) runner.add_trial(trial3) runner.add_trial(trial4) timeout = time.monotonic() + 30 while not runner.is_finished(): # We enforce a timeout here self.assertLess(time.monotonic(), timeout, msg="Ran into a resource deadlock") runner.step()
def testCheckpointWithFunction(self): ray.init(num_cpus=2) trial = Trial( "__fake", config={"callbacks": { "on_episode_start": lambda i: i, }}, checkpoint_freq=1, ) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(trial) for _ in range(5): runner.step() # force checkpoint runner.checkpoint() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) new_trial = runner2.get_trials()[0] self.assertTrue("callbacks" in new_trial.config) self.assertTrue("on_episode_start" in new_trial.config["callbacks"])
def testCheckpointAtEndNotBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "0.5" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir)) ray.init(num_cpus=2) trial = Trial( "__fake", checkpoint_at_end=True, stopping_criterion={"training_iteration": 4}, ) observer = TrialResultObserver() runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period=0, trial_executor=RayTrialExecutor(result_buffer_length=7), callbacks=[observer], ) runner.add_trial(trial) while not observer.just_received_a_result(): runner.step() self.assertEqual(trial.last_result[TRAINING_ITERATION], 1) self.assertEqual(num_checkpoints(trial), 0) while True: runner.step() if observer.just_received_a_result(): break self.assertEqual(trial.last_result[TRAINING_ITERATION], 2) self.assertEqual(num_checkpoints(trial), 0) while True: runner.step() if observer.just_received_a_result(): break self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 0) while True: runner.step() if observer.just_received_a_result(): break self.assertEqual(trial.last_result[TRAINING_ITERATION], 4) while not runner.is_finished(): runner.step() self.assertEqual(num_checkpoints(trial), 1)
def testTrialErrorResumeTrue(self): ray.init(num_cpus=3, local_mode=True, include_dashboard=False) runner = TrialRunner(local_checkpoint_dir=self.tmpdir) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "resources": Resources(cpu=1, gpu=0), } trials = [ Trial("__fake", config={"mock_error": True}, **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() runner.checkpoint(force=True) assert trials[0].status == Trial.ERROR del runner new_runner = TrialRunner(resume="ERRORED_ONLY", local_checkpoint_dir=self.tmpdir) assert len(new_runner.get_trials()) == 3 assert Trial.ERROR not in (t.status for t in new_runner.get_trials()) # The below is just a check for standard behavior. disable_error = False for t in new_runner.get_trials(): if t.config.get("mock_error"): t.config["mock_error"] = False disable_error = True assert disable_error while not new_runner.is_finished(): new_runner.step() assert Trial.ERROR not in (t.status for t in new_runner.get_trials())
def testCallbackSetupBackwardsCompatible(self, mocked_warning_method): class NoExperimentInSetupCallback(Callback): # Old method definition didn't take in **experiment.public_spec def setup(self): return callback = NoExperimentInSetupCallback() trial_runner = TrialRunner(callbacks=[callback]) trial_runner.setup_experiments( experiments=[Experiment("", lambda x: x)], total_num_samples=1) mocked_warning_method.assert_called_once() self.assertIn("Please update", mocked_warning_method.call_args_list[0][0][0])
def testResultDone(self): """Tests that last_result is marked `done` after trial is complete.""" ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 2}, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].last_result[DONE], True)
def testSearchAlgFinishes(self): """Empty SearchAlg changing state in `next_trials` does not crash.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" class FinishFastAlg(_MockSuggestionAlgorithm): _index = 0 def next_trial(self): spec = self._experiment.spec trial = None if self._index < spec["num_samples"]: trial = Trial(spec.get("run"), stopping_criterion=spec.get("stop")) self._index += 1 if self._index > 4: self.set_finished() return trial def suggest(self, trial_id): return {} ray.init(num_cpus=2, local_mode=True, include_dashboard=False) experiment_spec = { "run": "__fake", "num_samples": 2, "stop": { "training_iteration": 1 }, } searcher = FinishFastAlg() experiments = [Experiment.from_json("test", experiment_spec)] searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher) self.assertFalse(runner.is_finished()) runner.step() # This launches a new run runner.step() # This launches a 2nd run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # This kills the first run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # This kills the 2nd run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # this converts self._finished to True self.assertTrue(searcher.is_finished()) self.assertRaises(TuneError, runner.step)
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster, tmpdir, durable): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() if durable: upload_dir = "file://" + str(tmpdir) syncer_callback = SyncerCallback() else: upload_dir = None syncer_callback = custom_driver_logdir_callback(str(tmpdir)) runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": upload_dir, } # Test recovery of trial that has been checkpointed t1 = Trial("__fake", **kwargs) runner.add_trial(t1) # Start trial, process result (x2), process save while not t1.has_checkpoint(): runner.step() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() # Remove checkpoint on "remote" node shutil.rmtree(os.path.dirname(t1.checkpoint.dir_or_data)) if not durable: # Recover from driver file t1.checkpoint.dir_or_data = os.path.join( tmpdir, t1.relative_logdir, os.path.relpath(t1.checkpoint.dir_or_data, t1.logdir), ) while not runner.is_finished(): runner.step() assert t1.status == Trial.TERMINATED, runner.debug_string()
def testSearchAlgStalled(self): """Checks that runner and searcher state is maintained when stalled.""" ray.init(num_cpus=4, num_gpus=2) experiment_spec = { "run": "__fake", "num_samples": 3, "stop": { "training_iteration": 1 }, } experiments = [Experiment.from_json("test", experiment_spec)] search_alg = _MockSuggestionAlgorithm(max_concurrent=1) search_alg.add_configurations(experiments) searcher = search_alg.searcher runner = TrialRunner(search_alg=search_alg) runner.step() trials = runner.get_trials() while trials[0].status != Trial.TERMINATED: runner.step() runner.step() trials = runner.get_trials() self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(len(searcher.live_trials), 1) searcher.stall = True while trials[1].status != Trial.TERMINATED: runner.step() self.assertEqual(trials[1].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(all(trial.is_finished() for trial in trials)) self.assertFalse(search_alg.is_finished()) self.assertFalse(runner.is_finished()) searcher.stall = False runner.step() trials = runner.get_trials() self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(len(searcher.live_trials), 1) while trials[2].status != Trial.TERMINATED: runner.step() self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(search_alg.is_finished()) self.assertTrue(runner.is_finished())
def testCheckpointingAtEnd(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 2}, "checkpoint_at_end": True, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].last_result[DONE], True) self.assertEqual(trials[0].has_checkpoint(), True)
def testMultiStepRun(self): ray.init(num_cpus=4, num_gpus=2) kwargs = { "stopping_criterion": {"training_iteration": 5}, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertTrue(snapshot.all_trials_are_terminated())
def testMultiStepRun2(self): """Checks that runner.step throws when overstepping.""" ray.init(num_cpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 2}, "resources": Resources(cpu=1, gpu=0), } trials = [Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertRaises(TuneError, runner.step)
def testChangeResources(self): """Checks that resource requirements can be changed on fly.""" ray.init(num_cpus=2) class ChangingScheduler(FIFOScheduler): def __init__(self): self._has_received_one_trial_result = False # For figuring out how many runner.step there are. def has_received_one_trial_result(self): return self._has_received_one_trial_result def on_trial_result(self, trial_runner, trial, result): if result["training_iteration"] == 1: self._has_received_one_trial_result = True executor = trial_runner.trial_executor executor.pause_trial(trial) trial.update_resources(dict(cpu=2, gpu=0)) return TrialScheduler.NOOP scheduler = ChangingScheduler() runner = TrialRunner(scheduler=scheduler) kwargs = { "stopping_criterion": {"training_iteration": 2}, "resources": Resources(cpu=1, gpu=0), } trials = [Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 1 ) self.assertRaises( ValueError, lambda: trials[0].update_resources(dict(cpu=2, gpu=0)) ) while not scheduler.has_received_one_trial_result(): runner.step() self.assertEqual(trials[0].status, Trial.PAUSED) # extra step for tune loop to stage the resource requests. runner.step() self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 2 )
def testUserCheckpointBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "8" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir)) ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 10})) trials = runner.get_trials() runner.step() # Start trial, schedule 1-8 self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) self.assertEqual(num_checkpoints(trials[0]), 0) runner.step() # Process results 0-8, schedule 9-11 (CP) self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 8) self.assertFalse(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 0) runner.step() # Process results 9-11 runner.step() # handle CP, schedule 12-19 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 11) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 1) runner.step() # Process results 12-19, schedule 20-21 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 19) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 1) runner.step() # Process results 20-21 runner.step() # handle CP, schedule 21-29 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 21) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 2) runner.step() # Process results 21-29, schedule 30-31 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 29) self.assertTrue(trials[0].has_checkpoint()) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 2)
def basicSetup(self): ray.init(num_cpus=4, num_gpus=1) port = get_valid_port() self.runner = TrialRunner(server_port=port) runner = self.runner kwargs = { "stopping_criterion": { "training_iteration": 3 }, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) client = TuneClient("localhost", port) return runner, client
def testSearchAlgNotification(self): """Checks notification of trial to the Search Algorithm.""" os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=4, num_gpus=2) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} experiments = [Experiment.from_json("test", experiment_spec)] search_alg = _MockSuggestionAlgorithm() searcher = search_alg.searcher search_alg.add_configurations(experiments) runner = TrialRunner(search_alg=search_alg) while not runner.is_finished(): runner.step() self.assertEqual(searcher.counter["result"], 1) self.assertEqual(searcher.counter["complete"], 1)
def test_trial_requeue(start_connected_emptyhead_cluster, tmpdir, durable): """Removing a node in full cluster causes Trial to be requeued.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() if durable: upload_dir = "file://" + str(tmpdir) syncer_callback = SyncerCallback() else: upload_dir = None syncer_callback = custom_driver_logdir_callback(str(tmpdir)) runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) # noqa kwargs = { "stopping_criterion": { "training_iteration": 5 }, "checkpoint_freq": 1, "max_failures": 1, "remote_checkpoint_dir": upload_dir, } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # Start trial runner.step() # Process result, dispatch save runner.step() # Process save running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) cluster.remove_node(node) cluster.wait_for_nodes() time.sleep(0.1) # Sleep so that next step() refreshes cluster resources runner.step() # Process result, dispatch save runner.step() # Process save (detect error), requeue trial assert all(t.status == Trial.PENDING for t in trials), runner.debug_string()
def testFailureRecoveryMaxFailures(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 2, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[0].num_failures, 3)
def testStopTrial(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) # Stop trial while running runner.stop_trial(trials[0]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.PENDING) # Stop trial while pending runner.stop_trial(trials[-1]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED) time.sleep(2) # Wait for stopped placement group to free resources runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED)