def testFailureRecoveryMaxFailures(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 2, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[0].num_failures, 3)
def testCustomResources(self): ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) # Since each trial will occupy the full custom resources, # there are at most 1 trial running at any given moment. snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) kwargs = { "stopping_criterion": {"training_iteration": 1}, "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "a": 2}]), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertLess(snapshot.max_running_trials(), 2) self.assertTrue(snapshot.all_trials_are_terminated())
def testTrialNoCheckpointSave(self): """Check that non-checkpointing trials *are* saved.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial( Trial( "__fake", trial_id="non_checkpoint", stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="checkpoint", checkpoint_at_end=True, stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="pending", stopping_criterion={"training_iteration": 2}, )) old_trials = runner.get_trials() while not old_trials[2].has_reported_at_least_once: runner.step() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) new_trials = runner2.get_trials() self.assertEqual(len(new_trials), 3) self.assertTrue( runner2.get_trial("non_checkpoint").status == Trial.TERMINATED) self.assertTrue( runner2.get_trial("checkpoint").status == Trial.TERMINATED) self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING) self.assertTrue( runner2.get_trial("pending").has_reported_at_least_once) runner2.step()
def testErrorHandling(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 1}, "resources": Resources(cpu=1, gpu=1), } _global_registry.register(TRAINABLE_CLASS, "asdf", None) trials = [Trial("asdf", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.RUNNING)
def testSearchAlgFinished(self): """Checks that SearchAlg is Finished before all trials are done.""" ray.init(num_cpus=4, local_mode=True, include_dashboard=False) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 1}} experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm() searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertTrue(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(searcher.is_finished()) self.assertTrue(runner.is_finished())
def testPauseThenResume(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 2}, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial runner.step() # Process result self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.get_info.remote()), None) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.trial_executor.pause_trial(trials[0]) self.assertEqual(trials[0].status, Trial.PAUSED)
def testExtraResources(self): ray.init(num_cpus=4, num_gpus=2) snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) kwargs = { "stopping_criterion": {"training_iteration": 1}, "placement_group_factory": PlacementGroupFactory( [{"CPU": 1}, {"CPU": 3, "GPU": 1}] ), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertLess(snapshot.max_running_trials(), 2) self.assertTrue(snapshot.all_trials_are_terminated())
def testCheckpointAutoPeriod(self): ray.init(num_cpus=3) # This makes checkpointing take 2 seconds. class CustomSyncer(Syncer): def __init__(self, sync_period: float = 300.0): super(CustomSyncer, self).__init__(sync_period=sync_period) self._sync_status = {} def sync_up(self, local_dir: str, remote_dir: str, exclude: list = None) -> bool: time.sleep(2) return True def sync_down(self, remote_dir: str, local_dir: str, exclude: list = None) -> bool: time.sleep(2) return True def delete(self, remote_dir: str) -> bool: pass runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period="auto", sync_config=SyncConfig(upload_dir="fake", syncer=CustomSyncer(), sync_period=0), remote_checkpoint_dir="fake", ) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 1})) runner.step() # Run one step, this will trigger checkpointing self.assertGreaterEqual(runner._checkpoint_manager._checkpoint_period, 38.0)
def testCheckpointWithFunction(self): ray.init(num_cpus=2) trial = Trial( "__fake", config={"callbacks": { "on_episode_start": lambda i: i, }}, checkpoint_freq=1, ) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(trial) for _ in range(5): runner.step() # force checkpoint runner.checkpoint() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) new_trial = runner2.get_trials()[0] self.assertTrue("callbacks" in new_trial.config) self.assertTrue("on_episode_start" in new_trial.config["callbacks"])
def testSearchAlgFinishes(self): """Empty SearchAlg changing state in `next_trials` does not crash.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" class FinishFastAlg(_MockSuggestionAlgorithm): _index = 0 def next_trial(self): spec = self._experiment.spec trial = None if self._index < spec["num_samples"]: trial = Trial(spec.get("run"), stopping_criterion=spec.get("stop")) self._index += 1 if self._index > 4: self.set_finished() return trial def suggest(self, trial_id): return {} ray.init(num_cpus=2, local_mode=True, include_dashboard=False) experiment_spec = { "run": "__fake", "num_samples": 2, "stop": { "training_iteration": 1 }, } searcher = FinishFastAlg() experiments = [Experiment.from_json("test", experiment_spec)] searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher) self.assertFalse(runner.is_finished()) runner.step() # This launches a new run runner.step() # This launches a 2nd run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # This kills the first run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # This kills the 2nd run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # this converts self._finished to True self.assertTrue(searcher.is_finished()) self.assertRaises(TuneError, runner.step)
def testTrialErrorResumeTrue(self): ray.init(num_cpus=3, local_mode=True, include_dashboard=False) runner = TrialRunner(local_checkpoint_dir=self.tmpdir) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "resources": Resources(cpu=1, gpu=0), } trials = [ Trial("__fake", config={"mock_error": True}, **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() runner.checkpoint(force=True) assert trials[0].status == Trial.ERROR del runner new_runner = TrialRunner(resume="ERRORED_ONLY", local_checkpoint_dir=self.tmpdir) assert len(new_runner.get_trials()) == 3 assert Trial.ERROR not in (t.status for t in new_runner.get_trials()) # The below is just a check for standard behavior. disable_error = False for t in new_runner.get_trials(): if t.config.get("mock_error"): t.config["mock_error"] = False disable_error = True assert disable_error while not new_runner.is_finished(): new_runner.step() assert Trial.ERROR not in (t.status for t in new_runner.get_trials())
def testFractionalGpus(self): ray.init(num_cpus=4, num_gpus=1) runner = TrialRunner() kwargs = { "resources": Resources(cpu=1, gpu=0.5), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) for _ in range(10): runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.PENDING) self.assertEqual(trials[3].status, Trial.PENDING)
def testSearchAlgSchedulerInteraction(self): """Checks that TrialScheduler killing trial will notify SearchAlg.""" class _MockScheduler(FIFOScheduler): def on_trial_result(self, *args, **kwargs): return TrialScheduler.STOP ray.init(num_cpus=4, local_mode=True, include_dashboard=False) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm() searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher, scheduler=_MockScheduler()) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertTrue(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(searcher.is_finished()) self.assertTrue(runner.is_finished())
def testFailureRecoveryDisabled(self): ray.init(num_cpus=1, num_gpus=1) searchalg, scheduler = create_mock_components() runner = TrialRunner(searchalg, scheduler=scheduler) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[0].num_failures, 1) self.assertEqual(len(searchalg.errored_trials), 1) self.assertEqual(len(scheduler.errored_trials), 1)
def testFailFast(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(fail_fast=True) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.ERROR) # Somehow with `fail_fast=True`, if one errors out, the others are # then stopped with `TERMINATED` status. self.assertEqual(trials[1].status, Trial.TERMINATED) self.assertRaises(TuneError, lambda: runner.step())
def testUserCheckpoint(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2})) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result, dispatch save runner.step() # Process save self.assertTrue(trials[0].has_checkpoint()) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) runner2.step() # 5: Start trial and dispatch restore trials2 = runner2.get_trials() self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1)
def testFailFastRaise(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(fail_fast=TrialRunner.RAISE) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() with self.assertRaises(Exception): while not runner.is_finished(): runner.step() # Not critical checks. Only to showcase the difference # with none raise type FailFast. self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING)
def test_remove_node_before_result(start_connected_emptyhead_cluster): """Tune continues when node is removed before trial returns.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2, } trial = Trial("__fake", **kwargs) runner.add_trial(trial) runner.step() # Start trial, call _train once running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) assert not trial.has_reported_at_least_once assert trial.status == Trial.RUNNING cluster.remove_node(node) cluster.add_node(num_cpus=1) cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 1 # Process result: fetch data, invoke _train again runner.step() assert trial.last_result.get("training_iteration") == 1 # Process result: discover failure, recover, _train (from scratch) while trial.status != Trial.TERMINATED: runner.step() assert trial.last_result.get("training_iteration") > 1 with pytest.raises(TuneError): runner.step()
def testChangeResources(self): """Checks that resource requirements can be changed on fly.""" ray.init(num_cpus=2) class ChangingScheduler(FIFOScheduler): def __init__(self): self._has_received_one_trial_result = False # For figuring out how many runner.step there are. def has_received_one_trial_result(self): return self._has_received_one_trial_result def on_trial_result(self, trial_runner, trial, result): if result["training_iteration"] == 1: self._has_received_one_trial_result = True executor = trial_runner.trial_executor executor.pause_trial(trial) trial.update_resources(dict(cpu=2, gpu=0)) return TrialScheduler.NOOP scheduler = ChangingScheduler() runner = TrialRunner(scheduler=scheduler) kwargs = { "stopping_criterion": {"training_iteration": 2}, "resources": Resources(cpu=1, gpu=0), } trials = [Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 1 ) self.assertRaises( ValueError, lambda: trials[0].update_resources(dict(cpu=2, gpu=0)) ) while not scheduler.has_received_one_trial_result(): runner.step() self.assertEqual(trials[0].status, Trial.PAUSED) # extra step for tune loop to stage the resource requests. runner.step() self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 2 )
def test_trial_requeue(start_connected_emptyhead_cluster, tmpdir, durable): """Removing a node in full cluster causes Trial to be requeued.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() if durable: upload_dir = "file://" + str(tmpdir) syncer_callback = SyncerCallback() else: upload_dir = None syncer_callback = custom_driver_logdir_callback(str(tmpdir)) runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) # noqa kwargs = { "stopping_criterion": { "training_iteration": 5 }, "checkpoint_freq": 1, "max_failures": 1, "remote_checkpoint_dir": upload_dir, } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # Start trial runner.step() # Process result, dispatch save runner.step() # Process save running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) cluster.remove_node(node) cluster.wait_for_nodes() time.sleep(0.1) # Sleep so that next step() refreshes cluster resources runner.step() # Process result, dispatch save runner.step() # Process save (detect error), requeue trial assert all(t.status == Trial.PENDING for t in trials), runner.debug_string()
def testStopTrial(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) # Stop trial while running runner.stop_trial(trials[0]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.PENDING) # Stop trial while pending runner.stop_trial(trials[-1]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED) time.sleep(2) # Wait for stopped placement group to free resources runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED)
def test_trial_processed_after_node_failure(start_connected_emptyhead_cluster): """Tests that Tune processes a trial as failed if its node died.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) mock_process_failure = MagicMock(side_effect=runner._process_trial_failure) runner._process_trial_failure = mock_process_failure runner.add_trial(Trial("__fake")) runner.step() runner.step() assert not mock_process_failure.called cluster.remove_node(node) runner.step() if not mock_process_failure.called: runner.step() assert mock_process_failure.called
def testRestoreMetricsAfterCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) observer = TrialResultObserver() runner = TrialRunner(callbacks=[observer]) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) kwargs["restore_path"] = trials[0].checkpoint.dir_or_data kwargs.pop("stopping_criterion") kwargs.pop("checkpoint_freq") # No checkpointing for next trial runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() observer.reset() while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10) self.assertEqual(trials[1].last_result["iterations_since_restore"], 1) self.assertGreater(trials[1].last_result["time_since_restore"], 0) while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20) self.assertEqual(trials[1].last_result["iterations_since_restore"], 2) self.assertGreater(trials[1].last_result["time_since_restore"], 0) self.addCleanup(shutil.rmtree, trials[0].checkpoint.dir_or_data)
def test_counting_resources(start_connected_cluster): """Tests that Tune accounting is consistent with actual cluster.""" cluster = start_connected_cluster nodes = [] assert ray.cluster_resources()["CPU"] == 1 runner = TrialRunner(BasicVariantGenerator()) kwargs = {"stopping_criterion": {"training_iteration": 10}} trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) assert ray.available_resources().get("CPU", 0) == 0 nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 2 cluster.remove_node(nodes.pop()) cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 1 runner.step() # Only 1 trial can be running due to resource limitation. assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1 for i in range(5): nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 6 # This is to make sure that pg is ready for the previous pending trial, # so that when runner.step() is called next, the trial can be started in # the same event loop. time.sleep(5) runner.step() assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 2
def testQueueFilling(self): os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=4) def f1(config): for i in range(10): yield i tune.register_trainable("f1", f1) search_alg = BasicVariantGenerator() search_alg.add_configurations( { "foo": { "run": "f1", "num_samples": 100, "config": { "a": tune.sample_from(lambda spec: 5.0 / 7), "b": tune.sample_from(lambda spec: "long" * 40), }, "resources_per_trial": {"cpu": 2}, } } ) runner = TrialRunner(search_alg=search_alg) runner.step() runner.step() runner.step() self.assertEqual(len(runner._trials), 3) runner.step() self.assertEqual(len(runner._trials), 3) self.assertEqual(runner._trials[0].status, Trial.RUNNING) self.assertEqual(runner._trials[1].status, Trial.RUNNING) self.assertEqual(runner._trials[2].status, Trial.PENDING)
def testThrowOnOverstep(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() runner.step() self.assertRaises(TuneError, runner.step)
def testUserCheckpointBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "8" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir)) ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 10})) trials = runner.get_trials() runner.step() # Start trial, schedule 1-8 self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) self.assertEqual(num_checkpoints(trials[0]), 0) runner.step() # Process results 0-8, schedule 9-11 (CP) self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 8) self.assertFalse(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 0) runner.step() # Process results 9-11 runner.step() # handle CP, schedule 12-19 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 11) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 1) runner.step() # Process results 12-19, schedule 20-21 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 19) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 1) runner.step() # Process results 20-21 runner.step() # handle CP, schedule 21-29 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 21) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 2) runner.step() # Process results 21-29, schedule 30-31 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 29) self.assertTrue(trials[0].has_checkpoint()) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 2)
def testTrialSaveRestore(self): """Creates different trials to test runner.checkpoint/restore.""" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) trials = [ Trial( "__fake", trial_id="trial_terminate", stopping_criterion={"training_iteration": 1}, checkpoint_freq=1, ) ] runner.add_trial(trials[0]) while not runner.is_finished(): # Start trial, process result, dispatch save and process save. runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) trials += [ Trial( "__fake", trial_id="trial_fail", stopping_criterion={"training_iteration": 3}, checkpoint_freq=1, config={"mock_error": True}, ) ] runner.add_trial(trials[1]) while not runner.is_finished(): # Start trial, # Process result, # Dispatch save, # Process save and # Error. runner.step() self.assertEqual(trials[1].status, Trial.ERROR) trials += [ Trial( "__fake", trial_id="trial_succ", stopping_criterion={"training_iteration": 2}, checkpoint_freq=1, ) ] runner.add_trial(trials[2]) runner.step() # Start trial self.assertEqual(len(runner.trial_executor.get_checkpoints()), 3) self.assertEqual(trials[2].status, Trial.RUNNING) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) for tid in ["trial_terminate", "trial_fail"]: original_trial = runner.get_trial(tid) restored_trial = runner2.get_trial(tid) self.assertEqual(original_trial.status, restored_trial.status) restored_trial = runner2.get_trial("trial_succ") self.assertEqual(Trial.PENDING, restored_trial.status) while not runner2.is_finished(): # Start trial, # Process result, dispatch save # Process save # Process result, dispatch save # Process save. runner2.step() self.assertEqual(restored_trial.status, Trial.TERMINATED)
def testSearcherSaveRestore(self): ray.init(num_cpus=8, local_mode=True) def create_searcher(): class TestSuggestion(Searcher): def __init__(self, index): self.index = index self.returned_result = [] super().__init__(metric="episode_reward_mean", mode="max") def suggest(self, trial_id): self.index += 1 return {"test_variable": self.index} def on_trial_complete(self, trial_id, result=None, **kwargs): self.returned_result.append(result) def save(self, checkpoint_path): with open(checkpoint_path, "wb") as f: pickle.dump(self.__dict__, f) def restore(self, checkpoint_path): with open(checkpoint_path, "rb") as f: self.__dict__.update(pickle.load(f)) searcher = TestSuggestion(0) searcher = ConcurrencyLimiter(searcher, max_concurrent=2) searcher = Repeater(searcher, repeat=3, set_index=False) search_alg = SearchGenerator(searcher) experiment_spec = { "run": "__fake", "num_samples": 20, "stop": { "training_iteration": 2 }, } experiments = [Experiment.from_json("test", experiment_spec)] search_alg.add_configurations(experiments) return search_alg searcher = create_searcher() runner = TrialRunner(search_alg=searcher, local_checkpoint_dir=self.tmpdir, checkpoint_period=-1) for i in range(6): runner.step() assert len( runner.get_trials()) == 6, [t.config for t in runner.get_trials()] runner.checkpoint() trials = runner.get_trials() [ runner.trial_executor.stop_trial(t) for t in trials if t.status is not Trial.ERROR ] del runner # stop_all(runner.get_trials()) searcher = create_searcher() runner2 = TrialRunner(search_alg=searcher, local_checkpoint_dir=self.tmpdir, resume="LOCAL") assert len(runner2.get_trials()) == 6, [ t.config for t in runner2.get_trials() ] def trial_statuses(): return [t.status for t in runner2.get_trials()] def num_running_trials(): return sum(t.status == Trial.RUNNING for t in runner2.get_trials()) for i in range(6): runner2.step() assert len(set(trial_statuses())) == 1 assert Trial.RUNNING in trial_statuses() for i in range(20): runner2.step() assert 1 <= num_running_trials() <= 6 evaluated = [ t.evaluated_params["test_variable"] for t in runner2.get_trials() ] count = Counter(evaluated) assert all(v <= 3 for v in count.values())
def testSearchAlgStalled(self): """Checks that runner and searcher state is maintained when stalled.""" ray.init(num_cpus=4, num_gpus=2) experiment_spec = { "run": "__fake", "num_samples": 3, "stop": { "training_iteration": 1 }, } experiments = [Experiment.from_json("test", experiment_spec)] search_alg = _MockSuggestionAlgorithm(max_concurrent=1) search_alg.add_configurations(experiments) searcher = search_alg.searcher runner = TrialRunner(search_alg=search_alg) runner.step() trials = runner.get_trials() while trials[0].status != Trial.TERMINATED: runner.step() runner.step() trials = runner.get_trials() self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(len(searcher.live_trials), 1) searcher.stall = True while trials[1].status != Trial.TERMINATED: runner.step() self.assertEqual(trials[1].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(all(trial.is_finished() for trial in trials)) self.assertFalse(search_alg.is_finished()) self.assertFalse(runner.is_finished()) searcher.stall = False runner.step() trials = runner.get_trials() self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(len(searcher.live_trials), 1) while trials[2].status != Trial.TERMINATED: runner.step() self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(search_alg.is_finished()) self.assertTrue(runner.is_finished())