def test_cluster_down_simple(start_connected_cluster, tmpdir): """Tests that TrialRunner save/restore works on cluster shutdown.""" cluster = start_connected_cluster cluster.add_node(num_cpus=1) cluster.wait_for_nodes() dirpath = str(tmpdir) runner = TrialRunner( BasicVariantGenerator(), metadata_checkpoint_dir=dirpath) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "checkpoint_freq": 1, "max_failures": 1 } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # start runner.step() # start2 runner.step() # step assert all(t.status == Trial.RUNNING for t in runner.get_trials()) runner.checkpoint() cluster.shutdown() ray.shutdown() cluster = _start_new_cluster() runner = TrialRunner.restore(dirpath) runner.step() # start runner.step() # start2 for i in range(3): runner.step() with pytest.raises(TuneError): runner.step() assert all(t.status == Trial.TERMINATED for t in runner.get_trials()) cluster.shutdown()
def testStopTrial(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs) ] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) # Stop trial while running runner.stop_trial(trials[0]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.PENDING) # Stop trial while pending runner.stop_trial(trials[-1]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED)
def test_trial_processed_after_node_failure(start_connected_emptyhead_cluster): """Tests that Tune processes a trial as failed if its node died.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) mock_process_failure = MagicMock(side_effect=runner._process_trial_failure) runner._process_trial_failure = mock_process_failure runner.add_trial(Trial("__fake")) runner.step() runner.step() assert not mock_process_failure.called cluster.remove_node(node) runner.step() if not mock_process_failure.called: runner.step() assert mock_process_failure.called
def testResourceScheduler(self): ray.init(num_cpus=4, num_gpus=1) kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertLess(snapshot.max_running_trials(), 2) self.assertTrue(snapshot.all_trials_are_terminated())
def testCheckpointingAtEnd(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 2 }, "checkpoint_at_end": True, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() # Process result runner.step() # Process result, dispatch save self.assertEqual(trials[0].last_result[DONE], True) runner.step() # Process save self.assertEqual(trials[0].has_checkpoint(), True)
def testErrorHandling(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 1}, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("CartPole-v0", "asdf", **kwargs), Trial("CartPole-v0", "__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.RUNNING)
def testSearchAlgNotification(self): """Checks notification of trial to the Search Algorithm.""" ray.init(num_cpus=4, num_gpus=2) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm(max_concurrent=10) searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(searcher.counter["result"], 1) self.assertEqual(searcher.counter["complete"], 1)
def testExtraResources(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=0, extra_cpu=3, extra_gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING)
def testCheckpointAutoPeriod(self): ray.init(num_cpus=3) # This makes checkpointing take 2 seconds. def sync_up(source, target, exclude=None): time.sleep(2) return True runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period="auto", sync_config=SyncConfig(upload_dir="fake", syncer=sync_up), remote_checkpoint_dir="fake", ) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 1})) runner.step() # Run one step, this will trigger checkpointing self.assertGreaterEqual(runner._checkpoint_manager._checkpoint_period, 38.0)
def test_trial_requeue(start_connected_emptyhead_cluster, trainable_id): """Removing a node in full cluster causes Trial to be requeued.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() syncer_callback = _PerTrialSyncerCallback( lambda trial: trial.trainable_name == "__fake") runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) # noqa kwargs = { "stopping_criterion": { "training_iteration": 5 }, "checkpoint_freq": 1, "max_failures": 1, } if trainable_id == "__fake_durable": kwargs["remote_checkpoint_dir"] = MOCK_REMOTE_DIR trials = [Trial(trainable_id, **kwargs), Trial(trainable_id, **kwargs)] for t in trials: runner.add_trial(t) runner.step() # Start trial runner.step() # Process result, dispatch save runner.step() # Process save running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) cluster.remove_node(node) cluster.wait_for_nodes() time.sleep(0.1) # Sleep so that next step() refreshes cluster resources runner.step() # Process result, dispatch save runner.step() # Process save (detect error), requeue trial assert all(t.status == Trial.PENDING for t in trials), runner.debug_string()
def testStopTrial(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 5}, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) # Stop trial while running runner.stop_trial(trials[0]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.PENDING) # Stop trial while pending runner.stop_trial(trials[-1]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED) time.sleep(2) # Wait for stopped placement group to free resources runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED)
def testPauseThenResume(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial runner.step() # Process result self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.get_info.remote()), None) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.trial_executor.pause_trial(trials[0]) self.assertEqual(trials[0].status, Trial.PAUSED)
def run_experiments(experiments, scheduler=None, **ray_args): if scheduler is None: scheduler = FIFOScheduler() runner = TrialRunner(scheduler) for name, spec in experiments.items(): for trial in generate_trials(spec, name): runner.add_trial(trial) print(runner.debug_string()) ray.init(**ray_args) while not runner.is_finished(): runner.step() print(runner.debug_string()) for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: raise TuneError("Trial did not complete", trial) return runner.get_trials()
def testErrorHandling(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), } _global_registry.register(TRAINABLE_CLASS, "asdf", None) trials = [Trial("asdf", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.RUNNING)
def testFailureRecoveryDisabled(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[0].num_failures, 1)
def basicSetup(self): # Wait up to five seconds for placement groups when starting a trial os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5" # Block for results even when placement groups are pending os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0" ray.init(num_cpus=4, num_gpus=1) port = get_valid_port() self.runner = TrialRunner(server_port=port) runner = self.runner kwargs = { "stopping_criterion": { "training_iteration": 3 }, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) client = TuneClient("localhost", port) return runner, client
def testRestoreMetricsAfterCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) observer = TrialResultObserver() runner = TrialRunner(callbacks=[observer]) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) kwargs["restore_path"] = trials[0].checkpoint.value kwargs.pop("stopping_criterion") kwargs.pop("checkpoint_freq") # No checkpointing for next trial runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() observer.reset() while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10) self.assertEqual(trials[1].last_result["iterations_since_restore"], 1) self.assertGreater(trials[1].last_result["time_since_restore"], 0) while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20) self.assertEqual(trials[1].last_result["iterations_since_restore"], 2) self.assertGreater(trials[1].last_result["time_since_restore"], 0) self.addCleanup(os.remove, trials[0].checkpoint.value)
def test_remove_node_before_result(start_connected_emptyhead_cluster): """Tune continues when node is removed before trial returns.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2, } trial = Trial("__fake", **kwargs) runner.add_trial(trial) runner.step() # Start trial, call _train once running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) assert not trial.has_reported_at_least_once assert trial.status == Trial.RUNNING cluster.remove_node(node) cluster.add_node(num_cpus=1) cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 1 # Process result: fetch data, invoke _train again runner.step() assert trial.last_result.get("training_iteration") == 1 # Process result: discover failure, recover, _train (from scratch) while trial.status != Trial.TERMINATED: runner.step() assert trial.last_result.get("training_iteration") > 1 with pytest.raises(TuneError): runner.step()
def testCheckpointAtEndNotBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "0.5" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir) ) ray.init(num_cpus=2) trial = Trial( "__fake", checkpoint_at_end=True, stopping_criterion={"training_iteration": 4}, ) runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period=0, trial_executor=RayTrialExecutor(result_buffer_length=7), ) runner.add_trial(trial) runner.step() # start trial runner.step() # run iteration 1 self.assertEqual(trial.last_result[TRAINING_ITERATION], 1) self.assertEqual(num_checkpoints(trial), 0) runner.step() # run iteration 2 self.assertEqual(trial.last_result[TRAINING_ITERATION], 2) self.assertEqual(num_checkpoints(trial), 0) runner.step() # run iteration 3 self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 0) runner.step() # run iteration 4 self.assertEqual(trial.last_result[TRAINING_ITERATION], 4) self.assertEqual(num_checkpoints(trial), 1)
def run_experiments(experiments, scheduler=None, **ray_args): if scheduler is None: scheduler = make_scheduler(args) runner = TrialRunner(scheduler) for name, spec in experiments.items(): for trial in generate_trials(spec, name): runner.add_trial(trial) print(runner.debug_string()) ray.init(**ray_args) while not runner.is_finished(): runner.step() print(runner.debug_string()) for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: print("Exit 1") sys.exit(1) print("Exit 0")
def testCustomResources(self): ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "placement_group_factory": PlacementGroupFactory([{ "CPU": 1, "a": 2 }]), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING)
def testFractionalGpus(self): ray.init(num_cpus=4, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "resources": Resources(cpu=1, gpu=0.5), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs) ] for t in trials: runner.add_trial(t) for _ in range(10): runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.PENDING) self.assertEqual(trials[3].status, Trial.PENDING)
def test_counting_resources(start_connected_cluster): """Tests that Tune accounting is consistent with actual cluster.""" cluster = start_connected_cluster nodes = [] assert ray.cluster_resources()["CPU"] == 1 runner = TrialRunner(BasicVariantGenerator()) kwargs = {"stopping_criterion": {"training_iteration": 10}} trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) assert ray.available_resources().get("CPU", 0) == 0 nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 2 cluster.remove_node(nodes.pop()) cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 1 runner.step() # Only 1 trial can be running due to resource limitation. assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1 for i in range(5): nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 6 # This is to make sure that pg is ready for the previous pending trial, # so that when runner.step() is called next, the trial can be started in # the same event loop. time.sleep(5) runner.step() assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 2
def testQueueFilling(self): os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=4) def f1(config): for i in range(10): yield i tune.register_trainable("f1", f1) search_alg = BasicVariantGenerator() search_alg.add_configurations( { "foo": { "run": "f1", "num_samples": 100, "config": { "a": tune.sample_from(lambda spec: 5.0 / 7), "b": tune.sample_from(lambda spec: "long" * 40), }, "resources_per_trial": {"cpu": 2}, } } ) runner = TrialRunner(search_alg=search_alg) runner.step() runner.step() runner.step() self.assertEqual(len(runner._trials), 3) runner.step() self.assertEqual(len(runner._trials), 3) self.assertEqual(runner._trials[0].status, Trial.RUNNING) self.assertEqual(runner._trials[1].status, Trial.RUNNING) self.assertEqual(runner._trials[2].status, Trial.PENDING)
def testSearchAlgSchedulerInteraction(self): """Checks that TrialScheduler killing trial will notify SearchAlg.""" class _MockScheduler(FIFOScheduler): def on_trial_result(self, *args, **kwargs): return TrialScheduler.STOP ray.init(num_cpus=4, num_gpus=2) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm(experiments, max_concurrent=10) runner = TrialRunner(search_alg=searcher, scheduler=_MockScheduler()) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertTrue(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(searcher.is_finished()) self.assertTrue(runner.is_finished())
def test_trial_requeue(start_connected_emptyhead_cluster, trainable_id): """Removing a node in full cluster causes Trial to be requeued.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() syncer_callback = _PerTrialSyncerCallback( lambda trial: trial.trainable_name == "__fake") runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "checkpoint_freq": 1, "max_failures": 1, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } trials = [Trial(trainable_id, **kwargs), Trial(trainable_id, **kwargs)] for t in trials: runner.add_trial(t) runner.step() # Start trial runner.step() # Process result, dispatch save runner.step() # Process save running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) cluster.remove_node(node) cluster.wait_for_nodes() runner.step() # Process result, dispatch save runner.step() # Process save (detect error), requeue trial assert all(t.status == Trial.PENDING for t in trials), runner.debug_string() with pytest.raises(TuneError): runner.step()
def run_experiments(experiments, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=True): # Make sure rllib agents are registered from ray import rllib # noqa # pylint: disable=unused-import if scheduler is None: scheduler = FIFOScheduler() runner = TrialRunner(scheduler, launch_web_server=with_server, server_port=server_port) for name, spec in experiments.items(): for trial in generate_trials(spec, name): trial.set_verbose(verbose) runner.add_trial(trial) print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: print(runner.debug_string()) last_debug = time.time() print(runner.debug_string(max_debug=99999)) for trial in runner.get_trials(): # TODO(rliaw): What about errored? if trial.status != Trial.TERMINATED: raise TuneError("Trial did not complete", trial) wait_for_log_sync() return runner.get_trials()
def testSearchAlgFinishes(self): """SearchAlg changing state in `next_trials` does not crash.""" class FinishFastAlg(SuggestionAlgorithm): def next_trials(self): self._finished = True return [] ray.init(num_cpus=4, num_gpus=2) experiment_spec = { "run": "__fake", "num_samples": 3, "stop": { "training_iteration": 1 } } searcher = FinishFastAlg() experiments = [Experiment.from_json("test", experiment_spec)] searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher) runner.step() # This should not fail self.assertTrue(searcher.is_finished()) self.assertTrue(runner.is_finished())
def testFailFastRaise(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(fail_fast=TrialRunner.RAISE) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() # Process result, dispatch save self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() # Process save with self.assertRaises(Exception): runner.step() # Error
def testMultiStepRun2(self): """Checks that runner.step throws when overstepping.""" ray.init(num_cpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=0), } trials = [Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertRaises(TuneError, runner.step)