def test_cluster_rllib_restore(start_connected_cluster, tmpdir): cluster = start_connected_cluster dirpath = str(tmpdir) script = """ import time import ray from ray import tune ray.init(redis_address="{redis_address}") kwargs = dict( run="PG", env="CartPole-v1", stop=dict(training_iteration=10), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1) tune.run_experiments( dict(experiment=kwargs), raise_on_failed_trial=False) """.format( redis_address=cluster.redis_address, checkpoint_dir=dirpath) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. metadata_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(100): if TrialRunner.checkpoint_exists(metadata_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner.restore(metadata_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration"): break time.sleep(0.3) if not TrialRunner.checkpoint_exists(metadata_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() cluster.wait_for_nodes() # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": "PG", "checkpoint_freq": 1, "local_dir": dirpath } }, resume=True) assert all(t.status == Trial.TERMINATED for t in trials2) cluster.shutdown()
def test_remove_node_before_result(start_connected_emptyhead_cluster): """Tune continues when node is removed before trial returns.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } trial = Trial("__fake", **kwargs) runner.add_trial(trial) runner.step() # run 1 assert trial.status == Trial.RUNNING cluster.remove_node(node) cluster.add_node(num_cpus=1) cluster.wait_for_nodes() assert ray.global_state.cluster_resources()["CPU"] == 1 for i in range(3): runner.step() assert trial.status == Trial.TERMINATED with pytest.raises(TuneError): runner.step()
def test_trial_requeue(start_connected_emptyhead_cluster): """Removing a node in full cluster causes Trial to be requeued.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "checkpoint_freq": 1, "max_failures": 1 } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # start runner.step() # 1 result cluster.remove_node(node) cluster.wait_for_nodes() runner.step() assert all(t.status == Trial.PENDING for t in trials) with pytest.raises(TuneError): runner.step()
def try_restore_runner(checkpoint_dir, search_alg, scheduler, trial_executor): new_runner = None try: new_runner = TrialRunner.restore(checkpoint_dir, search_alg, scheduler, trial_executor) except Exception: logger.exception("Runner restore failed. Restarting experiment.") return new_runner
def test_counting_resources(start_connected_cluster): """Tests that Tune accounting is consistent with actual cluster.""" cluster = start_connected_cluster nodes = [] assert ray.global_state.cluster_resources()["CPU"] == 1 runner = TrialRunner(BasicVariantGenerator()) kwargs = {"stopping_criterion": {"training_iteration": 10}} trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # run 1 nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.global_state.cluster_resources()["CPU"] == 2 cluster.remove_node(nodes.pop()) cluster.wait_for_nodes() assert ray.global_state.cluster_resources()["CPU"] == 1 runner.step() # run 2 assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1 for i in range(5): nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.global_state.cluster_resources()["CPU"] == 6 runner.step() # 1 result assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 2
def testCheckpointOverwrite(self): def count_checkpoints(cdir): return sum( (fname.startswith("experiment_state") and fname.endswith(".json")) for fname in os.listdir(cdir) ) ray.init(num_cpus=2) trial = Trial("__fake", checkpoint_freq=1) tmpdir = tempfile.mkdtemp() runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0) runner.add_trial(trial) for _ in range(5): runner.step() # force checkpoint runner.checkpoint() self.assertEqual(count_checkpoints(tmpdir), 1) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir) for _ in range(5): runner2.step() self.assertEqual(count_checkpoints(tmpdir), 2) runner2.checkpoint() self.assertEqual(count_checkpoints(tmpdir), 2) shutil.rmtree(tmpdir)
def testThrowOnOverstep(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() runner.step() self.assertRaises(TuneError, runner.step)
def testRestoreMetricsAfterCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) observer = TrialResultObserver() runner = TrialRunner(callbacks=[observer]) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) kwargs["restore_path"] = trials[0].checkpoint.value kwargs.pop("stopping_criterion") kwargs.pop("checkpoint_freq") # No checkpointing for next trial runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() observer.reset() while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10) self.assertEqual(trials[1].last_result["iterations_since_restore"], 1) self.assertGreater(trials[1].last_result["time_since_restore"], 0) while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20) self.assertEqual(trials[1].last_result["iterations_since_restore"], 2) self.assertGreater(trials[1].last_result["time_since_restore"], 0) self.addCleanup(os.remove, trials[0].checkpoint.value)
def testFailureRecoveryDisabled(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[0].num_failures, 1)
def testSearchAlgStalled(self): """Checks that runner and searcher state is maintained when stalled.""" ray.init(num_cpus=4, num_gpus=2) experiment_spec = { "run": "__fake", "num_samples": 3, "stop": { "training_iteration": 1 } } experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm(max_concurrent=1) searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) trials = runner.get_trials() runner.step() self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(len(searcher.live_trials), 1) searcher.stall = True runner.step() self.assertEqual(trials[1].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(all(trial.is_finished() for trial in trials)) self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) searcher.stall = False runner.step() trials = runner.get_trials() self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(len(searcher.live_trials), 1) runner.step() self.assertEqual(trials[2].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(searcher.is_finished()) self.assertTrue(runner.is_finished())
def testPauseThenResume(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.get_info.remote()), None) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.trial_executor.pause_trial(trials[0]) self.assertEqual(trials[0].status, Trial.PAUSED) runner.trial_executor.resume_trial(trials[0]) self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.get_info.remote()), 1) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED)
def setUp(self): self.tmpdir = tempfile.mkdtemp() self.callback = TestCallback() self.executor = _MockTrialExecutor() self.trial_runner = TrialRunner( trial_executor=self.executor, callbacks=[self.callback])
class TrialRunnerCallbacks(unittest.TestCase): def setUp(self): self.tmpdir = tempfile.mkdtemp() self.callback = TestCallback() self.executor = _MockTrialExecutor() self.trial_runner = TrialRunner( trial_executor=self.executor, callbacks=[self.callback]) def tearDown(self): ray.shutdown() _register_all() # re-register the evicted objects if "CUDA_VISIBLE_DEVICES" in os.environ: del os.environ["CUDA_VISIBLE_DEVICES"] shutil.rmtree(self.tmpdir) def testCallbackSteps(self): trials = [ Trial("__fake", trial_id="one"), Trial("__fake", trial_id="two") ] for t in trials: self.trial_runner.add_trial(t) self.executor.next_trial = trials[0] self.trial_runner.step() # Trial 1 has been started self.assertEqual(self.callback.state["trial_start"]["iteration"], 0) self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id, "one") # All these events haven't happened, yet self.assertTrue( all(k not in self.callback.state for k in [ "trial_restore", "trial_save", "trial_result", "trial_complete", "trial_fail" ])) self.executor.next_trial = trials[1] self.trial_runner.step() # Iteration not increased yet self.assertEqual(self.callback.state["step_begin"]["iteration"], 1) # Iteration increased self.assertEqual(self.callback.state["step_end"]["iteration"], 2) # Second trial has been just started self.assertEqual(self.callback.state["trial_start"]["iteration"], 1) self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id, "two") cp = Checkpoint(Checkpoint.PERSISTENT, "__checkpoint", {TRAINING_ITERATION: 0}) # Let the first trial save a checkpoint self.executor.next_trial = trials[0] trials[0].saving_to = cp self.trial_runner.step() self.assertEqual(self.callback.state["trial_save"]["iteration"], 2) self.assertEqual(self.callback.state["trial_save"]["trial"].trial_id, "one") # Let the second trial send a result result = {TRAINING_ITERATION: 1, "metric": 800, "done": False} self.executor.results[trials[1]] = result self.executor.next_trial = trials[1] self.assertEqual(trials[1].last_result, {}) self.trial_runner.step() self.assertEqual(self.callback.state["trial_result"]["iteration"], 3) self.assertEqual(self.callback.state["trial_result"]["trial"].trial_id, "two") self.assertEqual( self.callback.state["trial_result"]["result"]["metric"], 800) self.assertEqual(trials[1].last_result["metric"], 800) # Let the second trial restore from a checkpoint trials[1].restoring_from = cp self.executor.results[trials[1]] = trials[1].last_result self.trial_runner.step() self.assertEqual(self.callback.state["trial_restore"]["iteration"], 4) self.assertEqual( self.callback.state["trial_restore"]["trial"].trial_id, "two") # Let the second trial finish trials[1].restoring_from = None self.executor.results[trials[1]] = { TRAINING_ITERATION: 2, "metric": 900, "done": True } self.trial_runner.step() self.assertEqual(self.callback.state["trial_complete"]["iteration"], 5) self.assertEqual( self.callback.state["trial_complete"]["trial"].trial_id, "two") # Let the first trial error self.executor.failed_trial = trials[0] self.trial_runner.step() self.assertEqual(self.callback.state["trial_fail"]["iteration"], 6) self.assertEqual(self.callback.state["trial_fail"]["trial"].trial_id, "one") def testCallbacksEndToEnd(self): def train(config): if config["do"] == "save": with tune.checkpoint_dir(0): pass tune.report(metric=1) elif config["do"] == "fail": raise RuntimeError("I am failing on purpose.") elif config["do"] == "delay": time.sleep(2) tune.report(metric=20) config = {"do": tune.grid_search(["save", "fail", "delay"])} tune.run( train, config=config, raise_on_failed_trial=False, callbacks=[self.callback]) self.assertEqual( self.callback.state["trial_fail"]["trial"].config["do"], "fail") self.assertEqual( self.callback.state["trial_save"]["trial"].config["do"], "save") self.assertEqual( self.callback.state["trial_result"]["trial"].config["do"], "delay") self.assertEqual( self.callback.state["trial_complete"]["trial"].config["do"], "delay")
def testStopTrial(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 5}, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) # Stop trial while running runner.stop_trial(trials[0]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.PENDING) # Stop trial while pending runner.stop_trial(trials[-1]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED) time.sleep(2) # Wait for stopped placement group to free resources runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED)
def testUserCheckpointBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "8" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir) ) ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 10})) trials = runner.get_trials() runner.step() # Start trial, schedule 1-8 self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) self.assertEqual(num_checkpoints(trials[0]), 0) runner.step() # Process results 0-8, schedule 9-11 (CP) self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 8) self.assertFalse(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 0) runner.step() # Process results 9-11 runner.step() # handle CP, schedule 12-19 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 11) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 1) runner.step() # Process results 12-19, schedule 20-21 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 19) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 1) runner.step() # Process results 20-21 runner.step() # handle CP, schedule 21-29 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 21) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 2) runner.step() # Process results 21-29, schedule 30-31 self.assertEqual(trials[0].last_result.get(TRAINING_ITERATION), 29) self.assertTrue(trials[0].has_checkpoint()) self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 2)
def testUserCheckpoint(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2})) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result, dispatch save runner.step() # Process save self.assertTrue(trials[0].has_checkpoint()) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) runner2.step() # 5: Start trial and dispatch restore trials2 = runner2.get_trials() self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1)
def testCheckpointAtEndNotBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "0.5" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir) ) ray.init(num_cpus=2) trial = Trial( "__fake", checkpoint_at_end=True, stopping_criterion={"training_iteration": 4}, ) runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period=0, trial_executor=RayTrialExecutor(result_buffer_length=7), ) runner.add_trial(trial) runner.step() # start trial runner.step() # run iteration 1 self.assertEqual(trial.last_result[TRAINING_ITERATION], 1) self.assertEqual(num_checkpoints(trial), 0) runner.step() # run iteration 2 self.assertEqual(trial.last_result[TRAINING_ITERATION], 2) self.assertEqual(num_checkpoints(trial), 0) runner.step() # run iteration 3 self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 0) runner.step() # run iteration 4 self.assertEqual(trial.last_result[TRAINING_ITERATION], 4) self.assertEqual(num_checkpoints(trial), 1)
def testCheckpointFreqBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir) ) ray.init(num_cpus=2) trial = Trial("__fake", checkpoint_freq=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(trial) runner.step() # start trial runner.step() # run iteration 1-3 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 1) runner.step() # run iteration 4-6 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 6) self.assertEqual(num_checkpoints(trial), 2) runner.step() # run iteration 7-9 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 9) self.assertEqual(num_checkpoints(trial), 3)
def test_cluster_interrupt_searcher(start_connected_cluster, tmpdir, searcher): """Tests restoration of HyperOptSearch experiment on cluster shutdown with actual interrupt. Restoration should restore both state of trials and previous search algorithm (HyperOptSearch) state. This is an end-to-end test. """ cluster = start_connected_cluster dirpath = str(tmpdir) local_checkpoint_dir = os.path.join(dirpath, "experiment") from ray.tune import register_trainable register_trainable("trainable", MyTrainableClass) def execute_script_with_args(*args): current_dir = os.path.dirname(__file__) script = os.path.join(current_dir, "_test_cluster_interrupt_searcher.py") subprocess.Popen([sys.executable, script] + list(args)) args = ( "--ray-address", cluster.address, "--local-dir", dirpath, "--searcher", searcher, ) execute_script_with_args(*args) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. trials = [] for i in range(100): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() if trials and len(trials) >= 10: break time.sleep(0.5) else: raise ValueError(f"Didn't generate enough trials: {len(trials)}") if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError( f"Checkpoint file didn't appear in {local_checkpoint_dir}. " f"Current list: {os.listdir(local_checkpoint_dir)}.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() execute_script_with_args(*(args + ("--resume", ))) time.sleep(2) register_trainable("trainable", MyTrainableClass) reached = False for i in range(100): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() if len(trials) == 0: continue # nonblocking script hasn't resumed yet, wait reached = True assert len(trials) >= 10 assert len(trials) <= 20 if len(trials) == 20: break else: stop_fn = runner.trial_executor.stop_trial [stop_fn(t) for t in trials if t.status is not Trial.ERROR] time.sleep(0.5) assert reached is True ray.shutdown() cluster.shutdown()
def testTrialSaveRestore(self): """Creates different trials to test runner.checkpoint/restore.""" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) trials = [ Trial( "__fake", trial_id="trial_terminate", stopping_criterion={"training_iteration": 1}, checkpoint_freq=1, ) ] runner.add_trial(trials[0]) runner.step() # Start trial runner.step() # Process result, dispatch save runner.step() # Process save self.assertEqual(trials[0].status, Trial.TERMINATED) trials += [ Trial( "__fake", trial_id="trial_fail", stopping_criterion={"training_iteration": 3}, checkpoint_freq=1, config={"mock_error": True}, ) ] runner.add_trial(trials[1]) runner.step() # Start trial runner.step() # Process result, dispatch save runner.step() # Process save runner.step() # Error self.assertEqual(trials[1].status, Trial.ERROR) trials += [ Trial( "__fake", trial_id="trial_succ", stopping_criterion={"training_iteration": 2}, checkpoint_freq=1, ) ] runner.add_trial(trials[2]) runner.step() # Start trial self.assertEqual(len(runner.trial_executor.get_checkpoints()), 3) self.assertEqual(trials[2].status, Trial.RUNNING) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) for tid in ["trial_terminate", "trial_fail"]: original_trial = runner.get_trial(tid) restored_trial = runner2.get_trial(tid) self.assertEqual(original_trial.status, restored_trial.status) restored_trial = runner2.get_trial("trial_succ") self.assertEqual(Trial.PENDING, restored_trial.status) runner2.step() # Start trial runner2.step() # Process result, dispatch save runner2.step() # Process save runner2.step() # Process result, dispatch save runner2.step() # Process save self.assertRaises(TuneError, runner2.step)
def testBurnInPeriod(self): runner = TrialRunner(trial_executor=MagicMock()) scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="error", mode="min", perturbation_interval=5, burn_in_period=50, log_config=True, synch=True, ) class MockTrial(Trial): @property def checkpoint(self): return _TuneCheckpoint(_TuneCheckpoint.MEMORY, "None", {}) @property def status(self): return Trial.PAUSED @status.setter def status(self, status): pass trial1 = MockTrial("PPO", config=dict(num=1)) trial2 = MockTrial("PPO", config=dict(num=2)) trial3 = MockTrial("PPO", config=dict(num=3)) trial4 = MockTrial("PPO", config=dict(num=4)) runner.add_trial(trial1) runner.add_trial(trial2) runner.add_trial(trial3) runner.add_trial(trial4) scheduler.on_trial_add(runner, trial1) scheduler.on_trial_add(runner, trial2) scheduler.on_trial_add(runner, trial3) scheduler.on_trial_add(runner, trial4) # Add initial results. scheduler.on_trial_result(runner, trial1, result=dict(training_iteration=1, error=50)) scheduler.on_trial_result(runner, trial2, result=dict(training_iteration=1, error=50)) scheduler.on_trial_result(runner, trial3, result=dict(training_iteration=1, error=10)) scheduler.on_trial_result(runner, trial4, result=dict(training_iteration=1, error=100)) # Add more results. Without burn-in, this would now exploit scheduler.on_trial_result(runner, trial1, result=dict(training_iteration=30, error=50)) scheduler.on_trial_result(runner, trial2, result=dict(training_iteration=30, error=50)) scheduler.on_trial_result(runner, trial3, result=dict(training_iteration=30, error=10)) scheduler.on_trial_result(runner, trial4, result=dict(training_iteration=30, error=100)) self.assertEqual(trial4.config["num"], 4) # Add more results. Since this is after burn-in, it should now exploit scheduler.on_trial_result(runner, trial1, result=dict(training_iteration=50, error=50)) scheduler.on_trial_result(runner, trial2, result=dict(training_iteration=50, error=50)) scheduler.on_trial_result(runner, trial3, result=dict(training_iteration=50, error=10)) scheduler.on_trial_result(runner, trial4, result=dict(training_iteration=50, error=100)) self.assertEqual(trial4.config["num"], 3) # Assert that trials do not hang after `burn_in_period` self.assertTrue(all(t.status == "PAUSED" for t in runner.get_trials())) self.assertTrue(scheduler.choose_trial_to_run(runner)) # Assert that trials do not hang when a terminated trial is added trial5 = Trial("PPO", config=dict(num=5)) runner.add_trial(trial5) scheduler.on_trial_add(runner, trial5) trial5.set_status(Trial.TERMINATED) self.assertTrue(scheduler.choose_trial_to_run(runner))
def testStopTrial(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs) ] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) # Stop trial while running runner.stop_trial(trials[0]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.PENDING) # Stop trial while pending runner.stop_trial(trials[-1]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED)
def run( run_or_experiment: Union[str, Callable, Type], name: Optional[str] = None, metric: Optional[str] = None, mode: Optional[str] = None, stop: Union[None, Mapping, Stopper, Callable[[str, Mapping], bool]] = None, time_budget_s: Union[None, int, float, datetime.timedelta] = None, config: Optional[Dict[str, Any]] = None, resources_per_trial: Union[None, Mapping[str, Union[float, int, Mapping]], PlacementGroupFactory] = None, num_samples: int = 1, local_dir: Optional[str] = None, search_alg: Optional[Union[Searcher, SearchAlgorithm, str]] = None, scheduler: Optional[Union[TrialScheduler, str]] = None, keep_checkpoints_num: Optional[int] = None, checkpoint_score_attr: Optional[str] = None, checkpoint_freq: int = 0, checkpoint_at_end: bool = False, verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, progress_reporter: Optional[ProgressReporter] = None, log_to_file: bool = False, trial_name_creator: Optional[Callable[[Trial], str]] = None, trial_dirname_creator: Optional[Callable[[Trial], str]] = None, sync_config: Optional[SyncConfig] = None, export_formats: Optional[Sequence] = None, max_failures: int = 0, fail_fast: bool = False, restore: Optional[str] = None, server_port: Optional[int] = None, resume: bool = False, reuse_actors: bool = False, trial_executor: Optional[RayTrialExecutor] = None, raise_on_failed_trial: bool = True, callbacks: Optional[Sequence[Callback]] = None, max_concurrent_trials: Optional[int] = None, # Deprecated args queue_trials: Optional[bool] = None, loggers: Optional[Sequence[Type[Logger]]] = None, _remote: Optional[bool] = None, ) -> ExperimentAnalysis: """Executes training. When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run will gracefully shut down and checkpoint the latest experiment state. Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step. Many aspects of Tune, such as the frequency of global checkpointing, maximum pending placement group trials and the path of the result directory be configured through environment variables. Refer to :ref:`tune-env-vars` for a list of environment variables available. Examples: .. code-block:: python # Run 10 trials (each trial is one instance of a Trainable). Tune runs # in parallel and automatically determines concurrency. tune.run(trainable, num_samples=10) # Run 1 trial, stop when trial has reached 10 iterations tune.run(my_trainable, stop={"training_iteration": 10}) # automatically retry failed trials up to 3 times tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3) # Run 1 trial, search over hyperparameters, stop after 10 iterations. space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)} tune.run(my_trainable, config=space, stop={"training_iteration": 10}) # Resumes training if a previous machine crashed tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume=True) # Rerun ONLY failed trials after an experiment is finished. tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume="ERRORED_ONLY") Args: run_or_experiment (function | class | str | :class:`Experiment`): If function|class|str, this is the algorithm or model to train. This may refer to the name of a built-on algorithm (e.g. RLLib's DQN or PPO), a user-defined trainable function or class, or the string identifier of a trainable function or class registered in the tune registry. If Experiment, then Tune will execute training based on Experiment.spec. If you want to pass in a Python lambda, you will need to first register the function: ``tune.register_trainable("lambda_id", lambda x: ...)``. You can then use ``tune.run("lambda_id")``. metric (str): Metric to optimize. This metric should be reported with `tune.report()`. If set, will be passed to the search algorithm and scheduler. mode (str): Must be one of [min, max]. Determines whether objective is minimizing or maximizing the metric attribute. If set, will be passed to the search algorithm and scheduler. name (str): Name of experiment. stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict, the keys may be any field in the return result of 'train()', whichever is reached first. If function, it must take (trial_id, result) as arguments and return a boolean (True if trial should be stopped, False otherwise). This can also be a subclass of ``ray.tune.Stopper``, which allows users to implement custom experiment-wide stopping (i.e., stopping an entire Tune run based on some time constraint). time_budget_s (int|float|datetime.timedelta): Global time budget in seconds after which all trials are stopped. Can also be a ``datetime.timedelta`` object. config (dict): Algorithm-specific configuration for Tune variant generation (e.g. env, hyperparams). Defaults to empty dict. Custom search algorithms may ignore this. resources_per_trial (dict|PlacementGroupFactory): Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be assigned unless you specify them here. Defaults to 1 CPU and 0 GPUs in ``Trainable.default_resource_request()``. This can also be a PlacementGroupFactory object wrapping arguments to create a per-trial placement group. num_samples (int): Number of times to sample from the hyperparameter space. Defaults to 1. If `grid_search` is provided as an argument, the grid will be repeated `num_samples` of times. If this is -1, (virtually) infinite samples are generated until a stopping condition is met. local_dir (str): Local dir to save training results to. Defaults to ``~/ray_results``. search_alg (Searcher|SearchAlgorithm|str): Search algorithm for optimization. You can also use the name of the algorithm. scheduler (TrialScheduler|str): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to ray.tune.schedulers for more options. You can also use the name of the scheduler. keep_checkpoints_num (int): Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. checkpoint_score_attr (str): Specifies by which attribute to rank the best checkpoint. Default is increasing order. If attribute starts with `min-` it will rank attribute in decreasing order, i.e. `min-validation_loss`. checkpoint_freq (int): How many training iterations between checkpoints. A value of 0 (default) disables checkpointing. This has no effect when using the Functional Training API. checkpoint_at_end (bool): Whether to checkpoint at the end of the experiment regardless of the checkpoint_freq. Default is False. This has no effect when using the Functional Training API. verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief trial results, 3 = status and detailed trial results. Defaults to 3. progress_reporter (ProgressReporter): Progress reporter for reporting intermediate experiment progress. Defaults to CLIReporter if running in command-line, or JupyterNotebookReporter if running in a Jupyter notebook. log_to_file (bool|str|Sequence): Log stdout and stderr to files in Tune's trial directories. If this is `False` (default), no files are written. If `true`, outputs are written to `trialdir/stdout` and `trialdir/stderr`, respectively. If this is a single string, this is interpreted as a file relative to the trialdir, to which both streams are written. If this is a Sequence (e.g. a Tuple), it has to have length 2 and the elements indicate the files to which stdout and stderr are written, respectively. trial_name_creator (Callable[[Trial], str]): Optional function for generating the trial string representation. trial_dirname_creator (Callable[[Trial], str]): Function for generating the trial dirname. This function should take in a Trial object and return a string representing the name of the directory. The return value cannot be a path. sync_config (SyncConfig): Configuration object for syncing. See tune.SyncConfig. export_formats (list): List of formats that exported at the end of the experiment. Default is None. max_failures (int): Try to recover a trial at least this many times. Ray will recover from the latest checkpoint if present. Setting to -1 will lead to infinite recovery retries. Setting to 0 will disable retries. Defaults to 0. fail_fast (bool | str): Whether to fail upon the first error. If fail_fast='raise' provided, Tune will automatically raise the exception received by the Trainable. fail_fast='raise' can easily leak resources and should be used with caution (it is best used with `ray.init(local_mode=True)`). restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. server_port (int): Port number for launching TuneServer. resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", or bool. LOCAL/True restores the checkpoint from the local experiment directory, determined by ``name`` and ``local_dir``. REMOTE restores the checkpoint from ``upload_dir`` (as passed to ``sync_config``). PROMPT provides CLI feedback. False forces a new experiment. ERRORED_ONLY resets and reruns ERRORED trials upon resume - previous trial artifacts will be left untouched. If resume is set but checkpoint does not exist, ValueError will be thrown. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. callbacks (list): List of callbacks that will be called at different times in the training loop. Must be instances of the ``ray.tune.callback.Callback`` class. If not passed, `LoggerCallback` and `SyncerCallback` callbacks are automatically added. max_concurrent_trials (int): Maximum number of trials to run concurrently. Must be non-negative. If None or 0, no limit will be applied. This is achieved by wrapping the ``search_alg`` in a :class:`ConcurrencyLimiter`, and thus setting this argument will raise an exception if the ``search_alg`` is already a :class:`ConcurrencyLimiter`. Defaults to None. _remote (bool): Whether to run the Tune driver in a remote function. This is disabled automatically if a custom trial executor is passed in. This is enabled by default in Ray client mode. Returns: ExperimentAnalysis: Object for experiment analysis. Raises: TuneError: Any trials failed and `raise_on_failed_trial` is True. """ # To be removed in 1.9. if queue_trials is not None: raise DeprecationWarning( "`queue_trials` has been deprecated and is replaced by " "the `TUNE_MAX_PENDING_TRIALS_PG` environment variable. " "Per default at least one Trial is queued at all times, " "so you likely don't need to change anything other than " "removing this argument from your call to `tune.run()`") # NO CODE IS TO BE ADDED ABOVE THIS COMMENT # remote_run_kwargs must be defined before any other # code is ran to ensure that at this point, # `locals()` is equal to args and kwargs remote_run_kwargs = locals().copy() remote_run_kwargs.pop("_remote") if _remote is None: _remote = ray.util.client.ray.is_connected() if _remote is True and trial_executor: raise ValueError("cannot use custom trial executor") if not trial_executor or isinstance(trial_executor, RayTrialExecutor): _ray_auto_init() if _remote: remote_run = ray.remote(num_cpus=0)(run) # Make sure tune.run is called on the sever node. remote_run = force_on_current_node(remote_run) # JupyterNotebooks don't work with remote tune runs out of the box # (e.g. via Ray client) as they don't have access to the main # process stdout. So we introduce a queue here that accepts # callables, which will then be executed on the driver side. if isinstance(progress_reporter, JupyterNotebookReporter): execute_queue = Queue(actor_options={ "num_cpus": 0, **force_on_current_node(None) }) progress_reporter.set_output_queue(execute_queue) def get_next_queue_item(): try: return execute_queue.get(block=False) except Empty: return None else: # If we don't need a queue, use this dummy get fn instead of # scheduling an unneeded actor def get_next_queue_item(): return None def _handle_execute_queue(): execute_item = get_next_queue_item() while execute_item: if isinstance(execute_item, Callable): execute_item() execute_item = get_next_queue_item() remote_future = remote_run.remote(_remote=False, **remote_run_kwargs) # ray.wait(...)[1] returns futures that are not ready, yet while ray.wait([remote_future], timeout=0.2)[1]: # Check if we have items to execute _handle_execute_queue() # Handle queue one last time _handle_execute_queue() return ray.get(remote_future) del remote_run_kwargs all_start = time.time() if loggers: # Raise DeprecationWarning in 1.9, remove in 1.10/1.11 warnings.warn( "The `loggers` argument is deprecated. Please pass the respective " "`LoggerCallback` classes to the `callbacks` argument instead. " "See https://docs.ray.io/en/latest/tune/api_docs/logging.html") if mode and mode not in ["min", "max"]: raise ValueError( "The `mode` parameter passed to `tune.run()` has to be one of " "['min', 'max']") set_verbosity(verbose) config = config or {} sync_config = sync_config or SyncConfig() set_sync_periods(sync_config) if num_samples == -1: num_samples = sys.maxsize result_buffer_length = None # Create scheduler here as we need access to some of its properties if isinstance(scheduler, str): # importing at top level causes a recursive dependency from ray.tune.schedulers import create_scheduler scheduler = create_scheduler(scheduler) scheduler = scheduler or FIFOScheduler() if not scheduler.supports_buffered_results: # Result buffering with e.g. a Hyperband scheduler is a bad idea, as # hyperband tries to stop trials when processing brackets. With result # buffering, we might trigger this multiple times when evaluating # a single trial, which leads to unexpected behavior. env_result_buffer_length = os.getenv("TUNE_RESULT_BUFFER_LENGTH", "") if env_result_buffer_length: warnings.warn( f"You are using a {type(scheduler)} scheduler, but " f"TUNE_RESULT_BUFFER_LENGTH is set " f"({env_result_buffer_length}). This can lead to undesired " f"and faulty behavior, so the buffer length was forcibly set " f"to 1 instead.") result_buffer_length = 1 if (isinstance(scheduler, (PopulationBasedTraining, PopulationBasedTrainingReplay)) and not reuse_actors): warnings.warn( "Consider boosting PBT performance by enabling `reuse_actors` as " "well as implementing `reset_config` for Trainable.") trial_executor = trial_executor or RayTrialExecutor( reuse_actors=reuse_actors, result_buffer_length=result_buffer_length) if isinstance(run_or_experiment, list): experiments = run_or_experiment else: experiments = [run_or_experiment] for i, exp in enumerate(experiments): if not isinstance(exp, Experiment): experiments[i] = Experiment( name=name, run=exp, stop=stop, time_budget_s=time_budget_s, config=config, resources_per_trial=resources_per_trial, num_samples=num_samples, local_dir=local_dir, sync_config=sync_config, trial_name_creator=trial_name_creator, trial_dirname_creator=trial_dirname_creator, log_to_file=log_to_file, checkpoint_freq=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr, export_formats=export_formats, max_failures=max_failures, restore=restore, ) else: logger.debug("Ignoring some parameters passed into tune.run.") if fail_fast and max_failures != 0: raise ValueError("max_failures must be 0 if fail_fast=True.") if isinstance(search_alg, str): # importing at top level causes a recursive dependency from ray.tune.suggest import create_searcher search_alg = create_searcher(search_alg) # if local_mode=True is set during ray.init(). is_local_mode = ray.worker._mode() == ray.worker.LOCAL_MODE if is_local_mode: max_concurrent_trials = 1 if not search_alg: search_alg = BasicVariantGenerator( max_concurrent=max_concurrent_trials or 0) elif max_concurrent_trials or is_local_mode: if isinstance(search_alg, ConcurrencyLimiter): if not is_local_mode: if search_alg.max_concurrent != max_concurrent_trials: raise ValueError( "You have specified `max_concurrent_trials=" f"{max_concurrent_trials}`, but the `search_alg` is " "already a `ConcurrencyLimiter` with `max_concurrent=" f"{search_alg.max_concurrent}. FIX THIS by setting " "`max_concurrent_trials=None`.") else: logger.warning( "You have specified `max_concurrent_trials=" f"{max_concurrent_trials}`, but the `search_alg` is " "already a `ConcurrencyLimiter`. " "`max_concurrent_trials` will be ignored.") else: if max_concurrent_trials < 1: raise ValueError( "`max_concurrent_trials` must be greater or equal than 1, " f"got {max_concurrent_trials}.") if isinstance(search_alg, Searcher): search_alg = ConcurrencyLimiter( search_alg, max_concurrent=max_concurrent_trials) elif not is_local_mode: logger.warning( "You have passed a `SearchGenerator` instance as the " "`search_alg`, but `max_concurrent_trials` requires a " "`Searcher` instance`. `max_concurrent_trials` " "will be ignored.") if isinstance(search_alg, Searcher): search_alg = SearchGenerator(search_alg) if config and not set_search_properties_backwards_compatible( search_alg.set_search_properties, metric, mode, config, **experiments[0].public_spec, ): if has_unresolved_values(config): raise ValueError( "You passed a `config` parameter to `tune.run()` with " "unresolved parameters, but the search algorithm was already " "instantiated with a search space. Make sure that `config` " "does not contain any more parameter definitions - include " "them in the search algorithm's search space if necessary.") if not scheduler.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the scheduler you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your scheduler or from your call to `tune.run()`") # Create syncer callbacks callbacks = create_default_callbacks(callbacks, sync_config, metric=metric, loggers=loggers) runner = TrialRunner( search_alg=search_alg, scheduler=scheduler, local_checkpoint_dir=experiments[0].checkpoint_dir, remote_checkpoint_dir=experiments[0].remote_checkpoint_dir, sync_config=sync_config, stopper=experiments[0].stopper, resume=resume, server_port=server_port, fail_fast=fail_fast, trial_executor=trial_executor, callbacks=callbacks, metric=metric, # Driver should only sync trial checkpoints if # checkpoints are not synced to cloud driver_sync_trial_checkpoints=not bool(sync_config.upload_dir), ) if not runner.resumed: for exp in experiments: search_alg.add_configurations([exp]) else: logger.info("TrialRunner resumed, ignoring new add_experiment but " "updating trial resources.") if resources_per_trial: runner.update_pending_trial_resources(resources_per_trial) progress_reporter = progress_reporter or detect_reporter() if not progress_reporter.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the reporter you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your reporter or from your call to `tune.run()`") progress_reporter.set_total_samples(search_alg.total_samples) # Calls setup on callbacks runner.setup_experiments(experiments=experiments, total_num_samples=search_alg.total_samples) # User Warning for GPUs if trial_executor.has_gpus(): if isinstance(resources_per_trial, dict) and "gpu" in resources_per_trial: # "gpu" is manually set. pass elif _check_default_resources_override(experiments[0].run_identifier): # "default_resources" is manually overridden. pass else: logger.warning("Tune detects GPUs, but no trials are using GPUs. " "To enable trials to use GPUs, set " "tune.run(resources_per_trial={'gpu': 1}...) " "which allows Tune to expose 1 GPU to each trial. " "You can also override " "`Trainable.default_resource_request` if using the " "Trainable API.") original_handler = signal.getsignal(signal.SIGINT) state = {signal.SIGINT: False} def sigint_handler(sig, frame): logger.warning( "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. " "This will try to checkpoint the experiment state one last time. " "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) " "to skip. ") state[signal.SIGINT] = True # Restore original signal handler to react to future SIGINT signals signal.signal(signal.SIGINT, original_handler) if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")): signal.signal(signal.SIGINT, sigint_handler) tune_start = time.time() progress_reporter.set_start_time(tune_start) while not runner.is_finished() and not state[signal.SIGINT]: runner.step() if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter) tune_taken = time.time() - tune_start try: runner.checkpoint(force=True) except Exception as e: logger.warning(f"Trial Runner checkpointing failed: {str(e)}") if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter, done=True) wait_for_sync() runner.cleanup() incomplete_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: incomplete_trials += [trial] if incomplete_trials: if raise_on_failed_trial and not state[signal.SIGINT]: raise TuneError("Trials did not complete", incomplete_trials) else: logger.error("Trials did not complete: %s", incomplete_trials) all_taken = time.time() - all_start if has_verbosity(Verbosity.V1_EXPERIMENT): logger.info(f"Total run time: {all_taken:.2f} seconds " f"({tune_taken:.2f} seconds for the tuning loop).") if state[signal.SIGINT]: logger.warning( "Experiment has been interrupted, but the most recent state was " "saved. You can continue running this experiment by passing " "`resume=True` to `tune.run()`") trials = runner.get_trials() return ExperimentAnalysis( runner.checkpoint_file, trials=trials, default_metric=metric, default_mode=mode, sync_config=sync_config, )
def testMultiStepRun(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING)
def test_trial_migration(start_connected_emptyhead_cluster): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } # Test recovery of trial that hasn't been checkpointed t = Trial("__fake", **kwargs) runner.add_trial(t) runner.step() # start runner.step() # 1 result assert t.last_result node2 = cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." for i in range(3): runner.step() assert t.status == Trial.TERMINATED # Test recovery of trial that has been checkpointed t2 = Trial("__fake", **kwargs) runner.add_trial(t2) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t2.has_checkpoint() node3 = cluster.add_node(num_cpus=1) cluster.remove_node(node2) cluster.wait_for_nodes() runner.step() # Recovery step assert t2.last_result["training_iteration"] == 2 for i in range(1): runner.step() assert t2.status == Trial.TERMINATED # Test recovery of trial that won't be checkpointed t3 = Trial("__fake", **{"stopping_criterion": {"training_iteration": 3}}) runner.add_trial(t3) runner.step() # start runner.step() # 1 result cluster.add_node(num_cpus=1) cluster.remove_node(node3) cluster.wait_for_nodes() runner.step() # Error handling step assert t3.status == Trial.ERROR with pytest.raises(TuneError): runner.step()
def testCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result, dispatch save runner.step() # Process save, stop trial kwargs["restore_path"] = trials[0].checkpoint.value self.assertEqual(trials[0].status, Trial.TERMINATED) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() self.assertEqual(trials[1].status, Trial.PENDING) runner.step() # Start trial, dispatch restore self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() # Process restore self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(ray.get(trials[1].runner.get_info.remote()), 1) self.addCleanup(os.remove, trials[0].checkpoint.value)
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } # Test recovery of trial that has been checkpointed t1 = Trial("__fake", **kwargs) runner.add_trial(t1) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t1.has_checkpoint() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() shutil.rmtree(os.path.dirname(t1._checkpoint.value)) runner.step() # Recovery step for i in range(3): runner.step() assert t1.status == Trial.TERMINATED
def testPauseResumeCheckpointCount(self): ray.init(num_cpus=2) tempdir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, tempdir) trial = Trial("__fake", keep_checkpoints_num=2) trial.init_logdir() trial.checkpoint_manager.delete = lambda cp: shutil.rmtree(cp.value) def write_checkpoint(trial: Trial, index: int): checkpoint_dir = TrainableUtil.make_checkpoint_dir(trial.logdir, index=index) result = {"training_iteration": index} with open(os.path.join(checkpoint_dir, "cp.json"), "w") as f: json.dump(result, f) tune_cp = _TuneCheckpoint(_TuneCheckpoint.PERSISTENT, checkpoint_dir, result) trial.saving_to = tune_cp trial.on_checkpoint(tune_cp) return checkpoint_dir def get_checkpoint_dirs(trial: Trial): return [ d for d in os.listdir(trial.logdir) if d.startswith("checkpoint_") ] runner = TrialRunner(local_checkpoint_dir=tempdir) runner.add_trial(trial) # Write 1 checkpoint result = write_checkpoint(trial, 1) runner._on_saving_result(trial, result) # Expect 1 checkpoint cp_dirs = get_checkpoint_dirs(trial) self.assertEqual(len(cp_dirs), 1, msg=f"Checkpoint dirs: {cp_dirs}") # Write second checkpoint result = write_checkpoint(trial, 2) runner._on_saving_result(trial, result) # Expect 2 checkpoints cp_dirs = get_checkpoint_dirs(trial) self.assertEqual(len(cp_dirs), 2, msg=f"Checkpoint dirs: {cp_dirs}") # Write third checkpoint result = write_checkpoint(trial, 3) runner._on_saving_result(trial, result) # Expect 2 checkpoints because keep_checkpoints_num = 2 cp_dirs = get_checkpoint_dirs(trial) self.assertEqual(len(cp_dirs), 2, msg=f"Checkpoint dirs: {cp_dirs}") # Re-instantiate trial runner and resume runner.checkpoint(force=True) runner = TrialRunner(local_checkpoint_dir=tempdir) runner.resume() trial = runner.get_trials()[0] trial.checkpoint_manager.delete = lambda cp: shutil.rmtree(cp.value) # Write fourth checkpoint result = write_checkpoint(trial, 4) runner._on_saving_result(trial, result) # Expect 2 checkpoints because keep_checkpoints_num = 2 cp_dirs = get_checkpoint_dirs(trial) self.assertEqual(len(cp_dirs), 2, msg=f"Checkpoint dirs: {cp_dirs}") # Write fifth checkpoint result = write_checkpoint(trial, 5) runner._on_saving_result(trial, result) # Expect 2 checkpoints because keep_checkpoints_num = 2 cp_dirs = get_checkpoint_dirs(trial) self.assertEqual(len(cp_dirs), 2, msg=f"Checkpoint dirs: {cp_dirs}") # Checkpoints before restore should be deleted self.assertIn("checkpoint_000004", cp_dirs) self.assertIn("checkpoint_000005", cp_dirs) self.assertNotIn("checkpoint_000002", cp_dirs) self.assertNotIn("checkpoint_000003", cp_dirs)
def testTrialNoCheckpointSave(self): """Check that non-checkpointing trials *are* saved.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial( Trial( "__fake", trial_id="non_checkpoint", stopping_criterion={"training_iteration": 2}, ) ) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="checkpoint", checkpoint_at_end=True, stopping_criterion={"training_iteration": 2}, ) ) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="pending", stopping_criterion={"training_iteration": 2}, ) ) runner.step() runner.step() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) new_trials = runner2.get_trials() self.assertEqual(len(new_trials), 3) self.assertTrue(runner2.get_trial("non_checkpoint").status == Trial.TERMINATED) self.assertTrue(runner2.get_trial("checkpoint").status == Trial.TERMINATED) self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING) self.assertTrue(runner2.get_trial("pending").has_reported_at_least_once) runner2.step()
def testResourceScheduler(self): ray.init(num_cpus=4, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.TERMINATED)
def testCheckpointWithFunction(self): ray.init(num_cpus=2) trial = Trial( "__fake", config={ "callbacks": { "on_episode_start": lambda i: i, } }, checkpoint_freq=1, ) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(trial) for _ in range(5): runner.step() # force checkpoint runner.checkpoint() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) new_trial = runner2.get_trials()[0] self.assertTrue("callbacks" in new_trial.config) self.assertTrue("on_episode_start" in new_trial.config["callbacks"])
def run_experiments(experiments, search_alg=None, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=2, resume=False, queue_trials=False, reuse_actors=False, trial_executor=None, raise_on_failed_trial=True): """Runs and blocks until all trials finish. Args: experiments (Experiment | list | dict): Experiments to run. Will be passed to `search_alg` via `add_configurations`. search_alg (SearchAlgorithm): Search Algorithm. Defaults to BasicVariantGenerator. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, and HyperBand. with_server (bool): Starts a background Tune server. Needed for using the Client API. server_port (int): Port number for launching TuneServer. verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and trial results. resume (bool|"prompt"): If checkpoint exists, the experiment will resume from there. If resume is "prompt", Tune will prompt if checkpoint detected. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. Examples: >>> experiment_spec = Experiment("experiment", my_func) >>> run_experiments(experiments=experiment_spec) >>> experiment_spec = {"experiment": {"run": my_func}} >>> run_experiments(experiments=experiment_spec) >>> run_experiments( >>> experiments=experiment_spec, >>> scheduler=MedianStoppingRule(...)) >>> run_experiments( >>> experiments=experiment_spec, >>> search_alg=SearchAlgorithm(), >>> scheduler=MedianStoppingRule(...)) Returns: List of Trial objects, holding data for each executed trial. """ # This is important to do this here # because it schematize the experiments # and it conducts the implicit registration. experiments = convert_to_experiment_list(experiments) checkpoint_dir = _find_checkpoint_dir(experiments) runner = None restore = False if TrialRunner.checkpoint_exists(checkpoint_dir): if resume == "prompt": msg = ("Found incomplete experiment at {}. " "Would you like to resume it?".format(checkpoint_dir)) restore = click.confirm(msg, default=False) if restore: logger.info("Tip: to always resume, " "pass resume=True to run_experiments()") else: logger.info("Tip: to always start a new experiment, " "pass resume=False to run_experiments()") elif resume: restore = True else: logger.info( "Tip: to resume incomplete experiments, " "pass resume='prompt' or resume=True to run_experiments()") else: logger.info( "Did not find checkpoint file in {}.".format(checkpoint_dir)) if restore: runner = try_restore_runner(checkpoint_dir, search_alg, scheduler, trial_executor) else: logger.info("Starting a new experiment.") if not runner: if scheduler is None: scheduler = FIFOScheduler() if search_alg is None: search_alg = BasicVariantGenerator() search_alg.add_configurations(experiments) runner = TrialRunner(search_alg, scheduler=scheduler, metadata_checkpoint_dir=checkpoint_dir, launch_web_server=with_server, server_port=server_port, verbose=bool(verbose > 1), queue_trials=queue_trials, reuse_actors=reuse_actors, trial_executor=trial_executor) if verbose: print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: if verbose: print(runner.debug_string()) last_debug = time.time() if verbose: print(runner.debug_string(max_debug=99999)) wait_for_log_sync() errored_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: errored_trials += [trial] if errored_trials: if raise_on_failed_trial: raise TuneError("Trials did not complete", errored_trials) else: logger.error("Trials did not complete: %s", errored_trials) return runner.get_trials()
def run(run_or_experiment, name=None, stop=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, upload_dir=None, trial_name_creator=None, loggers=None, sync_to_cloud=None, sync_to_driver=None, checkpoint_freq=0, checkpoint_at_end=False, sync_on_checkpoint=True, keep_checkpoints_num=None, checkpoint_score_attr=None, global_checkpoint_period=10, export_formats=None, max_failures=0, restore=None, search_alg=None, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=2, resume=False, queue_trials=False, reuse_actors=False, trial_executor=None, raise_on_failed_trial=True, return_trials=False, ray_auto_init=True, sync_function=None): """Executes training. Args: run_or_experiment (function|class|str|Experiment): If function|class|str, this is the algorithm or model to train. This may refer to the name of a built-on algorithm (e.g. RLLib's DQN or PPO), a user-defined trainable function or class, or the string identifier of a trainable function or class registered in the tune registry. If Experiment, then Tune will execute training based on Experiment.spec. name (str): Name of experiment. stop (dict|func): The stopping criteria. If dict, the keys may be any field in the return result of 'train()', whichever is reached first. If function, it must take (trial_id, result) as arguments and return a boolean (True if trial should be stopped, False otherwise). config (dict): Algorithm-specific configuration for Tune variant generation (e.g. env, hyperparams). Defaults to empty dict. Custom search algorithms may ignore this. resources_per_trial (dict): Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be assigned unless you specify them here. Defaults to 1 CPU and 0 GPUs in ``Trainable.default_resource_request()``. num_samples (int): Number of times to sample from the hyperparameter space. Defaults to 1. If `grid_search` is provided as an argument, the grid will be repeated `num_samples` of times. local_dir (str): Local dir to save training results to. Defaults to ``~/ray_results``. upload_dir (str): Optional URI to sync training results and checkpoints to (e.g. ``s3://bucket`` or ``gs://bucket``). trial_name_creator (func): Optional function for generating the trial string representation. loggers (list): List of logger creators to be used with each Trial. If None, defaults to ray.tune.logger.DEFAULT_LOGGERS. See `ray/tune/logger.py`. sync_to_cloud (func|str): Function for syncing the local_dir to and from upload_dir. If string, then it must be a string template that includes `{source}` and `{target}` for the syncer to run. If not provided, the sync command defaults to standard S3 or gsutil sync commands. sync_to_driver (func|str|bool): Function for syncing trial logdir from remote node to local. If string, then it must be a string template that includes `{source}` and `{target}` for the syncer to run. If True or not provided, it defaults to using rsync. If False, syncing to driver is disabled. checkpoint_freq (int): How many training iterations between checkpoints. A value of 0 (default) disables checkpointing. checkpoint_at_end (bool): Whether to checkpoint at the end of the experiment regardless of the checkpoint_freq. Default is False. sync_on_checkpoint (bool): Force sync-down of trial checkpoint to driver. If set to False, checkpoint syncing from worker to driver is asynchronous and best-effort. This does not affect persistent storage syncing. Defaults to True. keep_checkpoints_num (int): Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. checkpoint_score_attr (str): Specifies by which attribute to rank the best checkpoint. Default is increasing order. If attribute starts with `min-` it will rank attribute in decreasing order, i.e. `min-validation_loss`. global_checkpoint_period (int): Seconds between global checkpointing. This does not affect `checkpoint_freq`, which specifies frequency for individual trials. export_formats (list): List of formats that exported at the end of the experiment. Default is None. max_failures (int): Try to recover a trial at least this many times. Ray will recover from the latest checkpoint if present. Setting to -1 will lead to infinite recovery retries. Setting to 0 will disable retries. Defaults to 3. restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. search_alg (SearchAlgorithm): Search Algorithm. Defaults to BasicVariantGenerator. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to ray.tune.schedulers for more options. with_server (bool): Starts a background Tune server. Needed for using the Client API. server_port (int): Port number for launching TuneServer. verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and trial results. resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", or bool. LOCAL/True restores the checkpoint from the local_checkpoint_dir. REMOTE restores the checkpoint from remote_checkpoint_dir. PROMPT provides CLI feedback. False forces a new experiment. If resume is set but checkpoint does not exist, ValueError will be thrown. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. ray_auto_init (bool): Automatically starts a local Ray cluster if using a RayTrialExecutor (which is the default) and if Ray is not initialized. Defaults to True. sync_function: Deprecated. See `sync_to_cloud` and `sync_to_driver`. Returns: List of Trial objects. Raises: TuneError if any trials failed and `raise_on_failed_trial` is True. Examples: >>> tune.run(mytrainable, scheduler=PopulationBasedTraining()) >>> tune.run(mytrainable, num_samples=5, reuse_actors=True) >>> tune.run( >>> "PG", >>> num_samples=5, >>> config={ >>> "env": "CartPole-v0", >>> "lr": tune.sample_from(lambda _: np.random.rand()) >>> } >>> ) """ trial_executor = trial_executor or RayTrialExecutor( queue_trials=queue_trials, reuse_actors=reuse_actors, ray_auto_init=ray_auto_init) if isinstance(run_or_experiment, list): experiments = run_or_experiment else: experiments = [run_or_experiment] if len(experiments) > 1: logger.info( "Running multiple concurrent experiments is experimental and may " "not work with certain features.") for i, exp in enumerate(experiments): if not isinstance(exp, Experiment): run_identifier = Experiment.register_if_needed(exp) experiments[i] = Experiment( name=name, run=run_identifier, stop=stop, config=config, resources_per_trial=resources_per_trial, num_samples=num_samples, local_dir=local_dir, upload_dir=upload_dir, sync_to_driver=sync_to_driver, trial_name_creator=trial_name_creator, loggers=loggers, checkpoint_freq=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, sync_on_checkpoint=sync_on_checkpoint, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr, export_formats=export_formats, max_failures=max_failures, restore=restore, sync_function=sync_function) else: logger.debug("Ignoring some parameters passed into tune.run.") if sync_to_cloud: for exp in experiments: assert exp.remote_checkpoint_dir, ( "Need `upload_dir` if `sync_to_cloud` given.") runner = TrialRunner( search_alg=search_alg or BasicVariantGenerator(), scheduler=scheduler or FIFOScheduler(), local_checkpoint_dir=experiments[0].checkpoint_dir, remote_checkpoint_dir=experiments[0].remote_checkpoint_dir, sync_to_cloud=sync_to_cloud, checkpoint_period=global_checkpoint_period, resume=resume, launch_web_server=with_server, server_port=server_port, verbose=bool(verbose > 1), trial_executor=trial_executor) for exp in experiments: runner.add_experiment(exp) if IS_NOTEBOOK: reporter = JupyterNotebookReporter(overwrite=verbose < 2) else: reporter = CLIReporter() # User Warning for GPUs if trial_executor.has_gpus(): if isinstance(resources_per_trial, dict) and "gpu" in resources_per_trial: # "gpu" is manually set. pass elif _check_default_resources_override(experiments[0].run_identifier): # "default_resources" is manually overriden. pass else: logger.warning("Tune detects GPUs, but no trials are using GPUs. " "To enable trials to use GPUs, set " "tune.run(resources_per_trial={'gpu': 1}...) " "which allows Tune to expose 1 GPU to each trial. " "You can also override " "`Trainable.default_resource_request` if using the " "Trainable API.") last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: if verbose: reporter.report(runner) last_debug = time.time() try: runner.checkpoint(force=True) except Exception: logger.exception("Trial Runner checkpointing failed.") if verbose: reporter.report(runner) wait_for_sync() errored_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: errored_trials += [trial] if errored_trials: if raise_on_failed_trial: raise TuneError("Trials did not complete", errored_trials) else: logger.error("Trials did not complete: %s", errored_trials) trials = runner.get_trials() if return_trials: return trials logger.info("Returning an analysis object by default. You can call " "`analysis.trials` to retrieve a list of trials. " "This message will be removed in future versions of Tune.") return ExperimentAnalysis(runner.checkpoint_file, trials=trials)
def test_cluster_down_simple(start_connected_cluster, tmpdir): """Tests that TrialRunner save/restore works on cluster shutdown.""" cluster = start_connected_cluster cluster.add_node(num_cpus=1) cluster.wait_for_nodes() dirpath = str(tmpdir) runner = TrialRunner( BasicVariantGenerator(), metadata_checkpoint_dir=dirpath) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "checkpoint_freq": 1, "max_failures": 1 } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # start runner.step() # start2 runner.step() # step assert all(t.status == Trial.RUNNING for t in runner.get_trials()) runner.checkpoint() cluster.shutdown() ray.shutdown() cluster = _start_new_cluster() runner = TrialRunner.restore(dirpath) runner.step() # start runner.step() # start2 for i in range(3): runner.step() with pytest.raises(TuneError): runner.step() assert all(t.status == Trial.TERMINATED for t in runner.get_trials()) cluster.shutdown()
def testCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) path = runner.trial_executor.save(trials[0]) kwargs["restore_path"] = path runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(ray.get(trials[1].runner.get_info.remote()), 1) self.addCleanup(os.remove, path)
def test_cluster_interrupt(start_connected_cluster, tmpdir): """Tests run_experiment on cluster shutdown with actual interrupt. This is an end-to-end test. """ cluster = start_connected_cluster dirpath = str(tmpdir) # Needs to be in scope for pytest class _Mock(tune.Trainable): """Finishes on the 4th iteration.""" def _setup(self, config): self.state = {"hi": 0} def _train(self): self.state["hi"] += 1 time.sleep(0.5) return {"done": self.state["hi"] >= 4} def _save(self, path): return self.state def _restore(self, state): self.state = state # Removes indent from class. reformatted = "\n".join(line[4:] if len(line) else line for line in inspect.getsource(_Mock).split("\n")) script = """ import time import ray from ray import tune ray.init(redis_address="{redis_address}") {fail_class_code} kwargs = dict( run={fail_class}, stop=dict(training_iteration=5), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1) tune.run_experiments( dict(experiment=kwargs), raise_on_failed_trial=False) """.format( redis_address=cluster.redis_address, checkpoint_dir=dirpath, fail_class_code=reformatted, fail_class=_Mock.__name__) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. metadata_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(50): if TrialRunner.checkpoint_exists(metadata_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner.restore(metadata_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration") == 3: break time.sleep(0.2) if not TrialRunner.checkpoint_exists(metadata_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() Experiment._register_if_needed(_Mock) # Inspect the internal trialrunner runner = TrialRunner.restore(metadata_checkpoint_dir) trials = runner.get_trials() assert trials[0].last_result["training_iteration"] == 3 assert trials[0].status == Trial.PENDING # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": _Mock, "local_dir": dirpath, "checkpoint_freq": 1 } }, resume=True, raise_on_failed_trial=False) assert all(t.status == Trial.TERMINATED for t in trials2) assert {t.trial_id for t in trials2} == {t.trial_id for t in trials} cluster.shutdown()
def testRestoreMetricsAfterCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) path = runner.trial_executor.save(trials[0]) runner.trial_executor.stop_trial(trials[0]) kwargs["restore_path"] = path runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10) self.assertEqual(trials[1].last_result["iterations_since_restore"], 1) self.assertGreater(trials[1].last_result["time_since_restore"], 0) runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20) self.assertEqual(trials[1].last_result["iterations_since_restore"], 2) self.assertGreater(trials[1].last_result["time_since_restore"], 0) self.addCleanup(os.remove, path)
def run_experiments(experiments, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=True): # Make sure rllib agents are registered from ray import rllib # noqa # pylint: disable=unused-import if scheduler is None: scheduler = FIFOScheduler() runner = TrialRunner( scheduler, launch_web_server=with_server, server_port=server_port) for name, spec in experiments.items(): for trial in generate_trials(spec, name): trial.set_verbose(verbose) runner.add_trial(trial) print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: print(runner.debug_string()) last_debug = time.time() print(runner.debug_string(max_debug=99999)) for trial in runner.get_trials(): # TODO(rliaw): What about errored? if trial.status != Trial.TERMINATED: raise TuneError("Trial did not complete", trial) wait_for_log_sync() return runner.get_trials()
def testResultDone(self): """Tests that last_result is marked `done` after trial is complete.""" ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertNotEqual(trials[0].last_result[DONE], True) runner.step() self.assertEqual(trials[0].last_result[DONE], True)
def run_experiments(experiments, search_alg=None, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=True, resume=False, queue_trials=False, trial_executor=None, raise_on_failed_trial=True): """Runs and blocks until all trials finish. Args: experiments (Experiment | list | dict): Experiments to run. Will be passed to `search_alg` via `add_configurations`. search_alg (SearchAlgorithm): Search Algorithm. Defaults to BasicVariantGenerator. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, and HyperBand. with_server (bool): Starts a background Tune server. Needed for using the Client API. server_port (int): Port number for launching TuneServer. verbose (bool): How much output should be printed for each trial. resume (bool|"prompt"): If checkpoint exists, the experiment will resume from there. If resume is "prompt", Tune will prompt if checkpoint detected. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. Examples: >>> experiment_spec = Experiment("experiment", my_func) >>> run_experiments(experiments=experiment_spec) >>> experiment_spec = {"experiment": {"run": my_func}} >>> run_experiments(experiments=experiment_spec) >>> run_experiments( >>> experiments=experiment_spec, >>> scheduler=MedianStoppingRule(...)) >>> run_experiments( >>> experiments=experiment_spec, >>> search_alg=SearchAlgorithm(), >>> scheduler=MedianStoppingRule(...)) Returns: List of Trial objects, holding data for each executed trial. """ # This is important to do this here # because it schematize the experiments # and it conducts the implicit registration. experiments = convert_to_experiment_list(experiments) checkpoint_dir = _find_checkpoint_dir(experiments) runner = None restore = False if os.path.exists( os.path.join(checkpoint_dir, TrialRunner.CKPT_FILE_NAME)): if resume == "prompt": msg = ("Found incomplete experiment at {}. " "Would you like to resume it?".format(checkpoint_dir)) restore = click.confirm(msg, default=False) if restore: logger.info("Tip: to always resume, " "pass resume=True to run_experiments()") else: logger.info("Tip: to always start a new experiment, " "pass resume=False to run_experiments()") elif resume: restore = True else: logger.info( "Tip: to resume incomplete experiments, " "pass resume='prompt' or resume=True to run_experiments()") else: logger.info( "Did not find checkpoint file in {}.".format(checkpoint_dir)) if restore: runner = try_restore_runner(checkpoint_dir, search_alg, scheduler, trial_executor) else: logger.info("Starting a new experiment.") if not runner: if scheduler is None: scheduler = FIFOScheduler() if search_alg is None: search_alg = BasicVariantGenerator() search_alg.add_configurations(experiments) runner = TrialRunner( search_alg, scheduler=scheduler, metadata_checkpoint_dir=checkpoint_dir, launch_web_server=with_server, server_port=server_port, verbose=verbose, queue_trials=queue_trials, trial_executor=trial_executor) print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: print(runner.debug_string()) last_debug = time.time() print(runner.debug_string(max_debug=99999)) wait_for_log_sync() errored_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: errored_trials += [trial] if errored_trials: if raise_on_failed_trial: raise TuneError("Trials did not complete", errored_trials) else: logger.error("Trials did not complete: %s", errored_trials) return runner.get_trials()
def testTrialErrorResumeTrue(self): ray.init(num_cpus=3, local_mode=True, include_dashboard=False) runner = TrialRunner(local_checkpoint_dir=self.tmpdir) kwargs = { "stopping_criterion": {"training_iteration": 4}, "resources": Resources(cpu=1, gpu=0), } trials = [ Trial("__fake", config={"mock_error": True}, **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() runner.checkpoint(force=True) assert trials[0].status == Trial.ERROR del runner new_runner = TrialRunner( resume="ERRORED_ONLY", local_checkpoint_dir=self.tmpdir ) assert len(new_runner.get_trials()) == 3 assert Trial.ERROR not in (t.status for t in new_runner.get_trials()) # The below is just a check for standard behavior. disable_error = False for t in new_runner.get_trials(): if t.config.get("mock_error"): t.config["mock_error"] = False disable_error = True assert disable_error while not new_runner.is_finished(): new_runner.step() assert Trial.ERROR not in (t.status for t in new_runner.get_trials())