def test_cluster_rllib_restore(start_connected_cluster, tmpdir): cluster = start_connected_cluster dirpath = str(tmpdir) script = """ import time import ray from ray import tune ray.init(address="{address}") tune.run( "PG", name="experiment", config=dict(env="CartPole-v1", framework="tf"), stop=dict(training_iteration=10), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1, dict(experiment=kwargs), raise_on_failed_trial=False) """.format( address=cluster.address, checkpoint_dir=dirpath) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. local_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(100): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner( resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration"): break time.sleep(0.3) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() cluster.wait_for_nodes() # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": "PG", "checkpoint_freq": 1, "local_dir": dirpath, } }, resume=True) assert all(t.status == Trial.TERMINATED for t in trials2) ray.shutdown() cluster.shutdown()
def test_cluster_rllib_restore(start_connected_cluster, tmpdir): cluster = start_connected_cluster dirpath = str(tmpdir) script = """ import time import ray from ray import tune ray.init(redis_address="{redis_address}") kwargs = dict( run="PG", env="CartPole-v1", stop=dict(training_iteration=10), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1) tune.run_experiments( dict(experiment=kwargs), raise_on_failed_trial=False) """.format( redis_address=cluster.redis_address, checkpoint_dir=dirpath) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. metadata_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(100): if TrialRunner.checkpoint_exists(metadata_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner.restore(metadata_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration"): break time.sleep(0.3) if not TrialRunner.checkpoint_exists(metadata_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() cluster.wait_for_nodes() # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": "PG", "checkpoint_freq": 1, "local_dir": dirpath } }, resume=True) assert all(t.status == Trial.TERMINATED for t in trials2) cluster.shutdown()
def _prompt_restore(checkpoint_dir, resume): restore = False if TrialRunner.checkpoint_exists(checkpoint_dir): if resume == "prompt": msg = ("Found incomplete experiment at {}. " "Would you like to resume it?".format(checkpoint_dir)) restore = click.confirm(msg, default=False) if restore: logger.info("Tip: to always resume, " "pass resume=True to run()") else: logger.info("Tip: to always start a new experiment, " "pass resume=False to run()") elif resume: restore = True else: logger.info("Tip: to resume incomplete experiments, " "pass resume='prompt' or resume=True to run()") else: logger.info( "Did not find checkpoint file in {}.".format(checkpoint_dir)) return restore
def test_cluster_interrupt(start_connected_cluster, tmpdir): """Tests run_experiment on cluster shutdown with actual interrupt. This is an end-to-end test. """ cluster = start_connected_cluster dirpath = str(tmpdir) # Needs to be in scope for pytest class _Mock(tune.Trainable): """Finishes on the 4th iteration.""" def _setup(self, config): self.state = {"hi": 0} def _train(self): self.state["hi"] += 1 time.sleep(0.5) return {"done": self.state["hi"] >= 4} def _save(self, path): return self.state def _restore(self, state): self.state = state # Removes indent from class. reformatted = "\n".join(line[4:] if len(line) else line for line in inspect.getsource(_Mock).split("\n")) script = """ import time import ray from ray import tune ray.init(address="{address}") {fail_class_code} tune.run( {fail_class}, name="experiment", stop=dict(training_iteration=5), local_dir="{checkpoint_dir}", checkpoint_freq=1, global_checkpoint_period=0, max_failures=1, raise_on_failed_trial=False) """.format(address=cluster.address, checkpoint_dir=dirpath, fail_class_code=reformatted, fail_class=_Mock.__name__) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. local_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(50): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration") == 3: break time.sleep(0.2) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() Experiment.register_if_needed(_Mock) # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() assert trials[0].last_result["training_iteration"] == 3 assert trials[0].status == Trial.PENDING # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": _Mock, "local_dir": dirpath, "checkpoint_freq": 1 } }, resume=True, raise_on_failed_trial=False) assert all(t.status == Trial.TERMINATED for t in trials2) assert {t.trial_id for t in trials2} == {t.trial_id for t in trials} ray.shutdown() cluster.shutdown()
def run_experiments(experiments, search_alg=None, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=2, resume=False, queue_trials=False, trial_executor=None, raise_on_failed_trial=True): """Runs and blocks until all trials finish. Args: experiments (Experiment | list | dict): Experiments to run. Will be passed to `search_alg` via `add_configurations`. search_alg (SearchAlgorithm): Search Algorithm. Defaults to BasicVariantGenerator. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, and HyperBand. with_server (bool): Starts a background Tune server. Needed for using the Client API. server_port (int): Port number for launching TuneServer. verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and trial results. resume (bool|"prompt"): If checkpoint exists, the experiment will resume from there. If resume is "prompt", Tune will prompt if checkpoint detected. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. Examples: >>> experiment_spec = Experiment("experiment", my_func) >>> run_experiments(experiments=experiment_spec) >>> experiment_spec = {"experiment": {"run": my_func}} >>> run_experiments(experiments=experiment_spec) >>> run_experiments( >>> experiments=experiment_spec, >>> scheduler=MedianStoppingRule(...)) >>> run_experiments( >>> experiments=experiment_spec, >>> search_alg=SearchAlgorithm(), >>> scheduler=MedianStoppingRule(...)) Returns: List of Trial objects, holding data for each executed trial. """ # This is important to do this here # because it schematize the experiments # and it conducts the implicit registration. experiments = convert_to_experiment_list(experiments) checkpoint_dir = _find_checkpoint_dir(experiments) runner = None restore = False if TrialRunner.checkpoint_exists(checkpoint_dir): if resume == "prompt": msg = ("Found incomplete experiment at {}. " "Would you like to resume it?".format(checkpoint_dir)) restore = click.confirm(msg, default=False) if restore: logger.info("Tip: to always resume, " "pass resume=True to run_experiments()") else: logger.info("Tip: to always start a new experiment, " "pass resume=False to run_experiments()") elif resume: restore = True else: logger.info( "Tip: to resume incomplete experiments, " "pass resume='prompt' or resume=True to run_experiments()") else: logger.info( "Did not find checkpoint file in {}.".format(checkpoint_dir)) if restore: runner = try_restore_runner(checkpoint_dir, search_alg, scheduler, trial_executor) else: logger.info("Starting a new experiment.") if not runner: if scheduler is None: scheduler = FIFOScheduler() if search_alg is None: search_alg = BasicVariantGenerator() search_alg.add_configurations(experiments) runner = TrialRunner(search_alg, scheduler=scheduler, metadata_checkpoint_dir=checkpoint_dir, launch_web_server=with_server, server_port=server_port, verbose=bool(verbose > 1), queue_trials=queue_trials, trial_executor=trial_executor) if verbose: print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: if verbose: print(runner.debug_string()) last_debug = time.time() if verbose: print(runner.debug_string(max_debug=99999)) wait_for_log_sync() errored_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: errored_trials += [trial] if errored_trials: if raise_on_failed_trial: raise TuneError("Trials did not complete", errored_trials) else: logger.error("Trials did not complete: %s", errored_trials) return runner.get_trials()
def run_experiments(experiments, search_alg=None, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=2, resume=False, queue_trials=False, reuse_actors=False, trial_executor=None, raise_on_failed_trial=True): """Runs and blocks until all trials finish. Args: experiments (Experiment | list | dict): Experiments to run. Will be passed to `search_alg` via `add_configurations`. search_alg (SearchAlgorithm): Search Algorithm. Defaults to BasicVariantGenerator. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, and HyperBand. with_server (bool): Starts a background Tune server. Needed for using the Client API. server_port (int): Port number for launching TuneServer. verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and trial results. resume (bool|"prompt"): If checkpoint exists, the experiment will resume from there. If resume is "prompt", Tune will prompt if checkpoint detected. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. Examples: >>> experiment_spec = Experiment("experiment", my_func) >>> run_experiments(experiments=experiment_spec) >>> experiment_spec = {"experiment": {"run": my_func}} >>> run_experiments(experiments=experiment_spec) >>> run_experiments( >>> experiments=experiment_spec, >>> scheduler=MedianStoppingRule(...)) >>> run_experiments( >>> experiments=experiment_spec, >>> search_alg=SearchAlgorithm(), >>> scheduler=MedianStoppingRule(...)) Returns: List of Trial objects, holding data for each executed trial. """ # This is important to do this here # because it schematize the experiments # and it conducts the implicit registration. experiments = convert_to_experiment_list(experiments) checkpoint_dir = _find_checkpoint_dir(experiments) runner = None restore = False if TrialRunner.checkpoint_exists(checkpoint_dir): if resume == "prompt": msg = ("Found incomplete experiment at {}. " "Would you like to resume it?".format(checkpoint_dir)) restore = click.confirm(msg, default=False) if restore: logger.info("Tip: to always resume, " "pass resume=True to run_experiments()") else: logger.info("Tip: to always start a new experiment, " "pass resume=False to run_experiments()") elif resume: restore = True else: logger.info( "Tip: to resume incomplete experiments, " "pass resume='prompt' or resume=True to run_experiments()") else: logger.info( "Did not find checkpoint file in {}.".format(checkpoint_dir)) if restore: runner = try_restore_runner(checkpoint_dir, search_alg, scheduler, trial_executor) else: logger.info("Starting a new experiment.") if not runner: if scheduler is None: scheduler = FIFOScheduler() if search_alg is None: search_alg = BasicVariantGenerator() search_alg.add_configurations(experiments) runner = TrialRunner( search_alg, scheduler=scheduler, metadata_checkpoint_dir=checkpoint_dir, launch_web_server=with_server, server_port=server_port, verbose=bool(verbose > 1), queue_trials=queue_trials, reuse_actors=reuse_actors, trial_executor=trial_executor) if verbose: print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: if verbose: print(runner.debug_string()) last_debug = time.time() if verbose: print(runner.debug_string(max_debug=99999)) wait_for_log_sync() errored_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: errored_trials += [trial] if errored_trials: if raise_on_failed_trial: raise TuneError("Trials did not complete", errored_trials) else: logger.error("Trials did not complete: %s", errored_trials) return runner.get_trials()
def test_cluster_interrupt_searcher(start_connected_cluster, tmpdir): """Tests restoration of HyperOptSearch experiment on cluster shutdown with actual interrupt. Restoration should restore both state of trials and previous search algorithm (HyperOptSearch) state. This is an end-to-end test. """ cluster = start_connected_cluster dirpath = str(tmpdir) local_checkpoint_dir = os.path.join(dirpath, "experiment") from ray.tune.examples.async_hyperband_example import MyTrainableClass from ray.tune import register_trainable register_trainable("trainable", MyTrainableClass) def execute_script_with_args(*args): current_dir = os.path.dirname(__file__) script = os.path.join(current_dir, "_test_cluster_interrupt_searcher.py") subprocess.Popen([sys.executable, script] + list(args)) args = ["--ray-address", cluster.address, "--local-dir", dirpath] execute_script_with_args(*args) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. for i in range(50): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner( resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() if trials and len(trials) >= 10: break time.sleep(.5) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError( f"Checkpoint file didn't appear in {local_checkpoint_dir}. " f"Current list: {os.listdir(local_checkpoint_dir)}.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() execute_script_with_args(*(args + ["--resume"])) time.sleep(2) register_trainable("trainable", MyTrainableClass) reached = False for i in range(50): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner( resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() if len(trials) == 0: continue # nonblocking script hasn't resumed yet, wait reached = True assert len(trials) >= 10 assert len(trials) <= 20 if len(trials) == 20: break else: stop_fn = runner.trial_executor.stop_trial [stop_fn(t) for t in trials if t.status is not Trial.ERROR] time.sleep(.5) assert reached is True ray.shutdown() cluster.shutdown()
def test_cluster_interrupt(start_connected_cluster, tmpdir): """Tests run_experiment on cluster shutdown with actual interrupt. This is an end-to-end test. """ cluster = start_connected_cluster dirpath = str(tmpdir) # Needs to be in scope for pytest class _Mock(tune.Trainable): """Finishes on the 4th iteration.""" def _setup(self, config): self.state = {"hi": 0} def _train(self): self.state["hi"] += 1 time.sleep(0.5) return {"done": self.state["hi"] >= 4} def _save(self, path): return self.state def _restore(self, state): self.state = state # Removes indent from class. reformatted = "\n".join(line[4:] if len(line) else line for line in inspect.getsource(_Mock).split("\n")) script = """ import time import ray from ray import tune ray.init(redis_address="{redis_address}") {fail_class_code} kwargs = dict( run={fail_class}, stop=dict(training_iteration=5), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1) tune.run_experiments( dict(experiment=kwargs), raise_on_failed_trial=False) """.format( redis_address=cluster.redis_address, checkpoint_dir=dirpath, fail_class_code=reformatted, fail_class=_Mock.__name__) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. metadata_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(50): if TrialRunner.checkpoint_exists(metadata_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner.restore(metadata_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration") == 3: break time.sleep(0.2) if not TrialRunner.checkpoint_exists(metadata_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() Experiment._register_if_needed(_Mock) # Inspect the internal trialrunner runner = TrialRunner.restore(metadata_checkpoint_dir) trials = runner.get_trials() assert trials[0].last_result["training_iteration"] == 3 assert trials[0].status == Trial.PENDING # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": _Mock, "local_dir": dirpath, "checkpoint_freq": 1 } }, resume=True, raise_on_failed_trial=False) assert all(t.status == Trial.TERMINATED for t in trials2) assert {t.trial_id for t in trials2} == {t.trial_id for t in trials} cluster.shutdown()