Beispiel #1
0
def test_cluster_rllib_restore(start_connected_cluster, tmpdir):
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    script = """
import time
import ray
from ray import tune

ray.init(redis_address="{redis_address}")

kwargs = dict(
    run="PG",
    env="CartPole-v1",
    stop=dict(training_iteration=10),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1)

tune.run_experiments(
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(redis_address=cluster.redis_address, checkpoint_dir=dirpath)
    run_string_as_driver_nonblocking(script)
    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    metadata_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(50):
        if os.path.exists(
                os.path.join(metadata_checkpoint_dir,
                             TrialRunner.CKPT_FILE_NAME)):
            # Inspect the internal trialrunner
            runner = TrialRunner.restore(metadata_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res is not None and last_res["training_iteration"]:
                break
        time.sleep(0.2)

    if not os.path.exists(
            os.path.join(metadata_checkpoint_dir, TrialRunner.CKPT_FILE_NAME)):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    cluster.wait_for_nodes()

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": "PG",
                "checkpoint_freq": 1,
                "local_dir": dirpath
            }
        },
        resume=True)
    assert all(t.status == Trial.TERMINATED for t in trials2)
    cluster.shutdown()
Beispiel #2
0
def try_restore_runner(checkpoint_dir, search_alg, scheduler, trial_executor):
    new_runner = None
    try:
        new_runner = TrialRunner.restore(checkpoint_dir, search_alg, scheduler,
                                         trial_executor)
    except Exception:
        logger.exception("Runner restore failed. Restarting experiment.")
    return new_runner
Beispiel #3
0
def try_restore_runner(checkpoint_dir, search_alg, scheduler, trial_executor):
    new_runner = None
    try:
        new_runner = TrialRunner.restore(checkpoint_dir, search_alg, scheduler,
                                         trial_executor)
    except Exception:
        logger.exception("Runner restore failed. Restarting experiment.")
    return new_runner
Beispiel #4
0
def test_cluster_rllib_restore(start_connected_cluster, tmpdir):
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    script = """
import time
import ray
from ray import tune

ray.init(redis_address="{redis_address}")

kwargs = dict(
    run="PG",
    env="CartPole-v1",
    stop=dict(training_iteration=10),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1)

tune.run_experiments(
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(
        redis_address=cluster.redis_address, checkpoint_dir=dirpath)
    run_string_as_driver_nonblocking(script)
    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    metadata_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(100):
        if TrialRunner.checkpoint_exists(metadata_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner.restore(metadata_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration"):
                break
        time.sleep(0.3)

    if not TrialRunner.checkpoint_exists(metadata_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    cluster.wait_for_nodes()

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": "PG",
                "checkpoint_freq": 1,
                "local_dir": dirpath
            }
        },
        resume=True)
    assert all(t.status == Trial.TERMINATED for t in trials2)
    cluster.shutdown()
Beispiel #5
0
def test_cluster_down_simple(start_connected_cluster, tmpdir):
    """Tests that TrialRunner save/restore works on cluster shutdown."""
    cluster = start_connected_cluster
    cluster.add_node(resources=dict(CPU=1))
    assert cluster.wait_for_nodes()

    dirpath = str(tmpdir)
    runner = TrialRunner(BasicVariantGenerator(),
                         metadata_checkpoint_dir=dirpath)
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 2
        },
        "checkpoint_freq": 1,
        "max_failures": 1
    }
    trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
    for t in trials:
        runner.add_trial(t)

    runner.step()  # start
    runner.step()  # start2
    runner.step()  # step
    assert all(t.status == Trial.RUNNING for t in runner.get_trials())
    runner.checkpoint()

    cluster.shutdown()
    ray.shutdown()

    cluster = _start_new_cluster()
    runner = TrialRunner.restore(dirpath)
    runner.step()  # start
    runner.step()  # start2

    for i in range(3):
        runner.step()

    with pytest.raises(TuneError):
        runner.step()

    assert all(t.status == Trial.TERMINATED for t in runner.get_trials())
    cluster.shutdown()
Beispiel #6
0
def test_cluster_down_simple(start_connected_cluster, tmpdir):
    """Tests that TrialRunner save/restore works on cluster shutdown."""
    cluster = start_connected_cluster
    cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    dirpath = str(tmpdir)
    runner = TrialRunner(
        BasicVariantGenerator(), metadata_checkpoint_dir=dirpath)
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 2
        },
        "checkpoint_freq": 1,
        "max_failures": 1
    }
    trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
    for t in trials:
        runner.add_trial(t)

    runner.step()  # start
    runner.step()  # start2
    runner.step()  # step
    assert all(t.status == Trial.RUNNING for t in runner.get_trials())
    runner.checkpoint()

    cluster.shutdown()
    ray.shutdown()

    cluster = _start_new_cluster()
    runner = TrialRunner.restore(dirpath)
    runner.step()  # start
    runner.step()  # start2

    for i in range(3):
        runner.step()

    with pytest.raises(TuneError):
        runner.step()

    assert all(t.status == Trial.TERMINATED for t in runner.get_trials())
    cluster.shutdown()
Beispiel #7
0
def test_cluster_interrupt(start_connected_cluster, tmpdir):
    """Tests run_experiment on cluster shutdown even with atypical trial.

    The trial fails on the 4th step, and the checkpointing happens on
    the 3rd step, so restoring should actually launch the trial again.
    """
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    script = """
import time
import ray
from ray import tune

ray.init(redis_address="{redis_address}")

{fail_class_code}

kwargs = dict(
    run={fail_class},
    stop=dict(training_iteration=5),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1)

tune.run_experiments(
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(redis_address=cluster.redis_address,
           checkpoint_dir=dirpath,
           fail_class_code=inspect.getsource(_Fail),
           fail_class=_Fail.__name__)
    run_string_as_driver_nonblocking(script)

    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    metadata_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(50):
        if os.path.exists(
                os.path.join(metadata_checkpoint_dir,
                             TrialRunner.CKPT_FILE_NAME)):
            # Inspect the internal trialrunner
            runner = TrialRunner.restore(metadata_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res is not None and last_res["training_iteration"] == 3:
                break
        time.sleep(0.2)

    if not os.path.exists(
            os.path.join(metadata_checkpoint_dir, TrialRunner.CKPT_FILE_NAME)):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    Experiment._register_if_needed(_Fail)

    # Inspect the internal trialrunner
    runner = TrialRunner.restore(metadata_checkpoint_dir)
    trials = runner.get_trials()
    assert trials[0].last_result["training_iteration"] == 3
    assert trials[0].status == Trial.PENDING

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": _Fail,
                "local_dir": dirpath,
                "checkpoint_freq": 1
            }
        },
        resume=True,
        raise_on_failed_trial=False)
    assert all(t.status == Trial.ERROR for t in trials2)
    assert {t.trial_id for t in trials2} == {t.trial_id for t in trials}
    cluster.shutdown()
Beispiel #8
0
def test_cluster_interrupt(start_connected_cluster, tmpdir):
    """Tests run_experiment on cluster shutdown with actual interrupt.

    This is an end-to-end test.
    """
    cluster = start_connected_cluster
    dirpath = str(tmpdir)

    # Needs to be in scope for pytest
    class _Mock(tune.Trainable):
        """Finishes on the 4th iteration."""

        def _setup(self, config):
            self.state = {"hi": 0}

        def _train(self):
            self.state["hi"] += 1
            time.sleep(0.5)
            return {"done": self.state["hi"] >= 4}

        def _save(self, path):
            return self.state

        def _restore(self, state):
            self.state = state

    # Removes indent from class.
    reformatted = "\n".join(line[4:] if len(line) else line
                            for line in inspect.getsource(_Mock).split("\n"))

    script = """
import time
import ray
from ray import tune

ray.init(redis_address="{redis_address}")

{fail_class_code}

kwargs = dict(
    run={fail_class},
    stop=dict(training_iteration=5),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1)

tune.run_experiments(
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(
        redis_address=cluster.redis_address,
        checkpoint_dir=dirpath,
        fail_class_code=reformatted,
        fail_class=_Mock.__name__)
    run_string_as_driver_nonblocking(script)

    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    metadata_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(50):
        if TrialRunner.checkpoint_exists(metadata_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner.restore(metadata_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration") == 3:
                break
        time.sleep(0.2)

    if not TrialRunner.checkpoint_exists(metadata_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    Experiment._register_if_needed(_Mock)

    # Inspect the internal trialrunner
    runner = TrialRunner.restore(metadata_checkpoint_dir)
    trials = runner.get_trials()
    assert trials[0].last_result["training_iteration"] == 3
    assert trials[0].status == Trial.PENDING

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": _Mock,
                "local_dir": dirpath,
                "checkpoint_freq": 1
            }
        },
        resume=True,
        raise_on_failed_trial=False)
    assert all(t.status == Trial.TERMINATED for t in trials2)
    assert {t.trial_id for t in trials2} == {t.trial_id for t in trials}
    cluster.shutdown()
Beispiel #9
0
def run(run_or_experiment,
        name=None,
        stop=None,
        config=None,
        resources_per_trial=None,
        num_samples=1,
        local_dir=None,
        upload_dir=None,
        trial_name_creator=None,
        loggers=None,
        sync_function=None,
        checkpoint_freq=0,
        checkpoint_at_end=False,
        export_formats=None,
        max_failures=3,
        restore=None,
        search_alg=None,
        scheduler=None,
        with_server=False,
        server_port=TuneServer.DEFAULT_PORT,
        verbose=2,
        resume=False,
        queue_trials=False,
        reuse_actors=False,
        trial_executor=None,
        raise_on_failed_trial=True,
        ray_auto_init=True):
    """Executes training.

    Args:
        run_or_experiment (function|class|str|Experiment): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec.
        name (str): Name of experiment.
        stop (dict): The stopping criteria. The keys may be any field in
            the return result of 'train()', whichever is reached first.
            Defaults to empty dict.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict): Machine resources to allocate per trial,
            e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be
            assigned unless you specify them here. Defaults to 1 CPU and 0
            GPUs in ``Trainable.default_resource_request()``.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        upload_dir (str): Optional URI to sync training results
            to (e.g. ``s3://bucket``).
        trial_name_creator (func): Optional function for generating
            the trial string representation.
        loggers (list): List of logger creators to be used with
            each Trial. If None, defaults to ray.tune.logger.DEFAULT_LOGGERS.
            See `ray/tune/logger.py`.
        sync_function (func|str): Function for syncing the local_dir to
            upload_dir. If string, then it must be a string template for
            syncer to run. If not provided, the sync command defaults
            to standard S3 or gsutil sync comamnds.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial from its last
            checkpoint at least this many times. Only applies if
            checkpointing is enabled. Setting to -1 will lead to infinite
            recovery retries. Defaults to 3.
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        search_alg (SearchAlgorithm): Search Algorithm. Defaults to
            BasicVariantGenerator.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, and HyperBand.
        with_server (bool): Starts a background Tune server. Needed for
            using the Client API.
        server_port (int): Port number for launching TuneServer.
        verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent,
            1 = only status updates, 2 = status and trial results.
        resume (bool|"prompt"): If checkpoint exists, the experiment will
            resume from there. If resume is "prompt", Tune will prompt if
            checkpoint detected.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        ray_auto_init (bool): Automatically starts a local Ray cluster
            if using a RayTrialExecutor (which is the default) and
            if Ray is not initialized. Defaults to True.

    Returns:
        List of Trial objects.

    Raises:
        TuneError if any trials failed and `raise_on_failed_trial` is True.

    Examples:
        >>> tune.run(mytrainable, scheduler=PopulationBasedTraining())

        >>> tune.run(mytrainable, num_samples=5, reuse_actors=True)

        >>> tune.run(
                "PG",
                num_samples=5,
                config={
                    "env": "CartPole-v0",
                    "lr": tune.sample_from(lambda _: np.random.rand())
                }
            )
    """
    trial_executor = trial_executor or RayTrialExecutor(
        queue_trials=queue_trials,
        reuse_actors=reuse_actors,
        ray_auto_init=ray_auto_init)
    experiment = run_or_experiment
    if not isinstance(run_or_experiment, Experiment):
        experiment = Experiment(name=name,
                                run=run_or_experiment,
                                stop=stop,
                                config=config,
                                resources_per_trial=resources_per_trial,
                                num_samples=num_samples,
                                local_dir=local_dir,
                                upload_dir=upload_dir,
                                trial_name_creator=trial_name_creator,
                                loggers=loggers,
                                sync_function=sync_function,
                                checkpoint_freq=checkpoint_freq,
                                checkpoint_at_end=checkpoint_at_end,
                                export_formats=export_formats,
                                max_failures=max_failures,
                                restore=restore)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    checkpoint_dir = _find_checkpoint_dir(experiment)
    should_restore = _prompt_restore(checkpoint_dir, resume)

    runner = None
    if should_restore:
        try:
            runner = TrialRunner.restore(checkpoint_dir, search_alg, scheduler,
                                         trial_executor)
        except Exception:
            logger.exception("Runner restore failed. Restarting experiment.")
    else:
        logger.info("Starting a new experiment.")

    if not runner:
        scheduler = scheduler or FIFOScheduler()
        search_alg = search_alg or BasicVariantGenerator()

        search_alg.add_configurations([experiment])

        runner = TrialRunner(search_alg=search_alg,
                             scheduler=scheduler,
                             metadata_checkpoint_dir=checkpoint_dir,
                             launch_web_server=with_server,
                             server_port=server_port,
                             verbose=bool(verbose > 1),
                             trial_executor=trial_executor)

    if verbose:
        print(runner.debug_string(max_debug=99999))

    last_debug = 0
    while not runner.is_finished():
        runner.step()
        if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
            if verbose:
                print(runner.debug_string())
            last_debug = time.time()

    if verbose:
        print(runner.debug_string(max_debug=99999))

    wait_for_log_sync()

    errored_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            errored_trials += [trial]

    if errored_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", errored_trials)
        else:
            logger.error("Trials did not complete: %s", errored_trials)

    return runner.get_trials()
Beispiel #10
0
def test_cluster_interrupt(start_connected_cluster, tmpdir):
    """Tests run_experiment on cluster shutdown with actual interrupt.

    This is an end-to-end test.
    """
    cluster = start_connected_cluster
    dirpath = str(tmpdir)

    # Needs to be in scope for pytest
    class _Mock(tune.Trainable):
        """Finishes on the 4th iteration."""

        def _setup(self, config):
            self.state = {"hi": 0}

        def _train(self):
            self.state["hi"] += 1
            time.sleep(0.5)
            return {"done": self.state["hi"] >= 4}

        def _save(self, path):
            return self.state

        def _restore(self, state):
            self.state = state

    # Removes indent from class.
    reformatted = "\n".join(line[4:] if len(line) else line
                            for line in inspect.getsource(_Mock).split("\n"))

    script = """
import time
import ray
from ray import tune

ray.init(redis_address="{redis_address}")

{fail_class_code}

kwargs = dict(
    run={fail_class},
    stop=dict(training_iteration=5),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1)

tune.run_experiments(
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(
        redis_address=cluster.redis_address,
        checkpoint_dir=dirpath,
        fail_class_code=reformatted,
        fail_class=_Mock.__name__)
    run_string_as_driver_nonblocking(script)

    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    metadata_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(50):
        if TrialRunner.checkpoint_exists(metadata_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner.restore(metadata_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration") == 3:
                break
        time.sleep(0.2)

    if not TrialRunner.checkpoint_exists(metadata_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    Experiment._register_if_needed(_Mock)

    # Inspect the internal trialrunner
    runner = TrialRunner.restore(metadata_checkpoint_dir)
    trials = runner.get_trials()
    assert trials[0].last_result["training_iteration"] == 3
    assert trials[0].status == Trial.PENDING

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": _Mock,
                "local_dir": dirpath,
                "checkpoint_freq": 1
            }
        },
        resume=True,
        raise_on_failed_trial=False)
    assert all(t.status == Trial.TERMINATED for t in trials2)
    assert {t.trial_id for t in trials2} == {t.trial_id for t in trials}
    cluster.shutdown()