Example #1
0
def test_cluster_rllib_restore(start_connected_cluster, tmpdir):
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    script = """
import time
import ray
from ray import tune

ray.init(address="{address}")


tune.run(
    "PG",
    name="experiment",
    config=dict(env="CartPole-v1", framework="tf"),
    stop=dict(training_iteration=10),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1,
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(address=cluster.address, checkpoint_dir=dirpath)
    run_string_as_driver_nonblocking(script)
    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    local_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(100):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration"):
                break
        time.sleep(0.3)

    if not TrialRunner.checkpoint_exists(local_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    cluster.wait_for_nodes()

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": "PG",
                "checkpoint_freq": 1,
                "local_dir": dirpath,
            }
        },
        resume=True,
    )
    assert all(t.status == Trial.TERMINATED for t in trials2)
    ray.shutdown()
    cluster.shutdown()
Example #2
0
def test_cluster_interrupt(start_connected_cluster, tmpdir):
    """Tests run_experiment on cluster shutdown with actual interrupt.

    This is an end-to-end test.
    """
    cluster = start_connected_cluster
    dirpath = str(tmpdir)

    # Needs to be in scope for pytest
    class _Mock(tune.Trainable):
        """Finishes on the 4th iteration."""
        def setup(self, config):
            self.state = {"hi": 0}

        def step(self):
            self.state["hi"] += 1
            time.sleep(0.5)
            return {"done": self.state["hi"] >= 4}

        def save_checkpoint(self, path):
            return self.state

        def load_checkpoint(self, state):
            self.state = state

    # Removes indent from class.
    reformatted = "\n".join(line[4:] if len(line) else line
                            for line in inspect.getsource(_Mock).split("\n"))

    script = """
import os
import time
import ray
from ray import tune

os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0"

ray.init(address="{address}")

{fail_class_code}

tune.run(
    {fail_class},
    name="experiment",
    stop=dict(training_iteration=5),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1,
    raise_on_failed_trial=False)
""".format(
        address=cluster.address,
        checkpoint_dir=dirpath,
        fail_class_code=reformatted,
        fail_class=_Mock.__name__,
    )
    run_string_as_driver_nonblocking(script)

    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    local_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(50):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration") == 3:
                break
        time.sleep(0.2)

    if not TrialRunner.checkpoint_exists(local_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    Experiment.register_if_needed(_Mock)

    # Inspect the internal trialrunner
    runner = TrialRunner(resume="LOCAL",
                         local_checkpoint_dir=local_checkpoint_dir)
    trials = runner.get_trials()
    assert trials[0].last_result["training_iteration"] == 3
    assert trials[0].status == Trial.PENDING

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": _Mock,
                "local_dir": dirpath,
                "checkpoint_freq": 1
            }
        },
        resume=True,
        raise_on_failed_trial=False,
    )
    assert all(t.status == Trial.TERMINATED for t in trials2)
    assert {t.trial_id for t in trials2} == {t.trial_id for t in trials}
    ray.shutdown()
    cluster.shutdown()
def test_cluster_interrupt_searcher(start_connected_cluster, tmpdir, searcher):
    """Tests restoration of HyperOptSearch experiment on cluster shutdown
    with actual interrupt.

    Restoration should restore both state of trials
    and previous search algorithm (HyperOptSearch) state.
    This is an end-to-end test.
    """
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    local_checkpoint_dir = os.path.join(dirpath, "experiment")
    from ray.tune import register_trainable

    register_trainable("trainable", MyTrainableClass)

    def execute_script_with_args(*args):
        current_dir = os.path.dirname(__file__)
        script = os.path.join(current_dir,
                              "_test_cluster_interrupt_searcher.py")
        subprocess.Popen([sys.executable, script] + list(args))

    args = (
        "--ray-address",
        cluster.address,
        "--local-dir",
        dirpath,
        "--searcher",
        searcher,
    )
    execute_script_with_args(*args)
    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    trials = []
    for i in range(100):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()
            if trials and len(trials) >= 10:
                break
        time.sleep(0.5)
    else:
        raise ValueError(f"Didn't generate enough trials: {len(trials)}")

    if not TrialRunner.checkpoint_exists(local_checkpoint_dir):
        raise RuntimeError(
            f"Checkpoint file didn't appear in {local_checkpoint_dir}. "
            f"Current list: {os.listdir(local_checkpoint_dir)}.")

    ray.shutdown()
    cluster.shutdown()

    cluster = _start_new_cluster()
    execute_script_with_args(*(args + ("--resume", )))

    time.sleep(2)

    register_trainable("trainable", MyTrainableClass)
    reached = False
    for i in range(100):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()

            if len(trials) == 0:
                continue  # nonblocking script hasn't resumed yet, wait

            reached = True
            assert len(trials) >= 10
            assert len(trials) <= 20
            if len(trials) == 20:
                break
            else:
                stop_fn = runner.trial_executor.stop_trial
                [stop_fn(t) for t in trials if t.status is not Trial.ERROR]
        time.sleep(0.5)
    assert reached is True

    ray.shutdown()
    cluster.shutdown()