Esempio n. 1
0
File: tune.py Progetto: adgirish/ray
def run_experiments(experiments, scheduler=None, with_server=False,
                    server_port=TuneServer.DEFAULT_PORT, verbose=True):

    # Make sure rllib agents are registered
    from ray import rllib  # noqa # pylint: disable=unused-import

    if scheduler is None:
        scheduler = FIFOScheduler()

    runner = TrialRunner(
        scheduler, launch_web_server=with_server, server_port=server_port)

    for name, spec in experiments.items():
        for trial in generate_trials(spec, name):
            trial.set_verbose(verbose)
            runner.add_trial(trial)
    print(runner.debug_string(max_debug=99999))

    last_debug = 0
    while not runner.is_finished():
        runner.step()
        if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
            print(runner.debug_string())
            last_debug = time.time()

    print(runner.debug_string(max_debug=99999))

    for trial in runner.get_trials():
        # TODO(rliaw): What about errored?
        if trial.status != Trial.TERMINATED:
            raise TuneError("Trial did not complete", trial)

    wait_for_log_sync()
    return runner.get_trials()
Esempio n. 2
0
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster):
    """Test checks that trial restarts if checkpoint is lost w/ node fail."""
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 3
        },
        "checkpoint_freq": 2,
        "max_failures": 2
    }

    # Test recovery of trial that has been checkpointed
    t1 = Trial("__fake", **kwargs)
    runner.add_trial(t1)
    runner.step()  # start
    runner.step()  # 1 result
    runner.step()  # 2 result and checkpoint
    assert t1.has_checkpoint()
    cluster.add_node(num_cpus=1)
    cluster.remove_node(node)
    cluster.wait_for_nodes()
    shutil.rmtree(os.path.dirname(t1._checkpoint.value))

    runner.step()  # Recovery step
    for i in range(3):
        runner.step()

    assert t1.status == Trial.TERMINATED
Esempio n. 3
0
def test_trial_requeue(start_connected_emptyhead_cluster):
    """Removing a node in full cluster causes Trial to be requeued."""
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 5
        },
        "checkpoint_freq": 1,
        "max_failures": 1
    }

    trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
    for t in trials:
        runner.add_trial(t)

    runner.step()  # start
    runner.step()  # 1 result

    cluster.remove_node(node)
    cluster.wait_for_nodes()
    runner.step()
    assert all(t.status == Trial.PENDING for t in trials)

    with pytest.raises(TuneError):
        runner.step()
Esempio n. 4
0
def test_remove_node_before_result(start_connected_emptyhead_cluster):
    """Tune continues when node is removed before trial returns."""
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 3
        },
        "checkpoint_freq": 2,
        "max_failures": 2
    }
    trial = Trial("__fake", **kwargs)
    runner.add_trial(trial)

    runner.step()  # run 1
    assert trial.status == Trial.RUNNING
    cluster.remove_node(node)
    cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()
    assert ray.global_state.cluster_resources()["CPU"] == 1

    for i in range(3):
        runner.step()
    assert trial.status == Trial.TERMINATED

    with pytest.raises(TuneError):
        runner.step()
Esempio n. 5
0
def test_counting_resources(start_connected_cluster):
    """Tests that Tune accounting is consistent with actual cluster."""
    cluster = start_connected_cluster
    nodes = []
    assert ray.global_state.cluster_resources()["CPU"] == 1
    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {"stopping_criterion": {"training_iteration": 10}}

    trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
    for t in trials:
        runner.add_trial(t)

    runner.step()  # run 1
    nodes += [cluster.add_node(num_cpus=1)]
    cluster.wait_for_nodes()
    assert ray.global_state.cluster_resources()["CPU"] == 2
    cluster.remove_node(nodes.pop())
    cluster.wait_for_nodes()
    assert ray.global_state.cluster_resources()["CPU"] == 1
    runner.step()  # run 2
    assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1

    for i in range(5):
        nodes += [cluster.add_node(num_cpus=1)]
    cluster.wait_for_nodes()
    assert ray.global_state.cluster_resources()["CPU"] == 6

    runner.step()  # 1 result
    assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 2
Esempio n. 6
0
def test_cluster_down_simple(start_connected_cluster, tmpdir):
    """Tests that TrialRunner save/restore works on cluster shutdown."""
    cluster = start_connected_cluster
    cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    dirpath = str(tmpdir)
    runner = TrialRunner(
        BasicVariantGenerator(), metadata_checkpoint_dir=dirpath)
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 2
        },
        "checkpoint_freq": 1,
        "max_failures": 1
    }
    trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
    for t in trials:
        runner.add_trial(t)

    runner.step()  # start
    runner.step()  # start2
    runner.step()  # step
    assert all(t.status == Trial.RUNNING for t in runner.get_trials())
    runner.checkpoint()

    cluster.shutdown()
    ray.shutdown()

    cluster = _start_new_cluster()
    runner = TrialRunner.restore(dirpath)
    runner.step()  # start
    runner.step()  # start2

    for i in range(3):
        runner.step()

    with pytest.raises(TuneError):
        runner.step()

    assert all(t.status == Trial.TERMINATED for t in runner.get_trials())
    cluster.shutdown()
Esempio n. 7
0
    def testFailFastRaise(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner(fail_fast=TrialRunner.RAISE)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()  # Start trial
        self.assertEqual(trials[0].status, Trial.RUNNING)
        runner.step()  # Process result, dispatch save
        self.assertEqual(trials[0].status, Trial.RUNNING)
        runner.step()  # Process save
        with self.assertRaises(Exception):
            runner.step()  # Error
Esempio n. 8
0
    def testUserCheckpoint(self):
        ray.init(num_cpus=3)
        tmpdir = tempfile.mkdtemp()
        runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0)
        runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2}))
        trials = runner.get_trials()

        runner.step()  # Start trial
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1)
        runner.step()  # Process result
        self.assertFalse(trials[0].has_checkpoint())
        runner.step()  # Process result
        self.assertFalse(trials[0].has_checkpoint())
        runner.step()  # Process result, dispatch save
        runner.step()  # Process save
        self.assertTrue(trials[0].has_checkpoint())

        runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir)
        runner2.step()  # 5: Start trial and dispatch restore
        trials2 = runner2.get_trials()
        self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1)
        shutil.rmtree(tmpdir)
Esempio n. 9
0
    def testMultiStepRun2(self):
        """Checks that runner.step throws when overstepping."""
        ray.init(num_cpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {
                "training_iteration": 2
            },
            "resources": Resources(cpu=1, gpu=0),
        }
        trials = [Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertRaises(TuneError, runner.step)
Esempio n. 10
0
    def testExtraCustomResources(self):
        ray.init(num_cpus=4, num_gpus=2, resources={"a": 2})
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {
                "training_iteration": 1
            },
            "resources": Resources(cpu=1,
                                   gpu=0,
                                   extra_custom_resources={"a": 2}),
        }
        trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.PENDING)

        runner.step()
        self.assertTrue(sum(t.status == Trial.RUNNING for t in trials) < 2)
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[1].status, Trial.PENDING)
Esempio n. 11
0
    def testFailureRecoveryEnabled(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner(BasicVariantGenerator())
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 1,
            "config": {
                "mock_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[0].num_failures, 1)
        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
Esempio n. 12
0
    def testFailureRecoveryDisabled(self):
        ray.init(num_cpus=1, num_gpus=1)
        searchalg, scheduler = create_mock_components()

        runner = TrialRunner(searchalg, scheduler=scheduler)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()

        self.assertEqual(trials[0].status, Trial.ERROR)
        self.assertEqual(trials[0].num_failures, 1)
        self.assertEqual(len(searchalg.errored_trials), 1)
        self.assertEqual(len(scheduler.errored_trials), 1)
Esempio n. 13
0
    def testFailFast(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner(fail_fast=True)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        while not runner.is_finished():
            runner.step()
        self.assertEqual(trials[0].status, Trial.ERROR)
        # Somehow with `fail_fast=True`, if one errors out, the others are
        # then stopped with `TERMINATED` status.
        self.assertEqual(trials[1].status, Trial.TERMINATED)
        self.assertRaises(TuneError, lambda: runner.step())
Esempio n. 14
0
    def testChangeResources(self):
        """Checks that resource requirements can be changed on fly."""
        ray.init(num_cpus=2)

        class ChangingScheduler(FIFOScheduler):
            def on_trial_result(self, trial_runner, trial, result):
                if result["training_iteration"] == 1:
                    executor = trial_runner.trial_executor
                    executor.stop_trial(trial)
                    trial.update_resources(dict(cpu=2, gpu=0))
                    executor.start_trial(trial)
                return TrialScheduler.CONTINUE

        runner = TrialRunner(scheduler=ChangingScheduler())
        kwargs = {
            "stopping_criterion": {
                "training_iteration": 2
            },
            "resources": Resources(cpu=1, gpu=0),
        }
        trials = [Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(
            runner.trial_executor._pg_manager.occupied_resources().get("CPU"),
            1)
        self.assertRaises(
            ValueError, lambda: trials[0].update_resources(dict(cpu=2, gpu=0)))

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(
            runner.trial_executor._pg_manager.occupied_resources().get("CPU"),
            2)
Esempio n. 15
0
    def testCheckpointAtEndNotBuffered(self):
        os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7"
        os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "0.5"

        def num_checkpoints(trial):
            return sum(
                item.startswith("checkpoint_")
                for item in os.listdir(trial.logdir))

        ray.init(num_cpus=2)

        trial = Trial("__fake",
                      checkpoint_at_end=True,
                      stopping_criterion={"training_iteration": 4})
        runner = TrialRunner(
            local_checkpoint_dir=self.tmpdir,
            checkpoint_period=0,
            trial_executor=RayTrialExecutor(result_buffer_length=7))
        runner.add_trial(trial)

        runner.step()  # start trial

        runner.step()  # run iteration 1
        self.assertEqual(trial.last_result[TRAINING_ITERATION], 1)
        self.assertEqual(num_checkpoints(trial), 0)

        runner.step()  # run iteration 2
        self.assertEqual(trial.last_result[TRAINING_ITERATION], 2)
        self.assertEqual(num_checkpoints(trial), 0)

        runner.step()  # run iteration 3
        self.assertEqual(trial.last_result[TRAINING_ITERATION], 3)
        self.assertEqual(num_checkpoints(trial), 0)

        runner.step()  # run iteration 4
        self.assertEqual(trial.last_result[TRAINING_ITERATION], 4)
        self.assertEqual(num_checkpoints(trial), 1)
Esempio n. 16
0
def test_trial_requeue(start_connected_emptyhead_cluster, trainable_id):
    """Removing a node in full cluster causes Trial to be requeued."""
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 5
        },
        "checkpoint_freq": 1,
        "max_failures": 1,
        "remote_checkpoint_dir": MOCK_REMOTE_DIR,
        "sync_to_driver_fn": trainable_id == "__fake",
    }

    trials = [Trial(trainable_id, **kwargs), Trial(trainable_id, **kwargs)]
    for t in trials:
        runner.add_trial(t)

    runner.step()  # Start trial
    runner.step()  # Process result, dispatch save
    runner.step()  # Process save

    running_trials = _get_running_trials(runner)
    assert len(running_trials) == 1
    assert _check_trial_running(running_trials[0])
    cluster.remove_node(node)
    cluster.wait_for_nodes()
    runner.step()  # Process result, dispatch save
    runner.step()  # Process save (detect error), requeue trial
    assert all(
        t.status == Trial.PENDING for t in trials), runner.debug_string()

    with pytest.raises(TuneError):
        runner.step()
Esempio n. 17
0
    def testUserCheckpoint(self):
        os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1"  # Don't finish early
        os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"

        ray.init(num_cpus=3)
        runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0)
        runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2}))
        trials = runner.get_trials()

        runner.step()  # Start trial
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1)
        runner.step()  # Process result
        self.assertFalse(trials[0].has_checkpoint())
        runner.step()  # Process result
        self.assertFalse(trials[0].has_checkpoint())
        runner.step()  # Process result, dispatch save
        runner.step()  # Process save
        self.assertTrue(trials[0].has_checkpoint())

        runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir)
        runner2.step()  # 5: Start trial and dispatch restore
        trials2 = runner2.get_trials()
        self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1)
Esempio n. 18
0
    def testFailFastRaise(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner(fail_fast=TrialRunner.RAISE)
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 0,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        with self.assertRaises(Exception):
            while not runner.is_finished():
                runner.step()

        # Not critical checks. Only to showcase the difference
        # with none raise type FailFast.
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.PENDING)
Esempio n. 19
0
    def testFailureRecoveryNodeRemoval(self):
        # Node removal simulation only works with resource requests
        os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1"

        ray.init(num_cpus=1, num_gpus=1)
        searchalg, scheduler = create_mock_components()

        runner = TrialRunner(searchalg, scheduler=scheduler)

        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 1,
            "config": {
                "mock_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        with patch("ray.cluster_resources") as resource_mock:
            resource_mock.return_value = {"CPU": 1, "GPU": 1}
            runner.step()  # Start trial
            self.assertEqual(trials[0].status, Trial.RUNNING)

            runner.step()  # Process result, dispatch save
            runner.step()  # Process save
            self.assertEqual(trials[0].status, Trial.RUNNING)

            # Mimic a node failure
            resource_mock.return_value = {"CPU": 0, "GPU": 0}
            runner.step()  # Detect node failure
            self.assertEqual(trials[0].status, Trial.PENDING)
            self.assertEqual(trials[0].num_failures, 1)
            self.assertEqual(len(searchalg.errored_trials), 0)
            self.assertEqual(len(scheduler.errored_trials), 1)
Esempio n. 20
0
    def testRestoreMetricsAfterCheckpointing(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()  # Start trial
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1)
        # checkpoint = runner.trial_executor.save(trials[0])
        runner.step()  # Process result, dispatch save
        runner.step()  # Process save
        runner.trial_executor.stop_trial(trials[0])
        kwargs["restore_path"] = trials[0].checkpoint.value

        kwargs.pop("checkpoint_freq")  # No checkpointing for next trial
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()  # Start trial, dispatch restore
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[1].status, Trial.RUNNING)
        runner.step()  # Process restore
        runner.step()  # Process result
        self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10)
        self.assertEqual(trials[1].last_result["iterations_since_restore"], 1)
        self.assertGreater(trials[1].last_result["time_since_restore"], 0)
        runner.step()  # Process restore
        self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20)
        self.assertEqual(trials[1].last_result["iterations_since_restore"], 2)
        self.assertGreater(trials[1].last_result["time_since_restore"], 0)
        self.addCleanup(os.remove, trials[0].checkpoint.value)
Esempio n. 21
0
    def testTrialSaveRestore(self):
        """Creates different trials to test runner.checkpoint/restore."""
        ray.init(num_cpus=3)
        tmpdir = tempfile.mkdtemp()

        runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0)
        trials = [
            Trial("__fake",
                  trial_id="trial_terminate",
                  stopping_criterion={"training_iteration": 1},
                  checkpoint_freq=1)
        ]
        runner.add_trial(trials[0])
        runner.step()  # start
        runner.step()
        self.assertEquals(trials[0].status, Trial.TERMINATED)

        trials += [
            Trial("__fake",
                  trial_id="trial_fail",
                  stopping_criterion={"training_iteration": 3},
                  checkpoint_freq=1,
                  config={"mock_error": True})
        ]
        runner.add_trial(trials[1])
        runner.step()
        runner.step()
        runner.step()
        self.assertEquals(trials[1].status, Trial.ERROR)

        trials += [
            Trial("__fake",
                  trial_id="trial_succ",
                  stopping_criterion={"training_iteration": 2},
                  checkpoint_freq=1)
        ]
        runner.add_trial(trials[2])
        runner.step()
        self.assertEquals(len(runner.trial_executor.get_checkpoints()), 3)
        self.assertEquals(trials[2].status, Trial.RUNNING)

        runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir)
        for tid in ["trial_terminate", "trial_fail"]:
            original_trial = runner.get_trial(tid)
            restored_trial = runner2.get_trial(tid)
            self.assertEqual(original_trial.status, restored_trial.status)

        restored_trial = runner2.get_trial("trial_succ")
        self.assertEqual(Trial.PENDING, restored_trial.status)

        runner2.step()
        runner2.step()
        runner2.step()
        self.assertRaises(TuneError, runner2.step)
        shutil.rmtree(tmpdir)
Esempio n. 22
0
    def testTrialNoCheckpointSave(self):
        """Check that non-checkpointing trials *are* saved."""
        os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"

        ray.init(num_cpus=3)

        runner = TrialRunner(local_checkpoint_dir=self.tmpdir,
                             checkpoint_period=0)
        runner.add_trial(
            Trial(
                "__fake",
                trial_id="non_checkpoint",
                stopping_criterion={"training_iteration": 2},
            ))

        while not all(t.status == Trial.TERMINATED
                      for t in runner.get_trials()):
            runner.step()

        runner.add_trial(
            Trial(
                "__fake",
                trial_id="checkpoint",
                checkpoint_at_end=True,
                stopping_criterion={"training_iteration": 2},
            ))

        while not all(t.status == Trial.TERMINATED
                      for t in runner.get_trials()):
            runner.step()

        runner.add_trial(
            Trial(
                "__fake",
                trial_id="pending",
                stopping_criterion={"training_iteration": 2},
            ))

        old_trials = runner.get_trials()
        while not old_trials[2].has_reported_at_least_once:
            runner.step()

        runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir)
        new_trials = runner2.get_trials()
        self.assertEqual(len(new_trials), 3)
        self.assertTrue(
            runner2.get_trial("non_checkpoint").status == Trial.TERMINATED)
        self.assertTrue(
            runner2.get_trial("checkpoint").status == Trial.TERMINATED)
        self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING)
        self.assertTrue(
            runner2.get_trial("pending").has_reported_at_least_once)
        runner2.step()
Esempio n. 23
0
def test_queue_trials(start_connected_emptyhead_cluster):
    """Tests explicit oversubscription for autoscaling.

    Tune oversubscribes a trial when `queue_trials=True`, but
    does not block other trials from running.
    """
    os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1"

    cluster = start_connected_emptyhead_cluster
    runner = TrialRunner()

    def create_trial(cpu, gpu=0):
        kwargs = {
            "resources": Resources(cpu=cpu, gpu=gpu),
            "stopping_criterion": {
                "training_iteration": 3
            }
        }
        return Trial("__fake", **kwargs)

    runner.add_trial(create_trial(cpu=1))
    with pytest.raises(TuneError):
        runner.step()  # run 1

    del runner

    executor = RayTrialExecutor(queue_trials=True)
    runner = TrialRunner(trial_executor=executor)
    cluster.add_node(num_cpus=2)
    cluster.wait_for_nodes()

    cpu_only = create_trial(cpu=1)
    runner.add_trial(cpu_only)
    runner.step()  # add cpu_only trial

    gpu_trial = create_trial(cpu=1, gpu=1)
    runner.add_trial(gpu_trial)
    runner.step()  # queue gpu_trial

    # This tests that the cpu_only trial should bypass the queued trial.
    for i in range(3):
        runner.step()
    assert cpu_only.status == Trial.TERMINATED
    assert gpu_trial.status == Trial.RUNNING

    # Scale up
    cluster.add_node(num_cpus=1, num_gpus=1)
    cluster.wait_for_nodes()

    for i in range(3):
        runner.step()
    assert gpu_trial.status == Trial.TERMINATED
Esempio n. 24
0
    def testTrialNoSave(self):
        """Check that non-checkpointing trials are not saved."""
        ray.init(num_cpus=3)
        tmpdir = tempfile.mkdtemp()

        runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0)
        runner.add_trial(
            Trial(
                "__fake",
                trial_id="non_checkpoint",
                stopping_criterion={"training_iteration": 2}))

        while not all(t.status == Trial.TERMINATED
                      for t in runner.get_trials()):
            runner.step()

        runner.add_trial(
            Trial(
                "__fake",
                trial_id="checkpoint",
                checkpoint_at_end=True,
                stopping_criterion={"training_iteration": 2}))

        while not all(t.status == Trial.TERMINATED
                      for t in runner.get_trials()):
            runner.step()

        runner.add_trial(
            Trial(
                "__fake",
                trial_id="pending",
                stopping_criterion={"training_iteration": 2}))

        runner.step()
        runner.step()

        runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir)
        new_trials = runner2.get_trials()
        self.assertEquals(len(new_trials), 3)
        self.assertTrue(
            runner2.get_trial("non_checkpoint").status == Trial.TERMINATED)
        self.assertTrue(
            runner2.get_trial("checkpoint").status == Trial.TERMINATED)
        self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING)
        self.assertTrue(not runner2.get_trial("pending").last_result)
        runner2.step()
        shutil.rmtree(tmpdir)
Esempio n. 25
0
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster,
                                      trainable_id):
    """Test checks that trial restarts if checkpoint is lost w/ node fail."""
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 4
        },
        "checkpoint_freq": 2,
        "max_failures": 2,
        "remote_checkpoint_dir": MOCK_REMOTE_DIR,
        "sync_to_driver_fn": trainable_id == "__fake_remote",
    }

    # The following patches only affect __fake_remote.
    find_checkpoint_dir = TrainableUtil.find_checkpoint_dir
    with patch("ray.tune.logger.get_node_syncer") as mock_get_node_syncer:
        trainable_util = "ray.tune.ray_trial_executor.TrainableUtil"
        with patch(trainable_util + ".find_checkpoint_dir") as mock_find_dir:

            def mock_get_syncer_fn(local_dir, remote_dir, sync_function):
                client = mock_storage_client()
                return MockNodeSyncer(local_dir, remote_dir, client)

            mock_get_node_syncer.side_effect = mock_get_syncer_fn

            def mock_find_dir_fn(checkpoint_path):
                """Converts back to local path first."""
                checkpoint_path = checkpoint_path[len(MOCK_REMOTE_DIR):]
                checkpoint_path = os.path.join("/", checkpoint_path)
                return find_checkpoint_dir(checkpoint_path)

            # __fake_remote trainables save to a separate "remote" directory.
            # TrainableUtil will not check this path unless we mock it.
            mock_find_dir.side_effect = mock_find_dir_fn

            # Test recovery of trial that has been checkpointed
            t1 = Trial(trainable_id, **kwargs)
            runner.add_trial(t1)

            # Start trial, process result (x2), process save
            for _ in range(4):
                runner.step()
            assert t1.has_checkpoint()

            cluster.add_node(num_cpus=1)
            cluster.remove_node(node)
            cluster.wait_for_nodes()
            shutil.rmtree(os.path.dirname(t1.checkpoint.value))

            runner.step()  # Collect result 3, kick off + fail result 4
            runner.step()  # Dispatch restore
            runner.step()  # Process restore + step 4
            for _ in range(3):
                if t1.status != Trial.TERMINATED:
                    runner.step()
    assert t1.status == Trial.TERMINATED, runner.debug_string()
Esempio n. 26
0
def test_trial_migration(start_connected_emptyhead_cluster):
    """Removing a node while cluster has space should migrate trial.

    The trial state should also be consistent with the checkpoint.
    """
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(resources=dict(CPU=1))
    assert cluster.wait_for_nodes()

    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 3
        },
        "checkpoint_freq": 2,
        "max_failures": 2
    }

    # Test recovery of trial that hasn't been checkpointed
    t = Trial("__fake", **kwargs)
    runner.add_trial(t)
    runner.step()  # start
    runner.step()  # 1 result
    assert t.last_result is not None
    node2 = cluster.add_node(resources=dict(CPU=1))
    cluster.remove_node(node)
    assert cluster.wait_for_nodes()
    runner.step()  # Recovery step

    # TODO(rliaw): This assertion is not critical but will not pass
    #   because checkpoint handling is messy and should be refactored
    #   rather than hotfixed.
    # assert t.last_result is None, "Trial result not restored correctly."
    for i in range(3):
        runner.step()

    assert t.status == Trial.TERMINATED

    # Test recovery of trial that has been checkpointed
    t2 = Trial("__fake", **kwargs)
    runner.add_trial(t2)
    runner.step()  # start
    runner.step()  # 1 result
    runner.step()  # 2 result and checkpoint
    assert t2.has_checkpoint()
    node3 = cluster.add_node(resources=dict(CPU=1))
    cluster.remove_node(node2)
    assert cluster.wait_for_nodes()
    runner.step()  # Recovery step
    assert t2.last_result["training_iteration"] == 2
    for i in range(1):
        runner.step()

    assert t2.status == Trial.TERMINATED

    # Test recovery of trial that won't be checkpointed
    t3 = Trial("__fake", **{"stopping_criterion": {"training_iteration": 3}})
    runner.add_trial(t3)
    runner.step()  # start
    runner.step()  # 1 result
    cluster.add_node(resources=dict(CPU=1))
    cluster.remove_node(node3)
    assert cluster.wait_for_nodes()
    runner.step()  # Error handling step
    assert t3.status == Trial.ERROR

    with pytest.raises(TuneError):
        runner.step()
Esempio n. 27
0
class TrialRunnerCallbacks(unittest.TestCase):
    def setUp(self):

        ray.init()
        self.tmpdir = tempfile.mkdtemp()
        self.callback = TestCallback()
        self.executor = _MockTrialExecutor()
        self.trial_runner = TrialRunner(trial_executor=self.executor,
                                        callbacks=[self.callback])
        # experiment would never be None normally, but it's fine for testing
        self.trial_runner.setup_experiments(experiments=[None],
                                            total_num_samples=1)

    def tearDown(self):
        ray.shutdown()
        _register_all()  # re-register the evicted objects
        if "CUDA_VISIBLE_DEVICES" in os.environ:
            del os.environ["CUDA_VISIBLE_DEVICES"]
        shutil.rmtree(self.tmpdir)

    def testCallbackSteps(self):
        trials = [
            Trial("__fake", trial_id="one"),
            Trial("__fake", trial_id="two")
        ]
        for t in trials:
            self.trial_runner.add_trial(t)

        self.executor.next_future_result = _ExecutorEvent(
            event_type=_ExecutorEventType.PG_READY)
        self.trial_runner.step()

        # Trial 1 has been started
        self.assertEqual(self.callback.state["trial_start"]["iteration"], 0)
        self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id,
                         "one")

        # All these events haven't happened, yet
        self.assertTrue(
            all(k not in self.callback.state for k in [
                "trial_restore",
                "trial_save",
                "trial_result",
                "trial_complete",
                "trial_fail",
                "experiment_end",
            ]))

        self.executor.next_future_result = _ExecutorEvent(
            event_type=_ExecutorEventType.PG_READY)
        self.trial_runner.step()

        # Iteration not increased yet
        self.assertEqual(self.callback.state["step_begin"]["iteration"], 1)

        # Iteration increased
        self.assertEqual(self.callback.state["step_end"]["iteration"], 2)

        # Second trial has been just started
        self.assertEqual(self.callback.state["trial_start"]["iteration"], 1)
        self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id,
                         "two")

        # Just a placeholder object ref for cp.value.
        cp = _TuneCheckpoint(_TuneCheckpoint.PERSISTENT,
                             value=ray.put(1),
                             result={TRAINING_ITERATION: 0})
        trials[0].saving_to = cp

        # Let the first trial save a checkpoint
        self.executor.next_future_result = _ExecutorEvent(
            event_type=_ExecutorEventType.SAVING_RESULT,
            trial=trials[0],
            result={_ExecutorEvent.KEY_FUTURE_RESULT: "__checkpoint"},
        )
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_save"]["iteration"], 2)
        self.assertEqual(self.callback.state["trial_save"]["trial"].trial_id,
                         "one")

        # Let the second trial send a result
        result = {TRAINING_ITERATION: 1, "metric": 800, "done": False}
        self.executor.next_future_result = _ExecutorEvent(
            event_type=_ExecutorEventType.TRAINING_RESULT,
            trial=trials[1],
            result={"future_result": result},
        )
        self.assertTrue(not trials[1].has_reported_at_least_once)
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_result"]["iteration"], 3)
        self.assertEqual(self.callback.state["trial_result"]["trial"].trial_id,
                         "two")
        self.assertEqual(
            self.callback.state["trial_result"]["result"]["metric"], 800)
        self.assertEqual(trials[1].last_result["metric"], 800)

        # Let the second trial restore from a checkpoint
        trials[1].restoring_from = cp
        self.executor.next_future_result = _ExecutorEvent(
            event_type=_ExecutorEventType.RESTORING_RESULT, trial=trials[1])
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_restore"]["iteration"], 4)
        self.assertEqual(
            self.callback.state["trial_restore"]["trial"].trial_id, "two")

        # Let the second trial finish
        trials[1].restoring_from = None
        self.executor.next_future_result = _ExecutorEvent(
            event_type=_ExecutorEventType.TRAINING_RESULT,
            trial=trials[1],
            result={
                _ExecutorEvent.KEY_FUTURE_RESULT: {
                    TRAINING_ITERATION: 2,
                    "metric": 900,
                    "done": True,
                }
            },
        )
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_complete"]["iteration"], 5)
        self.assertEqual(
            self.callback.state["trial_complete"]["trial"].trial_id, "two")

        # Let the first trial error
        self.executor.next_future_result = _ExecutorEvent(
            event_type=_ExecutorEventType.ERROR,
            trial=trials[0],
            result={_ExecutorEvent.KEY_EXCEPTION: Exception()},
        )
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_fail"]["iteration"], 6)
        self.assertEqual(self.callback.state["trial_fail"]["trial"].trial_id,
                         "one")

    def testCallbacksEndToEnd(self):
        def train(config):
            if config["do"] == "save":
                with tune.checkpoint_dir(0):
                    pass
                tune.report(metric=1)
            elif config["do"] == "fail":
                raise RuntimeError("I am failing on purpose.")
            elif config["do"] == "delay":
                time.sleep(2)
                tune.report(metric=20)

        config = {"do": tune.grid_search(["save", "fail", "delay"])}

        tune.run(train,
                 config=config,
                 raise_on_failed_trial=False,
                 callbacks=[self.callback])

        self.assertIn("setup", self.callback.state)
        self.assertTrue(self.callback.state["setup"] is not None)
        keys = Experiment.PUBLIC_KEYS.copy()
        keys.add("total_num_samples")
        for key in keys:
            self.assertIn(key, self.callback.state["setup"])
        # check if it was added first
        self.assertTrue(list(self.callback.state)[0] == "setup")
        self.assertEqual(
            self.callback.state["trial_fail"]["trial"].config["do"], "fail")
        self.assertEqual(
            self.callback.state["trial_save"]["trial"].config["do"], "save")
        self.assertEqual(
            self.callback.state["trial_result"]["trial"].config["do"], "delay")
        self.assertEqual(
            self.callback.state["trial_complete"]["trial"].config["do"],
            "delay")
        self.assertIn("experiment_end", self.callback.state)
        # check if it was added last
        self.assertTrue(list(self.callback.state)[-1] == "experiment_end")

    def testCallbackReordering(self):
        """SyncerCallback should come after LoggerCallback callbacks"""
        def get_positions(callbacks):
            first_logger_pos = None
            last_logger_pos = None
            syncer_pos = None
            for i, callback in enumerate(callbacks):
                if isinstance(callback, LoggerCallback):
                    if first_logger_pos is None:
                        first_logger_pos = i
                    last_logger_pos = i
                elif isinstance(callback, SyncerCallback):
                    syncer_pos = i
            return first_logger_pos, last_logger_pos, syncer_pos

        # Auto creation of loggers, no callbacks, no syncer
        callbacks = create_default_callbacks(None, SyncConfig(), None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)

        # Auto creation of loggers with callbacks
        callbacks = create_default_callbacks([Callback()], SyncConfig(), None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)

        # Auto creation of loggers with existing logger (but no CSV/JSON)
        callbacks = create_default_callbacks([LoggerCallback()], SyncConfig(),
                                             None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)

        # This should throw an error as the syncer comes before the logger
        with self.assertRaises(ValueError):
            callbacks = create_default_callbacks(
                [SyncerCallback(None), LoggerCallback()], SyncConfig(), None)

        # This should be reordered but preserve the regular callback order
        [mc1, mc2, mc3] = [Callback(), Callback(), Callback()]
        # Has to be legacy logger to avoid logger callback creation
        lc = LegacyLoggerCallback(logger_classes=DEFAULT_LOGGERS)
        callbacks = create_default_callbacks([mc1, mc2, lc, mc3], SyncConfig(),
                                             None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)
        self.assertLess(callbacks.index(mc1), callbacks.index(mc2))
        self.assertLess(callbacks.index(mc2), callbacks.index(mc3))
        self.assertLess(callbacks.index(lc), callbacks.index(mc3))
        # Syncer callback is appended
        self.assertLess(callbacks.index(mc3), syncer_pos)

    @patch.object(warnings, "warn")
    def testCallbackSetupBackwardsCompatible(self, mocked_warning_method):
        class NoExperimentInSetupCallback(Callback):
            # Old method definition didn't take in **experiment.public_spec
            def setup(self):
                return

        callback = NoExperimentInSetupCallback()
        trial_runner = TrialRunner(callbacks=[callback])
        trial_runner.setup_experiments(
            experiments=[Experiment("", lambda x: x)], total_num_samples=1)
        mocked_warning_method.assert_called_once()
        self.assertIn("Please update",
                      mocked_warning_method.call_args_list[0][0][0])
Esempio n. 28
0
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
        help='Directory for storing input data')
    FLAGS, unparsed = parser.parse_known_args()
    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)


# !!! Example of using the ray.tune Python API !!!
if __name__ == '__main__':
    runner = TrialRunner()

    for act in ['relu', 'elu', 'tanh']:
        runner.add_trial(
            Trial(
                'mnist', 'script',
                stopping_criterion={
                    'mean_accuracy': 0.99, 'time_total_s': 600},
                config={
                    'script_file_path': os.path.abspath(__file__),
                    'script_min_iter_time_s': 1,
                    'activation': act,
                },
                experiment_tag='act={}'.format(act)))

    ray.init()

    while not runner.is_finished():
        runner.step()
        print(runner.debug_string())
Esempio n. 29
0
class TrialRunnerCallbacks(unittest.TestCase):
    def setUp(self):
        os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "1"

        ray.init()
        self.tmpdir = tempfile.mkdtemp()
        self.callback = TestCallback()
        self.executor = _MockTrialExecutor()
        self.trial_runner = TrialRunner(trial_executor=self.executor,
                                        callbacks=[self.callback])

    def tearDown(self):
        ray.shutdown()
        _register_all()  # re-register the evicted objects
        if "CUDA_VISIBLE_DEVICES" in os.environ:
            del os.environ["CUDA_VISIBLE_DEVICES"]
        shutil.rmtree(self.tmpdir)

    def testCallbackSteps(self):
        trials = [
            Trial("__fake", trial_id="one"),
            Trial("__fake", trial_id="two")
        ]
        for t in trials:
            self.trial_runner.add_trial(t)

        self.executor.next_trial = trials[0]
        self.trial_runner.step()

        # Trial 1 has been started
        self.assertEqual(self.callback.state["trial_start"]["iteration"], 0)
        self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id,
                         "one")

        # All these events haven't happened, yet
        self.assertTrue(
            all(k not in self.callback.state for k in [
                "trial_restore", "trial_save", "trial_result",
                "trial_complete", "trial_fail"
            ]))

        self.executor.next_trial = trials[1]
        self.trial_runner.step()

        # Iteration not increased yet
        self.assertEqual(self.callback.state["step_begin"]["iteration"], 1)

        # Iteration increased
        self.assertEqual(self.callback.state["step_end"]["iteration"], 2)

        # Second trial has been just started
        self.assertEqual(self.callback.state["trial_start"]["iteration"], 1)
        self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id,
                         "two")

        cp = Checkpoint(Checkpoint.PERSISTENT, "__checkpoint",
                        {TRAINING_ITERATION: 0})

        # Let the first trial save a checkpoint
        self.executor.next_trial = trials[0]
        trials[0].saving_to = cp
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_save"]["iteration"], 2)
        self.assertEqual(self.callback.state["trial_save"]["trial"].trial_id,
                         "one")

        # Let the second trial send a result
        result = {TRAINING_ITERATION: 1, "metric": 800, "done": False}
        self.executor.results[trials[1]] = result
        self.executor.next_trial = trials[1]
        self.assertEqual(trials[1].last_result, {})
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_result"]["iteration"], 3)
        self.assertEqual(self.callback.state["trial_result"]["trial"].trial_id,
                         "two")
        self.assertEqual(
            self.callback.state["trial_result"]["result"]["metric"], 800)
        self.assertEqual(trials[1].last_result["metric"], 800)

        # Let the second trial restore from a checkpoint
        trials[1].restoring_from = cp
        self.executor.results[trials[1]] = trials[1].last_result
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_restore"]["iteration"], 4)
        self.assertEqual(
            self.callback.state["trial_restore"]["trial"].trial_id, "two")

        # Let the second trial finish
        trials[1].restoring_from = None
        self.executor.results[trials[1]] = {
            TRAINING_ITERATION: 2,
            "metric": 900,
            "done": True
        }
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_complete"]["iteration"], 5)
        self.assertEqual(
            self.callback.state["trial_complete"]["trial"].trial_id, "two")

        # Let the first trial error
        self.executor.failed_trial = trials[0]
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_fail"]["iteration"], 6)
        self.assertEqual(self.callback.state["trial_fail"]["trial"].trial_id,
                         "one")

    def testCallbacksEndToEnd(self):
        def train(config):
            if config["do"] == "save":
                with tune.checkpoint_dir(0):
                    pass
                tune.report(metric=1)
            elif config["do"] == "fail":
                raise RuntimeError("I am failing on purpose.")
            elif config["do"] == "delay":
                time.sleep(2)
                tune.report(metric=20)

        config = {"do": tune.grid_search(["save", "fail", "delay"])}

        tune.run(train,
                 config=config,
                 raise_on_failed_trial=False,
                 callbacks=[self.callback])

        self.assertEqual(
            self.callback.state["trial_fail"]["trial"].config["do"], "fail")
        self.assertEqual(
            self.callback.state["trial_save"]["trial"].config["do"], "save")
        self.assertEqual(
            self.callback.state["trial_result"]["trial"].config["do"], "delay")
        self.assertEqual(
            self.callback.state["trial_complete"]["trial"].config["do"],
            "delay")

    def testCallbackReordering(self):
        """SyncerCallback should come after LoggerCallback callbacks"""
        def get_positions(callbacks):
            first_logger_pos = None
            last_logger_pos = None
            syncer_pos = None
            for i, callback in enumerate(callbacks):
                if isinstance(callback, LoggerCallback):
                    if first_logger_pos is None:
                        first_logger_pos = i
                    last_logger_pos = i
                elif isinstance(callback, SyncerCallback):
                    syncer_pos = i
            return first_logger_pos, last_logger_pos, syncer_pos

        # Auto creation of loggers, no callbacks, no syncer
        callbacks = create_default_callbacks(None, SyncConfig(), None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)

        # Auto creation of loggers with callbacks
        callbacks = create_default_callbacks([Callback()], SyncConfig(), None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)

        # Auto creation of loggers with existing logger (but no CSV/JSON)
        callbacks = create_default_callbacks([LoggerCallback()], SyncConfig(),
                                             None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)

        # This should throw an error as the syncer comes before the logger
        with self.assertRaises(ValueError):
            callbacks = create_default_callbacks(
                [SyncerCallback(None), LoggerCallback()], SyncConfig(), None)

        # This should be reordered but preserve the regular callback order
        [mc1, mc2, mc3] = [Callback(), Callback(), Callback()]
        # Has to be legacy logger to avoid logger callback creation
        lc = LegacyLoggerCallback(logger_classes=DEFAULT_LOGGERS)
        callbacks = create_default_callbacks([mc1, mc2, lc, mc3], SyncConfig(),
                                             None)
        print(callbacks)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)
        self.assertLess(callbacks.index(mc1), callbacks.index(mc2))
        self.assertLess(callbacks.index(mc2), callbacks.index(mc3))
        self.assertLess(callbacks.index(lc), callbacks.index(mc3))
        # Syncer callback is appended
        self.assertLess(callbacks.index(mc3), syncer_pos)
Esempio n. 30
0
    def testBurnInPeriod(self):
        runner = TrialRunner(trial_executor=MagicMock())

        scheduler = PopulationBasedTraining(time_attr="training_iteration",
                                            metric="error",
                                            mode="min",
                                            perturbation_interval=5,
                                            burn_in_period=50,
                                            log_config=True,
                                            synch=True)

        class MockTrial(Trial):
            @property
            def checkpoint(self):
                return Checkpoint(Checkpoint.MEMORY, "None", {})

            @property
            def status(self):
                return Trial.PAUSED

            @status.setter
            def status(self, status):
                pass

        trial1 = MockTrial("PPO", config=dict(num=1))
        trial2 = MockTrial("PPO", config=dict(num=2))
        trial3 = MockTrial("PPO", config=dict(num=3))
        trial4 = MockTrial("PPO", config=dict(num=4))

        runner.add_trial(trial1)
        runner.add_trial(trial2)
        runner.add_trial(trial3)
        runner.add_trial(trial4)

        scheduler.on_trial_add(runner, trial1)
        scheduler.on_trial_add(runner, trial2)
        scheduler.on_trial_add(runner, trial3)
        scheduler.on_trial_add(runner, trial4)

        # Add initial results.
        scheduler.on_trial_result(runner,
                                  trial1,
                                  result=dict(training_iteration=1, error=50))
        scheduler.on_trial_result(runner,
                                  trial2,
                                  result=dict(training_iteration=1, error=50))
        scheduler.on_trial_result(runner,
                                  trial3,
                                  result=dict(training_iteration=1, error=10))
        scheduler.on_trial_result(runner,
                                  trial4,
                                  result=dict(training_iteration=1, error=100))

        # Add more results. Without burn-in, this would now exploit
        scheduler.on_trial_result(runner,
                                  trial1,
                                  result=dict(training_iteration=30, error=50))
        scheduler.on_trial_result(runner,
                                  trial2,
                                  result=dict(training_iteration=30, error=50))
        scheduler.on_trial_result(runner,
                                  trial3,
                                  result=dict(training_iteration=30, error=10))
        scheduler.on_trial_result(runner,
                                  trial4,
                                  result=dict(training_iteration=30,
                                              error=100))

        self.assertEqual(trial4.config["num"], 4)

        # Add more results. Since this is after burn-in, it should now exploit
        scheduler.on_trial_result(runner,
                                  trial1,
                                  result=dict(training_iteration=50, error=50))
        scheduler.on_trial_result(runner,
                                  trial2,
                                  result=dict(training_iteration=50, error=50))
        scheduler.on_trial_result(runner,
                                  trial3,
                                  result=dict(training_iteration=50, error=10))
        scheduler.on_trial_result(runner,
                                  trial4,
                                  result=dict(training_iteration=50,
                                              error=100))

        self.assertEqual(trial4.config["num"], 3)
Esempio n. 31
0
                    help="The Redis address of the cluster.")
parser.add_argument("--restore", default=None, type=str,
                    help="If specified, restore from this checkpoint.")
parser.add_argument("-f", "--config-file", default=None, type=str,
                    help="If specified, use config options from this file.")


if __name__ == "__main__":
    args = parser.parse_args()
    runner = TrialRunner()

    if args.config_file:
        with open(args.config_file) as f:
            config = yaml.load(f)
        for trial in parse_to_trials(config):
            runner.add_trial(trial)
    else:
        runner.add_trial(
            Trial(
                args.env, args.alg, args.config, args.local_dir, None,
                args.resources, args.stop, args.checkpoint_freq,
                args.restore, args.upload_dir))

    ray.init(redis_address=args.redis_address)

    while not runner.is_finished():
        runner.step()
        print(runner.debug_string())

    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
Esempio n. 32
0
    def testBurnInPeriod(self):
        runner = TrialRunner(trial_executor=MagicMock())

        scheduler = PopulationBasedTraining(
            time_attr="training_iteration",
            metric="error",
            mode="min",
            perturbation_interval=5,
            burn_in_period=50,
            log_config=True,
            synch=True,
        )

        class MockTrial(Trial):
            @property
            def checkpoint(self):
                return _TrackedCheckpoint(
                    dir_or_data={"data": "None"},
                    storage_mode=CheckpointStorage.MEMORY,
                    metrics={},
                )

            @property
            def status(self):
                return Trial.PAUSED

            @status.setter
            def status(self, status):
                pass

        trial1 = MockTrial("PPO", config=dict(num=1))
        trial2 = MockTrial("PPO", config=dict(num=2))
        trial3 = MockTrial("PPO", config=dict(num=3))
        trial4 = MockTrial("PPO", config=dict(num=4))

        runner.add_trial(trial1)
        runner.add_trial(trial2)
        runner.add_trial(trial3)
        runner.add_trial(trial4)

        scheduler.on_trial_add(runner, trial1)
        scheduler.on_trial_add(runner, trial2)
        scheduler.on_trial_add(runner, trial3)
        scheduler.on_trial_add(runner, trial4)

        # Add initial results.
        scheduler.on_trial_result(
            runner, trial1, result=dict(training_iteration=1, error=50)
        )
        scheduler.on_trial_result(
            runner, trial2, result=dict(training_iteration=1, error=50)
        )
        scheduler.on_trial_result(
            runner, trial3, result=dict(training_iteration=1, error=10)
        )
        scheduler.on_trial_result(
            runner, trial4, result=dict(training_iteration=1, error=100)
        )

        # Add more results. Without burn-in, this would now exploit
        scheduler.on_trial_result(
            runner, trial1, result=dict(training_iteration=30, error=50)
        )
        scheduler.on_trial_result(
            runner, trial2, result=dict(training_iteration=30, error=50)
        )
        scheduler.on_trial_result(
            runner, trial3, result=dict(training_iteration=30, error=10)
        )
        scheduler.on_trial_result(
            runner, trial4, result=dict(training_iteration=30, error=100)
        )

        self.assertEqual(trial4.config["num"], 4)

        # Add more results. Since this is after burn-in, it should now exploit
        scheduler.on_trial_result(
            runner, trial1, result=dict(training_iteration=50, error=50)
        )
        scheduler.on_trial_result(
            runner, trial2, result=dict(training_iteration=50, error=50)
        )
        scheduler.on_trial_result(
            runner, trial3, result=dict(training_iteration=50, error=10)
        )
        scheduler.on_trial_result(
            runner, trial4, result=dict(training_iteration=50, error=100)
        )

        self.assertEqual(trial4.config["num"], 3)

        # Assert that trials do not hang after `burn_in_period`
        self.assertTrue(all(t.status == "PAUSED" for t in runner.get_trials()))
        self.assertTrue(scheduler.choose_trial_to_run(runner))

        # Assert that trials do not hang when a terminated trial is added
        trial5 = Trial("PPO", config=dict(num=5))
        runner.add_trial(trial5)
        scheduler.on_trial_add(runner, trial5)
        trial5.set_status(Trial.TERMINATED)
        self.assertTrue(scheduler.choose_trial_to_run(runner))
Esempio n. 33
0
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster,
                                      trainable_id):
    """Test checks that trial restarts if checkpoint is lost w/ node fail."""
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    class _SyncerCallback(SyncerCallback):
        def _create_trial_syncer(self, trial: "Trial"):
            client = mock_storage_client()
            return MockNodeSyncer(trial.logdir, trial.logdir, client)

    syncer_callback = _SyncerCallback(None)
    runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback])
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 4
        },
        "checkpoint_freq": 2,
        "max_failures": 2,
        "remote_checkpoint_dir": MOCK_REMOTE_DIR,
    }

    # The following patches only affect __fake_remote.
    def hide_remote_path(path_function):
        def hidden_path_func(checkpoint_path):
            """Converts back to local path first."""
            if MOCK_REMOTE_DIR in checkpoint_path:
                checkpoint_path = checkpoint_path[len(MOCK_REMOTE_DIR):]
                checkpoint_path = os.path.join("/", checkpoint_path)
            return path_function(checkpoint_path)

        return hidden_path_func

    trainable_util = "ray.tune.ray_trial_executor.TrainableUtil"
    _find_ckpt = trainable_util + ".find_checkpoint_dir"
    find_func = TrainableUtil.find_checkpoint_dir
    _pickle_ckpt = trainable_util + ".pickle_checkpoint"
    pickle_func = TrainableUtil.pickle_checkpoint

    with patch(_find_ckpt) as mock_find, patch(_pickle_ckpt) as mock_pkl_ckpt:
        # __fake_remote trainables save to a separate "remote" directory.
        # TrainableUtil will not check this path unless we mock it.
        mock_find.side_effect = hide_remote_path(find_func)
        mock_pkl_ckpt.side_effect = hide_remote_path(pickle_func)

        # Test recovery of trial that has been checkpointed
        t1 = Trial(trainable_id, **kwargs)
        runner.add_trial(t1)

        # Start trial, process result (x2), process save
        for _ in range(4):
            runner.step()
        assert t1.has_checkpoint()

        cluster.add_node(num_cpus=1)
        cluster.remove_node(node)
        cluster.wait_for_nodes()
        shutil.rmtree(os.path.dirname(t1.checkpoint.value))
        runner.step()  # Collect result 3, kick off + fail result 4
        runner.step()  # Dispatch restore
        runner.step()  # Process restore + step 4
        for _ in range(3):
            if t1.status != Trial.TERMINATED:
                runner.step()
    assert t1.status == Trial.TERMINATED, runner.debug_string()
Esempio n. 34
0
def test_trial_migration(start_connected_emptyhead_cluster, trainable_id):
    """Removing a node while cluster has space should migrate trial.

    The trial state should also be consistent with the checkpoint.
    """
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    syncer_callback = _PerTrialSyncerCallback(
        lambda trial: trial.trainable_name == "__fake")
    runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback])
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 4
        },
        "checkpoint_freq": 2,
        "max_failures": 2,
        "remote_checkpoint_dir": MOCK_REMOTE_DIR,
    }

    # Test recovery of trial that hasn't been checkpointed
    t = Trial(trainable_id, **kwargs)
    runner.add_trial(t)
    runner.step()  # Start trial
    runner.step()  # Process result
    assert t.last_result
    node2 = cluster.add_node(num_cpus=1)
    cluster.remove_node(node)
    cluster.wait_for_nodes()
    # TODO(ujvl): Node failure does not propagate until a step after it
    #  actually should. This is possibly a problem with `Cluster`.
    runner.step()
    runner.step()  # Recovery step

    # TODO(rliaw): This assertion is not critical but will not pass
    #   because checkpoint handling is messy and should be refactored
    #   rather than hotfixed.
    # assert t.last_result is None, "Trial result not restored correctly."

    # Process result (x2), process save, process result (x2), process save
    for _ in range(6):
        runner.step()

    assert t.status == Trial.TERMINATED, runner.debug_string()

    # Test recovery of trial that has been checkpointed
    t2 = Trial(trainable_id, **kwargs)
    runner.add_trial(t2)
    # Start trial, process result (x2), process save
    for _ in range(4):
        runner.step()
    assert t2.has_checkpoint()
    node3 = cluster.add_node(num_cpus=1)
    cluster.remove_node(node2)
    cluster.wait_for_nodes()
    runner.step()  # Process result 3 + start and fail 4 result
    runner.step()  # Dispatch restore
    runner.step()  # Process restore
    runner.step()  # Process result 5
    if t2.status != Trial.TERMINATED:
        runner.step()  # Process result 6, dispatch save
        runner.step()  # Process save
    assert t2.status == Trial.TERMINATED, runner.debug_string()

    # Test recovery of trial that won't be checkpointed
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 3
        },
        "remote_checkpoint_dir": MOCK_REMOTE_DIR,
    }
    t3 = Trial(trainable_id, **kwargs)
    runner.add_trial(t3)
    runner.step()  # Start trial
    runner.step()  # Process result 1
    cluster.add_node(num_cpus=1)
    cluster.remove_node(node3)
    cluster.wait_for_nodes()
    runner.step()  # Error handling step
    if t3.status != Trial.ERROR:
        runner.step()
    assert t3.status == Trial.ERROR, runner.debug_string()

    with pytest.raises(TuneError):
        runner.step()
Esempio n. 35
0
def test_trial_migration(start_connected_emptyhead_cluster, trainable_id):
    """Removing a node while cluster has space should migrate trial.

    The trial state should also be consistent with the checkpoint.
    """
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 4
        },
        "checkpoint_freq": 2,
        "max_failures": 2,
        "remote_checkpoint_dir": MOCK_REMOTE_DIR,
        "sync_to_driver_fn": trainable_id == "__fake",
    }

    # Test recovery of trial that hasn't been checkpointed
    t = Trial(trainable_id, **kwargs)
    runner.add_trial(t)
    runner.step()  # start
    runner.step()  # 1 result
    assert t.last_result
    node2 = cluster.add_node(num_cpus=1)
    cluster.remove_node(node)
    cluster.wait_for_nodes()
    runner.step()  # Recovery step

    # TODO(rliaw): This assertion is not critical but will not pass
    #   because checkpoint handling is messy and should be refactored
    #   rather than hotfixed.
    # assert t.last_result is None, "Trial result not restored correctly."
    for i in range(4):
        runner.step()

    assert t.status == Trial.TERMINATED

    # Test recovery of trial that has been checkpointed
    t2 = Trial(trainable_id, **kwargs)
    runner.add_trial(t2)
    runner.step()  # start
    runner.step()  # 1 result
    runner.step()  # 2 result and checkpoint
    assert t2.has_checkpoint()
    node3 = cluster.add_node(num_cpus=1)
    cluster.remove_node(node2)
    cluster.wait_for_nodes()
    runner.step()  # 3 result + start and fail 4 result
    runner.step()  # Recovery step
    runner.step()  # Process recovery
    runner.step()  # result
    if t2.status != Trial.TERMINATED:
        runner.step()
    assert t2.status == Trial.TERMINATED, runner.debug_string()

    # Test recovery of trial that won't be checkpointed
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 3
        },
        "remote_checkpoint_dir": MOCK_REMOTE_DIR,
        "sync_to_driver_fn": trainable_id == "__fake",
    }
    t3 = Trial(trainable_id, **kwargs)
    runner.add_trial(t3)
    runner.step()  # start
    runner.step()  # 1 result
    cluster.add_node(num_cpus=1)
    cluster.remove_node(node3)
    cluster.wait_for_nodes()
    runner.step()  # Error handling step
    if t3.status != Trial.ERROR:
        runner.step()
    assert t3.status == Trial.ERROR, runner.debug_string()

    with pytest.raises(TuneError):
        runner.step()
Esempio n. 36
0
class TrialRunnerCallbacks(unittest.TestCase):
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()
        self.callback = TestCallback()
        self.executor = _MockTrialExecutor()
        self.trial_runner = TrialRunner(trial_executor=self.executor,
                                        callbacks=[self.callback])

    def tearDown(self):
        ray.shutdown()
        _register_all()  # re-register the evicted objects
        if "CUDA_VISIBLE_DEVICES" in os.environ:
            del os.environ["CUDA_VISIBLE_DEVICES"]
        shutil.rmtree(self.tmpdir)

    def testCallbackSteps(self):
        trials = [
            Trial("__fake", trial_id="one"),
            Trial("__fake", trial_id="two")
        ]
        for t in trials:
            self.trial_runner.add_trial(t)

        self.executor.next_trial = trials[0]
        self.trial_runner.step()

        # Trial 1 has been started
        self.assertEqual(self.callback.state["trial_start"]["iteration"], 0)
        self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id,
                         "one")

        # All these events haven't happened, yet
        self.assertTrue(
            all(k not in self.callback.state for k in [
                "trial_restore", "trial_save", "trial_result",
                "trial_complete", "trial_fail"
            ]))

        self.executor.next_trial = trials[1]
        self.trial_runner.step()

        # Iteration not increased yet
        self.assertEqual(self.callback.state["step_begin"]["iteration"], 1)

        # Iteration increased
        self.assertEqual(self.callback.state["step_end"]["iteration"], 2)

        # Second trial has been just started
        self.assertEqual(self.callback.state["trial_start"]["iteration"], 1)
        self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id,
                         "two")

        cp = Checkpoint(Checkpoint.PERSISTENT, "__checkpoint",
                        {TRAINING_ITERATION: 0})

        # Let the first trial save a checkpoint
        self.executor.next_trial = trials[0]
        trials[0].saving_to = cp
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_save"]["iteration"], 2)
        self.assertEqual(self.callback.state["trial_save"]["trial"].trial_id,
                         "one")

        # Let the second trial send a result
        result = {TRAINING_ITERATION: 1, "metric": 800, "done": False}
        self.executor.results[trials[1]] = result
        self.executor.next_trial = trials[1]
        self.assertEqual(trials[1].last_result, {})
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_result"]["iteration"], 3)
        self.assertEqual(self.callback.state["trial_result"]["trial"].trial_id,
                         "two")
        self.assertEqual(
            self.callback.state["trial_result"]["result"]["metric"], 800)
        self.assertEqual(trials[1].last_result["metric"], 800)

        # Let the second trial restore from a checkpoint
        trials[1].restoring_from = cp
        self.executor.results[trials[1]] = trials[1].last_result
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_restore"]["iteration"], 4)
        self.assertEqual(
            self.callback.state["trial_restore"]["trial"].trial_id, "two")

        # Let the second trial finish
        trials[1].restoring_from = None
        self.executor.results[trials[1]] = {
            TRAINING_ITERATION: 2,
            "metric": 900,
            "done": True
        }
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_complete"]["iteration"], 5)
        self.assertEqual(
            self.callback.state["trial_complete"]["trial"].trial_id, "two")

        # Let the first trial error
        self.executor.failed_trial = trials[0]
        self.trial_runner.step()
        self.assertEqual(self.callback.state["trial_fail"]["iteration"], 6)
        self.assertEqual(self.callback.state["trial_fail"]["trial"].trial_id,
                         "one")

    def testCallbacksEndToEnd(self):
        def train(config):
            if config["do"] == "save":
                with tune.checkpoint_dir(0):
                    pass
                tune.report(metric=1)
            elif config["do"] == "fail":
                raise RuntimeError("I am failing on purpose.")
            elif config["do"] == "delay":
                time.sleep(2)
                tune.report(metric=20)

        config = {"do": tune.grid_search(["save", "fail", "delay"])}

        tune.run(train,
                 config=config,
                 raise_on_failed_trial=False,
                 callbacks=[self.callback])

        self.assertEqual(
            self.callback.state["trial_fail"]["trial"].config["do"], "fail")
        self.assertEqual(
            self.callback.state["trial_save"]["trial"].config["do"], "save")
        self.assertEqual(
            self.callback.state["trial_result"]["trial"].config["do"], "delay")
        self.assertEqual(
            self.callback.state["trial_complete"]["trial"].config["do"],
            "delay")
Esempio n. 37
0
def test_trial_migration(start_connected_emptyhead_cluster):
    """Removing a node while cluster has space should migrate trial.

    The trial state should also be consistent with the checkpoint.
    """
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 3
        },
        "checkpoint_freq": 2,
        "max_failures": 2
    }

    # Test recovery of trial that hasn't been checkpointed
    t = Trial("__fake", **kwargs)
    runner.add_trial(t)
    runner.step()  # start
    runner.step()  # 1 result
    assert t.last_result
    node2 = cluster.add_node(num_cpus=1)
    cluster.remove_node(node)
    cluster.wait_for_nodes()
    runner.step()  # Recovery step

    # TODO(rliaw): This assertion is not critical but will not pass
    #   because checkpoint handling is messy and should be refactored
    #   rather than hotfixed.
    # assert t.last_result is None, "Trial result not restored correctly."
    for i in range(3):
        runner.step()

    assert t.status == Trial.TERMINATED

    # Test recovery of trial that has been checkpointed
    t2 = Trial("__fake", **kwargs)
    runner.add_trial(t2)
    runner.step()  # start
    runner.step()  # 1 result
    runner.step()  # 2 result and checkpoint
    assert t2.has_checkpoint()
    node3 = cluster.add_node(num_cpus=1)
    cluster.remove_node(node2)
    cluster.wait_for_nodes()
    runner.step()  # Recovery step
    assert t2.last_result["training_iteration"] == 2
    for i in range(1):
        runner.step()

    assert t2.status == Trial.TERMINATED

    # Test recovery of trial that won't be checkpointed
    t3 = Trial("__fake", **{"stopping_criterion": {"training_iteration": 3}})
    runner.add_trial(t3)
    runner.step()  # start
    runner.step()  # 1 result
    cluster.add_node(num_cpus=1)
    cluster.remove_node(node3)
    cluster.wait_for_nodes()
    runner.step()  # Error handling step
    assert t3.status == Trial.ERROR

    with pytest.raises(TuneError):
        runner.step()