Ejemplo n.º 1
0
def tune_horovod(num_workers, num_samples, use_gpu, mode="square", x_max=1.0):
    horovod_trainer = HorovodTrainer(
        train_loop_per_worker=train_loop_per_worker,
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu
        },
        train_loop_config={
            "mode": mode,
            "x_max": x_max
        },
    )

    tuner = Tuner(
        horovod_trainer,
        param_space={"train_loop_config": {
            "lr": tune.uniform(0.1, 1)
        }},
        tune_config=TuneConfig(mode="min",
                               metric="loss",
                               num_samples=num_samples),
        _tuner_kwargs={"fail_fast": True},
    )

    result_grid = tuner.fit()

    print("Best hyperparameters found were: ",
          result_grid.get_best_result().config)
Ejemplo n.º 2
0
def test_retry(ray_start_4_cpus):
    def train_func():
        ckpt = session.get_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            ckpt = ckpt.to_dict()
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            session.report(
                dict(test=i, training_iteration=i),
                checkpoint=Checkpoint.from_dict(dict(iter=i)),
            )

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3))
    )

    analysis = tuner.fit()._experiment_analysis
    checkpoint_path = analysis.trials[0].checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 3

    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 4
Ejemplo n.º 3
0
 def test_tuner_with_torch_trainer(self):
     """Test a successful run using torch trainer."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"),
                   ignore_errors=True)
     # The following two should be tunable.
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10}
     scaling_config = {"num_workers": 1, "use_gpu": False}
     trainer = TorchTrainer(
         train_loop_per_worker=linear_train_func,
         train_loop_config=config,
         scaling_config=scaling_config,
     )
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         "train_loop_config": {
             "batch_size": tune.grid_search([4, 8]),
             "epochs": tune.grid_search([5, 10]),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="loss"),
     )
     results = tuner.fit()
     assert len(results) == 8
Ejemplo n.º 4
0
def tune_linear(num_workers, num_samples, use_gpu):
    train_dataset, val_dataset = get_datasets()

    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}

    scaling_config = {"num_workers": num_workers, "use_gpu": use_gpu}

    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=scaling_config,
        datasets={
            "train": train_dataset,
            "validation": val_dataset
        },
    )

    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([4, 16, 32]),
                "epochs": 3,
            }
        },
        tune_config=TuneConfig(num_samples=num_samples,
                               metric="loss",
                               mode="min"),
    )
    result_grid = tuner.fit()
    best_result = result_grid.get_best_result()
    print(best_result)
    return best_result
Ejemplo n.º 5
0
def train_rl_bc_offline(path: str,
                        num_workers: int,
                        use_gpu: bool = False) -> Result:
    print("Starting offline training")
    dataset = ray.data.read_json(path,
                                 parallelism=num_workers,
                                 ray_remote_args={"num_cpus": 1})

    trainer = RLTrainer(
        run_config=RunConfig(stop={"training_iteration": 5}),
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu,
        },
        datasets={"train": dataset},
        algorithm=BCTrainer,
        config={
            "env": "CartPole-v0",
            "framework": "tf",
            "evaluation_num_workers": 1,
            "evaluation_interval": 1,
            "evaluation_config": {
                "input": "sampler"
            },
        },
    )

    # Todo (krfricke/xwjiang): Enable checkpoint config in RunConfig
    # result = trainer.fit()
    tuner = Tuner(
        trainer,
        _tuner_kwargs={"checkpoint_at_end": True},
    )
    result = tuner.fit()[0]
    return result
Ejemplo n.º 6
0
def test_data_parallel_trainer(ray_start_8_cpus):
    num_workers = 2
    trainer = AssertingDataParallelTrainer(
        train_fn, scaling_config=ScalingConfig(num_workers=num_workers)
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "num_epochs": 100,
                "metric": tune.grid_search([1, 2, 3, 4, 5]),
            }
        },
        tune_config=TuneConfig(
            mode="max",
            metric="metric",
            scheduler=ResourceChangingScheduler(
                ASHAScheduler(),
                resources_allocation_function=DistributeResources(
                    add_bundles=True, reserve_resources={"CPU": 1}
                ),
            ),
        ),
        run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)),
    )
    result_grid = tuner.fit()
    assert not any(x.error for x in result_grid)
    # + 1 for Trainable
    assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
Ejemplo n.º 7
0
def train_rl_ppo_online(num_workers: int, use_gpu: bool = False) -> Result:
    print("Starting online training")
    trainer = RLTrainer(
        run_config=RunConfig(stop={"training_iteration": 5}),
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu,
        },
        algorithm="PPO",
        config={
            "env": "CartPole-v0",
            "framework": "tf",
            "evaluation_num_workers": 1,
            "evaluation_interval": 1,
            "evaluation_config": {
                "input": "sampler"
            },
        },
    )
    # Todo (krfricke/xwjiang): Enable checkpoint config in RunConfig
    # result = trainer.fit()
    tuner = Tuner(
        trainer,
        _tuner_kwargs={"checkpoint_at_end": True},
    )
    result = tuner.fit()[0]
    return result
Ejemplo n.º 8
0
def test_tuner_fn_trainable_checkpoint_at_end_none():
    tuner = Tuner(
        lambda config, checkpoint_dir: 1,
        run_config=ray.air.RunConfig(
            checkpoint_config=ray.air.CheckpointConfig(
                checkpoint_at_end=None)),
    )
    tuner.fit()
Ejemplo n.º 9
0
def test_tuner_fn_trainable_checkpoint_at_end_true():
    tuner = Tuner(
        lambda config, checkpoint_dir: 1,
        run_config=ray.air.RunConfig(
            checkpoint_config=ray.air.CheckpointConfig(
                checkpoint_at_end=True)),
    )
    with pytest.raises(TuneError):
        tuner.fit()
Ejemplo n.º 10
0
def test_tuner_api_kwargs(params_expected):
    tuner_params, assertion = params_expected

    tuner = Tuner(lambda config: 1, **tuner_params)

    caught_kwargs = {}

    def catch_kwargs(**kwargs):
        caught_kwargs.update(kwargs)

    with patch("ray.tune.impl.tuner_internal.run", catch_kwargs):
        tuner.fit()

    assert assertion(caught_kwargs)
Ejemplo n.º 11
0
def test_tune_error(ray_start_4_cpus):
    def train_func(config):
        raise RuntimeError("Error in training function!")

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer,
    )

    result_grid = tuner.fit()
    with pytest.raises(RuntimeError):
        raise result_grid[0].error
Ejemplo n.º 12
0
 def test_tuner_trainer_fail(self):
     trainer = FailingTrainer()
     param_space = {
         "scaling_config":
         ScalingConfig(num_workers=tune.grid_search([1, 2]))
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner_trainer_fail"),
         param_space=param_space,
         tune_config=TuneConfig(mode="max", metric="iteration"),
     )
     results = tuner.fit()
     assert len(results) == 2
     for i in range(2):
         assert results[i].error
Ejemplo n.º 13
0
 def test_tuner_with_xgboost_trainer(self):
     """Test a successful run."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"),
                   ignore_errors=True)
     trainer = XGBoostTrainer(
         label_column="target",
         params={},
         datasets={"train": gen_dataset_func_eager()},
     )
     # prep_v1 = StandardScaler(["worst radius", "worst area"])
     # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
         "datasets": {
             "train":
             tune.grid_search(
                 [gen_dataset_func(),
                  gen_dataset_func(do_shuffle=True)]),
         },
         "params": {
             "objective": "binary:logistic",
             "tree_method": "approx",
             "eval_metric": ["logloss", "error"],
             "eta": tune.loguniform(1e-4, 1e-1),
             "subsample": tune.uniform(0.5, 1.0),
             "max_depth": tune.randint(1, 9),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="train-error"),
         # limiting the number of trials running at one time.
         # As the unit test only has access to 4 CPUs on Buildkite.
         _tuner_kwargs={"max_concurrent_trials": 1},
     )
     results = tuner.fit()
     assert not isinstance(results.get_best_result().checkpoint,
                           TrialCheckpoint)
     assert len(results) == 4
Ejemplo n.º 14
0
 def test_tuner_with_xgboost_trainer(self):
     """Test a successful run."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"),
                   ignore_errors=True)
     trainer = XGBoostTrainer(
         label_column="target",
         params={},
         # TODO(xwjiang): change when dataset out-of-band ser/des is landed.
         datasets={"train": gen_dataset_func_eager()},
     )
     # prep_v1 = StandardScaler(["worst radius", "worst area"])
     # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363
         #  is resolved.
         # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
         # "datasets": {
         #     "train": tune.choice(
         #         [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]
         #     ),
         # },
         "params": {
             "objective": "binary:logistic",
             "tree_method": "approx",
             "eval_metric": ["logloss", "error"],
             "eta": tune.loguniform(1e-4, 1e-1),
             "subsample": tune.uniform(0.5, 1.0),
             "max_depth": tune.randint(1, 9),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="train-error"),
     )
     results = tuner.fit()
     assert not isinstance(results.get_best_result().checkpoint,
                           TrialCheckpoint)
     assert len(results) == 2
Ejemplo n.º 15
0
def tune_tensorflow_mnist(num_workers, num_samples):
    trainer = TensorflowTrainer(
        train_func, scaling_config=ScalingConfig(num_workers=num_workers))
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([32, 64, 128]),
                "epochs": 3,
            },
        },
        tune_config=TuneConfig(num_samples=num_samples),
    )
    analysis = tuner.fit()
    best_loss = analysis.get_best_result(metric="loss", mode="min")
    best_accuracy = analysis.get_best_result(metric="accuracy", mode="max")
    print(f"Best loss result: {best_loss}")
    print(f"Best accuracy result: {best_accuracy}")
    return analysis
Ejemplo n.º 16
0
def test_tune(ray_start_4_cpus):
    def train_func(config):
        session.report({"loss": config["x"]})

    trainer = DataParallelTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={"x": 100},
        scaling_config=scale_config,
    )

    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"x": tune.choice([200, 300])}},
        tune_config=TuneConfig(num_samples=2),
    )
    result_grid = tuner.fit()
    assert result_grid[0].metrics["loss"] in [200, 300]

    # Make sure original Trainer is not affected.
    assert trainer._train_loop_config["x"] == 100
Ejemplo n.º 17
0
    def test_tuner_trainer_fail(self):
        class DummyTrainer(Trainer):
            def training_loop(self) -> None:
                raise RuntimeError("There is an error in trainer!")

        trainer = DummyTrainer()
        param_space = {
            "scaling_config": {
                "num_workers": tune.grid_search([1, 2]),
            }
        }
        tuner = Tuner(
            trainable=trainer,
            run_config=RunConfig(name="test_tuner_trainer_fail"),
            param_space=param_space,
            tune_config=TuneConfig(mode="max", metric="iteration"),
        )
        results = tuner.fit()
        assert len(results) == 2
        for i in range(2):
            assert results[i].error
Ejemplo n.º 18
0
def test_reuse_checkpoint(ray_start_4_cpus):
    def train_func(config):
        itr = 0
        ckpt = session.get_checkpoint()
        if ckpt is not None:
            ckpt = ckpt.to_dict()
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            session.report(
                dict(test=i, training_iteration=i),
                checkpoint=Checkpoint.from_dict(dict(iter=i)),
            )

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"max_iter": 5}},
    )
    [trial] = tuner.fit()._experiment_analysis.trials
    checkpoint_path = trial.checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 4

    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"max_iter": 10}},
    ).restore(trial.local_dir)
    analysis = tuner.fit()._experiment_analysis
    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 5
Ejemplo n.º 19
0
 def test_tuner_with_torch_trainer(self):
     """Test a successful run using torch trainer."""
     shutil.rmtree(
         os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"), ignore_errors=True
     )
     # The following two should be tunable.
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10}
     scaling_config = {"num_workers": 1, "use_gpu": False}
     trainer = TorchTrainer(
         train_loop_per_worker=linear_train_func,
         train_loop_config=config,
         scaling_config=scaling_config,
     )
     # prep_v1 = StandardScaler(["worst radius", "worst area"])
     # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363
         #  is resolved.
         # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
         # "datasets": {
         #     "train": tune.choice(
         #         [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]
         #     ),
         # },
         "train_loop_config": {
             "batch_size": tune.grid_search([4, 8]),
             "epochs": tune.grid_search([5, 10]),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="loss"),
     )
     results = tuner.fit()
     assert len(results) == 8
Ejemplo n.º 20
0
def tune_tensorflow_mnist(num_workers: int = 2,
                          num_samples: int = 2,
                          use_gpu: bool = False):
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    tuner = Tuner(
        trainer,
        tune_config=TuneConfig(num_samples=num_samples,
                               metric="accuracy",
                               mode="max"),
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([32, 64, 128]),
                "epochs": 3,
            }
        },
    )
    best_accuracy = tuner.fit().get_best_result().metrics["accuracy"]
    print(f"Best accuracy config: {best_accuracy}")
Ejemplo n.º 21
0
def torch_fashion_mnist(num_workers, use_gpu, num_samples):
    trainer = TorchTrainer(
        fashion_mnist_train_func,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([32, 64, 128]),
                "epochs": 2,
            }
        },
        tune_config=TuneConfig(
            num_samples=num_samples,
        ),
    )
    analysis = tuner.fit()._experiment_analysis

    # Check that loss decreases in each trial.
    for path, df in analysis.trial_dataframes.items():
        assert df.loc[1, "loss"] < df.loc[0, "loss"]
Ejemplo n.º 22
0
def test_tune_checkpoint(ray_start_4_cpus):
    def train_func():
        for i in range(9):
            session.report(dict(test=i))
        session.report(
            dict(test=i + 1), checkpoint=Checkpoint.from_dict(dict(hello="world"))
        )

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"max_iter": 5}},
    )

    [trial] = tuner.fit()._experiment_analysis.trials
    checkpoint_path = trial.checkpoint.dir_or_data
    assert os.path.exists(checkpoint_path)
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["hello"] == "world"
Ejemplo n.º 23
0
def test_tune_torch_get_device_gpu(ray_2_node_4_gpu, num_gpus_per_worker):
    from ray import tune
    from ray.tune.tuner import Tuner, TuneConfig

    num_samples = 2

    @patch("torch.cuda.is_available", lambda: True)
    def train_func():
        train.report(device_id=train.torch.get_device().index)

    trainer = TorchTrainer(
        train_func,
        torch_config=TorchConfig(backend="gloo"),
        scaling_config=ScalingConfig(
            num_workers=2,
            use_gpu=True,
            resources_per_worker={"GPU": num_gpus_per_worker},
        ),
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "dummy": tune.choice([32, 64, 128]),
            }
        },
        tune_config=TuneConfig(
            num_samples=num_samples,
        ),
    )
    analysis = tuner.fit()._experiment_analysis
    trial_dfs = list(analysis.trial_dataframes.values())
    device_ids = [trial_df["device_id"].tolist() for trial_df in trial_dfs]

    assert len(device_ids) == num_samples
    for i in range(num_samples):
        assert device_ids[i][0] == 0
Ejemplo n.º 24
0
def test_gbdt_trainer(ray_start_8_cpus):
    data_raw = load_breast_cancer()
    dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
    dataset_df["target"] = data_raw["target"]
    train_ds = ray.data.from_pandas(dataset_df).repartition(16)
    trainer = AssertingXGBoostTrainer(
        datasets={TRAIN_DATASET_KEY: train_ds},
        label_column="target",
        scaling_config=ScalingConfig(num_workers=2),
        params={
            "objective": "binary:logistic",
            "eval_metric": ["logloss"],
        },
    )
    tuner = Tuner(
        trainer,
        param_space={
            "num_boost_round": 100,
            "params": {
                "eta": tune.grid_search([0.28, 0.29, 0.3, 0.31, 0.32]),
            },
        },
        tune_config=TuneConfig(
            mode="min",
            metric="train-logloss",
            scheduler=ResourceChangingScheduler(
                ASHAScheduler(),
                resources_allocation_function=DistributeResources(
                    add_bundles=True, reserve_resources={"CPU": 1}
                ),
            ),
        ),
        run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)),
    )
    result_grid = tuner.fit()
    assert not any(x.error for x in result_grid)
Ejemplo n.º 25
0
    def fit(self) -> Result:
        """Runs training.

        Returns:
            A Result object containing the training result.

        Raises:
            TrainingFailedError: If any failures during the execution of
            ``self.as_trainable()``.
        """
        from ray.tune.tuner import Tuner

        trainable = self.as_trainable()

        tuner = Tuner(trainable=trainable, run_config=self.run_config)
        result_grid = tuner.fit()
        assert len(result_grid) == 1
        try:
            result = result_grid[0]
            if result.error:
                raise result.error
        except TuneError:
            raise TrainingFailedError
        return result
Ejemplo n.º 26
0
result = trainer.fit()
# __air_trainer_end__

# __air_trainer_output_start__
print(result.metrics)
print(result.checkpoint)
# __air_trainer_output_end__

# __air_tuner_start__
from ray import tune
from ray.tune.tuner import Tuner, TuneConfig

tuner = Tuner(
    trainer,
    param_space={"params": {"max_depth": tune.randint(1, 9)}},
    tune_config=TuneConfig(num_samples=5, metric="train-logloss", mode="min"),
)
result_grid = tuner.fit()
best_result = result_grid.get_best_result()
print(best_result)
# __air_tuner_end__

# __air_batch_predictor_start__
from ray.train.batch_predictor import BatchPredictor
from ray.train.xgboost import XGBoostPredictor

batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, XGBoostPredictor)

# Bulk batch prediction.
predicted_labels = (
Ejemplo n.º 27
0
# __shared_dataset_start__
import ray
from ray.ml.utils.check_ingest import DummyTrainer
from ray.tune.tuner import Tuner, TuneConfig

ray.init(num_cpus=5)

# Generate a synthetic 100MiB tensor dataset.
dataset = ray.data.range_tensor(500, shape=(80, 80, 4), parallelism=10)

# Create an example trainer that simply loops over the data a few times.
trainer = DummyTrainer(datasets={"train": dataset}, runtime_seconds=1)

# Run the Trainer 4x in parallel with Tune.
tuner = Tuner(
    trainer,
    tune_config=TuneConfig(num_samples=4),
)
tuner.fit()
# __shared_dataset_end__

ray.shutdown()

# __indep_dataset_start__
import ray
from ray import tune
from ray.ml.utils.check_ingest import DummyTrainer
from ray.tune.tuner import Tuner, TuneConfig

ray.init(num_cpus=5)

Ejemplo n.º 28
0
    def test_tuner_run_config_override(self):
        trainer = DummyTrainer(run_config=RunConfig(stop={"metric": 4}))
        tuner = Tuner(trainer)

        assert tuner._local_tuner._run_config.stop == {"metric": 4}
Ejemplo n.º 29
0
    def test_tuner_with_xgboost_trainer_driver_fail_and_resume(self):
        # So that we have some global checkpointing happening.
        os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "1"
        shutil.rmtree(
            os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail"),
            ignore_errors=True,
        )
        trainer = XGBoostTrainer(
            label_column="target",
            params={},
            # TODO(xwjiang): change when dataset out-of-band ser/des is landed.
            datasets={"train": gen_dataset_func_eager()},
        )
        # prep_v1 = StandardScaler(["worst radius", "worst area"])
        # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
        param_space = {
            "scaling_config": {
                "num_workers": tune.grid_search([1, 2]),
            },
            # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363
            #  is resolved.
            # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
            # "datasets": {
            #     "train": tune.choice(
            #         [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]
            #     ),
            # },
            "params": {
                "objective": "binary:logistic",
                "tree_method": "approx",
                "eval_metric": ["logloss", "error"],
                "eta": tune.loguniform(1e-4, 1e-1),
                "subsample": tune.uniform(0.5, 1.0),
                "max_depth": tune.randint(1, 9),
            },
        }

        class FailureInjectionCallback(Callback):
            """Inject failure at the configured iteration number."""
            def __init__(self, num_iters=10):
                self.num_iters = num_iters

            def on_step_end(self, iteration, trials, **kwargs):
                if iteration == self.num_iters:
                    print(f"Failing after {self.num_iters} iters.")
                    raise RuntimeError

        tuner = Tuner(
            trainable=trainer,
            run_config=RunConfig(name="test_tuner_driver_fail",
                                 callbacks=[FailureInjectionCallback()]),
            param_space=param_space,
            tune_config=TuneConfig(mode="min", metric="train-error"),
        )
        with self.assertRaises(TuneError):
            tuner.fit()

        # Test resume
        restore_path = os.path.join(DEFAULT_RESULTS_DIR,
                                    "test_tuner_driver_fail")
        tuner = Tuner.restore(restore_path)
        # A hack before we figure out RunConfig semantics across resumes.
        tuner._local_tuner._run_config.callbacks = None
        results = tuner.fit()
        assert len(results) == 2
Ejemplo n.º 30
0
            },
        },
    )

    tuner = Tuner(
        horovod_trainer,
        param_space={
            "train_loop_config": {
                "lr":
                0.1 if args.smoke_test else tune.grid_search(
                    [0.1 * i for i in range(1, 10)])
            }
        },
        tune_config=TuneConfig(
            num_samples=2 if args.smoke_test else 1,
            metric="loss",
            mode="min",
            scheduler=pbt,
        ),
        run_config=RunConfig(
            stop={"training_iteration": 1} if args.smoke_test else None,
            callbacks=[ProgressCallback()],
        ),
        _tuner_kwargs={
            "fail_fast": False,
            "keep_checkpoints_num": 1
        },
    )

    result_grid = tuner.fit()

    # Make sure trials do not fail.