Ejemplo n.º 1
0
 def test_tuner_with_torch_trainer(self):
     """Test a successful run using torch trainer."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"),
                   ignore_errors=True)
     # The following two should be tunable.
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10}
     scaling_config = {"num_workers": 1, "use_gpu": False}
     trainer = TorchTrainer(
         train_loop_per_worker=linear_train_func,
         train_loop_config=config,
         scaling_config=scaling_config,
     )
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         "train_loop_config": {
             "batch_size": tune.grid_search([4, 8]),
             "epochs": tune.grid_search([5, 10]),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="loss"),
     )
     results = tuner.fit()
     assert len(results) == 8
Ejemplo n.º 2
0
    def __init__(
        self,
        restore_path: str = None,
        trainable: Optional[Union[str, Callable, Type[Trainable],
                                  BaseTrainer, ]] = None,
        param_space: Optional[Dict[str, Any]] = None,
        tune_config: Optional[TuneConfig] = None,
        run_config: Optional[RunConfig] = None,
        _tuner_kwargs: Optional[Dict] = None,
    ):
        # Restored from Tuner checkpoint.
        if restore_path:
            trainable_ckpt = os.path.join(restore_path, _TRAINABLE_PKL)
            with open(trainable_ckpt, "rb") as fp:
                trainable = pickle.load(fp)

            tuner_ckpt = os.path.join(restore_path, _TUNER_PKL)
            with open(tuner_ckpt, "rb") as fp:
                tuner = pickle.load(fp)
                self.__dict__.update(tuner.__dict__)

            self._is_restored = True
            self._trainable = trainable
            self._experiment_checkpoint_dir = restore_path
            return

        # Start from fresh
        if not trainable:
            raise TuneError("You need to provide a trainable to tune.")

        # If no run config was passed to Tuner directly, use the one from the Trainer,
        # if available
        if not run_config and isinstance(trainable, BaseTrainer):
            run_config = trainable.run_config

        self._is_restored = False
        self._trainable = trainable
        self._tune_config = tune_config or TuneConfig()
        self._run_config = run_config or RunConfig()
        self._tuner_kwargs = copy.deepcopy(_tuner_kwargs) or {}
        self._experiment_checkpoint_dir = self._setup_create_experiment_checkpoint_dir(
            self._run_config)

        # Not used for restored Tuner.
        self._param_space = param_space or {}

        # This needs to happen before `tune.run()` is kicked in.
        # This is because currently tune does not exit gracefully if
        # run in ray client mode - if crash happens, it just exits immediately
        # without allowing for checkpointing tuner and trainable.
        # Thus this has to happen before tune.run() so that we can have something
        # to restore from.
        tuner_ckpt = os.path.join(self._experiment_checkpoint_dir, _TUNER_PKL)
        with open(tuner_ckpt, "wb") as fp:
            pickle.dump(self, fp)

        trainable_ckpt = os.path.join(self._experiment_checkpoint_dir,
                                      _TRAINABLE_PKL)
        with open(trainable_ckpt, "wb") as fp:
            pickle.dump(self._trainable, fp)
Ejemplo n.º 3
0
def tune_linear(num_workers, num_samples, use_gpu):
    train_dataset, val_dataset = get_datasets()

    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}

    scaling_config = {"num_workers": num_workers, "use_gpu": use_gpu}

    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=scaling_config,
        datasets={
            "train": train_dataset,
            "validation": val_dataset
        },
    )

    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([4, 16, 32]),
                "epochs": 3,
            }
        },
        tune_config=TuneConfig(num_samples=num_samples,
                               metric="loss",
                               mode="min"),
    )
    result_grid = tuner.fit()
    best_result = result_grid.get_best_result()
    print(best_result)
    return best_result
Ejemplo n.º 4
0
def tune_horovod(num_workers, num_samples, use_gpu, mode="square", x_max=1.0):
    horovod_trainer = HorovodTrainer(
        train_loop_per_worker=train_loop_per_worker,
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu
        },
        train_loop_config={
            "mode": mode,
            "x_max": x_max
        },
    )

    tuner = Tuner(
        horovod_trainer,
        param_space={"train_loop_config": {
            "lr": tune.uniform(0.1, 1)
        }},
        tune_config=TuneConfig(mode="min",
                               metric="loss",
                               num_samples=num_samples),
        _tuner_kwargs={"fail_fast": True},
    )

    result_grid = tuner.fit()

    print("Best hyperparameters found were: ",
          result_grid.get_best_result().config)
Ejemplo n.º 5
0
def test_data_parallel_trainer(ray_start_8_cpus):
    num_workers = 2
    trainer = AssertingDataParallelTrainer(
        train_fn, scaling_config=ScalingConfig(num_workers=num_workers)
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "num_epochs": 100,
                "metric": tune.grid_search([1, 2, 3, 4, 5]),
            }
        },
        tune_config=TuneConfig(
            mode="max",
            metric="metric",
            scheduler=ResourceChangingScheduler(
                ASHAScheduler(),
                resources_allocation_function=DistributeResources(
                    add_bundles=True, reserve_resources={"CPU": 1}
                ),
            ),
        ),
        run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)),
    )
    result_grid = tuner.fit()
    assert not any(x.error for x in result_grid)
    # + 1 for Trainable
    assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
Ejemplo n.º 6
0
 def test_tuner_trainer_fail(self):
     trainer = FailingTrainer()
     param_space = {
         "scaling_config":
         ScalingConfig(num_workers=tune.grid_search([1, 2]))
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner_trainer_fail"),
         param_space=param_space,
         tune_config=TuneConfig(mode="max", metric="iteration"),
     )
     results = tuner.fit()
     assert len(results) == 2
     for i in range(2):
         assert results[i].error
Ejemplo n.º 7
0
 def test_tuner_with_xgboost_trainer(self):
     """Test a successful run."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"),
                   ignore_errors=True)
     trainer = XGBoostTrainer(
         label_column="target",
         params={},
         datasets={"train": gen_dataset_func_eager()},
     )
     # prep_v1 = StandardScaler(["worst radius", "worst area"])
     # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
         "datasets": {
             "train":
             tune.grid_search(
                 [gen_dataset_func(),
                  gen_dataset_func(do_shuffle=True)]),
         },
         "params": {
             "objective": "binary:logistic",
             "tree_method": "approx",
             "eval_metric": ["logloss", "error"],
             "eta": tune.loguniform(1e-4, 1e-1),
             "subsample": tune.uniform(0.5, 1.0),
             "max_depth": tune.randint(1, 9),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="train-error"),
         # limiting the number of trials running at one time.
         # As the unit test only has access to 4 CPUs on Buildkite.
         _tuner_kwargs={"max_concurrent_trials": 1},
     )
     results = tuner.fit()
     assert not isinstance(results.get_best_result().checkpoint,
                           TrialCheckpoint)
     assert len(results) == 4
Ejemplo n.º 8
0
 def test_tuner_with_xgboost_trainer(self):
     """Test a successful run."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"),
                   ignore_errors=True)
     trainer = XGBoostTrainer(
         label_column="target",
         params={},
         # TODO(xwjiang): change when dataset out-of-band ser/des is landed.
         datasets={"train": gen_dataset_func_eager()},
     )
     # prep_v1 = StandardScaler(["worst radius", "worst area"])
     # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363
         #  is resolved.
         # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
         # "datasets": {
         #     "train": tune.choice(
         #         [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]
         #     ),
         # },
         "params": {
             "objective": "binary:logistic",
             "tree_method": "approx",
             "eval_metric": ["logloss", "error"],
             "eta": tune.loguniform(1e-4, 1e-1),
             "subsample": tune.uniform(0.5, 1.0),
             "max_depth": tune.randint(1, 9),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="train-error"),
     )
     results = tuner.fit()
     assert not isinstance(results.get_best_result().checkpoint,
                           TrialCheckpoint)
     assert len(results) == 2
Ejemplo n.º 9
0
def tune_tensorflow_mnist(num_workers, num_samples):
    trainer = TensorflowTrainer(
        train_func, scaling_config=ScalingConfig(num_workers=num_workers))
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([32, 64, 128]),
                "epochs": 3,
            },
        },
        tune_config=TuneConfig(num_samples=num_samples),
    )
    analysis = tuner.fit()
    best_loss = analysis.get_best_result(metric="loss", mode="min")
    best_accuracy = analysis.get_best_result(metric="accuracy", mode="max")
    print(f"Best loss result: {best_loss}")
    print(f"Best accuracy result: {best_accuracy}")
    return analysis
Ejemplo n.º 10
0
def test_tune(ray_start_4_cpus):
    def train_func(config):
        session.report({"loss": config["x"]})

    trainer = DataParallelTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={"x": 100},
        scaling_config=scale_config,
    )

    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"x": tune.choice([200, 300])}},
        tune_config=TuneConfig(num_samples=2),
    )
    result_grid = tuner.fit()
    assert result_grid[0].metrics["loss"] in [200, 300]

    # Make sure original Trainer is not affected.
    assert trainer._train_loop_config["x"] == 100
Ejemplo n.º 11
0
    def test_tuner_trainer_fail(self):
        class DummyTrainer(Trainer):
            def training_loop(self) -> None:
                raise RuntimeError("There is an error in trainer!")

        trainer = DummyTrainer()
        param_space = {
            "scaling_config": {
                "num_workers": tune.grid_search([1, 2]),
            }
        }
        tuner = Tuner(
            trainable=trainer,
            run_config=RunConfig(name="test_tuner_trainer_fail"),
            param_space=param_space,
            tune_config=TuneConfig(mode="max", metric="iteration"),
        )
        results = tuner.fit()
        assert len(results) == 2
        for i in range(2):
            assert results[i].error
Ejemplo n.º 12
0
 def test_tuner_with_torch_trainer(self):
     """Test a successful run using torch trainer."""
     shutil.rmtree(
         os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"), ignore_errors=True
     )
     # The following two should be tunable.
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10}
     scaling_config = {"num_workers": 1, "use_gpu": False}
     trainer = TorchTrainer(
         train_loop_per_worker=linear_train_func,
         train_loop_config=config,
         scaling_config=scaling_config,
     )
     # prep_v1 = StandardScaler(["worst radius", "worst area"])
     # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363
         #  is resolved.
         # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
         # "datasets": {
         #     "train": tune.choice(
         #         [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]
         #     ),
         # },
         "train_loop_config": {
             "batch_size": tune.grid_search([4, 8]),
             "epochs": tune.grid_search([5, 10]),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="loss"),
     )
     results = tuner.fit()
     assert len(results) == 8
Ejemplo n.º 13
0
def tune_tensorflow_mnist(num_workers: int = 2,
                          num_samples: int = 2,
                          use_gpu: bool = False):
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    tuner = Tuner(
        trainer,
        tune_config=TuneConfig(num_samples=num_samples,
                               metric="accuracy",
                               mode="max"),
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([32, 64, 128]),
                "epochs": 3,
            }
        },
    )
    best_accuracy = tuner.fit().get_best_result().metrics["accuracy"]
    print(f"Best accuracy config: {best_accuracy}")
Ejemplo n.º 14
0
def torch_fashion_mnist(num_workers, use_gpu, num_samples):
    trainer = TorchTrainer(
        fashion_mnist_train_func,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([32, 64, 128]),
                "epochs": 2,
            }
        },
        tune_config=TuneConfig(
            num_samples=num_samples,
        ),
    )
    analysis = tuner.fit()._experiment_analysis

    # Check that loss decreases in each trial.
    for path, df in analysis.trial_dataframes.items():
        assert df.loc[1, "loss"] < df.loc[0, "loss"]
Ejemplo n.º 15
0
def test_tune_torch_get_device_gpu(ray_2_node_4_gpu, num_gpus_per_worker):
    from ray import tune
    from ray.tune.tuner import Tuner, TuneConfig

    num_samples = 2

    @patch("torch.cuda.is_available", lambda: True)
    def train_func():
        train.report(device_id=train.torch.get_device().index)

    trainer = TorchTrainer(
        train_func,
        torch_config=TorchConfig(backend="gloo"),
        scaling_config=ScalingConfig(
            num_workers=2,
            use_gpu=True,
            resources_per_worker={"GPU": num_gpus_per_worker},
        ),
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "dummy": tune.choice([32, 64, 128]),
            }
        },
        tune_config=TuneConfig(
            num_samples=num_samples,
        ),
    )
    analysis = tuner.fit()._experiment_analysis
    trial_dfs = list(analysis.trial_dataframes.values())
    device_ids = [trial_df["device_id"].tolist() for trial_df in trial_dfs]

    assert len(device_ids) == num_samples
    for i in range(num_samples):
        assert device_ids[i][0] == 0
Ejemplo n.º 16
0
def test_gbdt_trainer(ray_start_8_cpus):
    data_raw = load_breast_cancer()
    dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
    dataset_df["target"] = data_raw["target"]
    train_ds = ray.data.from_pandas(dataset_df).repartition(16)
    trainer = AssertingXGBoostTrainer(
        datasets={TRAIN_DATASET_KEY: train_ds},
        label_column="target",
        scaling_config=ScalingConfig(num_workers=2),
        params={
            "objective": "binary:logistic",
            "eval_metric": ["logloss"],
        },
    )
    tuner = Tuner(
        trainer,
        param_space={
            "num_boost_round": 100,
            "params": {
                "eta": tune.grid_search([0.28, 0.29, 0.3, 0.31, 0.32]),
            },
        },
        tune_config=TuneConfig(
            mode="min",
            metric="train-logloss",
            scheduler=ResourceChangingScheduler(
                ASHAScheduler(),
                resources_allocation_function=DistributeResources(
                    add_bundles=True, reserve_resources={"CPU": 1}
                ),
            ),
        ),
        run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)),
    )
    result_grid = tuner.fit()
    assert not any(x.error for x in result_grid)
Ejemplo n.º 17
0
    def test_tuner_with_xgboost_trainer_driver_fail_and_resume(self):
        # So that we have some global checkpointing happening.
        os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "1"
        shutil.rmtree(
            os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail"),
            ignore_errors=True,
        )
        trainer = XGBoostTrainer(
            label_column="target",
            params={},
            # TODO(xwjiang): change when dataset out-of-band ser/des is landed.
            datasets={"train": gen_dataset_func_eager()},
        )
        # prep_v1 = StandardScaler(["worst radius", "worst area"])
        # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
        param_space = {
            "scaling_config": {
                "num_workers": tune.grid_search([1, 2]),
            },
            # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363
            #  is resolved.
            # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
            # "datasets": {
            #     "train": tune.choice(
            #         [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]
            #     ),
            # },
            "params": {
                "objective": "binary:logistic",
                "tree_method": "approx",
                "eval_metric": ["logloss", "error"],
                "eta": tune.loguniform(1e-4, 1e-1),
                "subsample": tune.uniform(0.5, 1.0),
                "max_depth": tune.randint(1, 9),
            },
        }

        class FailureInjectionCallback(Callback):
            """Inject failure at the configured iteration number."""
            def __init__(self, num_iters=10):
                self.num_iters = num_iters

            def on_step_end(self, iteration, trials, **kwargs):
                if iteration == self.num_iters:
                    print(f"Failing after {self.num_iters} iters.")
                    raise RuntimeError

        tuner = Tuner(
            trainable=trainer,
            run_config=RunConfig(name="test_tuner_driver_fail",
                                 callbacks=[FailureInjectionCallback()]),
            param_space=param_space,
            tune_config=TuneConfig(mode="min", metric="train-error"),
        )
        with self.assertRaises(TuneError):
            tuner.fit()

        # Test resume
        restore_path = os.path.join(DEFAULT_RESULTS_DIR,
                                    "test_tuner_driver_fail")
        tuner = Tuner.restore(restore_path)
        # A hack before we figure out RunConfig semantics across resumes.
        tuner._local_tuner._run_config.callbacks = None
        results = tuner.fit()
        assert len(results) == 2
Ejemplo n.º 18
0
    trainer,
    param_space={
        "train_loop_config": {
            "lr": tune.choice([0.001, 0.01, 0.1]),
            "momentum": 0.8,
            "head_location": None,
            "worker_locations": None,
            "test_mode": args.smoke_test,
            "batch_size": 128 * num_training_workers,
            # For the long running test, we want the training to run forever,
            # and it will be terminated by the release test infra.
            "epochs": 1 if args.smoke_test else sys.maxsize,
        }
    },
    tune_config=TuneConfig(num_samples=4,
                           metric="loss",
                           mode="min",
                           scheduler=pbt_scheduler),
    run_config=RunConfig(
        stop={"training_iteration": 1} if args.smoke_test else None,
        callbacks=[
            FailureInjectorCallback(time_between_checks=90),
            ProgressCallback()
        ],
    ),
)

results = tuner.fit()

print(results.get_best_result(metric="loss", mode="min"))
Ejemplo n.º 19
0
        ),
        (
            {
                "run_config": RunConfig(reuse_actors=True)
            },
            lambda kw: kw["reuse_actors"] is True,
        ),
        (
            {
                "run_config": RunConfig(log_to_file="some_file")
            },
            lambda kw: kw["log_to_file"] == "some_file",
        ),
        (
            {
                "tune_config": TuneConfig(max_concurrent_trials=3)
            },
            lambda kw: kw["max_concurrent_trials"] == 3,
        ),
        (
            {
                "tune_config": TuneConfig(time_budget_s=60)
            },
            lambda kw: kw["time_budget_s"] == 60,
        ),
    ],
)
def test_tuner_api_kwargs(params_expected):
    tuner_params, assertion = params_expected

    tuner = Tuner(lambda config: 1, **tuner_params)
Ejemplo n.º 20
0
            },
        },
    )

    tuner = Tuner(
        horovod_trainer,
        param_space={
            "train_loop_config": {
                "lr":
                0.1 if args.smoke_test else tune.grid_search(
                    [0.1 * i for i in range(1, 10)])
            }
        },
        tune_config=TuneConfig(
            num_samples=2 if args.smoke_test else 1,
            metric="loss",
            mode="min",
            scheduler=pbt,
        ),
        run_config=RunConfig(
            stop={"training_iteration": 1} if args.smoke_test else None,
            callbacks=[ProgressCallback()],
        ),
        _tuner_kwargs={
            "fail_fast": False,
            "keep_checkpoints_num": 1
        },
    )

    result_grid = tuner.fit()

    # Make sure trials do not fail.
Ejemplo n.º 21
0
    # Manually iterate over the data 10 times (10 epochs).
    for _ in range(10):
        for batch in data_shard.iter_batches():
            print("Do some training on batch", batch)


trainer = TorchTrainer(
    train_loop_per_worker,
    scaling_config=ScalingConfig(num_workers=1),
    datasets={"train": ray.data.range_tensor(1000)},
)
param_space = {
    "scaling_config": ScalingConfig(num_workers=tune.grid_search([1, 2])),
    "params": {
        "objective": "binary:logistic",
        "tree_method": "approx",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9),
    },
}
tuner = Tuner(
    trainable=trainer,
    param_space=param_space,
    tune_config=TuneConfig(mode="min", metric="train-error"),
)
results = tuner.fit()
# __config_scaling_2_end__