Beispiel #1
0
    def __init__(
        self,
        restore_path: str = None,
        trainable: Optional[Union[str, Callable, Type[Trainable],
                                  BaseTrainer, ]] = None,
        param_space: Optional[Dict[str, Any]] = None,
        tune_config: Optional[TuneConfig] = None,
        run_config: Optional[RunConfig] = None,
        _tuner_kwargs: Optional[Dict] = None,
    ):
        # Restored from Tuner checkpoint.
        if restore_path:
            trainable_ckpt = os.path.join(restore_path, _TRAINABLE_PKL)
            with open(trainable_ckpt, "rb") as fp:
                trainable = pickle.load(fp)

            tuner_ckpt = os.path.join(restore_path, _TUNER_PKL)
            with open(tuner_ckpt, "rb") as fp:
                tuner = pickle.load(fp)
                self.__dict__.update(tuner.__dict__)

            self._is_restored = True
            self._trainable = trainable
            self._experiment_checkpoint_dir = restore_path
            return

        # Start from fresh
        if not trainable:
            raise TuneError("You need to provide a trainable to tune.")

        # If no run config was passed to Tuner directly, use the one from the Trainer,
        # if available
        if not run_config and isinstance(trainable, BaseTrainer):
            run_config = trainable.run_config

        self._is_restored = False
        self._trainable = trainable
        self._tune_config = tune_config or TuneConfig()
        self._run_config = run_config or RunConfig()
        self._tuner_kwargs = copy.deepcopy(_tuner_kwargs) or {}
        self._experiment_checkpoint_dir = self._setup_create_experiment_checkpoint_dir(
            self._run_config)

        # Not used for restored Tuner.
        self._param_space = param_space or {}

        # This needs to happen before `tune.run()` is kicked in.
        # This is because currently tune does not exit gracefully if
        # run in ray client mode - if crash happens, it just exits immediately
        # without allowing for checkpointing tuner and trainable.
        # Thus this has to happen before tune.run() so that we can have something
        # to restore from.
        tuner_ckpt = os.path.join(self._experiment_checkpoint_dir, _TUNER_PKL)
        with open(tuner_ckpt, "wb") as fp:
            pickle.dump(self, fp)

        trainable_ckpt = os.path.join(self._experiment_checkpoint_dir,
                                      _TRAINABLE_PKL)
        with open(trainable_ckpt, "wb") as fp:
            pickle.dump(self._trainable, fp)
Beispiel #2
0
def test_data_parallel_trainer(ray_start_8_cpus):
    num_workers = 2
    trainer = AssertingDataParallelTrainer(
        train_fn, scaling_config=ScalingConfig(num_workers=num_workers)
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "num_epochs": 100,
                "metric": tune.grid_search([1, 2, 3, 4, 5]),
            }
        },
        tune_config=TuneConfig(
            mode="max",
            metric="metric",
            scheduler=ResourceChangingScheduler(
                ASHAScheduler(),
                resources_allocation_function=DistributeResources(
                    add_bundles=True, reserve_resources={"CPU": 1}
                ),
            ),
        ),
        run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)),
    )
    result_grid = tuner.fit()
    assert not any(x.error for x in result_grid)
    # + 1 for Trainable
    assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
Beispiel #3
0
 def test_tuner_with_torch_trainer(self):
     """Test a successful run using torch trainer."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"),
                   ignore_errors=True)
     # The following two should be tunable.
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10}
     scaling_config = {"num_workers": 1, "use_gpu": False}
     trainer = TorchTrainer(
         train_loop_per_worker=linear_train_func,
         train_loop_config=config,
         scaling_config=scaling_config,
     )
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         "train_loop_config": {
             "batch_size": tune.grid_search([4, 8]),
             "epochs": tune.grid_search([5, 10]),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="loss"),
     )
     results = tuner.fit()
     assert len(results) == 8
Beispiel #4
0
def test_retry(ray_start_4_cpus):
    def train_func():
        ckpt = session.get_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            ckpt = ckpt.to_dict()
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            session.report(
                dict(test=i, training_iteration=i),
                checkpoint=Checkpoint.from_dict(dict(iter=i)),
            )

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3))
    )

    analysis = tuner.fit()._experiment_analysis
    checkpoint_path = analysis.trials[0].checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 3

    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 4
Beispiel #5
0
    def __init__(
        self,
        *,
        scaling_config: Optional[ScalingConfig] = None,
        run_config: Optional[RunConfig] = None,
        datasets: Optional[Dict[str, GenDataset]] = None,
        preprocessor: Optional["Preprocessor"] = None,
        resume_from_checkpoint: Optional[Checkpoint] = None,
    ):

        self.scaling_config = (scaling_config if scaling_config is not None
                               else ScalingConfig())
        self.run_config = run_config if run_config is not None else RunConfig()
        self.datasets = datasets if datasets is not None else {}
        self.preprocessor = preprocessor
        self.resume_from_checkpoint = resume_from_checkpoint

        self._validate_attributes()

        if datasets and not self.scaling_config._max_cpu_fraction_per_node:
            logger.warning(
                "When passing `datasets` to a Trainer, it is recommended to "
                "reserve at least 20% of node CPUs for Dataset execution by setting "
                "`_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. "
                "Not doing so can lead to resource contention or hangs. "
                "See https://docs.ray.io/en/master/data/key-concepts.html"
                "#example-datasets-in-tune for more info.")
Beispiel #6
0
 def test_tuner_trainer_fail(self):
     trainer = FailingTrainer()
     param_space = {
         "scaling_config":
         ScalingConfig(num_workers=tune.grid_search([1, 2]))
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner_trainer_fail"),
         param_space=param_space,
         tune_config=TuneConfig(mode="max", metric="iteration"),
     )
     results = tuner.fit()
     assert len(results) == 2
     for i in range(2):
         assert results[i].error
Beispiel #7
0
def main(num_workers=2, use_gpu=False):
    trainer = TorchTrainer(
        train_func,
        train_loop_config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": 4
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        run_config=RunConfig(callbacks=[
            MLflowLoggerCallback(experiment_name="train_fashion_mnist")
        ]),
    )
    final_results = trainer.fit()

    print("Final metrics: ", final_results.metrics)
Beispiel #8
0
    def __init__(
        self,
        *,
        scaling_config: Optional[ScalingConfig] = None,
        run_config: Optional[RunConfig] = None,
        datasets: Optional[Dict[str, GenDataset]] = None,
        preprocessor: Optional["Preprocessor"] = None,
        resume_from_checkpoint: Optional[Checkpoint] = None,
    ):

        self.scaling_config = scaling_config if scaling_config is not None else {}
        self.run_config = run_config if run_config is not None else RunConfig()
        self.datasets = datasets if datasets is not None else {}
        self.preprocessor = preprocessor
        self.resume_from_checkpoint = resume_from_checkpoint

        self._validate_attributes()
Beispiel #9
0
 def test_tuner_with_xgboost_trainer(self):
     """Test a successful run."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"),
                   ignore_errors=True)
     trainer = XGBoostTrainer(
         label_column="target",
         params={},
         datasets={"train": gen_dataset_func_eager()},
     )
     # prep_v1 = StandardScaler(["worst radius", "worst area"])
     # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
         "datasets": {
             "train":
             tune.grid_search(
                 [gen_dataset_func(),
                  gen_dataset_func(do_shuffle=True)]),
         },
         "params": {
             "objective": "binary:logistic",
             "tree_method": "approx",
             "eval_metric": ["logloss", "error"],
             "eta": tune.loguniform(1e-4, 1e-1),
             "subsample": tune.uniform(0.5, 1.0),
             "max_depth": tune.randint(1, 9),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="train-error"),
         # limiting the number of trials running at one time.
         # As the unit test only has access to 4 CPUs on Buildkite.
         _tuner_kwargs={"max_concurrent_trials": 1},
     )
     results = tuner.fit()
     assert not isinstance(results.get_best_result().checkpoint,
                           TrialCheckpoint)
     assert len(results) == 4
Beispiel #10
0
def test_gbdt_trainer(ray_start_8_cpus):
    data_raw = load_breast_cancer()
    dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
    dataset_df["target"] = data_raw["target"]
    train_ds = ray.data.from_pandas(dataset_df).repartition(16)
    trainer = AssertingXGBoostTrainer(
        datasets={TRAIN_DATASET_KEY: train_ds},
        label_column="target",
        scaling_config=ScalingConfig(num_workers=2),
        params={
            "objective": "binary:logistic",
            "eval_metric": ["logloss"],
        },
    )
    tuner = Tuner(
        trainer,
        param_space={
            "num_boost_round": 100,
            "params": {
                "eta": tune.grid_search([0.28, 0.29, 0.3, 0.31, 0.32]),
            },
        },
        tune_config=TuneConfig(
            mode="min",
            metric="train-logloss",
            scheduler=ResourceChangingScheduler(
                ASHAScheduler(),
                resources_allocation_function=DistributeResources(
                    add_bundles=True, reserve_resources={"CPU": 1}
                ),
            ),
        ),
        run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)),
    )
    result_grid = tuner.fit()
    assert not any(x.error for x in result_grid)
Beispiel #11
0
    def test_tuner_run_config_override(self):
        trainer = DummyTrainer(run_config=RunConfig(stop={"metric": 4}))
        tuner = Tuner(trainer)

        assert tuner._local_tuner._run_config.stop == {"metric": 4}
Beispiel #12
0
    def test_tuner_with_xgboost_trainer_driver_fail_and_resume(self):
        # So that we have some global checkpointing happening.
        os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "1"
        shutil.rmtree(
            os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail"),
            ignore_errors=True,
        )
        trainer = XGBoostTrainer(
            label_column="target",
            params={},
            datasets={"train": gen_dataset_func_eager()},
        )
        # prep_v1 = StandardScaler(["worst radius", "worst area"])
        # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
        param_space = {
            "scaling_config": {
                "num_workers": tune.grid_search([1, 2]),
            },
            # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
            "datasets": {
                "train":
                tune.grid_search(
                    [gen_dataset_func(),
                     gen_dataset_func(do_shuffle=True)]),
            },
            "params": {
                "objective": "binary:logistic",
                "tree_method": "approx",
                "eval_metric": ["logloss", "error"],
                "eta": tune.loguniform(1e-4, 1e-1),
                "subsample": tune.uniform(0.5, 1.0),
                "max_depth": tune.randint(1, 9),
            },
        }

        class FailureInjectionCallback(Callback):
            """Inject failure at the configured iteration number."""
            def __init__(self, num_iters=10):
                self.num_iters = num_iters

            def on_step_end(self, iteration, trials, **kwargs):
                if iteration == self.num_iters:
                    print(f"Failing after {self.num_iters} iters.")
                    raise RuntimeError

        tuner = Tuner(
            trainable=trainer,
            run_config=RunConfig(name="test_tuner_driver_fail",
                                 callbacks=[FailureInjectionCallback()]),
            param_space=param_space,
            tune_config=TuneConfig(mode="min", metric="train-error"),
            # limiting the number of trials running at one time.
            # As the unit test only has access to 4 CPUs on Buildkite.
            _tuner_kwargs={"max_concurrent_trials": 1},
        )
        with self.assertRaises(TuneError):
            tuner.fit()

        # Test resume
        restore_path = os.path.join(DEFAULT_RESULTS_DIR,
                                    "test_tuner_driver_fail")
        tuner = Tuner.restore(restore_path)
        # A hack before we figure out RunConfig semantics across resumes.
        tuner._local_tuner._run_config.callbacks = None
        results = tuner.fit()
        assert len(results) == 4
Beispiel #13
0
        results = tuner.fit()
        assert len(results) == 8

    def test_tuner_run_config_override(self):
        trainer = DummyTrainer(run_config=RunConfig(stop={"metric": 4}))
        tuner = Tuner(trainer)

        assert tuner._local_tuner._run_config.stop == {"metric": 4}


@pytest.mark.parametrize(
    "params_expected",
    [
        (
            {
                "run_config": RunConfig(progress_reporter=CLIReporter())
            },
            lambda kw: isinstance(kw["progress_reporter"], CLIReporter),
        ),
        (
            {
                "run_config": RunConfig(reuse_actors=True)
            },
            lambda kw: kw["reuse_actors"] is True,
        ),
        (
            {
                "run_config": RunConfig(log_to_file="some_file")
            },
            lambda kw: kw["log_to_file"] == "some_file",
        ),
Beispiel #14
0
    trainer,
    param_space={
        "train_loop_config": {
            "lr": tune.choice([0.001, 0.01, 0.1]),
            "momentum": 0.8,
            "head_location": None,
            "worker_locations": None,
            "test_mode": args.smoke_test,
            "batch_size": 128 * num_training_workers,
            # For the long running test, we want the training to run forever,
            # and it will be terminated by the release test infra.
            "epochs": 1 if args.smoke_test else sys.maxsize,
        }
    },
    tune_config=TuneConfig(num_samples=4,
                           metric="loss",
                           mode="min",
                           scheduler=pbt_scheduler),
    run_config=RunConfig(
        stop={"training_iteration": 1} if args.smoke_test else None,
        callbacks=[
            FailureInjectorCallback(time_between_checks=90),
            ProgressCallback()
        ],
    ),
)

results = tuner.fit()

print(results.get_best_result(metric="loss", mode="min"))
                "momentum": [0.8, 0.9, 0.99],
            }
        },
    )

    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.choice([0.001, 0.01, 0.1]),
                "momentum": 0.8,
                "batch_size": 128 * args.num_workers,
                "epochs": args.num_epochs,
                "test_mode": args.smoke_test,  # whether to to subset the data
            }
        },
        tune_config=TuneConfig(num_samples=4,
                               metric="loss",
                               mode="min",
                               scheduler=pbt_scheduler),
        run_config=RunConfig(
            stop={"training_iteration": 2 if args.smoke_test else 100},
            failure_config=FailureConfig(
                max_failures=3),  # used for fault tolerance
        ),
    )

    results = tuner.fit()

    print(results.get_best_result(metric="loss", mode="min"))