Beispiel #1
0
def tune_xgboost(train_df, test_df, target_column):
    # Set XGBoost config.
    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    ray_params = RayParams(max_actor_restarts=1,
                           gpus_per_actor=0,
                           cpus_per_actor=4,
                           num_actors=4)

    analysis = tune.run(
        tune.with_parameters(train_xgboost,
                             train_df=train_df,
                             test_df=test_df,
                             target_column=target_column,
                             ray_params=ray_params),
        # Use the `get_tune_resources` helper function to set the resources.
        resources_per_trial=ray_params.get_tune_resources(),
        config=config,
        num_samples=1,
        metric="eval-error",
        mode="min",
        verbose=1)

    accuracy = 1. - analysis.best_result["eval-error"]
    print(f"Best model parameters: {analysis.best_config}")
    print(f"Best model total accuracy: {accuracy:.4f}")

    return analysis.best_config
Beispiel #2
0
def main(cpus_per_actor, num_actors, num_samples):
    # Set XGBoost config.
    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    ray_params = RayParams(max_actor_restarts=1,
                           gpus_per_actor=0,
                           cpus_per_actor=cpus_per_actor,
                           num_actors=num_actors)

    analysis = tune.run(
        tune.with_parameters(train_breast_cancer, ray_params=ray_params),
        # Use the `get_tune_resources` helper function to set the resources.
        resources_per_trial=ray_params.get_tune_resources(),
        config=config,
        num_samples=num_samples,
        metric="eval-error",
        mode="min")

    # Load the best model checkpoint.
    best_bst = xgboost_ray.tune.load_model(
        os.path.join(analysis.best_logdir, "tuned.xgb"))

    best_bst.save_model("best_model.xgb")

    accuracy = 1. - analysis.best_result["eval-error"]
    print(f"Best model parameters: {analysis.best_config}")
    print(f"Best model total accuracy: {accuracy:.4f}")
Beispiel #3
0
def main():
    name = "large xgboost sweep"

    ray.init(address="auto")

    num_samples = 31  # So that we fit on 1024 CPUs with 1 head bundle
    num_actors_per_sample = 32

    max_runtime = 3500

    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": 4,
    }

    ray_params = RayParams(
        max_actor_restarts=1,
        gpus_per_actor=0,
        cpus_per_actor=1,
        num_actors=num_actors_per_sample,
    )

    start_time = time.monotonic()
    analysis = tune.run(
        tune.with_parameters(xgboost_train,
                             ray_params=ray_params,
                             num_boost_round=100),
        config=config,
        num_samples=num_samples,
        resources_per_trial=ray_params.get_tune_resources(),
    )
    time_taken = time.monotonic() - start_time

    result = {
        "time_taken": time_taken,
        "trial_states":
        dict(Counter([trial.status for trial in analysis.trials])),
        "last_update": time.time(),
    }
    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
                                      "/tmp/tune_test.json")
    with open(test_output_json, "wt") as f:
        json.dump(result, f)

    if time_taken > max_runtime:
        print(f"The {name} test took {time_taken:.2f} seconds, but should not "
              f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n"
              f"--- FAILED: {name.upper()} ::: "
              f"{time_taken:.2f} > {max_runtime:.2f} ---")
    else:
        print(f"The {name} test took {time_taken:.2f} seconds, which "
              f"is below the budget of {max_runtime:.2f} seconds. "
              f"Test successful. \n\n"
              f"--- PASSED: {name.upper()} ::: "
              f"{time_taken:.2f} <= {max_runtime:.2f} ---")
Beispiel #4
0
 def testElasticFails(self):
     """Test if error is thrown when using Tune with elastic training."""
     ray_params = RayParams(cpus_per_actor=1,
                            num_actors=1,
                            elastic_training=True)
     with self.assertRaises(TuneError):
         tune.run(self.train_func(ray_params),
                  config=self.params,
                  resources_per_trial=ray_params.get_tune_resources(),
                  num_samples=1)
Beispiel #5
0
    def testNumIters(self):
        """Test that the number of reported tune results is correct"""
        ray_params = RayParams(cpus_per_actor=1, num_actors=2)
        analysis = tune.run(
            self.train_func(ray_params),
            config=self.params,
            resources_per_trial=ray_params.get_tune_resources(),
            num_samples=2)

        self.assertSequenceEqual(
            list(analysis.results_df["training_iteration"]),
            list(analysis.results_df["config.num_boost_round"]))
Beispiel #6
0
    def test_tune_pack(self):
        """Tests whether workers are packed when using Tune."""
        try:
            from ray import tune
        except ImportError:
            self.skipTest("Tune is not installed.")
            return
        with self.ray_start_cluster() as cluster:
            num_actors = 2
            cluster.add_node(num_cpus=3)
            cluster.add_node(num_cpus=3)
            ray.init(address=cluster.address)

            ray_params = RayParams(max_actor_restarts=1,
                                   num_actors=num_actors,
                                   cpus_per_actor=1)

            def _mock_train(*args, _training_state, **kwargs):
                try:
                    results = _train(*args,
                                     _training_state=_training_state,
                                     **kwargs)
                    return results
                except Exception:
                    raise
                finally:
                    assert len(_training_state.actors) == num_actors
                    if not any(a is None for a in _training_state.actors):
                        actor_infos = ray.state.actors()
                        actor_nodes = []
                        for a in _training_state.actors:
                            actor_info = actor_infos.get(a._actor_id.hex())
                            actor_node = actor_info["Address"]["NodeID"]
                            actor_nodes.append(actor_node)
                        assert actor_nodes[0] == actor_nodes[1]

            def train_func(params, x, y, ray_params):
                def inner_func(config):
                    with patch("xgboost_ray.main._train", _mock_train):
                        train(params,
                              RayDMatrix(x, y),
                              num_boost_round=4,
                              ray_params=ray_params)

                return inner_func

            tune.run(
                train_func(self.params, self.x, self.y, ray_params),
                resources_per_trial=ray_params.get_tune_resources(),
                num_samples=1,
            )
Beispiel #7
0
    def testEndToEndCheckpointingOrigTune(self):
        ray_params = RayParams(cpus_per_actor=1, num_actors=2)
        analysis = tune.run(
            self.train_func(ray_params,
                            callbacks=[OrigTuneReportCheckpointCallback()]),
            config=self.params,
            resources_per_trial=ray_params.get_tune_resources(),
            num_samples=1,
            metric="train-mlogloss",
            mode="min",
            log_to_file=True,
            local_dir=self.experiment_dir)

        self.assertTrue(os.path.exists(analysis.best_checkpoint))
Beispiel #8
0
def readme_tune():
    from xgboost_ray import RayDMatrix, RayParams, train
    from sklearn.datasets import load_breast_cancer

    num_actors = 4
    num_cpus_per_actor = 1

    ray_params = RayParams(
        num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)

    def train_model(config):
        train_x, train_y = load_breast_cancer(return_X_y=True)
        train_set = RayDMatrix(train_x, train_y)

        evals_result = {}
        bst = train(
            params=config,
            dtrain=train_set,
            evals_result=evals_result,
            evals=[(train_set, "train")],
            verbose_eval=False,
            ray_params=ray_params)
        bst.save_model("model.xgb")

    from ray import tune

    # Specify the hyperparameter search space.
    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    # Make sure to use the `get_tune_resources` method to set the `resources_per_trial`
    analysis = tune.run(
        train_model,
        config=config,
        metric="train-error",
        mode="min",
        num_samples=4,
        resources_per_trial=ray_params.get_tune_resources())
    print("Best hyperparameters", analysis.best_config)
Beispiel #9
0
def main():
    name = "large xgboost sweep"

    ray.init(address="auto")

    num_samples = 32
    num_actors_per_sample = 32

    max_runtime = 3500

    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": 4
    }

    ray_params = RayParams(
        max_actor_restarts=1,
        gpus_per_actor=0,
        cpus_per_actor=1,
        num_actors=num_actors_per_sample)

    start_time = time.monotonic()
    tune.run(
        tune.with_parameters(
            xgboost_train, ray_params=ray_params, num_boost_round=100),
        config=config,
        num_samples=num_samples,
        resources_per_trial=ray_params.get_tune_resources())
    time_taken = time.monotonic() - start_time

    assert time_taken < max_runtime, \
        f"The {name} test took {time_taken:.2f} seconds, but should not " \
        f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n" \
        f"--- FAILED: {name.upper()} ::: " \
        f"{time_taken:.2f} > {max_runtime:.2f} ---"

    print(f"The {name} test took {time_taken:.2f} seconds, which "
          f"is below the budget of {max_runtime:.2f} seconds. "
          f"Test successful. \n\n"
          f"--- PASSED: {name.upper()} ::: "
          f"{time_taken:.2f} <= {max_runtime:.2f} ---")
Beispiel #10
0
def tune_test(path,
              num_trials,
              num_workers,
              num_boost_rounds,
              num_files=0,
              regression=False,
              use_gpu=False,
              fake_data=False,
              smoke_test=False):
    ray_params = RayParams(elastic_training=False,
                           max_actor_restarts=0,
                           num_actors=num_workers,
                           cpus_per_actor=1,
                           gpus_per_actor=0 if not use_gpu else 1)

    def local_train(config):
        temp_dir = None
        if fake_data or smoke_test:
            temp_dir = "/tmp/release_test_data"
            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir)

            os.makedirs(temp_dir, 0o755)
            local_path = os.path.join(temp_dir, "smoketest.parquet")

            create_parquet(filename=local_path,
                           num_rows=args.num_workers * 500,
                           num_features=4,
                           num_classes=2,
                           num_partitions=args.num_workers * 10)
        else:
            if not os.path.exists(path):
                raise ValueError(
                    f"Benchmarking data not found: {path}."
                    f"\nFIX THIS by running `python create_test_data.py` "
                    f"on all nodes first.")
            local_path = path

        xgboost_params = {
            "tree_method": "hist" if not use_gpu else "gpu_hist",
        }

        xgboost_params.update({
            "objective": "binary:logistic",
            "eval_metric": ["logloss", "error"],
        })

        xgboost_params.update(config)

        additional_results = {}

        bst, time_taken = train_ray(
            path=local_path,
            num_workers=num_workers,
            num_boost_rounds=num_boost_rounds,
            num_files=num_files,
            regression=regression,
            use_gpu=use_gpu,
            smoke_test=smoke_test,
            ray_params=ray_params,
            xgboost_params=xgboost_params,
            # kwargs
            additional_results=additional_results,
            callbacks=[PlacementCallback(),
                       TuneReportCallback()])

        bst.save_model("tuned.xgb")

        trial_ips = []
        for rank, ips in enumerate(additional_results["callback_returns"]):
            for ip in ips:
                trial_ips.append(ip)

        tune_trial = get_trial_id()
        with tune.checkpoint_dir(num_boost_rounds + 1) as checkpoint_dir:
            with open(os.path.join(checkpoint_dir, "callback_returns.json"),
                      "wt") as f:
                json.dump({tune_trial: trial_ips}, f)

        if temp_dir:
            shutil.rmtree(temp_dir)

    search_space = {
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    analysis = tune.run(
        local_train,
        config=search_space,
        num_samples=num_trials,
        sync_config=tune.SyncConfig(sync_to_driver=DockerSyncer),
        resources_per_trial=ray_params.get_tune_resources())

    # In our PACK scheduling, we expect that each IP hosts only workers
    # for one Ray Tune trial.
    ip_to_trials = defaultdict(list)
    for trial in analysis.trials:
        trial = trial
        with open(
                os.path.join(trial.checkpoint.value, "callback_returns.json"),
                "rt") as f:
            trial_to_ips = json.load(f)
        for tune_trial, ips in trial_to_ips.items():
            for node_ip in ips:
                ip_to_trials[node_ip].append(tune_trial)

    fail = False
    for ip, trial_ids in ip_to_trials.items():
        print(f"For IP {ip} got trial IDs {trial_ids}")
        fail = fail or any(trial_id != trial_ids[0] for trial_id in trial_ids)

    if fail:
        raise ValueError("Different trial IDs found on same node.")
    else:
        print("Success.")
Beispiel #11
0
        num_files=25,
        regression=False,
        use_gpu=False,
        ray_params=ray_params,
        xgboost_params=config,
    )


if __name__ == "__main__":
    search_space = {
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    ray.init(address="auto")

    ray_params = RayParams(elastic_training=False,
                           max_actor_restarts=2,
                           num_actors=4,
                           cpus_per_actor=1,
                           gpus_per_actor=0)

    analysis = tune.run(tune.with_parameters(train_wrapper,
                                             ray_params=ray_params),
                        config=search_space,
                        num_samples=4,
                        resources_per_trial=ray_params.get_tune_resources())

    print("PASSED.")