Beispiel #1
0
def train_ray(
    path,
    num_workers,
    num_boost_rounds,
    num_files=0,
    regression=False,
    use_gpu=False,
    ray_params=None,
    xgboost_params=None,
    **kwargs,
):
    if not isinstance(path, list):
        path = get_parquet_files(path, num_files=num_files)

    use_device_matrix = False
    if use_gpu:
        try:
            import cupy  # noqa: F401

            use_device_matrix = True
        except ImportError:
            use_device_matrix = False

    if use_device_matrix:
        dtrain = RayDeviceQuantileDMatrix(
            path,
            num_actors=num_workers,
            label="labels",
            ignore=["partition"],
            filetype=RayFileType.PARQUET,
        )
    else:
        dtrain = RayDMatrix(
            path,
            num_actors=num_workers,
            label="labels",
            ignore=["partition"],
            filetype=RayFileType.PARQUET,
        )

    config = {"tree_method": "hist" if not use_gpu else "gpu_hist"}

    if not regression:
        # Classification
        config.update(
            {
                "objective": "binary:logistic",
                "eval_metric": ["logloss", "error"],
            }
        )
    else:
        # Regression
        config.update(
            {
                "objective": "reg:squarederror",
                "eval_metric": ["logloss", "rmse"],
            }
        )

    if xgboost_params:
        config.update(xgboost_params)

    start = time.time()
    evals_result = {}
    additional_results = {}
    bst = train(
        config,
        dtrain,
        evals_result=evals_result,
        additional_results=additional_results,
        num_boost_round=num_boost_rounds,
        ray_params=ray_params
        or RayParams(
            max_actor_restarts=2,
            num_actors=num_workers,
            cpus_per_actor=1,
            gpus_per_actor=1 if not use_gpu else 1,
        ),
        evals=[(dtrain, "train")],
        **kwargs,
    )
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    out_file = os.path.expanduser(
        "~/benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")
    )
    bst.save_model(out_file)

    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
    return bst, additional_results, taken
Beispiel #2
0
import json
import os
import time

import ray
from xgboost_ray import RayParams

from ray.util.xgboost.release_test_util import train_ray

if __name__ == "__main__":
    ray.init(address="auto")

    ray_params = RayParams(
        elastic_training=False,
        max_actor_restarts=2,
        num_actors=32,
        cpus_per_actor=4,
        gpus_per_actor=0,
    )

    start = time.time()
    train_ray(
        path="/data/classification.parquet",
        num_workers=32,
        num_boost_rounds=100,
        num_files=128,
        regression=False,
        use_gpu=False,
        ray_params=ray_params,
        xgboost_params=None,
    )
    def testCheckpointContinuationValidity(self):
        """Test that checkpoints are stored and loaded correctly"""

        # Train once, get checkpoint via callback returns
        res_1 = {}
        bst_1 = train(self.params,
                      RayDMatrix(self.x, self.y),
                      callbacks=[
                          _checkpoint_callback(frequency=1,
                                               before_iteration_=False)
                      ],
                      num_boost_round=2,
                      ray_params=RayParams(num_actors=2),
                      additional_results=res_1)
        last_checkpoint_1 = res_1["callback_returns"][0][-1]
        last_checkpoint_other_rank_1 = res_1["callback_returns"][1][-1]

        # Sanity check
        lc1 = xgb.Booster()
        lc1.load_model(last_checkpoint_1)
        self.assertEqual(last_checkpoint_1, last_checkpoint_other_rank_1)
        self.assertEqual(last_checkpoint_1, lc1.save_raw())
        self.assertEqual(bst_1.get_dump(), lc1.get_dump())

        # Start new training run, starting from existing model
        res_2 = {}
        bst_2 = train(self.params,
                      RayDMatrix(self.x, self.y),
                      callbacks=[
                          _checkpoint_callback(frequency=1,
                                               before_iteration_=True),
                          _checkpoint_callback(frequency=1,
                                               before_iteration_=False)
                      ],
                      num_boost_round=4,
                      ray_params=RayParams(num_actors=2),
                      additional_results=res_2,
                      xgb_model=lc1)
        first_checkpoint_2 = res_2["callback_returns"][0][0]
        first_checkpoint_other_actor_2 = res_2["callback_returns"][1][0]
        last_checkpoint_2 = res_2["callback_returns"][0][-1]
        last_checkpoint_other_actor_2 = res_2["callback_returns"][1][-1]

        fcp_bst = xgb.Booster()
        fcp_bst.load_model(first_checkpoint_2)

        lcp_bst = xgb.Booster()
        lcp_bst.load_model(last_checkpoint_2)

        # Sanity check
        self.assertEqual(first_checkpoint_2, first_checkpoint_other_actor_2)
        self.assertEqual(last_checkpoint_2, last_checkpoint_other_actor_2)
        self.assertEqual(bst_2.get_dump(), lcp_bst.get_dump())

        # Training should not have proceeded for the first checkpoint,
        # so trees should be equal
        self.assertEqual(lc1.get_dump(), fcp_bst.get_dump())

        # Training should have proceeded for the last checkpoint,
        # so trees should not be equal
        self.assertNotEqual(fcp_bst.get_dump(), lcp_bst.get_dump())
    def testMaybeScheduleNewActors(self):
        """Test scheduling of new actors if resources become available.

        Context: We are training with num_actors=8, of which 3 actors are
        dead. The cluster has resources to restart 2 of these actors.

        In this test, we walk through the `_maybe_schedule_new_actors` and
        `_update_scheduled_actor_states` methods, checking their state
        after each call.

        """
        from xgboost_ray.main import _TrainingState
        from xgboost_ray.elastic import _update_scheduled_actor_states
        from xgboost_ray.elastic import _maybe_schedule_new_actors

        os.environ["RXGB_ELASTIC_RESTART_GRACE_PERIOD_S"] = "30"

        # Three actors are dead
        actors = [
            MagicMock(), None,
            MagicMock(),
            MagicMock(), None,
            MagicMock(), None,
            MagicMock()
        ]

        # Mock training state
        state = _TrainingState(
            actors=actors,
            queue=MagicMock(),
            stop_event=MagicMock(),
            checkpoint=MagicMock(),
            additional_results={},
            failed_actor_ranks=set(),
        )

        created_actors = []

        def fake_create_actor(rank, *args, **kwargs):
            created_actors.append(rank)
            return MagicMock()

        with patch("xgboost_ray.elastic._create_actor") as create_actor:
            create_actor.side_effect = fake_create_actor

            _maybe_schedule_new_actors(training_state=state,
                                       num_cpus_per_actor=8,
                                       num_gpus_per_actor=0,
                                       resources_per_actor={"custom": 1.0},
                                       load_data=[],
                                       ray_params=RayParams(
                                           num_actors=8,
                                           elastic_training=True,
                                           max_failed_actors=1,
                                           max_actor_restarts=2))

            # 3 new actors should have been created
            self.assertEqual(len(created_actors), 3)
            self.assertEqual(len(state.pending_actors), 3)

            # The number of created actors shouldn't change even
            # if we run this function again.
            _maybe_schedule_new_actors(training_state=state,
                                       num_cpus_per_actor=8,
                                       num_gpus_per_actor=0,
                                       resources_per_actor={"custom": 1.0},
                                       load_data=[],
                                       ray_params=RayParams(
                                           num_actors=8,
                                           elastic_training=True,
                                           max_failed_actors=1,
                                           max_actor_restarts=2))

            self.assertEqual(len(created_actors), 3)
            self.assertEqual(len(state.pending_actors), 3)

            # The actors have not yet been promoted because the
            # loading task has not finished.
            self.assertFalse(actors[1])
            self.assertFalse(actors[4])
            self.assertFalse(actors[6])

            # Update status, nothing should change
            _update_scheduled_actor_states(training_state=state)

            self.assertFalse(actors[1])
            self.assertFalse(actors[4])
            self.assertFalse(actors[6])

            # Set loading task status to finished, but only for first actor
            for _, (_, task) in state.pending_actors.items():
                task.ready = True
                break

            # Update status. This shouldn't raise RayXGBoostActorAvailable
            # because we still have a grace period to wait for the second
            # actor.
            _update_scheduled_actor_states(training_state=state)

            # Grace period is set through ENV.ELASTIC_RESTART_GRACE_PERIOD_S
            # Allow for some slack in test execution
            self.assertGreaterEqual(state.restart_training_at,
                                    time.time() + 22)

            # The first actor should have been promoted to full actor
            self.assertTrue(actors[1])
            self.assertFalse(actors[4])
            self.assertFalse(actors[6])

            # Set loading task status to finished for all actors
            for _, (_, task) in state.pending_actors.items():
                task.ready = True

            # Update status. This should now raise RayXGBoostActorAvailable
            # immediately as there are no pending actors left to wait for.
            with self.assertRaises(RayXGBoostActorAvailable):
                _update_scheduled_actor_states(training_state=state)

            # All restarted actors should have been promoted to full actors
            self.assertTrue(actors[1])
            self.assertTrue(actors[4])
            self.assertTrue(actors[6])
# Get data
df = pd.read_csv("winequality-red.csv", delimiter=";")
print(f"Rows, columns: {str(df.shape)}")
print(df.head)
print(df.isna().sum())

# Create Classification version of target variable
df['goodquality'] = [1 if x >= 6 else 0 for x in df['quality']]
X = df.drop(['quality', 'goodquality'], axis=1)
y = df['goodquality']
print(df['goodquality'].value_counts())

# Normalize feature variables
X_features = X
X = StandardScaler().fit_transform(X)
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=0)

start = time.time()
model = RayXGBClassifier(
    #    n_jobs=10,  # In XGBoost-Ray, n_jobs sets the number of actors
    random_state=1)

model.fit(X=X_train, y=y_train, ray_params=RayParams(num_actors=3))
print(f"executed XGBoost in {time.time() - start}")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
Beispiel #6
0
def train_ray(path,
              num_workers,
              num_boost_rounds,
              num_files=0,
              regression=False,
              use_gpu=False,
              smoke_test=False,
              ray_params=None,
              xgboost_params=None,
              **kwargs):
    if num_files:
        files = sorted(glob.glob(f"{path}/**/*.parquet"))
        while num_files > len(files):
            files = files + files
        path = files[0:num_files]

    use_device_matrix = False
    if use_gpu:
        try:
            import cupy  # noqa: F401
            use_device_matrix = True
        except ImportError:
            use_device_matrix = False

    if use_device_matrix:
        dtrain = RayDeviceQuantileDMatrix(path,
                                          num_actors=num_workers,
                                          label="labels",
                                          ignore=["partition"],
                                          filetype=RayFileType.PARQUET)
    else:
        dtrain = RayDMatrix(path,
                            num_actors=num_workers,
                            label="labels",
                            ignore=["partition"],
                            filetype=RayFileType.PARQUET)

    config = xgboost_params or {
        "tree_method": "hist" if not use_gpu else "gpu_hist"
    }
    if not regression:
        # Classification
        config.update({
            "objective": "binary:logistic",
            "eval_metric": ["logloss", "error"],
        })
    else:
        # Regression
        config.update({
            "objective": "reg:squarederror",
            "eval_metric": ["logloss", "rmse"],
        })

    start = time.time()
    evals_result = {}
    bst = train(config,
                dtrain,
                evals_result=evals_result,
                num_boost_round=num_boost_rounds,
                ray_params=ray_params
                or RayParams(max_actor_restarts=2,
                             num_actors=num_workers,
                             cpus_per_actor=4 if not smoke_test else 1,
                             gpus_per_actor=0 if not use_gpu else 1),
                evals=[(dtrain, "train")],
                **kwargs)
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu"))
    print("Final training error: {:.4f}".format(
        evals_result["train"]["error"][-1]))
    return bst, taken
Beispiel #7
0
# The dataset has to be downloaded onto the cluster, which may take a few
# minutes.

# standard XGBoost config for classification
config = {
    "tree_method": "approx",
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
}

bst, evals_result = train_xgboost(
    config,
    train_df,
    eval_df,
    LABEL_COLUMN,
    RayParams(cpus_per_actor=cpus_per_actor, num_actors=num_actors),
)
print(f"Results: {evals_result}")

###############################################################################
# Hyperparameter optimization
# ---------------------------
# If we are not content with the results obtained with default XGBoost
# parameters, we can use `Ray Tune
# <https://docs.ray.io/en/latest/tune/index.html>`_ for cutting-edge
# distributed hyperparameter tuning. XGBoost-Ray automatically integrates
# with Ray Tune, meaning we can use the same training function as before.
#
# In this workflow, we will tune three hyperparameters - ``eta``, ``subsample``
# and ``max_depth``. We are using `Tune's samplers to define the search
# space <https://docs.ray.io/en/latest/tune/user-guide.html#search-space-grid-random>`_.
Beispiel #8
0
        num_files=128,
        regression=False,
        use_gpu=False,
        ray_params=ray_params,
        xgboost_params=config,
    )


if __name__ == "__main__":
    search_space = {
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    ray.init(address="auto")

    ray_params = RayParams(elastic_training=False,
                           max_actor_restarts=2,
                           num_actors=32,
                           cpus_per_actor=1,
                           gpus_per_actor=0)

    analysis = tune.run(tune.with_parameters(train_wrapper,
                                             ray_params=ray_params),
                        config=search_space,
                        num_samples=4,
                        resources_per_trial=ray_params.get_tune_resources())

    print("PASSED.")
Beispiel #9
0
from _train import train_ray
from ft_small_non_elastic import FailureState, FailureInjection, \
    TrackingCallback

if __name__ == "__main__":
    ray.init(address="auto")
    from xgboost_ray.main import logger
    logger.setLevel(10)

    failure_state = FailureState.remote()

    ray_params = RayParams(
        elastic_training=True,
        max_failed_actors=2,
        max_actor_restarts=3,
        num_actors=4,
        cpus_per_actor=4,
        gpus_per_actor=0)

    world_sizes = []
    start_actors = []

    def _mock_train(*args, _training_state, **kwargs):
        world_sizes.append(len([a for a in _training_state.actors if a]))
        start_actors.append(len(_training_state.failed_actor_ranks))

        return unmocked_train(*args, _training_state=_training_state, **kwargs)

    with patch("xgboost_ray.main._train") as mocked:
        mocked.side_effect = _mock_train