def train_ray( path, num_workers, num_boost_rounds, num_files=0, regression=False, use_gpu=False, ray_params=None, xgboost_params=None, **kwargs, ): if not isinstance(path, list): path = get_parquet_files(path, num_files=num_files) use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 use_device_matrix = True except ImportError: use_device_matrix = False if use_device_matrix: dtrain = RayDeviceQuantileDMatrix( path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) else: dtrain = RayDMatrix( path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) config = {"tree_method": "hist" if not use_gpu else "gpu_hist"} if not regression: # Classification config.update( { "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } ) else: # Regression config.update( { "objective": "reg:squarederror", "eval_metric": ["logloss", "rmse"], } ) if xgboost_params: config.update(xgboost_params) start = time.time() evals_result = {} additional_results = {} bst = train( config, dtrain, evals_result=evals_result, additional_results=additional_results, num_boost_round=num_boost_rounds, ray_params=ray_params or RayParams( max_actor_restarts=2, num_actors=num_workers, cpus_per_actor=1, gpus_per_actor=1 if not use_gpu else 1, ), evals=[(dtrain, "train")], **kwargs, ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") out_file = os.path.expanduser( "~/benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu") ) bst.save_model(out_file) print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) return bst, additional_results, taken
import json import os import time import ray from xgboost_ray import RayParams from ray.util.xgboost.release_test_util import train_ray if __name__ == "__main__": ray.init(address="auto") ray_params = RayParams( elastic_training=False, max_actor_restarts=2, num_actors=32, cpus_per_actor=4, gpus_per_actor=0, ) start = time.time() train_ray( path="/data/classification.parquet", num_workers=32, num_boost_rounds=100, num_files=128, regression=False, use_gpu=False, ray_params=ray_params, xgboost_params=None, )
def testCheckpointContinuationValidity(self): """Test that checkpoints are stored and loaded correctly""" # Train once, get checkpoint via callback returns res_1 = {} bst_1 = train(self.params, RayDMatrix(self.x, self.y), callbacks=[ _checkpoint_callback(frequency=1, before_iteration_=False) ], num_boost_round=2, ray_params=RayParams(num_actors=2), additional_results=res_1) last_checkpoint_1 = res_1["callback_returns"][0][-1] last_checkpoint_other_rank_1 = res_1["callback_returns"][1][-1] # Sanity check lc1 = xgb.Booster() lc1.load_model(last_checkpoint_1) self.assertEqual(last_checkpoint_1, last_checkpoint_other_rank_1) self.assertEqual(last_checkpoint_1, lc1.save_raw()) self.assertEqual(bst_1.get_dump(), lc1.get_dump()) # Start new training run, starting from existing model res_2 = {} bst_2 = train(self.params, RayDMatrix(self.x, self.y), callbacks=[ _checkpoint_callback(frequency=1, before_iteration_=True), _checkpoint_callback(frequency=1, before_iteration_=False) ], num_boost_round=4, ray_params=RayParams(num_actors=2), additional_results=res_2, xgb_model=lc1) first_checkpoint_2 = res_2["callback_returns"][0][0] first_checkpoint_other_actor_2 = res_2["callback_returns"][1][0] last_checkpoint_2 = res_2["callback_returns"][0][-1] last_checkpoint_other_actor_2 = res_2["callback_returns"][1][-1] fcp_bst = xgb.Booster() fcp_bst.load_model(first_checkpoint_2) lcp_bst = xgb.Booster() lcp_bst.load_model(last_checkpoint_2) # Sanity check self.assertEqual(first_checkpoint_2, first_checkpoint_other_actor_2) self.assertEqual(last_checkpoint_2, last_checkpoint_other_actor_2) self.assertEqual(bst_2.get_dump(), lcp_bst.get_dump()) # Training should not have proceeded for the first checkpoint, # so trees should be equal self.assertEqual(lc1.get_dump(), fcp_bst.get_dump()) # Training should have proceeded for the last checkpoint, # so trees should not be equal self.assertNotEqual(fcp_bst.get_dump(), lcp_bst.get_dump())
def testMaybeScheduleNewActors(self): """Test scheduling of new actors if resources become available. Context: We are training with num_actors=8, of which 3 actors are dead. The cluster has resources to restart 2 of these actors. In this test, we walk through the `_maybe_schedule_new_actors` and `_update_scheduled_actor_states` methods, checking their state after each call. """ from xgboost_ray.main import _TrainingState from xgboost_ray.elastic import _update_scheduled_actor_states from xgboost_ray.elastic import _maybe_schedule_new_actors os.environ["RXGB_ELASTIC_RESTART_GRACE_PERIOD_S"] = "30" # Three actors are dead actors = [ MagicMock(), None, MagicMock(), MagicMock(), None, MagicMock(), None, MagicMock() ] # Mock training state state = _TrainingState( actors=actors, queue=MagicMock(), stop_event=MagicMock(), checkpoint=MagicMock(), additional_results={}, failed_actor_ranks=set(), ) created_actors = [] def fake_create_actor(rank, *args, **kwargs): created_actors.append(rank) return MagicMock() with patch("xgboost_ray.elastic._create_actor") as create_actor: create_actor.side_effect = fake_create_actor _maybe_schedule_new_actors(training_state=state, num_cpus_per_actor=8, num_gpus_per_actor=0, resources_per_actor={"custom": 1.0}, load_data=[], ray_params=RayParams( num_actors=8, elastic_training=True, max_failed_actors=1, max_actor_restarts=2)) # 3 new actors should have been created self.assertEqual(len(created_actors), 3) self.assertEqual(len(state.pending_actors), 3) # The number of created actors shouldn't change even # if we run this function again. _maybe_schedule_new_actors(training_state=state, num_cpus_per_actor=8, num_gpus_per_actor=0, resources_per_actor={"custom": 1.0}, load_data=[], ray_params=RayParams( num_actors=8, elastic_training=True, max_failed_actors=1, max_actor_restarts=2)) self.assertEqual(len(created_actors), 3) self.assertEqual(len(state.pending_actors), 3) # The actors have not yet been promoted because the # loading task has not finished. self.assertFalse(actors[1]) self.assertFalse(actors[4]) self.assertFalse(actors[6]) # Update status, nothing should change _update_scheduled_actor_states(training_state=state) self.assertFalse(actors[1]) self.assertFalse(actors[4]) self.assertFalse(actors[6]) # Set loading task status to finished, but only for first actor for _, (_, task) in state.pending_actors.items(): task.ready = True break # Update status. This shouldn't raise RayXGBoostActorAvailable # because we still have a grace period to wait for the second # actor. _update_scheduled_actor_states(training_state=state) # Grace period is set through ENV.ELASTIC_RESTART_GRACE_PERIOD_S # Allow for some slack in test execution self.assertGreaterEqual(state.restart_training_at, time.time() + 22) # The first actor should have been promoted to full actor self.assertTrue(actors[1]) self.assertFalse(actors[4]) self.assertFalse(actors[6]) # Set loading task status to finished for all actors for _, (_, task) in state.pending_actors.items(): task.ready = True # Update status. This should now raise RayXGBoostActorAvailable # immediately as there are no pending actors left to wait for. with self.assertRaises(RayXGBoostActorAvailable): _update_scheduled_actor_states(training_state=state) # All restarted actors should have been promoted to full actors self.assertTrue(actors[1]) self.assertTrue(actors[4]) self.assertTrue(actors[6])
# Get data df = pd.read_csv("winequality-red.csv", delimiter=";") print(f"Rows, columns: {str(df.shape)}") print(df.head) print(df.isna().sum()) # Create Classification version of target variable df['goodquality'] = [1 if x >= 6 else 0 for x in df['quality']] X = df.drop(['quality', 'goodquality'], axis=1) y = df['goodquality'] print(df['goodquality'].value_counts()) # Normalize feature variables X_features = X X = StandardScaler().fit_transform(X) # Splitting the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0) start = time.time() model = RayXGBClassifier( # n_jobs=10, # In XGBoost-Ray, n_jobs sets the number of actors random_state=1) model.fit(X=X_train, y=y_train, ray_params=RayParams(num_actors=3)) print(f"executed XGBoost in {time.time() - start}") y_pred = model.predict(X_test) print(classification_report(y_test, y_pred))
def train_ray(path, num_workers, num_boost_rounds, num_files=0, regression=False, use_gpu=False, smoke_test=False, ray_params=None, xgboost_params=None, **kwargs): if num_files: files = sorted(glob.glob(f"{path}/**/*.parquet")) while num_files > len(files): files = files + files path = files[0:num_files] use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 use_device_matrix = True except ImportError: use_device_matrix = False if use_device_matrix: dtrain = RayDeviceQuantileDMatrix(path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) else: dtrain = RayDMatrix(path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) config = xgboost_params or { "tree_method": "hist" if not use_gpu else "gpu_hist" } if not regression: # Classification config.update({ "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }) else: # Regression config.update({ "objective": "reg:squarederror", "eval_metric": ["logloss", "rmse"], }) start = time.time() evals_result = {} bst = train(config, dtrain, evals_result=evals_result, num_boost_round=num_boost_rounds, ray_params=ray_params or RayParams(max_actor_restarts=2, num_actors=num_workers, cpus_per_actor=4 if not smoke_test else 1, gpus_per_actor=0 if not use_gpu else 1), evals=[(dtrain, "train")], **kwargs) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) print("Final training error: {:.4f}".format( evals_result["train"]["error"][-1])) return bst, taken
# The dataset has to be downloaded onto the cluster, which may take a few # minutes. # standard XGBoost config for classification config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } bst, evals_result = train_xgboost( config, train_df, eval_df, LABEL_COLUMN, RayParams(cpus_per_actor=cpus_per_actor, num_actors=num_actors), ) print(f"Results: {evals_result}") ############################################################################### # Hyperparameter optimization # --------------------------- # If we are not content with the results obtained with default XGBoost # parameters, we can use `Ray Tune # <https://docs.ray.io/en/latest/tune/index.html>`_ for cutting-edge # distributed hyperparameter tuning. XGBoost-Ray automatically integrates # with Ray Tune, meaning we can use the same training function as before. # # In this workflow, we will tune three hyperparameters - ``eta``, ``subsample`` # and ``max_depth``. We are using `Tune's samplers to define the search # space <https://docs.ray.io/en/latest/tune/user-guide.html#search-space-grid-random>`_.
num_files=128, regression=False, use_gpu=False, ray_params=ray_params, xgboost_params=config, ) if __name__ == "__main__": search_space = { "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } ray.init(address="auto") ray_params = RayParams(elastic_training=False, max_actor_restarts=2, num_actors=32, cpus_per_actor=1, gpus_per_actor=0) analysis = tune.run(tune.with_parameters(train_wrapper, ray_params=ray_params), config=search_space, num_samples=4, resources_per_trial=ray_params.get_tune_resources()) print("PASSED.")
from _train import train_ray from ft_small_non_elastic import FailureState, FailureInjection, \ TrackingCallback if __name__ == "__main__": ray.init(address="auto") from xgboost_ray.main import logger logger.setLevel(10) failure_state = FailureState.remote() ray_params = RayParams( elastic_training=True, max_failed_actors=2, max_actor_restarts=3, num_actors=4, cpus_per_actor=4, gpus_per_actor=0) world_sizes = [] start_actors = [] def _mock_train(*args, _training_state, **kwargs): world_sizes.append(len([a for a in _training_state.actors if a])) start_actors.append(len(_training_state.failed_actor_ranks)) return unmocked_train(*args, _training_state=_training_state, **kwargs) with patch("xgboost_ray.main._train") as mocked: mocked.side_effect = _mock_train