def test_checkpoint_freq(ray_start_4_cpus, freq_end_expected): freq, end, expected = freq_end_expected train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = XGBoostTrainer( run_config=ray.air.RunConfig( checkpoint_config=ray.air.CheckpointConfig( checkpoint_frequency=freq, checkpoint_at_end=end)), scaling_config=scale_config, label_column="target", params=params, num_boost_round=25, datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, ) result = trainer.fit() # Assert number of checkpoints assert len(result.best_checkpoints) == expected, str([ (metrics["training_iteration"], _cp._local_path) for _cp, metrics in result.best_checkpoints ]) # Assert checkpoint numbers are increasing cp_paths = [cp._local_path for cp, _ in result.best_checkpoints] assert cp_paths == sorted(cp_paths), str(cp_paths)
def test_tune(ray_start_4_cpus): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params={ **params, **{ "max_depth": 1 } }, datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, ) tune.run( trainer.as_trainable(), config={"params": { "max_depth": tune.randint(2, 4) }}, num_samples=2, ) # Make sure original Trainer is not affected. assert trainer.params["max_depth"] == 1
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_checkpoint(resume_from) assert get_num_trees(model) == 10 assert preprocessor.is_same assert preprocessor.fitted_
def test_fit(ray_start_4_cpus): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, ) trainer.fit()
def test_validation(ray_start_4_cpus): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) with pytest.raises(KeyError, match=TRAIN_DATASET_KEY): XGBoostTrainer( scaling_config={"num_workers": 2}, label_column="target", params=params, datasets={"valid": valid_dataset}, ) with pytest.raises(KeyError, match="dmatrix_params"): XGBoostTrainer( scaling_config={"num_workers": 2}, label_column="target", params=params, dmatrix_params={"data": {}}, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, )
def run_xgboost_training(data_path: str, num_workers: int): ds = data.read_parquet(data_path) params = { "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } trainer = XGBoostTrainer( scaling_config=ScalingConfig( num_workers=num_workers, resources_per_worker={"CPU": 12}, ), label_column="labels", params=params, datasets={"train": ds}, ) result = trainer.fit() checkpoint = XGBoostCheckpoint.from_checkpoint(result.checkpoint) xgboost_model = checkpoint.get_model() xgboost_model.save_model(_XGB_MODEL_PATH) ray.shutdown()
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, ) result = trainer.fit() checkpoint = result.checkpoint xgb_model, _ = load_checkpoint(checkpoint) assert get_num_trees(xgb_model) == 5 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, resume_from_checkpoint=resume_from, ) result = trainer.fit() checkpoint = result.checkpoint model, _ = load_checkpoint(checkpoint) assert get_num_trees(model) == 10
def test_tuner_with_xgboost_trainer(self): """Test a successful run.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"), ignore_errors=True) trainer = XGBoostTrainer( label_column="target", params={}, datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # "preprocessor": tune.grid_search([prep_v1, prep_v2]), "datasets": { "train": tune.grid_search( [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]), }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), # limiting the number of trials running at one time. # As the unit test only has access to 4 CPUs on Buildkite. _tuner_kwargs={"max_concurrent_trials": 1}, ) results = tuner.fit() assert not isinstance(results.get_best_result().checkpoint, TrialCheckpoint) assert len(results) == 4
params = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } num_workers = 2 use_gpu = False # use GPUs if detected. trainer = XGBoostTrainer( scaling_config={ "num_workers": num_workers, "use_gpu": use_gpu, }, label_column="target", params=params, datasets={ "train": train_dataset, "valid": valid_dataset }, preprocessor=preprocessor, num_boost_round=20, ) result = trainer.fit() print(result.metrics) # __air_xgb_train_end__ # __air_xgb_batchpred_start__ from ray.train.batch_predictor import BatchPredictor from ray.train.xgboost import XGBoostPredictor batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint,
train_dataset = ray.data.from_items([{ "x": x, "y": 2 * x } for x in range(0, 32, 3)]) valid_dataset = ray.data.from_items([{ "x": x, "y": 2 * x } for x in range(1, 32, 3)]) preprocessor = MinMaxScaler(["x"]) trainer = XGBoostTrainer( label_column="y", params={"objective": "reg:squarederror"}, scaling_config=ScalingConfig(num_workers=2), datasets={ "train": train_dataset, "valid": valid_dataset }, preprocessor=preprocessor, ) result = trainer.fit() # __trainer_end__ # __checkpoint_start__ import os import ray.cloudpickle as cpickle from ray.air.constants import PREPROCESSOR_KEY checkpoint = result.checkpoint with checkpoint.as_directory() as checkpoint_path: path = os.path.join(checkpoint_path, PREPROCESSOR_KEY)
def test_tuner_with_xgboost_trainer_driver_fail_and_resume(self): # So that we have some global checkpointing happening. os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "1" shutil.rmtree( os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail"), ignore_errors=True, ) trainer = XGBoostTrainer( label_column="target", params={}, datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # "preprocessor": tune.grid_search([prep_v1, prep_v2]), "datasets": { "train": tune.grid_search( [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]), }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } class FailureInjectionCallback(Callback): """Inject failure at the configured iteration number.""" def __init__(self, num_iters=10): self.num_iters = num_iters def on_step_end(self, iteration, trials, **kwargs): if iteration == self.num_iters: print(f"Failing after {self.num_iters} iters.") raise RuntimeError tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner_driver_fail", callbacks=[FailureInjectionCallback()]), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), # limiting the number of trials running at one time. # As the unit test only has access to 4 CPUs on Buildkite. _tuner_kwargs={"max_concurrent_trials": 1}, ) with self.assertRaises(TuneError): tuner.fit() # Test resume restore_path = os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail") tuner = Tuner.restore(restore_path) # A hack before we figure out RunConfig semantics across resumes. tuner._local_tuner._run_config.callbacks = None results = tuner.fit() assert len(results) == 4
# Split data into train and validation. dataset = ray.data.read_csv( "s3://anonymous@air-example-data/breast_cancer.csv") train_dataset, valid_dataset = train_test_split(dataset, test_size=0.3) test_dataset = valid_dataset.drop_columns(["target"]) columns_to_scale = ["mean radius", "mean texture"] preprocessor = StandardScaler(columns=columns_to_scale) trainer = XGBoostTrainer( label_column="target", num_boost_round=20, scaling_config=ScalingConfig(num_workers=2), params={ "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }, datasets={"train": train_dataset}, preprocessor=preprocessor, ) result = trainer.fit() # You can also create a checkpoint from a trained model using # `XGBoostCheckpoint.from_model`. # import xgboost as xgb # from ray.train.xgboost import XGBoostCheckpoint # model = xgb.Booster() # model.load_model(...) # checkpoint = XGBoostCheckpoint.from_model(model, path=".")
# __air_generic_preprocess_end__ # __air_xgb_train_start__ from ray.train.xgboost import XGBoostTrainer from ray.air.config import ScalingConfig trainer = XGBoostTrainer( scaling_config=ScalingConfig( # Number of workers to use for data parallelism. num_workers=2, # Whether to use GPU acceleration. use_gpu=False, ), label_column="target", num_boost_round=20, params={ # XGBoost specific params "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }, datasets={ "train": train_dataset, "valid": valid_dataset }, preprocessor=preprocessor, ) result = trainer.fit() print(result.metrics) # __air_xgb_train_end__ # __air_xgb_tuner_start__ from ray import tune
use_gpu = False # XGBoost specific params params = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "max_depth": 2, } trainer = XGBoostTrainer( scaling_config=ScalingConfig( num_workers=num_workers, use_gpu=use_gpu, ), label_column="target", params=params, datasets={ "train": train_dataset, "valid": valid_dataset }, preprocessor=preprocessor, num_boost_round=5, ) result = trainer.fit() # __air_trainer_end__ # __air_trainer_output_start__ print(result.metrics) print(result.checkpoint) # __air_trainer_output_end__