def get_datasets(a=5, b=10, size=1000, split=0.8) -> Tuple[Dataset]: def get_dataset(a, b, size) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{ "x": x, "y": a * x + b } for x in items]) return dataset dataset = get_dataset(a, b, size) train_dataset, validation_dataset = train_test_split(dataset, split, shuffle=True) return train_dataset, validation_dataset
import ray from ray.data.preprocessors import StandardScaler from ray.air import train_test_split from ray.train.batch_predictor import BatchPredictor from ray.train.xgboost import XGBoostTrainer, XGBoostPredictor from ray.air.config import ScalingConfig # Split data into train and validation. dataset = ray.data.read_csv( "s3://anonymous@air-example-data/breast_cancer.csv") train_dataset, valid_dataset = train_test_split(dataset, test_size=0.3) test_dataset = valid_dataset.drop_columns(["target"]) columns_to_scale = ["mean radius", "mean texture"] preprocessor = StandardScaler(columns=columns_to_scale) trainer = XGBoostTrainer( label_column="target", num_boost_round=20, scaling_config=ScalingConfig(num_workers=2), params={ "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }, datasets={"train": train_dataset}, preprocessor=preprocessor, ) result = trainer.fit() # You can also create a checkpoint from a trained model using # `XGBoostCheckpoint.from_model`.
def test_train_test_split(ray_start_4_cpus): ds = ray.data.range(8) # float train, test = train_test_split(ds, test_size=0.25) assert train.take() == [0, 1, 2, 3, 4, 5] assert test.take() == [6, 7] # int train, test = train_test_split(ds, test_size=2) assert train.take() == [0, 1, 2, 3, 4, 5] assert test.take() == [6, 7] # shuffle train, test = train_test_split(ds, test_size=0.25, shuffle=True, seed=1) assert train.take() == [5, 7, 6, 3, 0, 4] assert test.take() == [2, 1] # error handling with pytest.raises(TypeError): train_test_split(ds, test_size=[1]) with pytest.raises(ValueError): train_test_split(ds, test_size=-1) with pytest.raises(ValueError): train_test_split(ds, test_size=0) with pytest.raises(ValueError): train_test_split(ds, test_size=1.1) with pytest.raises(ValueError): train_test_split(ds, test_size=9)