Ejemplo n.º 1
0
def get_datasets(a=5, b=10, size=1000, split=0.8) -> Tuple[Dataset]:
    def get_dataset(a, b, size) -> Dataset:
        items = [i / size for i in range(size)]
        dataset = ray.data.from_items([{
            "x": x,
            "y": a * x + b
        } for x in items])
        return dataset

    dataset = get_dataset(a, b, size)

    train_dataset, validation_dataset = train_test_split(dataset,
                                                         split,
                                                         shuffle=True)
    return train_dataset, validation_dataset
Ejemplo n.º 2
0
import ray
from ray.data.preprocessors import StandardScaler
from ray.air import train_test_split
from ray.train.batch_predictor import BatchPredictor
from ray.train.xgboost import XGBoostTrainer, XGBoostPredictor
from ray.air.config import ScalingConfig

# Split data into train and validation.
dataset = ray.data.read_csv(
    "s3://anonymous@air-example-data/breast_cancer.csv")
train_dataset, valid_dataset = train_test_split(dataset, test_size=0.3)
test_dataset = valid_dataset.drop_columns(["target"])

columns_to_scale = ["mean radius", "mean texture"]
preprocessor = StandardScaler(columns=columns_to_scale)

trainer = XGBoostTrainer(
    label_column="target",
    num_boost_round=20,
    scaling_config=ScalingConfig(num_workers=2),
    params={
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    },
    datasets={"train": train_dataset},
    preprocessor=preprocessor,
)
result = trainer.fit()

# You can also create a checkpoint from a trained model using
# `XGBoostCheckpoint.from_model`.
Ejemplo n.º 3
0
def test_train_test_split(ray_start_4_cpus):
    ds = ray.data.range(8)

    # float
    train, test = train_test_split(ds, test_size=0.25)
    assert train.take() == [0, 1, 2, 3, 4, 5]
    assert test.take() == [6, 7]

    # int
    train, test = train_test_split(ds, test_size=2)
    assert train.take() == [0, 1, 2, 3, 4, 5]
    assert test.take() == [6, 7]

    # shuffle
    train, test = train_test_split(ds, test_size=0.25, shuffle=True, seed=1)
    assert train.take() == [5, 7, 6, 3, 0, 4]
    assert test.take() == [2, 1]

    # error handling
    with pytest.raises(TypeError):
        train_test_split(ds, test_size=[1])

    with pytest.raises(ValueError):
        train_test_split(ds, test_size=-1)

    with pytest.raises(ValueError):
        train_test_split(ds, test_size=0)

    with pytest.raises(ValueError):
        train_test_split(ds, test_size=1.1)

    with pytest.raises(ValueError):
        train_test_split(ds, test_size=9)