Example #1
0
def train_sklearn(num_cpus: int, use_gpu: bool = False) -> Result:
    if use_gpu and not cuMLRandomForestClassifier:
        raise RuntimeError(
            "cuML must be installed for GPU enabled sklearn estimators.")

    train_dataset, valid_dataset, _ = prepare_data()

    # Scale some random columns
    columns_to_scale = ["mean radius", "mean texture"]
    preprocessor = Chain(OrdinalEncoder(["categorical_column"]),
                         StandardScaler(columns=columns_to_scale))

    if use_gpu:
        trainer_resources = {"CPU": 1, "GPU": 1}
        estimator = cuMLRandomForestClassifier()
    else:
        trainer_resources = {"CPU": num_cpus}
        estimator = RandomForestClassifier()

    trainer = SklearnTrainer(
        estimator=estimator,
        label_column="target",
        datasets={
            "train": train_dataset,
            "valid": valid_dataset
        },
        preprocessor=preprocessor,
        cv=5,
        scaling_config={
            "trainer_resources": trainer_resources,
        },
    )
    result = trainer.fit()
    print(result.metrics)

    return result
Example #2
0
            )

            if rank == 0:
                print("Ingest stats from rank=0:\n\n{}".format(data_shard.stats()))

        return train_loop_per_worker


if __name__ == "__main__":
    # Generate a synthetic dataset of ~10GiB of float64 data. The dataset is sharded
    # into 100 blocks (parallelism=100).
    dataset = ray.data.range_tensor(50000, shape=(80, 80, 4), parallelism=100)

    # An example preprocessor chain that just scales all values by 4.0 in two stages.
    preprocessor = Chain(
        BatchMapper(lambda df: df * 2),
        BatchMapper(lambda df: df * 2),
    )

    # Setup the dummy trainer that prints ingest stats.
    # Run and print ingest stats.
    trainer = DummyTrainer(
        scaling_config={"num_workers": 1, "use_gpu": False},
        datasets={"train": dataset},
        preprocessor=preprocessor,
        runtime_seconds=30,  # Stop after this amount or time or 1 epoch is read.
        prefetch_blocks=1,  # Number of blocks to prefetch when reading data.
        batch_size=None,
    )
    trainer.fit()

    # Print memory stats (you can also use "ray memory --stats-only" to monitor this
Example #3
0
def test_chain():
    """Tests basic Chain functionality."""
    col_a = [-1, -1, 1, 1]
    col_b = [1, 1, 1, None]
    col_c = ["sunday", "monday", "tuesday", "tuesday"]
    in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c})
    ds = ray.data.from_pandas(in_df)

    imputer = SimpleImputer(["B"])
    scaler = StandardScaler(["A", "B"])
    encoder = LabelEncoder("C")
    chain = Chain(scaler, imputer, encoder)

    # Fit data.
    chain.fit(ds)
    assert imputer.stats_ == {
        "mean(B)": 0.0,
    }
    assert scaler.stats_ == {
        "mean(A)": 0.0,
        "mean(B)": 1.0,
        "std(A)": 1.0,
        "std(B)": 0.0,
    }
    assert encoder.stats_ == {
        "unique_values(C)": {
            "monday": 0,
            "sunday": 1,
            "tuesday": 2
        }
    }

    # Transform data.
    transformed = chain.transform(ds)
    out_df = transformed.to_pandas()

    processed_col_a = [-1.0, -1.0, 1.0, 1.0]
    processed_col_b = [0.0, 0.0, 0.0, 0.0]
    processed_col_c = [1, 0, 2, 2]
    expected_df = pd.DataFrame.from_dict({
        "A": processed_col_a,
        "B": processed_col_b,
        "C": processed_col_c
    })

    assert out_df.equals(expected_df)

    # Transform batch.
    pred_col_a = [1, 2, None]
    pred_col_b = [0, None, 2]
    pred_col_c = ["monday", "tuesday", "wednesday"]
    pred_in_df = pd.DataFrame.from_dict({
        "A": pred_col_a,
        "B": pred_col_b,
        "C": pred_col_c
    })

    pred_out_df = chain.transform_batch(pred_in_df)

    pred_processed_col_a = [1, 2, None]
    pred_processed_col_b = [-1.0, 0.0, 1.0]
    pred_processed_col_c = [0, 2, None]
    pred_expected_df = pd.DataFrame.from_dict({
        "A": pred_processed_col_a,
        "B": pred_processed_col_b,
        "C": pred_processed_col_c,
    })

    assert pred_out_df.equals(pred_expected_df)