def train_sklearn(num_cpus: int, use_gpu: bool = False) -> Result: if use_gpu and not cuMLRandomForestClassifier: raise RuntimeError( "cuML must be installed for GPU enabled sklearn estimators.") train_dataset, valid_dataset, _ = prepare_data() # Scale some random columns columns_to_scale = ["mean radius", "mean texture"] preprocessor = Chain(OrdinalEncoder(["categorical_column"]), StandardScaler(columns=columns_to_scale)) if use_gpu: trainer_resources = {"CPU": 1, "GPU": 1} estimator = cuMLRandomForestClassifier() else: trainer_resources = {"CPU": num_cpus} estimator = RandomForestClassifier() trainer = SklearnTrainer( estimator=estimator, label_column="target", datasets={ "train": train_dataset, "valid": valid_dataset }, preprocessor=preprocessor, cv=5, scaling_config={ "trainer_resources": trainer_resources, }, ) result = trainer.fit() print(result.metrics) return result
) if rank == 0: print("Ingest stats from rank=0:\n\n{}".format(data_shard.stats())) return train_loop_per_worker if __name__ == "__main__": # Generate a synthetic dataset of ~10GiB of float64 data. The dataset is sharded # into 100 blocks (parallelism=100). dataset = ray.data.range_tensor(50000, shape=(80, 80, 4), parallelism=100) # An example preprocessor chain that just scales all values by 4.0 in two stages. preprocessor = Chain( BatchMapper(lambda df: df * 2), BatchMapper(lambda df: df * 2), ) # Setup the dummy trainer that prints ingest stats. # Run and print ingest stats. trainer = DummyTrainer( scaling_config={"num_workers": 1, "use_gpu": False}, datasets={"train": dataset}, preprocessor=preprocessor, runtime_seconds=30, # Stop after this amount or time or 1 epoch is read. prefetch_blocks=1, # Number of blocks to prefetch when reading data. batch_size=None, ) trainer.fit() # Print memory stats (you can also use "ray memory --stats-only" to monitor this
def test_chain(): """Tests basic Chain functionality.""" col_a = [-1, -1, 1, 1] col_b = [1, 1, 1, None] col_c = ["sunday", "monday", "tuesday", "tuesday"] in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c}) ds = ray.data.from_pandas(in_df) imputer = SimpleImputer(["B"]) scaler = StandardScaler(["A", "B"]) encoder = LabelEncoder("C") chain = Chain(scaler, imputer, encoder) # Fit data. chain.fit(ds) assert imputer.stats_ == { "mean(B)": 0.0, } assert scaler.stats_ == { "mean(A)": 0.0, "mean(B)": 1.0, "std(A)": 1.0, "std(B)": 0.0, } assert encoder.stats_ == { "unique_values(C)": { "monday": 0, "sunday": 1, "tuesday": 2 } } # Transform data. transformed = chain.transform(ds) out_df = transformed.to_pandas() processed_col_a = [-1.0, -1.0, 1.0, 1.0] processed_col_b = [0.0, 0.0, 0.0, 0.0] processed_col_c = [1, 0, 2, 2] expected_df = pd.DataFrame.from_dict({ "A": processed_col_a, "B": processed_col_b, "C": processed_col_c }) assert out_df.equals(expected_df) # Transform batch. pred_col_a = [1, 2, None] pred_col_b = [0, None, 2] pred_col_c = ["monday", "tuesday", "wednesday"] pred_in_df = pd.DataFrame.from_dict({ "A": pred_col_a, "B": pred_col_b, "C": pred_col_c }) pred_out_df = chain.transform_batch(pred_in_df) pred_processed_col_a = [1, 2, None] pred_processed_col_b = [-1.0, 0.0, 1.0] pred_processed_col_c = [0, 2, None] pred_expected_df = pd.DataFrame.from_dict({ "A": pred_processed_col_a, "B": pred_processed_col_b, "C": pred_processed_col_c, }) assert pred_out_df.equals(pred_expected_df)