def test_chain(): """Tests basic Chain functionality.""" col_a = [-1, -1, 1, 1] col_b = [1, 1, 1, None] col_c = ["sunday", "monday", "tuesday", "tuesday"] in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c}) ds = ray.data.from_pandas(in_df) imputer = SimpleImputer(["B"]) scaler = StandardScaler(["A", "B"]) encoder = LabelEncoder("C") chain = Chain(scaler, imputer, encoder) # Fit data. chain.fit(ds) assert imputer.stats_ == { "mean(B)": 0.0, } assert scaler.stats_ == { "mean(A)": 0.0, "mean(B)": 1.0, "std(A)": 1.0, "std(B)": 0.0, } assert encoder.stats_ == { "unique_values(C)": { "monday": 0, "sunday": 1, "tuesday": 2 } } # Transform data. transformed = chain.transform(ds) out_df = transformed.to_pandas() processed_col_a = [-1.0, -1.0, 1.0, 1.0] processed_col_b = [0.0, 0.0, 0.0, 0.0] processed_col_c = [1, 0, 2, 2] expected_df = pd.DataFrame.from_dict({ "A": processed_col_a, "B": processed_col_b, "C": processed_col_c }) assert out_df.equals(expected_df) # Transform batch. pred_col_a = [1, 2, None] pred_col_b = [0, None, 2] pred_col_c = ["monday", "tuesday", "wednesday"] pred_in_df = pd.DataFrame.from_dict({ "A": pred_col_a, "B": pred_col_b, "C": pred_col_c }) pred_out_df = chain.transform_batch(pred_in_df) pred_processed_col_a = [1, 2, None] pred_processed_col_b = [-1.0, 0.0, 1.0] pred_processed_col_c = [0, 2, None] pred_expected_df = pd.DataFrame.from_dict({ "A": pred_processed_col_a, "B": pred_processed_col_b, "C": pred_processed_col_c, }) assert pred_out_df.equals(pred_expected_df)
def test_label_encoder(): """Tests basic LabelEncoder functionality.""" col_a = ["red", "green", "blue", "red"] col_b = ["warm", "cold", "cold", "hot"] col_c = [1, 2, 3, 4] in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c}) ds = ray.data.from_pandas(in_df) encoder = LabelEncoder("A") # Transform with unfitted preprocessor. with pytest.raises(PreprocessorNotFittedException): encoder.transform(ds) # Fit data. encoder.fit(ds) assert encoder.stats_ == { "unique_values(A)": { "blue": 0, "green": 1, "red": 2 } } # Transform data. transformed = encoder.transform(ds) out_df = transformed.to_pandas() processed_col_a = [2, 1, 0, 2] processed_col_b = col_b processed_col_c = col_c expected_df = pd.DataFrame.from_dict({ "A": processed_col_a, "B": processed_col_b, "C": processed_col_c }) assert out_df.equals(expected_df) # Transform batch. pred_col_a = ["blue", "red", "yellow"] pred_col_b = ["cold", "unknown", None] pred_col_c = [10, 20, None] pred_in_df = pd.DataFrame.from_dict({ "A": pred_col_a, "B": pred_col_b, "C": pred_col_c }) pred_out_df = encoder.transform_batch(pred_in_df) pred_processed_col_a = [0, 2, None] pred_processed_col_b = pred_col_b pred_processed_col_c = pred_col_c pred_expected_df = pd.DataFrame.from_dict({ "A": pred_processed_col_a, "B": pred_processed_col_b, "C": pred_processed_col_c, }) assert pred_out_df.equals(pred_expected_df) # Test null behavior. null_col = [1, None] nonnull_col = [1, 1] null_df = pd.DataFrame.from_dict({"A": null_col}) null_ds = ray.data.from_pandas(null_df) nonnull_df = pd.DataFrame.from_dict({"A": nonnull_col}) nonnull_ds = ray.data.from_pandas(nonnull_df) null_encoder = LabelEncoder("A") # Verify fit fails for null values. with pytest.raises(ValueError): null_encoder.fit(null_ds) null_encoder.fit(nonnull_ds) # Verify transform fails for null values. with pytest.raises(ValueError): null_encoder.transform(null_ds) null_encoder.transform(nonnull_ds) # Verify transform_batch fails for null values. with pytest.raises(ValueError): null_encoder.transform_batch(null_df) null_encoder.transform_batch(nonnull_df)