def test_onehot_categorizer( df_train, df_test, columns_to_categorize, drop_first, hardcode, expected_output_train, expected_output_test ): categorizer_learner = onehot_categorizer( columns_to_categorize=columns_to_categorize, hardcode_nans=hardcode, drop_first_column=drop_first) pred_fn, data, log = categorizer_learner(df_train) test_result = pred_fn(df_test) assert_frame_equal(test_result, expected_output_test) assert_frame_equal(data, expected_output_train)
def test_build_pipeline_with_onehotencoder(has_repeated_learners): df_train = pd.DataFrame({ 'id': ["id1", "id2", "id3", "id4", "id3", "id4"], 'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0], "x2": [0, 1, 1, 0, 1, 0], "cat": ["c1", "c1", "c2", None, "c2", "c4"], 'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9] }) df_test = pd.DataFrame({ 'id': ["id4", "id4", "id5", "id6", "id5", "id6"], 'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0], "x2": [1, 1, 0, None, 0, 1], "cat": ["c1", "c2", "c5", None, "c2", "c3"], 'y': [1.3, -4.0, 0.0, 49, 0.0, 49] }) features = ["x1", "x2", "cat"] target = "y" train_fn = build_pipeline( placeholder_imputer(columns_to_impute=["x1", "x2"], placeholder_value=-999), onehot_categorizer(columns_to_categorize=["cat"], hardcode_nans=True), xgb_regression_learner(features=features, target=target, num_estimators=20, extra_params={"seed": 42}), has_repeated_learners=has_repeated_learners) predict_fn, pred_train, log = train_fn(df_train) pred_test = predict_fn(df_test) expected_feature_columns_after_encoding = [ "x1", "x2", "fklearn_feat__cat==c1", "fklearn_feat__cat==c2", "fklearn_feat__cat==c4", "fklearn_feat__cat==nan" ] assert set( pred_test.columns) == set(expected_feature_columns_after_encoding + ["id", target, "prediction"])
def test_onehot_categorizer(): input_df_train = pd.DataFrame({ "feat1_num": [1, 0.5, nan, 100], "sex": ["female", "male", "male", "male"], "region": ["SP", "RG", "MG", nan] }) expected_output_train_no_hardcode = pd.DataFrame( OrderedDict( (("feat1_num", [1, 0.5, nan, 100]), ("fklearn_feat__sex==female", [1, 0, 0, 0]), ("fklearn_feat__sex==male", [0, 1, 1, 1]), ("fklearn_feat__region==MG", [0, 0, 1, 0]), ("fklearn_feat__region==RG", [0, 1, 0, 0]), ("fklearn_feat__region==SP", [1, 0, 0, 0])))) expected_output_train_hardcode = pd.DataFrame( OrderedDict( (("feat1_num", [1, 0.5, nan, 100]), ("fklearn_feat__sex==female", [1, 0, 0, 0]), ("fklearn_feat__sex==male", [0, 1, 1, 1]), ("fklearn_feat__sex==nan", [0, 0, 0, 0]), ("fklearn_feat__region==MG", [0, 0, 1, 0]), ("fklearn_feat__region==RG", [0, 1, 0, 0]), ("fklearn_feat__region==SP", [1, 0, 0, 0]), ("fklearn_feat__region==nan", [0, 0, 0, 1])))) expected_output_train_drop_first = pd.DataFrame( OrderedDict( (("feat1_num", [1, 0.5, nan, 100]), ("fklearn_feat__sex==male", [0, 1, 1, 1]), ("fklearn_feat__region==RG", [0, 1, 0, 0]), ("fklearn_feat__region==SP", [1, 0, 0, 0])))) input_df_test = pd.DataFrame({ "feat1_num": [2, 20, 200, 2000], "sex": ["male", "female", "male", "nonbinary"], "region": [nan, nan, "SP", "RG"] }) expected_output_test_no_hardcode = pd.DataFrame( OrderedDict( (("feat1_num", [2, 20, 200, 2000]), ("fklearn_feat__sex==female", [0, 1, 0, 0]), ("fklearn_feat__sex==male", [1, 0, 1, 0]), ("fklearn_feat__region==MG", [0, 0, 0, 0]), ("fklearn_feat__region==RG", [0, 0, 0, 1]), ("fklearn_feat__region==SP", [0, 0, 1, 0])))) expected_output_test_hardcode = pd.DataFrame( OrderedDict( (("feat1_num", [2, 20, 200, 2000]), ("fklearn_feat__sex==female", [0, 1, 0, 0]), ("fklearn_feat__sex==male", [1, 0, 1, 0]), ("fklearn_feat__sex==nan", [0, 0, 0, 1]), ("fklearn_feat__region==MG", [0, 0, 0, 0]), ("fklearn_feat__region==RG", [0, 0, 0, 1]), ("fklearn_feat__region==SP", [0, 0, 1, 0]), ("fklearn_feat__region==nan", [1, 1, 0, 0])))) expected_output_test_drop_first = pd.DataFrame( OrderedDict( (("feat1_num", [2, 20, 200, 2000]), ("fklearn_feat__sex==male", [1, 0, 1, 0]), ("fklearn_feat__region==RG", [0, 0, 0, 1]), ("fklearn_feat__region==SP", [0, 0, 1, 0])))) # Test without hardcoding NaNs categorizer_learner = onehot_categorizer( columns_to_categorize=["sex", "region"], hardcode_nans=False) pred_fn, data, log = categorizer_learner(input_df_train) test_result = pred_fn(input_df_test) assert (test_result[expected_output_test_no_hardcode. columns]. # we don't care about output order equals(expected_output_test_no_hardcode)) assert (data[expected_output_train_no_hardcode. columns]. # we don't care about output order equals(expected_output_train_no_hardcode)) # Test with hardcoding NaNs categorizer_learner = onehot_categorizer( columns_to_categorize=["sex", "region"], hardcode_nans=True) pred_fn, data, log = categorizer_learner(input_df_train) test_result = pred_fn(input_df_test) assert (test_result[expected_output_test_hardcode. columns]. # we don't care about output order equals(expected_output_test_hardcode)) assert (data[expected_output_train_hardcode. columns]. # we don't care about output order equals(expected_output_train_hardcode)) # Testing dropping the first column categorizer_learner = onehot_categorizer( columns_to_categorize=["sex", "region"], hardcode_nans=False, drop_first_column=True) pred_fn, data, log = categorizer_learner(input_df_train) test_result = pred_fn(input_df_test) assert (test_result[expected_output_test_drop_first.columns].equals( expected_output_test_drop_first)) assert (data[expected_output_train_drop_first.columns].equals( expected_output_train_drop_first))