Ejemplo n.º 1
0
def test_xgb_regression_learner():
    df_train = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id4"],
        'x1': [10.0, 13.0, 10.0, 13.0],
        "x2": [0, 1, 1, 0],
        "w": [2, 1, 2, 0.5],
        'y': [2.3, 4.0, 100.0, -3.9]
    })

    df_test = pd.DataFrame({
        'id': ["id4", "id4", "id5", "id6"],
        'x1': [12.0, 1000.0, -4.0, 0.0],
        "x2": [1, 1, 0, 1],
        "w": [1, 2, 0, 0.5],
        'y': [1.3, -4.0, 0.0, 49]
    })

    features = ["x1", "x2"]

    learner = xgb_regression_learner(features=features,
                                     target="y",
                                     learning_rate=0.1,
                                     num_estimators=20,
                                     extra_params={
                                         "max_depth": 2,
                                         "seed": 42
                                     },
                                     prediction_column="prediction",
                                     weight_column="w")

    predict_fn, pred_train, log = learner(df_train)

    pred_test = predict_fn(df_test)

    expected_col_train = df_train.columns.tolist() + ["prediction"]
    expected_col_test = df_test.columns.tolist() + ["prediction"]

    assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
    assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
    assert (pred_test.columns == pred_train.columns).all()
    assert "prediction" in pred_test.columns

    # SHAP test
    pred_shap = predict_fn(df_test, apply_shap=True)
    assert "shap_values" in pred_shap.columns
    assert "shap_expected_value" in pred_shap.columns
    assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)
Ejemplo n.º 2
0
def test_build_pipeline(has_repeated_learners):
    df_train = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id4", "id3", "id4"],
        'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0],
        "x2": [0, 1, 1, 0, 1, 0],
        "cat": ["c1", "c1", "c2", None, "c2", "c4"],
        'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9]
    })

    df_test = pd.DataFrame({
        'id': ["id4", "id4", "id5", "id6", "id5", "id6"],
        'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0],
        "x2": [1, 1, 0, None, 0, 1],
        "cat": ["c1", "c2", "c5", None, "c2", "c3"],
        'y': [1.3, -4.0, 0.0, 49, 0.0, 49]
    })

    features = ["x1", "x2", "cat"]
    target = "y"

    train_fn = build_pipeline(placeholder_imputer(columns_to_impute=features,
                                                  placeholder_value=-999),
                              count_categorizer(columns_to_categorize=["cat"]),
                              xgb_regression_learner(features=features,
                                                     target=target,
                                                     num_estimators=20,
                                                     extra_params={"seed":
                                                                   42}),
                              has_repeated_learners=has_repeated_learners)

    predict_fn, pred_train, log = train_fn(df_train)

    pred_test_with_shap = predict_fn(df_test, apply_shap=True)
    assert set(pred_test_with_shap.columns) - set(pred_train.columns) == {
        "shap_values", "shap_expected_value"
    }

    pred_test_without_shap = predict_fn(df_test)
    assert set(pred_test_without_shap.columns) == set(pred_train.columns)

    pd.util.testing.assert_frame_equal(
        pred_test_with_shap[pred_test_without_shap.columns],
        pred_test_without_shap)
Ejemplo n.º 3
0
def test_build_pipeline_with_onehotencoder(has_repeated_learners):
    df_train = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id4", "id3", "id4"],
        'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0],
        "x2": [0, 1, 1, 0, 1, 0],
        "cat": ["c1", "c1", "c2", None, "c2", "c4"],
        'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9]
    })

    df_test = pd.DataFrame({
        'id': ["id4", "id4", "id5", "id6", "id5", "id6"],
        'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0],
        "x2": [1, 1, 0, None, 0, 1],
        "cat": ["c1", "c2", "c5", None, "c2", "c3"],
        'y': [1.3, -4.0, 0.0, 49, 0.0, 49]
    })

    features = ["x1", "x2", "cat"]
    target = "y"

    train_fn = build_pipeline(
        placeholder_imputer(columns_to_impute=["x1", "x2"],
                            placeholder_value=-999),
        onehot_categorizer(columns_to_categorize=["cat"], hardcode_nans=True),
        xgb_regression_learner(features=features,
                               target=target,
                               num_estimators=20,
                               extra_params={"seed": 42}),
        has_repeated_learners=has_repeated_learners)

    predict_fn, pred_train, log = train_fn(df_train)

    pred_test = predict_fn(df_test)

    expected_feature_columns_after_encoding = [
        "x1", "x2", "fklearn_feat__cat==c1", "fklearn_feat__cat==c2",
        "fklearn_feat__cat==c4", "fklearn_feat__cat==nan"
    ]

    assert set(
        pred_test.columns) == set(expected_feature_columns_after_encoding +
                                  ["id", target, "prediction"])