Esempio n. 1
0
def test_column_transformer_with_make_column_selector():
    # Functional test for column transformer + column selector
    pd = pytest.importorskip('pandas')
    X_df = pd.DataFrame(
        {
            'col_int': np.array([0, 1, 2], dtype=np.int),
            'col_float': np.array([0.0, 1.0, 2.0], dtype=np.float),
            'col_cat': ["one", "two", "one"],
            'col_str': ["low", "middle", "high"]
        },
        columns=['col_int', 'col_float', 'col_cat', 'col_str'])
    X_df['col_str'] = X_df['col_str'].astype('category')

    cat_selector = make_column_selector(dtype_include=['category', object])
    num_selector = make_column_selector(dtype_include=np.number)

    ohe = OneHotEncoder()
    scaler = StandardScaler()

    ct_selector = make_column_transformer((ohe, cat_selector),
                                          (scaler, num_selector))
    ct_direct = make_column_transformer((ohe, ['col_cat', 'col_str']),
                                        (scaler, ['col_float', 'col_int']))

    X_selector = ct_selector.fit_transform(X_df)
    X_direct = ct_direct.fit_transform(X_df)

    assert_allclose(X_selector, X_direct)
Esempio n. 2
0
def test_make_column_transformer_remainder_transformer():
    scaler = StandardScaler()
    norm = Normalizer()
    remainder = StandardScaler()
    ct = make_column_transformer((scaler, 'first'), (norm, ['second']),
                                 remainder=remainder)
    assert ct.remainder == remainder
Esempio n. 3
0
def test_make_column_transformer_pandas():
    pd = pytest.importorskip('pandas')
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    norm = Normalizer()
    ct1 = ColumnTransformer([('norm', Normalizer(), X_df.columns)])
    ct2 = make_column_transformer((norm, X_df.columns))
    assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
Esempio n. 4
0
def test_make_column_transformer():
    scaler = StandardScaler()
    norm = Normalizer()
    ct = make_column_transformer((scaler, 'first'), (norm, ['second']))
    names, transformers, columns = zip(*ct.transformers)
    assert names == ("standardscaler", "normalizer")
    assert transformers == (scaler, norm)
    assert columns == ('first', ['second'])
Esempio n. 5
0
def test_column_transformer_remainder():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
    X_res_both = X_array

    # default drop
    ct = ColumnTransformer([('trans1', Trans(), [0])])
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # specify passthrough
    ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # column order is not preserved (passed through added to end)
    ct = ColumnTransformer([('trans1', Trans(), [1])], remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [0])

    # passthrough when all actual transformers are skipped
    ct = ColumnTransformer([('trans1', 'drop', [0])], remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_second)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # error on invalid arg
    ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit, X_array)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit_transform, X_array)

    # check default for make_column_transformer
    ct = make_column_transformer((Trans(), [0]))
    assert ct.remainder == 'drop'
def test_partial_dependence_unfitted(estimator):
    X = iris.data
    preprocessor = make_column_transformer((StandardScaler(), [0, 2]),
                                           (RobustScaler(), [1, 3]))
    pipe = make_pipeline(preprocessor, estimator)
    with pytest.raises(NotFittedError, match="is not fitted yet"):
        partial_dependence(pipe, X, features=[0, 2], grid_resolution=10)
    with pytest.raises(NotFittedError, match="is not fitted yet"):
        partial_dependence(estimator, X, features=[0, 2], grid_resolution=10)
Esempio n. 7
0
def test_make_column_transformer_kwargs():
    scaler = StandardScaler()
    norm = Normalizer()
    ct = make_column_transformer((scaler, 'first'), (norm, ['second']),
                                 n_jobs=3,
                                 remainder='drop',
                                 sparse_threshold=0.5)
    assert ct.transformers == make_column_transformer(
        (scaler, 'first'), (norm, ['second'])).transformers
    assert ct.n_jobs == 3
    assert ct.remainder == 'drop'
    assert ct.sparse_threshold == 0.5
    # invalid keyword parameters should raise an error message
    assert_raise_message(TypeError,
                         'Unknown keyword arguments: "transformer_weights"',
                         make_column_transformer, (scaler, 'first'),
                         (norm, ['second']),
                         transformer_weights={
                             'pca': 10,
                             'Transf': 1
                         })
Esempio n. 8
0
def test_column_transformer_mixed_cols_sparse():
    df = np.array([['a', 1, True], ['b', 2, False]], dtype='O')

    ct = make_column_transformer((OneHotEncoder(), [0]),
                                 ('passthrough', [1, 2]),
                                 sparse_threshold=1.0)

    # this shouldn't fail, since boolean can be coerced into a numeric
    # See: https://github.com/scikit-learn/scikit-learn/issues/11912
    X_trans = ct.fit_transform(df)
    assert X_trans.getformat() == 'csr'
    assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1], [0, 1, 2,
                                                                   0]]))

    ct = make_column_transformer((OneHotEncoder(), [0]), ('passthrough', [0]),
                                 sparse_threshold=1.0)
    with pytest.raises(ValueError,
                       match="For a sparse output, all columns should"):
        # this fails since strings `a` and `b` cannot be
        # coerced into a numeric.
        ct.fit_transform(df)
def test_partial_dependence_feature_type(features, expected_pd_shape):
    # check all possible features type supported in PDP
    pd = pytest.importorskip("pandas")
    df = pd.DataFrame(iris.data, columns=iris.feature_names)

    preprocessor = make_column_transformer(
        (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
        (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]))
    pipe = make_pipeline(preprocessor,
                         LogisticRegression(max_iter=1000, random_state=0))
    pipe.fit(df, iris.target)
    pdp_pipe, values_pipe = partial_dependence(pipe,
                                               df,
                                               features=features,
                                               grid_resolution=10)
    assert pdp_pipe.shape == expected_pd_shape
    assert len(values_pipe) == len(pdp_pipe.shape) - 1
    cm = np.array([[19, 34], [32, 58]])
    disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])

    disp.plot(cmap=pyplot.cm.Blues)
    min_color = pyplot.cm.Blues(0)
    max_color = pyplot.cm.Blues(255)
    assert_allclose(disp.text_[0, 0].get_color(), max_color)
    assert_allclose(disp.text_[0, 1].get_color(), max_color)
    assert_allclose(disp.text_[1, 0].get_color(), max_color)
    assert_allclose(disp.text_[1, 1].get_color(), min_color)


@pytest.mark.parametrize("clf", [
    LogisticRegression(),
    make_pipeline(StandardScaler(), LogisticRegression()),
    make_pipeline(make_column_transformer(
        (StandardScaler(), [0, 1])), LogisticRegression())
])
def test_confusion_matrix_pipeline(pyplot, clf, data, n_classes):
    X, y = data
    with pytest.raises(NotFittedError):
        plot_confusion_matrix(clf, X, y)
    clf.fit(X, y)
    y_pred = clf.predict(X)

    disp = plot_confusion_matrix(clf, X, y)
    cm = confusion_matrix(y, y_pred)

    assert_allclose(disp.confusion_matrix, cm)
    assert disp.text_.shape == (n_classes, n_classes)

    assert_allclose(pdp_pipe, pdp_clf)
    assert_allclose(
        values_pipe[0],
        values_clf[0] * scaler.scale_[features] + scaler.mean_[features])


@pytest.mark.parametrize("estimator", [
    LogisticRegression(max_iter=1000, random_state=0),
    GradientBoostingClassifier(random_state=0, n_estimators=5)
],
                         ids=['estimator-brute', 'estimator-recursion'])
@pytest.mark.parametrize(
    "preprocessor", [
        None,
        make_column_transformer(
            (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
            (RobustScaler(), [iris.feature_names[i] for i in (1, 3)])),
        make_column_transformer(
            (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
            remainder='passthrough')
    ],
    ids=['None', 'column-transformer', 'column-transformer-passthrough'])
@pytest.mark.parametrize("features",
                         [[0, 2], [iris.feature_names[i] for i in (0, 2)]],
                         ids=['features-integer', 'features-string'])
def test_partial_dependence_dataframe(estimator, preprocessor, features):
    # check that the partial dependence support dataframe and pipeline
    # including a column transformer
    pd = pytest.importorskip("pandas")
    df = pd.DataFrame(iris.data, columns=iris.feature_names)