Exemple #1
0
def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(FunctionTransformer(replace), classifier)
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert y.shape == y_hat.shape
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    assert_raises(ValueError, pipeline.fit, X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    assert_raises(ValueError, bagging_classifier.fit, X, y)
Exemple #2
0
def test_pipeline_ducktyping():
    pipeline = make_pipeline(Mult(5))
    pipeline.predict
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(Transf())
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline('passthrough')
    assert pipeline.steps[0] == ('passthrough', 'passthrough')
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(Transf(), NoInvTransf())
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    assert not hasattr(pipeline, 'inverse_transform')

    pipeline = make_pipeline(NoInvTransf(), Transf())
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    assert not hasattr(pipeline, 'inverse_transform')
Exemple #3
0
def test_bagging_regressor_with_missing_inputs():
    # Check that BaggingRegressor can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y_values = [
        np.array([2, 3, 3, 3, 3]),
        np.array([
            [2, 1, 9],
            [3, 6, 8],
            [3, 6, 8],
            [3, 6, 8],
            [3, 6, 8],
        ])
    ]
    for y in y_values:
        regressor = DecisionTreeRegressor()
        pipeline = make_pipeline(FunctionTransformer(replace), regressor)
        pipeline.fit(X, y).predict(X)
        bagging_regressor = BaggingRegressor(pipeline)
        y_hat = bagging_regressor.fit(X, y).predict(X)
        assert y.shape == y_hat.shape

        # Verify that exceptions can be raised by wrapper regressor
        regressor = DecisionTreeRegressor()
        pipeline = make_pipeline(regressor)
        assert_raises(ValueError, pipeline.fit, X, y)
        bagging_regressor = BaggingRegressor(pipeline)
        assert_raises(ValueError, bagging_regressor.fit, X, y)
Exemple #4
0
def test_permutation_importance_mixed_types_pandas():
    pd = pytest.importorskip("pandas")
    rng = np.random.RandomState(42)
    n_repeats = 5

    # Last column is correlated with y
    X = pd.DataFrame({
        'col1': [1.0, 2.0, 3.0, np.nan],
        'col2': ['a', 'b', 'a', 'b']
    })
    y = np.array([0, 1, 0, 1])

    num_preprocess = make_pipeline(SimpleImputer(), StandardScaler())
    preprocess = ColumnTransformer([('num', num_preprocess, ['col1']),
                                    ('cat', OneHotEncoder(), ['col2'])])
    clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs'))
    clf.fit(X, y)

    result = permutation_importance(clf,
                                    X,
                                    y,
                                    n_repeats=n_repeats,
                                    random_state=rng)

    assert result.importances.shape == (X.shape[1], n_repeats)
    # the correlated feature with y is the last column and should
    # have the highest importance
    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
Exemple #5
0
def test_classes_property():
    X = iris.data
    y = iris.target

    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
    reg.fit(X, y)
    assert_raises(AttributeError, getattr, reg, "classes_")

    clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
    assert_raises(AttributeError, getattr, clf, "classes_")
    clf.fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))
Exemple #6
0
def test_make_pipeline_memory():
    cachedir = mkdtemp()
    if LooseVersion(joblib.__version__) < LooseVersion('0.12'):
        # Deal with change of API in joblib
        memory = joblib.Memory(cachedir=cachedir, verbose=10)
    else:
        memory = joblib.Memory(location=cachedir, verbose=10)
    pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
    assert pipeline.memory is memory
    pipeline = make_pipeline(DummyTransf(), SVC())
    assert pipeline.memory is None
    assert len(pipeline) == 2

    shutil.rmtree(cachedir)
Exemple #7
0
def test_estimators_samples_deterministic():
    # This test is a regression test to check that with a random step
    # (e.g. SparseRandomProjection) and a given random state, the results
    # generated at fit time can be identically reproduced at a later time using
    # data saved in object attributes. Check issue #9524 for full discussion.

    iris = load_iris()
    X, y = iris.data, iris.target

    base_pipeline = make_pipeline(SparseRandomProjection(n_components=2),
                                  LogisticRegression())
    clf = BaggingClassifier(base_estimator=base_pipeline,
                            max_samples=0.5,
                            random_state=0)
    clf.fit(X, y)
    pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()

    estimator = clf.estimators_[0]
    estimator_sample = clf.estimators_samples_[0]
    estimator_feature = clf.estimators_features_[0]

    X_train = (X[estimator_sample])[:, estimator_feature]
    y_train = y[estimator_sample]

    estimator.fit(X_train, y_train)
    assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
Exemple #8
0
def test_pipeline_with_nearest_neighbors_transformer():
    # Test chaining NearestNeighborsTransformer and Isomap with
    # neighbors_algorithm='precomputed'
    algorithm = 'auto'
    n_neighbors = 10

    X, _ = datasets.make_blobs(random_state=0)
    X2, _ = datasets.make_blobs(random_state=1)

    # compare the chained version and the compact version
    est_chain = pipeline.make_pipeline(
        neighbors.KNeighborsTransformer(n_neighbors=n_neighbors,
                                        algorithm=algorithm,
                                        mode='distance'),
        manifold.Isomap(n_neighbors=n_neighbors, metric='precomputed'))
    est_compact = manifold.Isomap(n_neighbors=n_neighbors,
                                  neighbors_algorithm=algorithm)

    Xt_chain = est_chain.fit_transform(X)
    Xt_compact = est_compact.fit_transform(X)
    assert_array_almost_equal(Xt_chain, Xt_compact)

    Xt_chain = est_chain.transform(X2)
    Xt_compact = est_compact.transform(X2)
    assert_array_almost_equal(Xt_chain, Xt_compact)
def test_partial_dependence_unfitted(estimator):
    X = iris.data
    preprocessor = make_column_transformer((StandardScaler(), [0, 2]),
                                           (RobustScaler(), [1, 3]))
    pipe = make_pipeline(preprocessor, estimator)
    with pytest.raises(NotFittedError, match="is not fitted yet"):
        partial_dependence(pipe, X, features=[0, 2], grid_resolution=10)
    with pytest.raises(NotFittedError, match="is not fitted yet"):
        partial_dependence(estimator, X, features=[0, 2], grid_resolution=10)
Exemple #10
0
def test_score_samples_on_pipeline_without_score_samples():
    X = np.array([[1], [2]])
    y = np.array([1, 2])
    # Test that a pipeline does not have score_samples method when the final
    # step of the pipeline does not have score_samples defined.
    pipe = make_pipeline(LogisticRegression())
    pipe.fit(X, y)
    with pytest.raises(AttributeError,
                       match="'LogisticRegression' object has no attribute "
                       "'score_samples'"):
        pipe.score_samples(X)
Exemple #11
0
def test_kde_pipeline_gridsearch():
    # test that kde plays nice in pipelines and grid-searches
    X, _ = make_blobs(cluster_std=.1,
                      random_state=1,
                      centers=[[0, 1], [1, 0], [0, 0]])
    pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False),
                          KernelDensity(kernel="gaussian"))
    params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
    search = GridSearchCV(pipe1, param_grid=params)
    search.fit(X)
    assert search.best_params_['kerneldensity__bandwidth'] == .1
Exemple #12
0
def test_make_pipeline():
    t1 = Transf()
    t2 = Transf()
    pipe = make_pipeline(t1, t2)
    assert isinstance(pipe, Pipeline)
    assert pipe.steps[0][0] == "transf-1"
    assert pipe.steps[1][0] == "transf-2"

    pipe = make_pipeline(t1, t2, FitParamT())
    assert isinstance(pipe, Pipeline)
    assert pipe.steps[0][0] == "transf-1"
    assert pipe.steps[1][0] == "transf-2"
    assert pipe.steps[2][0] == "fitparamt"

    assert_raise_message(TypeError,
                         'Unknown keyword arguments: "random_parameter"',
                         make_pipeline,
                         t1,
                         t2,
                         random_parameter='rnd')
Exemple #13
0
def test_lasso_cv_with_some_model_selection():
    from sklearn_lib.pipeline import make_pipeline
    from sklearn_lib.preprocessing import StandardScaler
    from sklearn_lib.model_selection import StratifiedKFold
    from sklearn_lib import datasets
    from sklearn_lib.linear_model import LassoCV

    diabetes = datasets.load_diabetes()
    X = diabetes.data
    y = diabetes.target

    pipe = make_pipeline(StandardScaler(), LassoCV(cv=StratifiedKFold()))
    pipe.fit(X, y)
Exemple #14
0
def test_spectral_clustering():
    # Test chaining KNeighborsTransformer and SpectralClustering
    n_neighbors = 5
    X, _ = make_blobs(random_state=0)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
        SpectralClustering(n_neighbors=n_neighbors, affinity='precomputed',
                           random_state=42))
    est_compact = SpectralClustering(
        n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42)
    labels_compact = est_compact.fit_predict(X)
    labels_chain = est_chain.fit_predict(X)
    assert_array_almost_equal(labels_chain, labels_compact)
Exemple #15
0
def test_dbscan():
    # Test chaining RadiusNeighborsTransformer and DBSCAN
    radius = 0.3
    n_clusters = 3
    X = generate_clustered_data(n_clusters=n_clusters)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        RadiusNeighborsTransformer(radius=radius, mode='distance'),
        DBSCAN(metric='precomputed', eps=radius))
    est_compact = DBSCAN(eps=radius)

    labels_chain = est_chain.fit_predict(X)
    labels_compact = est_compact.fit_predict(X)
    assert_array_almost_equal(labels_chain, labels_compact)
Exemple #16
0
def test_precision_recall_curve_string_labels(pyplot):
    # regression test #15738
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target_names[cancer.target]

    lr = make_pipeline(StandardScaler(), LogisticRegression())
    lr.fit(X, y)
    for klass in cancer.target_names:
        assert klass in lr.classes_
    disp = plot_precision_recall_curve(lr, X, y)

    y_pred = lr.predict_proba(X)[:, 1]
    avg_prec = average_precision_score(y, y_pred, pos_label=lr.classes_[1])

    assert disp.average_precision == pytest.approx(avg_prec)
    assert disp.estimator_name == lr.__class__.__name__
def test_partial_dependence_feature_type(features, expected_pd_shape):
    # check all possible features type supported in PDP
    pd = pytest.importorskip("pandas")
    df = pd.DataFrame(iris.data, columns=iris.feature_names)

    preprocessor = make_column_transformer(
        (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
        (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]))
    pipe = make_pipeline(preprocessor,
                         LogisticRegression(max_iter=1000, random_state=0))
    pipe.fit(df, iris.target)
    pdp_pipe, values_pipe = partial_dependence(pipe,
                                               df,
                                               features=features,
                                               grid_resolution=10)
    assert pdp_pipe.shape == expected_pd_shape
    assert len(values_pipe) == len(pdp_pipe.shape) - 1
Exemple #18
0
def test_check_scoring_gridsearchcv():
    # test that check_scoring works on GridSearchCV and pipeline.
    # slightly redundant non-regression test.

    grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3)
    scorer = check_scoring(grid, "f1")
    assert isinstance(scorer, _PredictScorer)

    pipe = make_pipeline(LinearSVC())
    scorer = check_scoring(pipe, "f1")
    assert isinstance(scorer, _PredictScorer)

    # check that cross_val_score definitely calls the scorer
    # and doesn't make any assumptions about the estimator apart from having a
    # fit.
    scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1],
                             scoring=DummyScorer(), cv=3)
    assert_array_equal(scores, 1)
Exemple #19
0
def test_lof_novelty_false():
    # Test chaining KNeighborsTransformer and LocalOutlierFactor
    n_neighbors = 4

    rng = np.random.RandomState(0)
    X = rng.randn(40, 2)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'),
        LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors,
                           novelty=False, contamination="auto"))
    est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=False,
                                     contamination="auto")

    pred_chain = est_chain.fit_predict(X)
    pred_compact = est_compact.fit_predict(X)
    assert_array_almost_equal(pred_chain, pred_compact)
Exemple #20
0
def test_pipeline():
    # Render a pipeline object
    pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
    expected = """
Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=999, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)"""

    expected = expected[1:]  # remove first \n
    assert pipeline.__repr__() == expected
Exemple #21
0
def test_kneighbors_regressor():
    # Test chaining KNeighborsTransformer and classifiers/regressors
    rng = np.random.RandomState(0)
    X = 2 * rng.rand(40, 5) - 1
    X2 = 2 * rng.rand(40, 5) - 1
    y = rng.rand(40, 1)

    n_neighbors = 12
    radius = 1.5
    # We precompute more neighbors than necessary, to have equivalence between
    # k-neighbors estimator after radius-neighbors transformer, and vice-versa.
    factor = 2

    k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance')
    k_trans_factor = KNeighborsTransformer(n_neighbors=int(
        n_neighbors * factor), mode='distance')

    r_trans = RadiusNeighborsTransformer(radius=radius, mode='distance')
    r_trans_factor = RadiusNeighborsTransformer(radius=int(
        radius * factor), mode='distance')

    k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
    r_reg = RadiusNeighborsRegressor(radius=radius)

    test_list = [
        (k_trans, k_reg),
        (k_trans_factor, r_reg),
        (r_trans, r_reg),
        (r_trans_factor, k_reg),
    ]

    for trans, reg in test_list:
        # compare the chained version and the compact version
        reg_compact = clone(reg)
        reg_precomp = clone(reg)
        reg_precomp.set_params(metric='precomputed')

        reg_chain = make_pipeline(clone(trans), reg_precomp)

        y_pred_chain = reg_chain.fit(X, y).predict(X2)
        y_pred_compact = reg_compact.fit(X, y).predict(X2)
        assert_array_almost_equal(y_pred_chain, y_pred_compact)
Exemple #22
0
def test_spectral_embedding():
    # Test chaining KNeighborsTransformer and SpectralEmbedding
    n_neighbors = 5

    n_samples = 1000
    centers = np.array([
        [0.0, 5.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 4.0, 0.0, 0.0],
        [1.0, 0.0, 0.0, 5.0, 1.0],
    ])
    S, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                cluster_std=1., random_state=42)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
        SpectralEmbedding(n_neighbors=n_neighbors, affinity='precomputed',
                          random_state=42))
    est_compact = SpectralEmbedding(
        n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42)
    St_compact = est_compact.fit_transform(S)
    St_chain = est_chain.fit_transform(S)
    assert_array_almost_equal(St_chain, St_compact)
Exemple #23
0
def test_tsne():
    # Test chaining KNeighborsTransformer and TSNE
    n_iter = 250
    perplexity = 5
    n_neighbors = int(3. * perplexity + 1)

    rng = np.random.RandomState(0)
    X = rng.randn(20, 2)

    for metric in ['minkowski', 'sqeuclidean']:

        # compare the chained version and the compact version
        est_chain = make_pipeline(
            KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
                                  metric=metric),
            TSNE(metric='precomputed', perplexity=perplexity,
                 method="barnes_hut", random_state=42, n_iter=n_iter))
        est_compact = TSNE(metric=metric, perplexity=perplexity, n_iter=n_iter,
                           method="barnes_hut", random_state=42)

        Xt_chain = est_chain.fit_transform(X)
        Xt_compact = est_compact.fit_transform(X)
        assert_array_almost_equal(Xt_chain, Xt_compact)
def test_partial_dependence_dataframe(estimator, preprocessor, features):
    # check that the partial dependence support dataframe and pipeline
    # including a column transformer
    pd = pytest.importorskip("pandas")
    df = pd.DataFrame(iris.data, columns=iris.feature_names)

    pipe = make_pipeline(preprocessor, estimator)
    pipe.fit(df, iris.target)
    pdp_pipe, values_pipe = partial_dependence(pipe,
                                               df,
                                               features=features,
                                               grid_resolution=10)

    # the column transformer will reorder the column when transforming
    # we mixed the index to be sure that we are computing the partial
    # dependence of the right columns
    if preprocessor is not None:
        X_proc = clone(preprocessor).fit_transform(df)
        features_clf = [0, 1]
    else:
        X_proc = df
        features_clf = [0, 2]

    clf = clone(estimator).fit(X_proc, iris.target)
    pdp_clf, values_clf = partial_dependence(clf,
                                             X_proc,
                                             features=features_clf,
                                             method='brute',
                                             grid_resolution=10)

    assert_allclose(pdp_pipe, pdp_clf)
    if preprocessor is not None:
        scaler = preprocessor.named_transformers_['standardscaler']
        assert_allclose(values_pipe[1],
                        values_clf[1] * scaler.scale_[1] + scaler.mean_[1])
    else:
        assert_allclose(values_pipe[1], values_clf[1])
def test_partial_dependence_pipeline():
    # check that the partial dependence support pipeline
    iris = load_iris()

    scaler = StandardScaler()
    clf = DummyClassifier(random_state=42)
    pipe = make_pipeline(scaler, clf)

    clf.fit(scaler.fit_transform(iris.data), iris.target)
    pipe.fit(iris.data, iris.target)

    features = 0
    pdp_pipe, values_pipe = partial_dependence(pipe,
                                               iris.data,
                                               features=[features],
                                               grid_resolution=10)
    pdp_clf, values_clf = partial_dependence(clf,
                                             scaler.transform(iris.data),
                                             features=[features],
                                             grid_resolution=10)
    assert_allclose(pdp_pipe, pdp_clf)
    assert_allclose(
        values_pipe[0],
        values_clf[0] * scaler.scale_[features] + scaler.mean_[features])
Exemple #26
0
def test_permutation_importance_mixed_types():
    rng = np.random.RandomState(42)
    n_repeats = 4

    # Last column is correlated with y
    X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T
    y = np.array([0, 1, 0, 1])

    clf = make_pipeline(SimpleImputer(), LogisticRegression(solver='lbfgs'))
    clf.fit(X, y)
    result = permutation_importance(clf,
                                    X,
                                    y,
                                    n_repeats=n_repeats,
                                    random_state=rng)

    assert result.importances.shape == (X.shape[1], n_repeats)

    # the correlated feature with y is the last column and should
    # have the highest importance
    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])

    # use another random state
    rng = np.random.RandomState(0)
    result2 = permutation_importance(clf,
                                     X,
                                     y,
                                     n_repeats=n_repeats,
                                     random_state=rng)
    assert result2.importances.shape == (X.shape[1], n_repeats)

    assert not np.allclose(result.importances, result2.importances)

    # the correlated feature with y is the last column and should
    # have the highest importance
    assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
def test_missing_values_minmax_imputation():
    # Compare the buit-in missing value handling of Histogram GBC with an
    # a-priori missing value imputation strategy that should yield the same
    # results in terms of decision function.
    #
    # Each feature (containing NaNs) is replaced by 2 features:
    # - one where the nans are replaced by min(feature) - 1
    # - one where the nans are replaced by max(feature) + 1
    # A split where nans go to the left has an equivalent split in the
    # first (min) feature, and a split where nans go to the right has an
    # equivalent split in the second (max) feature.
    #
    # Assuming the data is such that there is never a tie to select the best
    # feature to split on during training, the learned decision trees should be
    # strictly equivalent (learn a sequence of splits that encode the same
    # decision function).
    #
    # The MinMaxImputer transformer is meant to be a toy implementation of the
    # "Missing In Attributes" (MIA) missing value handling for decision trees
    # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
    # The implementation of MIA as an imputation transformer was suggested by
    # "Remark 3" in https://arxiv.org/abs/1902.06931

    class MinMaxImputer(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            mm = MinMaxScaler().fit(X)
            self.data_min_ = mm.data_min_
            self.data_max_ = mm.data_max_
            return self

        def transform(self, X):
            X_min, X_max = X.copy(), X.copy()

            for feature_idx in range(X.shape[1]):
                nan_mask = np.isnan(X[:, feature_idx])
                X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
                X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1

            return np.concatenate([X_min, X_max], axis=1)

    def make_missing_value_data(n_samples=int(1e4), seed=0):
        rng = np.random.RandomState(seed)
        X, y = make_regression(n_samples=n_samples,
                               n_features=4,
                               random_state=rng)

        # Pre-bin the data to ensure a deterministic handling by the 2
        # strategies and also make it easier to insert np.nan in a structured
        # way:
        X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)

        # First feature has missing values completely at random:
        rnd_mask = rng.rand(X.shape[0]) > 0.9
        X[rnd_mask, 0] = np.nan

        # Second and third features have missing values for extreme values
        # (censoring missingness):
        low_mask = X[:, 1] == 0
        X[low_mask, 1] = np.nan

        high_mask = X[:, 2] == X[:, 2].max()
        X[high_mask, 2] = np.nan

        # Make the last feature nan pattern very informative:
        y_max = np.percentile(y, 70)
        y_max_mask = y >= y_max
        y[y_max_mask] = y_max
        X[y_max_mask, 3] = np.nan

        # Check that there is at least one missing value in each feature:
        for feature_idx in range(X.shape[1]):
            assert any(np.isnan(X[:, feature_idx]))

        # Let's use a test set to check that the learned decision function is
        # the same as evaluated on unseen data. Otherwise it could just be the
        # case that we find two independent ways to overfit the training set.
        return train_test_split(X, y, random_state=rng)

    # n_samples need to be large enough to minimize the likelihood of having
    # several candidate splits with the same gain value in a given tree.
    X_train, X_test, y_train, y_test = make_missing_value_data(
        n_samples=int(1e4), seed=0)

    # Use a small number of leaf nodes and iterations so as to keep
    # under-fitting models to minimize the likelihood of ties when training the
    # model.
    gbm1 = HistGradientBoostingRegressor(max_iter=100,
                                         max_leaf_nodes=5,
                                         random_state=0)
    gbm1.fit(X_train, y_train)

    gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
    gbm2.fit(X_train, y_train)

    # Check that the model reach the same score:
    assert gbm1.score(X_train, y_train) == \
        pytest.approx(gbm2.score(X_train, y_train))

    assert gbm1.score(X_test, y_test) == \
        pytest.approx(gbm2.score(X_test, y_test))

    # Check the individual prediction match as a finer grained
    # decision function check.
    assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
    assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
Exemple #28
0
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert isinstance(estimator[0].steps[-1][1].random_state, int)
    # Regression test for #15920
    cm = np.array([[19, 34], [32, 58]])
    disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])

    disp.plot(cmap=pyplot.cm.Blues)
    min_color = pyplot.cm.Blues(0)
    max_color = pyplot.cm.Blues(255)
    assert_allclose(disp.text_[0, 0].get_color(), max_color)
    assert_allclose(disp.text_[0, 1].get_color(), max_color)
    assert_allclose(disp.text_[1, 0].get_color(), max_color)
    assert_allclose(disp.text_[1, 1].get_color(), min_color)


@pytest.mark.parametrize("clf", [
    LogisticRegression(),
    make_pipeline(StandardScaler(), LogisticRegression()),
    make_pipeline(make_column_transformer(
        (StandardScaler(), [0, 1])), LogisticRegression())
])
def test_confusion_matrix_pipeline(pyplot, clf, data, n_classes):
    X, y = data
    with pytest.raises(NotFittedError):
        plot_confusion_matrix(clf, X, y)
    clf.fit(X, y)
    y_pred = clf.predict(X)

    disp = plot_confusion_matrix(clf, X, y)
    cm = confusion_matrix(y, y_pred)

    assert_allclose(disp.confusion_matrix, cm)
    assert disp.text_.shape == (n_classes, n_classes)
Exemple #30
0
def test_pipeline_param_error():
    clf = make_pipeline(LogisticRegression())
    with pytest.raises(ValueError,
                       match="Pipeline.fit does not accept "
                       "the sample_weight parameter"):
        clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1])