def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline(FunctionTransformer(replace), classifier) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert y.shape == y_hat.shape bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) assert_raises(ValueError, pipeline.fit, X, y) bagging_classifier = BaggingClassifier(pipeline) assert_raises(ValueError, bagging_classifier.fit, X, y)
def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) pipeline.predict pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf()) assert not hasattr(pipeline, 'predict') pipeline.transform pipeline.inverse_transform pipeline = make_pipeline('passthrough') assert pipeline.steps[0] == ('passthrough', 'passthrough') assert not hasattr(pipeline, 'predict') pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert not hasattr(pipeline, 'predict') pipeline.transform assert not hasattr(pipeline, 'inverse_transform') pipeline = make_pipeline(NoInvTransf(), Transf()) assert not hasattr(pipeline, 'predict') pipeline.transform assert not hasattr(pipeline, 'inverse_transform')
def test_bagging_regressor_with_missing_inputs(): # Check that BaggingRegressor can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y_values = [ np.array([2, 3, 3, 3, 3]), np.array([ [2, 1, 9], [3, 6, 8], [3, 6, 8], [3, 6, 8], [3, 6, 8], ]) ] for y in y_values: regressor = DecisionTreeRegressor() pipeline = make_pipeline(FunctionTransformer(replace), regressor) pipeline.fit(X, y).predict(X) bagging_regressor = BaggingRegressor(pipeline) y_hat = bagging_regressor.fit(X, y).predict(X) assert y.shape == y_hat.shape # Verify that exceptions can be raised by wrapper regressor regressor = DecisionTreeRegressor() pipeline = make_pipeline(regressor) assert_raises(ValueError, pipeline.fit, X, y) bagging_regressor = BaggingRegressor(pipeline) assert_raises(ValueError, bagging_regressor.fit, X, y)
def test_permutation_importance_mixed_types_pandas(): pd = pytest.importorskip("pandas") rng = np.random.RandomState(42) n_repeats = 5 # Last column is correlated with y X = pd.DataFrame({ 'col1': [1.0, 2.0, 3.0, np.nan], 'col2': ['a', 'b', 'a', 'b'] }) y = np.array([0, 1, 0, 1]) num_preprocess = make_pipeline(SimpleImputer(), StandardScaler()) preprocess = ColumnTransformer([('num', num_preprocess, ['col1']), ('cat', OneHotEncoder(), ['col2'])]) clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_classes_property(): X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) assert_raises(AttributeError, getattr, reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) assert_raises(AttributeError, getattr, clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y))
def test_make_pipeline_memory(): cachedir = mkdtemp() if LooseVersion(joblib.__version__) < LooseVersion('0.12'): # Deal with change of API in joblib memory = joblib.Memory(cachedir=cachedir, verbose=10) else: memory = joblib.Memory(location=cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert pipeline.memory is memory pipeline = make_pipeline(DummyTransf(), SVC()) assert pipeline.memory is None assert len(pipeline) == 2 shutil.rmtree(cachedir)
def test_estimators_samples_deterministic(): # This test is a regression test to check that with a random step # (e.g. SparseRandomProjection) and a given random state, the results # generated at fit time can be identically reproduced at a later time using # data saved in object attributes. Check issue #9524 for full discussion. iris = load_iris() X, y = iris.data, iris.target base_pipeline = make_pipeline(SparseRandomProjection(n_components=2), LogisticRegression()) clf = BaggingClassifier(base_estimator=base_pipeline, max_samples=0.5, random_state=0) clf.fit(X, y) pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy() estimator = clf.estimators_[0] estimator_sample = clf.estimators_samples_[0] estimator_feature = clf.estimators_features_[0] X_train = (X[estimator_sample])[:, estimator_feature] y_train = y[estimator_sample] estimator.fit(X_train, y_train) assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
def test_pipeline_with_nearest_neighbors_transformer(): # Test chaining NearestNeighborsTransformer and Isomap with # neighbors_algorithm='precomputed' algorithm = 'auto' n_neighbors = 10 X, _ = datasets.make_blobs(random_state=0) X2, _ = datasets.make_blobs(random_state=1) # compare the chained version and the compact version est_chain = pipeline.make_pipeline( neighbors.KNeighborsTransformer(n_neighbors=n_neighbors, algorithm=algorithm, mode='distance'), manifold.Isomap(n_neighbors=n_neighbors, metric='precomputed')) est_compact = manifold.Isomap(n_neighbors=n_neighbors, neighbors_algorithm=algorithm) Xt_chain = est_chain.fit_transform(X) Xt_compact = est_compact.fit_transform(X) assert_array_almost_equal(Xt_chain, Xt_compact) Xt_chain = est_chain.transform(X2) Xt_compact = est_compact.transform(X2) assert_array_almost_equal(Xt_chain, Xt_compact)
def test_partial_dependence_unfitted(estimator): X = iris.data preprocessor = make_column_transformer((StandardScaler(), [0, 2]), (RobustScaler(), [1, 3])) pipe = make_pipeline(preprocessor, estimator) with pytest.raises(NotFittedError, match="is not fitted yet"): partial_dependence(pipe, X, features=[0, 2], grid_resolution=10) with pytest.raises(NotFittedError, match="is not fitted yet"): partial_dependence(estimator, X, features=[0, 2], grid_resolution=10)
def test_score_samples_on_pipeline_without_score_samples(): X = np.array([[1], [2]]) y = np.array([1, 2]) # Test that a pipeline does not have score_samples method when the final # step of the pipeline does not have score_samples defined. pipe = make_pipeline(LogisticRegression()) pipe.fit(X, y) with pytest.raises(AttributeError, match="'LogisticRegression' object has no attribute " "'score_samples'"): pipe.score_samples(X)
def test_kde_pipeline_gridsearch(): # test that kde plays nice in pipelines and grid-searches X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False), KernelDensity(kernel="gaussian")) params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10]) search = GridSearchCV(pipe1, param_grid=params) search.fit(X) assert search.best_params_['kerneldensity__bandwidth'] == .1
def test_make_pipeline(): t1 = Transf() t2 = Transf() pipe = make_pipeline(t1, t2) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" pipe = make_pipeline(t1, t2, FitParamT()) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" assert pipe.steps[2][0] == "fitparamt" assert_raise_message(TypeError, 'Unknown keyword arguments: "random_parameter"', make_pipeline, t1, t2, random_parameter='rnd')
def test_lasso_cv_with_some_model_selection(): from sklearn_lib.pipeline import make_pipeline from sklearn_lib.preprocessing import StandardScaler from sklearn_lib.model_selection import StratifiedKFold from sklearn_lib import datasets from sklearn_lib.linear_model import LassoCV diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target pipe = make_pipeline(StandardScaler(), LassoCV(cv=StratifiedKFold())) pipe.fit(X, y)
def test_spectral_clustering(): # Test chaining KNeighborsTransformer and SpectralClustering n_neighbors = 5 X, _ = make_blobs(random_state=0) # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'), SpectralClustering(n_neighbors=n_neighbors, affinity='precomputed', random_state=42)) est_compact = SpectralClustering( n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42) labels_compact = est_compact.fit_predict(X) labels_chain = est_chain.fit_predict(X) assert_array_almost_equal(labels_chain, labels_compact)
def test_dbscan(): # Test chaining RadiusNeighborsTransformer and DBSCAN radius = 0.3 n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) # compare the chained version and the compact version est_chain = make_pipeline( RadiusNeighborsTransformer(radius=radius, mode='distance'), DBSCAN(metric='precomputed', eps=radius)) est_compact = DBSCAN(eps=radius) labels_chain = est_chain.fit_predict(X) labels_compact = est_compact.fit_predict(X) assert_array_almost_equal(labels_chain, labels_compact)
def test_precision_recall_curve_string_labels(pyplot): # regression test #15738 cancer = load_breast_cancer() X = cancer.data y = cancer.target_names[cancer.target] lr = make_pipeline(StandardScaler(), LogisticRegression()) lr.fit(X, y) for klass in cancer.target_names: assert klass in lr.classes_ disp = plot_precision_recall_curve(lr, X, y) y_pred = lr.predict_proba(X)[:, 1] avg_prec = average_precision_score(y, y_pred, pos_label=lr.classes_[1]) assert disp.average_precision == pytest.approx(avg_prec) assert disp.estimator_name == lr.__class__.__name__
def test_partial_dependence_feature_type(features, expected_pd_shape): # check all possible features type supported in PDP pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) preprocessor = make_column_transformer( (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), (RobustScaler(), [iris.feature_names[i] for i in (1, 3)])) pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, random_state=0)) pipe.fit(df, iris.target) pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features, grid_resolution=10) assert pdp_pipe.shape == expected_pd_shape assert len(values_pipe) == len(pdp_pipe.shape) - 1
def test_check_scoring_gridsearchcv(): # test that check_scoring works on GridSearchCV and pipeline. # slightly redundant non-regression test. grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3) scorer = check_scoring(grid, "f1") assert isinstance(scorer, _PredictScorer) pipe = make_pipeline(LinearSVC()) scorer = check_scoring(pipe, "f1") assert isinstance(scorer, _PredictScorer) # check that cross_val_score definitely calls the scorer # and doesn't make any assumptions about the estimator apart from having a # fit. scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1], scoring=DummyScorer(), cv=3) assert_array_equal(scores, 1)
def test_lof_novelty_false(): # Test chaining KNeighborsTransformer and LocalOutlierFactor n_neighbors = 4 rng = np.random.RandomState(0) X = rng.randn(40, 2) # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'), LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors, novelty=False, contamination="auto")) est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=False, contamination="auto") pred_chain = est_chain.fit_predict(X) pred_compact = est_compact.fit_predict(X) assert_array_almost_equal(pred_chain, pred_compact)
def test_pipeline(): # Render a pipeline object pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999)) expected = """ Pipeline(memory=None, steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=999, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False))], verbose=False)""" expected = expected[1:] # remove first \n assert pipeline.__repr__() == expected
def test_kneighbors_regressor(): # Test chaining KNeighborsTransformer and classifiers/regressors rng = np.random.RandomState(0) X = 2 * rng.rand(40, 5) - 1 X2 = 2 * rng.rand(40, 5) - 1 y = rng.rand(40, 1) n_neighbors = 12 radius = 1.5 # We precompute more neighbors than necessary, to have equivalence between # k-neighbors estimator after radius-neighbors transformer, and vice-versa. factor = 2 k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance') k_trans_factor = KNeighborsTransformer(n_neighbors=int( n_neighbors * factor), mode='distance') r_trans = RadiusNeighborsTransformer(radius=radius, mode='distance') r_trans_factor = RadiusNeighborsTransformer(radius=int( radius * factor), mode='distance') k_reg = KNeighborsRegressor(n_neighbors=n_neighbors) r_reg = RadiusNeighborsRegressor(radius=radius) test_list = [ (k_trans, k_reg), (k_trans_factor, r_reg), (r_trans, r_reg), (r_trans_factor, k_reg), ] for trans, reg in test_list: # compare the chained version and the compact version reg_compact = clone(reg) reg_precomp = clone(reg) reg_precomp.set_params(metric='precomputed') reg_chain = make_pipeline(clone(trans), reg_precomp) y_pred_chain = reg_chain.fit(X, y).predict(X2) y_pred_compact = reg_compact.fit(X, y).predict(X2) assert_array_almost_equal(y_pred_chain, y_pred_compact)
def test_spectral_embedding(): # Test chaining KNeighborsTransformer and SpectralEmbedding n_neighbors = 5 n_samples = 1000 centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [0.0, 0.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) S, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'), SpectralEmbedding(n_neighbors=n_neighbors, affinity='precomputed', random_state=42)) est_compact = SpectralEmbedding( n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42) St_compact = est_compact.fit_transform(S) St_chain = est_chain.fit_transform(S) assert_array_almost_equal(St_chain, St_compact)
def test_tsne(): # Test chaining KNeighborsTransformer and TSNE n_iter = 250 perplexity = 5 n_neighbors = int(3. * perplexity + 1) rng = np.random.RandomState(0) X = rng.randn(20, 2) for metric in ['minkowski', 'sqeuclidean']: # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', metric=metric), TSNE(metric='precomputed', perplexity=perplexity, method="barnes_hut", random_state=42, n_iter=n_iter)) est_compact = TSNE(metric=metric, perplexity=perplexity, n_iter=n_iter, method="barnes_hut", random_state=42) Xt_chain = est_chain.fit_transform(X) Xt_compact = est_compact.fit_transform(X) assert_array_almost_equal(Xt_chain, Xt_compact)
def test_partial_dependence_dataframe(estimator, preprocessor, features): # check that the partial dependence support dataframe and pipeline # including a column transformer pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) pipe = make_pipeline(preprocessor, estimator) pipe.fit(df, iris.target) pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features, grid_resolution=10) # the column transformer will reorder the column when transforming # we mixed the index to be sure that we are computing the partial # dependence of the right columns if preprocessor is not None: X_proc = clone(preprocessor).fit_transform(df) features_clf = [0, 1] else: X_proc = df features_clf = [0, 2] clf = clone(estimator).fit(X_proc, iris.target) pdp_clf, values_clf = partial_dependence(clf, X_proc, features=features_clf, method='brute', grid_resolution=10) assert_allclose(pdp_pipe, pdp_clf) if preprocessor is not None: scaler = preprocessor.named_transformers_['standardscaler'] assert_allclose(values_pipe[1], values_clf[1] * scaler.scale_[1] + scaler.mean_[1]) else: assert_allclose(values_pipe[1], values_clf[1])
def test_partial_dependence_pipeline(): # check that the partial dependence support pipeline iris = load_iris() scaler = StandardScaler() clf = DummyClassifier(random_state=42) pipe = make_pipeline(scaler, clf) clf.fit(scaler.fit_transform(iris.data), iris.target) pipe.fit(iris.data, iris.target) features = 0 pdp_pipe, values_pipe = partial_dependence(pipe, iris.data, features=[features], grid_resolution=10) pdp_clf, values_clf = partial_dependence(clf, scaler.transform(iris.data), features=[features], grid_resolution=10) assert_allclose(pdp_pipe, pdp_clf) assert_allclose( values_pipe[0], values_clf[0] * scaler.scale_[features] + scaler.mean_[features])
def test_permutation_importance_mixed_types(): rng = np.random.RandomState(42) n_repeats = 4 # Last column is correlated with y X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T y = np.array([0, 1, 0, 1]) clf = make_pipeline(SimpleImputer(), LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1]) # use another random state rng = np.random.RandomState(0) result2 = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result2.importances.shape == (X.shape[1], n_repeats) assert not np.allclose(result.importances, result2.importances) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
def test_missing_values_minmax_imputation(): # Compare the buit-in missing value handling of Histogram GBC with an # a-priori missing value imputation strategy that should yield the same # results in terms of decision function. # # Each feature (containing NaNs) is replaced by 2 features: # - one where the nans are replaced by min(feature) - 1 # - one where the nans are replaced by max(feature) + 1 # A split where nans go to the left has an equivalent split in the # first (min) feature, and a split where nans go to the right has an # equivalent split in the second (max) feature. # # Assuming the data is such that there is never a tie to select the best # feature to split on during training, the learned decision trees should be # strictly equivalent (learn a sequence of splits that encode the same # decision function). # # The MinMaxImputer transformer is meant to be a toy implementation of the # "Missing In Attributes" (MIA) missing value handling for decision trees # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305 # The implementation of MIA as an imputation transformer was suggested by # "Remark 3" in https://arxiv.org/abs/1902.06931 class MinMaxImputer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): mm = MinMaxScaler().fit(X) self.data_min_ = mm.data_min_ self.data_max_ = mm.data_max_ return self def transform(self, X): X_min, X_max = X.copy(), X.copy() for feature_idx in range(X.shape[1]): nan_mask = np.isnan(X[:, feature_idx]) X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1 X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1 return np.concatenate([X_min, X_max], axis=1) def make_missing_value_data(n_samples=int(1e4), seed=0): rng = np.random.RandomState(seed) X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 # strategies and also make it easier to insert np.nan in a structured # way: X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) # First feature has missing values completely at random: rnd_mask = rng.rand(X.shape[0]) > 0.9 X[rnd_mask, 0] = np.nan # Second and third features have missing values for extreme values # (censoring missingness): low_mask = X[:, 1] == 0 X[low_mask, 1] = np.nan high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan # Make the last feature nan pattern very informative: y_max = np.percentile(y, 70) y_max_mask = y >= y_max y[y_max_mask] = y_max X[y_max_mask, 3] = np.nan # Check that there is at least one missing value in each feature: for feature_idx in range(X.shape[1]): assert any(np.isnan(X[:, feature_idx])) # Let's use a test set to check that the learned decision function is # the same as evaluated on unseen data. Otherwise it could just be the # case that we find two independent ways to overfit the training set. return train_test_split(X, y, random_state=rng) # n_samples need to be large enough to minimize the likelihood of having # several candidate splits with the same gain value in a given tree. X_train, X_test, y_train, y_test = make_missing_value_data( n_samples=int(1e4), seed=0) # Use a small number of leaf nodes and iterations so as to keep # under-fitting models to minimize the likelihood of ties when training the # model. gbm1 = HistGradientBoostingRegressor(max_iter=100, max_leaf_nodes=5, random_state=0) gbm1.fit(X_train, y_train) gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1)) gbm2.fit(X_train, y_train) # Check that the model reach the same score: assert gbm1.score(X_train, y_train) == \ pytest.approx(gbm2.score(X_train, y_train)) assert gbm1.score(X_test, y_test) == \ pytest.approx(gbm2.score(X_test, y_test)) # Check the individual prediction match as a finer grained # decision function check. assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train)) assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target) assert isinstance(estimator[0].steps[-1][1].random_state, int)
# Regression test for #15920 cm = np.array([[19, 34], [32, 58]]) disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1]) disp.plot(cmap=pyplot.cm.Blues) min_color = pyplot.cm.Blues(0) max_color = pyplot.cm.Blues(255) assert_allclose(disp.text_[0, 0].get_color(), max_color) assert_allclose(disp.text_[0, 1].get_color(), max_color) assert_allclose(disp.text_[1, 0].get_color(), max_color) assert_allclose(disp.text_[1, 1].get_color(), min_color) @pytest.mark.parametrize("clf", [ LogisticRegression(), make_pipeline(StandardScaler(), LogisticRegression()), make_pipeline(make_column_transformer( (StandardScaler(), [0, 1])), LogisticRegression()) ]) def test_confusion_matrix_pipeline(pyplot, clf, data, n_classes): X, y = data with pytest.raises(NotFittedError): plot_confusion_matrix(clf, X, y) clf.fit(X, y) y_pred = clf.predict(X) disp = plot_confusion_matrix(clf, X, y) cm = confusion_matrix(y, y_pred) assert_allclose(disp.confusion_matrix, cm) assert disp.text_.shape == (n_classes, n_classes)
def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises(ValueError, match="Pipeline.fit does not accept " "the sample_weight parameter"): clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1])