def test_permutation_importance_mixed_types_pandas(): pd = pytest.importorskip("pandas") rng = np.random.RandomState(42) n_repeats = 5 # Last column is correlated with y X = pd.DataFrame({ 'col1': [1.0, 2.0, 3.0, np.nan], 'col2': ['a', 'b', 'a', 'b'] }) y = np.array([0, 1, 0, 1]) num_preprocess = make_pipeline(SimpleImputer(), StandardScaler()) preprocess = ColumnTransformer([('num', num_preprocess, ['col1']), ('cat', OneHotEncoder(), ['col2'])]) clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) pipeline.predict pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf()) assert not hasattr(pipeline, 'predict') pipeline.transform pipeline.inverse_transform pipeline = make_pipeline('passthrough') assert pipeline.steps[0] == ('passthrough', 'passthrough') assert not hasattr(pipeline, 'predict') pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert not hasattr(pipeline, 'predict') pipeline.transform assert not hasattr(pipeline, 'inverse_transform') pipeline = make_pipeline(NoInvTransf(), Transf()) assert not hasattr(pipeline, 'predict') pipeline.transform assert not hasattr(pipeline, 'inverse_transform')
def test_classes_property(): iris = load_iris() X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) assert_raises(AttributeError, getattr, reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) assert_raises(AttributeError, getattr, clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y))
def test_make_pipeline_memory(): cachedir = mkdtemp() if LooseVersion(joblib.__version__) < LooseVersion('0.12'): # Deal with change of API in joblib memory = joblib.Memory(cachedir=cachedir, verbose=10) else: memory = joblib.Memory(location=cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert pipeline.memory is memory pipeline = make_pipeline(DummyTransf(), SVC()) assert pipeline.memory is None assert len(pipeline) == 2 shutil.rmtree(cachedir)
def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( make_union(imputer, MissingIndicator(missing_values=0)), REGRESSOR) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) return impute_scores
def test_make_pipeline(): t1 = Transf() t2 = Transf() pipe = make_pipeline(t1, t2) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" pipe = make_pipeline(t1, t2, FitParamT()) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" assert pipe.steps[2][0] == "fitparamt" assert_raise_message( TypeError, 'Unknown keyword arguments: "random_parameter"', make_pipeline, t1, t2, random_parameter='rnd' )
def test_kde_pipeline_gridsearch(): # test that kde plays nice in pipelines and grid-searches X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False), KernelDensity(kernel="gaussian")) params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10]) search = GridSearchCV(pipe1, param_grid=params) search.fit(X) assert search.best_params_['kerneldensity__bandwidth'] == .1
def test_score_samples_on_pipeline_without_score_samples(): X = np.array([[1], [2]]) y = np.array([1, 2]) # Test that a pipeline does not have score_samples method when the final # step of the pipeline does not have score_samples defined. pipe = make_pipeline(LogisticRegression()) pipe.fit(X, y) with pytest.raises(AttributeError, match="'LogisticRegression' object has no attribute " "'score_samples'"): pipe.score_samples(X)
def drop_first_component(X, y): """ Create a pipeline with PCA and the column selector and use it to transform the dataset. """ pipeline = make_pipeline( PCA(), FunctionTransformer(all_but_first_column), ) X_train, X_test, y_train, y_test = train_test_split(X, y) pipeline.fit(X_train, y_train) return pipeline.transform(X_test), y_test
def test_lasso_cv_with_some_model_selection(): from mrex.pipeline import make_pipeline from mrex.preprocessing import StandardScaler from mrex.model_selection import StratifiedKFold from mrex import datasets from mrex.linear_model import LassoCV diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target pipe = make_pipeline(StandardScaler(), LassoCV(cv=StratifiedKFold())) pipe.fit(X, y)
def test_pipeline(): # Render a pipeline object pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999)) expected = """ Pipeline(memory=None, steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=999, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False))], verbose=False)""" expected = expected[1:] # remove first \n assert pipeline.__repr__() == expected
def test_check_scoring_gridsearchcv(): # test that check_scoring works on GridSearchCV and pipeline. # slightly redundant non-regression test. grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3) scorer = check_scoring(grid, "f1") assert isinstance(scorer, _PredictScorer) pipe = make_pipeline(LinearSVC()) scorer = check_scoring(pipe, "f1") assert isinstance(scorer, _PredictScorer) # check that cross_val_score definitely calls the scorer # and doesn't make any assumptions about the estimator apart from having a # fit. scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1], scoring=DummyScorer(), cv=3) assert_array_equal(scores, 1)
def test_partial_dependence_pipeline(): # check that the partial dependence support pipeline iris = load_iris() scaler = StandardScaler() clf = DummyClassifier(random_state=42) pipe = make_pipeline(scaler, clf) clf.fit(scaler.fit_transform(iris.data), iris.target) pipe.fit(iris.data, iris.target) features = 0 pdp_pipe, values_pipe = partial_dependence(pipe, iris.data, features=[features]) pdp_clf, values_clf = partial_dependence(clf, scaler.transform(iris.data), features=[features]) assert_allclose(pdp_pipe, pdp_clf) assert_allclose( values_pipe[0], values_clf[0] * scaler.scale_[features] + scaler.mean_[features])
def test_permutation_importance_mixed_types(): rng = np.random.RandomState(42) n_repeats = 4 # Last column is correlated with y X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T y = np.array([0, 1, 0, 1]) clf = make_pipeline(SimpleImputer(), LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1]) # use another random state rng = np.random.RandomState(0) result2 = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result2.importances.shape == (X.shape[1], n_repeats) assert not np.allclose(result.importances, result2.importances) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
n_neighbors = 3 random_state = 0 # Load Digits dataset X, y = datasets.load_digits(return_X_y=True) # Split into train/test X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.5, stratify=y, random_state=random_state) dim = len(X[0]) n_classes = len(np.unique(y)) # Reduce dimension to 2 with PCA pca = make_pipeline(StandardScaler(), PCA(n_components=2, random_state=random_state)) # Reduce dimension to 2 with LinearDiscriminantAnalysis lda = make_pipeline(StandardScaler(), LinearDiscriminantAnalysis(n_components=2)) # Reduce dimension to 2 with NeighborhoodComponentAnalysis nca = make_pipeline(StandardScaler(), NeighborhoodComponentsAnalysis(n_components=2, random_state=random_state)) # Use a nearest neighbor classifier to evaluate the methods knn = KNeighborsClassifier(n_neighbors=n_neighbors) # Make a list of the methods to be compared dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]
y_full, scoring='neg_mean_squared_error', cv=N_SPLITS), columns=['Full Data']) # Add a single missing value to each row X_missing = X_full.copy() y_missing = y_full missing_samples = np.arange(n_samples) missing_features = rng.choice(n_features, n_samples, replace=True) X_missing[missing_samples, missing_features] = np.nan # Estimate the score after imputation (mean and median strategies) score_simple_imputer = pd.DataFrame() for strategy in ('mean', 'median'): estimator = make_pipeline( SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator) score_simple_imputer[strategy] = cross_val_score( estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) # Estimate the score after iterative imputation of the missing values # with different estimators estimators = [ BayesianRidge(), DecisionTreeRegressor(max_features='sqrt', random_state=0), ExtraTreesRegressor(n_estimators=10, random_state=0), KNeighborsRegressor(n_neighbors=15) ]
def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises(ValueError, match="Pipeline.fit does not accept " "the sample_weight parameter"): clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1])
# We use the default selection function to select the four # most significant features selector = SelectKBest(f_classif, k=4) selector.fit(X_train, y_train) scores = -np.log10(selector.pvalues_) scores /= scores.max() plt.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='darkorange', edgecolor='black') # ############################################################################# # Compare to the weights of an SVM clf = make_pipeline(MinMaxScaler(), LinearSVC()) clf.fit(X_train, y_train) print('Classification accuracy without selecting features: {:.3f}'.format( clf.score(X_test, y_test))) svm_weights = np.abs(clf[-1].coef_).sum(axis=0) svm_weights /= svm_weights.sum() plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='navy', edgecolor='black') clf_selected = make_pipeline(SelectKBest(f_classif, k=4), MinMaxScaler(),
# License: BSD 3 clause RANDOM_STATE = 42 FIG_SIZE = (10, 7) features, target = load_wine(return_X_y=True) # Make a train/test split using 30% test size X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, random_state=RANDOM_STATE) # Fit to data and predict using pipelined GNB and PCA. unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB()) unscaled_clf.fit(X_train, y_train) pred_test = unscaled_clf.predict(X_test) # Fit to data and predict using pipelined scaling, GNB and PCA. std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB()) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test) # Show prediction accuracies in scaled and unscaled data. print('\nPrediction accuracy for the normal test dataset with PCA') print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test))) print('\nPrediction accuracy for the standardized test dataset with PCA') print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))
def test_missing_values_minmax_imputation(): # Compare the buit-in missing value handling of Histogram GBC with an # a-priori missing value imputation strategy that should yield the same # results in terms of decision function. # # Each feature (containing NaNs) is replaced by 2 features: # - one where the nans are replaced by min(feature) - 1 # - one where the nans are replaced by max(feature) + 1 # A split where nans go to the left has an equivalent split in the # first (min) feature, and a split where nans go to the right has an # equivalent split in the second (max) feature. # # Assuming the data is such that there is never a tie to select the best # feature to split on during training, the learned decision trees should be # strictly equivalent (learn a sequence of splits that encode the same # decision function). # # The MinMaxImputer transformer is meant to be a toy implementation of the # "Missing In Attributes" (MIA) missing value handling for decision trees # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305 # The implementation of MIA as an imputation transformer was suggested by # "Remark 3" in https://arxiv.org/abs/1902.06931 class MinMaxImputer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): mm = MinMaxScaler().fit(X) self.data_min_ = mm.data_min_ self.data_max_ = mm.data_max_ return self def transform(self, X): X_min, X_max = X.copy(), X.copy() for feature_idx in range(X.shape[1]): nan_mask = np.isnan(X[:, feature_idx]) X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1 X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1 return np.concatenate([X_min, X_max], axis=1) def make_missing_value_data(n_samples=int(1e4), seed=0): rng = np.random.RandomState(seed) X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 # strategies and also make it easier to insert np.nan in a structured # way: X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) # First feature has missing values completely at random: rnd_mask = rng.rand(X.shape[0]) > 0.9 X[rnd_mask, 0] = np.nan # Second and third features have missing values for extreme values # (censoring missingness): low_mask = X[:, 1] == 0 X[low_mask, 1] = np.nan high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan # Make the last feature nan pattern very informative: y_max = np.percentile(y, 70) y_max_mask = y >= y_max y[y_max_mask] = y_max X[y_max_mask, 3] = np.nan # Check that there is at least one missing value in each feature: for feature_idx in range(X.shape[1]): assert any(np.isnan(X[:, feature_idx])) # Let's use a test set to check that the learned decision function is # the same as evaluated on unseen data. Otherwise it could just be the # case that we find two independent ways to overfit the training set. return train_test_split(X, y, random_state=rng) # n_samples need to be large enough to minimize the likelihood of having # several candidate splits with the same gain value in a given tree. X_train, X_test, y_train, y_test = make_missing_value_data( n_samples=int(1e4), seed=0) # Use a small number of leaf nodes and iterations so as to keep # under-fitting models to minimize the likelihood of ties when training the # model. gbm1 = HistGradientBoostingRegressor(max_iter=100, max_leaf_nodes=5, random_state=0) gbm1.fit(X_train, y_train) gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1)) gbm2.fit(X_train, y_train) # Check that the model reach the same score: assert gbm1.score(X_train, y_train) == \ pytest.approx(gbm2.score(X_train, y_train)) assert gbm1.score(X_test, y_test) == \ pytest.approx(gbm2.score(X_test, y_test)) # Check the individual prediction match as a finer grained # decision function check. assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train)) assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
print("Creating train-test split...") n_train = 60000 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] return X_train, X_test, y_train, y_test ESTIMATORS = { "dummy": DummyClassifier(), 'CART': DecisionTreeClassifier(), 'ExtraTrees': ExtraTreesClassifier(), 'RandomForest': RandomForestClassifier(), 'Nystroem-SVM': make_pipeline( Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)), 'SampledRBF-SVM': make_pipeline( RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)), 'LogisticRegression-SAG': LogisticRegression(solver='sag', tol=1e-1, C=1e4), 'LogisticRegression-SAGA': LogisticRegression(solver='saga', tol=1e-1, C=1e4), 'MultilayerPerceptron': MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver='sgd', learning_rate_init=0.2, momentum=0.9, verbose=1, tol=1e-4, random_state=1), 'MLP-adam': MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver='adam', learning_rate_init=0.001, verbose=1, tol=1e-4, random_state=1) }
# It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression(max_iter=1000) pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_lm = LogisticRegression(max_iter=1000) rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
y, test_size=0.1, random_state=0) ############################################################################## # Partial Dependence computation for multi-layer perceptron # --------------------------------------------------------- # # Let's fit a MLPRegressor and compute single-variable partial dependence # plots print("Training MLPRegressor...") tic = time() est = make_pipeline( QuantileTransformer(), MLPRegressor(hidden_layer_sizes=(50, 50), learning_rate_init=0.01, early_stopping=True)) est.fit(X_train, y_train) print("done in {:.3f}s".format(time() - tic)) print("Test R2 score: {:.2f}".format(est.score(X_test, y_test))) ############################################################################## # We configured a pipeline to scale the numerical input features and tuned the # neural network size and learning rate to get a reasonable compromise between # training time and predictive performance on a test set. # # Importantly, this tabular dataset has very different dynamic ranges for its # features. Neural networks tend to be very sensitive to features with varying # scales and forgetting to preprocess the numeric feature would lead to a very # poor model.
from mrex.datasets import samples_generator from mrex.feature_selection import SelectKBest, f_regression from mrex.pipeline import make_pipeline from mrex.model_selection import train_test_split from mrex.metrics import classification_report print(__doc__) # import some data to play with X, y = samples_generator.make_classification(n_features=20, n_informative=3, n_redundant=0, n_classes=4, n_clusters_per_class=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # ANOVA SVM-C # 1) anova filter, take 3 best ranked features anova_filter = SelectKBest(f_regression, k=3) # 2) svm clf = svm.LinearSVC() anova_svm = make_pipeline(anova_filter, clf) anova_svm.fit(X_train, y_train) y_pred = anova_svm.predict(X_test) print(classification_report(y_test, y_pred)) coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_) print(coef)
rng.shuffle(x) x = np.sort(x[:20]) y = f(x) # create matrix versions of these arrays X = x[:, np.newaxis] X_plot = x_plot[:, np.newaxis] colors = ['teal', 'yellowgreen', 'gold'] lw = 2 plt.plot(x_plot, f(x_plot), color='cornflowerblue', linewidth=lw, label="ground truth") plt.scatter(x, y, color='navy', s=30, marker='o', label="training points") for count, degree in enumerate([3, 4, 5]): model = make_pipeline(PolynomialFeatures(degree), Ridge()) model.fit(X, y) y_plot = model.predict(X_plot) plt.plot(x_plot, y_plot, color=colors[count], linewidth=lw, label="degree %d" % degree) plt.legend(loc='lower left') plt.show()
'HuberRegressor': '--' } lw = 3 x_plot = np.linspace(X.min(), X.max()) for title, this_X, this_y in [('Modeling Errors Only', X, y), ('Corrupt X, Small Deviants', X_errors, y), ('Corrupt y, Small Deviants', X, y_errors), ('Corrupt X, Large Deviants', X_errors_large, y), ('Corrupt y, Large Deviants', X, y_errors_large) ]: plt.figure(figsize=(5, 4)) plt.plot(this_X[:, 0], this_y, 'b+') for name, estimator in estimators: model = make_pipeline(PolynomialFeatures(3), estimator) model.fit(this_X, this_y) mse = mean_squared_error(model.predict(X_test), y_test) y_plot = model.predict(x_plot[:, np.newaxis]) plt.plot(x_plot, y_plot, color=colors[name], linestyle=linestyle[name], linewidth=lw, label='%s: error = %.3f' % (name, mse)) legend_title = 'Error of Mean\nAbsolute Deviation\nto Non-corrupt Data' legend = plt.legend(loc='upper right', frameon=False, title=legend_title, prop=dict(size='x-small'))
if name == 'Pipeline': name = [get_name(est[1]) for est in estimator.steps] name = ' + '.join(name) return name # list of (estimator, param_grid), where param_grid is used in GridSearchCV classifiers = [ (LogisticRegression(random_state=0), { 'C': np.logspace(-2, 7, 10) }), (LinearSVC(random_state=0), { 'C': np.logspace(-2, 7, 10) }), (make_pipeline( KBinsDiscretizer(encode='onehot'), LogisticRegression(random_state=0)), { 'kbinsdiscretizer__n_bins': np.arange(2, 10), 'logisticregression__C': np.logspace(-2, 7, 10), }), (make_pipeline( KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), { 'kbinsdiscretizer__n_bins': np.arange(2, 10), 'linearsvc__C': np.logspace(-2, 7, 10), }), (GradientBoostingClassifier(n_estimators=50, random_state=0), { 'learning_rate': np.logspace(-4, 0, 10) }), (SVC(random_state=0), { 'C': np.logspace(-2, 7, 10) }),
print() labels = dataset.target true_k = np.unique(labels).shape[0] print("Extracting features from the training dataset " "using a sparse vectorizer") t0 = time() if opts.use_hashing: if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm=None) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm='l2') else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape)