def test_boston_dataset(n_bins): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_permutation_importance_correlated_feature_regression(n_jobs): # Make sure that feature highly correlated to the target have a higher # importance rng = np.random.RandomState(42) n_repeats = 5 X, y = load_boston(return_X_y=True) y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape( -1, 1) X = np.hstack([X, y_with_little_noise]) clf = RandomForestRegressor(n_estimators=10, random_state=42) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng, n_jobs=n_jobs) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y was added as the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_load_boston(): res = load_boston() assert res.data.shape == (506, 13) assert res.target.size == 506 assert res.feature_names.size == 13 assert res.DESCR assert os.path.exists(res.filename) # test return_X_y option check_return_X_y(res, partial(load_boston))
def test_score_sample_weight(): rng = np.random.RandomState(0) # test both ClassifierMixin and RegressorMixin estimators = [ DecisionTreeClassifier(max_depth=2), DecisionTreeRegressor(max_depth=2) ] sets = [datasets.load_iris(), datasets.load_boston()] for est, ds in zip(estimators, sets): est.fit(ds.data, ds.target) # generate random sample weights sample_weight = rng.randint(1, 10, size=len(ds.target)) # check that the score with and without sample weights are different assert (est.score(ds.data, ds.target) != est.score( ds.data, ds.target, sample_weight=sample_weight)), ("Unweighted and weighted scores " "are unexpectedly equal")
def test_warm_start_convergence_with_regularizer_decrement(): X, y = load_boston(return_X_y=True) # Train a model to converge on a lightly regularized problem final_alpha = 1e-5 low_reg_model = ElasticNet(alpha=final_alpha).fit(X, y) # Fitting a new model on a more regularized version of the same problem. # Fitting with high regularization is easier it should converge faster # in general. high_reg_model = ElasticNet(alpha=final_alpha * 10).fit(X, y) assert low_reg_model.n_iter_ > high_reg_model.n_iter_ # Fit the solution to the original, less regularized version of the # problem but from the solution of the highly regularized variant of # the problem as a better starting point. This should also converge # faster than the original model that starts from zero. warm_low_reg_model = deepcopy(high_reg_model) warm_low_reg_model.set_params(warm_start=True, alpha=final_alpha) warm_low_reg_model.fit(X, y) assert low_reg_model.n_iter_ > warm_low_reg_model.n_iter_
def test_iterative_imputer_catch_warning(): # check that we catch a RuntimeWarning due to a division by zero when a # feature is constant in the dataset X, y = load_boston(return_X_y=True) n_samples, n_features = X.shape # simulate that a feature only contain one category during fit X[:, 3] = 1 # add some missing values rng = np.random.RandomState(0) missing_rate = 0.15 for feat in range(n_features): sample_idx = rng.choice(np.arange(n_samples), size=int(n_samples * missing_rate), replace=False) X[sample_idx, feat] = np.nan imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True) with pytest.warns(None) as record: X_fill = imputer.fit_transform(X, y) assert not record.list assert not np.any(np.isnan(X_fill))
def test_plot_partial_dependence(pyplot): # Test partial dependence plot function. boston = load_boston() clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(boston.data, boston.target) grid_resolution = 25 plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)], grid_resolution=grid_resolution, feature_names=boston.feature_names) fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 3 assert all(ax.has_data for ax in axs) # check with str features and array feature names plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', ('CRIM', 'ZN')], grid_resolution=grid_resolution, feature_names=boston.feature_names) fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 3 assert all(ax.has_data for ax in axs) # check with list feature_names feature_names = boston.feature_names.tolist() plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', ('CRIM', 'ZN')], grid_resolution=grid_resolution, feature_names=feature_names) fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 3 assert all(ax.has_data for ax in axs)
from mrex.utils.testing import assert_raises from mrex.ensemble.partial_dependence import partial_dependence from mrex.ensemble.partial_dependence import plot_partial_dependence from mrex.ensemble import GradientBoostingClassifier from mrex.ensemble import GradientBoostingRegressor from mrex import datasets from mrex.utils.testing import ignore_warnings # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] sample_weight = [1, 1, 1, 2, 2, 2] # also load the boston dataset boston = datasets.load_boston() # also load the iris dataset iris = datasets.load_iris() @ignore_warnings(category=DeprecationWarning) def test_partial_dependence_classifier(): # Test partial dependence for classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5) # only 4 grid points instead of 5 because only 4 unique X[:,0] vals assert pdp.shape == (1, 4)
ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"] X_digits, y_digits = load_digits(n_class=3, return_X_y=True) X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200]) y_digits_multi = y_digits[:200] X_digits, y_digits = load_digits(n_class=2, return_X_y=True) X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200]) y_digits_binary = y_digits[:200] classification_datasets = [(X_digits_multi, y_digits_multi), (X_digits_binary, y_digits_binary)] boston = load_boston() Xboston = StandardScaler().fit_transform(boston.data)[:200] yboston = boston.target[:200] regression_datasets = [(Xboston, yboston)] iris = load_iris() X_iris = iris.data y_iris = iris.target def test_alpha(): # Test that larger alpha yields weights closer to zero X = X_digits_binary[:100]
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) ############################################################################### # Real-world data set ############################################################################### ############################################################################### # In a similar manner, the boston housing data set is used to show the impact # of transforming the targets before learning a model. In this example, the # targets to be predicted corresponds to the weighted distances to the five # Boston employment centers. from mrex.datasets import load_boston from mrex.preprocessing import QuantileTransformer, quantile_transform dataset = load_boston() target = np.array(dataset.feature_names) == "DIS" X = dataset.data[:, np.logical_not(target)] y = dataset.data[:, target].squeeze() y_trans = quantile_transform(dataset.data[:, target], n_quantiles=300, output_distribution='normal', copy=True).squeeze() ############################################################################### # A :class:`mrex.preprocessing.QuantileTransformer` is used such that the # targets follows a normal distribution before applying a # :class:`mrex.linear_model.RidgeCV` model. f, (ax0, ax1) = plt.subplots(1, 2)
""" ==================================== Plotting Cross-Validated Predictions ==================================== This example shows how to use `cross_val_predict` to visualize prediction errors. """ from mrex import datasets from mrex.model_selection import cross_val_predict from mrex import linear_model import matplotlib.pyplot as plt lr = linear_model.LinearRegression() X, y = datasets.load_boston(return_X_y=True) # cross_val_predict returns an array of the same size as `y` where each entry # is a prediction obtained by cross validation: predicted = cross_val_predict(lr, X, y, cv=10) fig, ax = plt.subplots() ax.scatter(y, predicted, edgecolors=(0, 0, 0)) ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.show()
from mrex.model_selection import GridSearchCV from mrex import datasets from mrex.model_selection import cross_val_score, train_test_split from mrex.datasets import make_multilabel_classification from mrex.svm import SVC from mrex.multiclass import OneVsRestClassifier from mrex.neighbors import KNeighborsClassifier from mrex.base import BaseEstimator, ClassifierMixin from mrex.dummy import DummyRegressor # Load datasets iris = datasets.load_iris() X, y = iris.data[:, 1:3], iris.target X_r, y_r = datasets.load_boston(return_X_y=True) def test_estimator_init(): eclf = VotingClassifier(estimators=[]) msg = ('Invalid `estimators` attribute, `estimators` should be' ' a list of (string, estimator) tuples') assert_raise_message(AttributeError, msg, eclf.fit, X, y) clf = LogisticRegression(random_state=1) eclf = VotingClassifier(estimators=[('lr', clf)], voting='error') msg = ('Voting must be \'soft\' or \'hard\'; got (voting=\'error\')') assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('lr', clf)], weights=[1, 2])
sample_posterior=True) iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), (iterative_impute_scores.mean(), iterative_impute_scores.std())) results_diabetes = np.array(get_results(load_diabetes())) mses_diabetes = results_diabetes[:, 0] * -1 stds_diabetes = results_diabetes[:, 1] results_boston = np.array(get_results(load_boston())) mses_boston = results_boston[:, 0] * -1 stds_boston = results_boston[:, 1] n_bars = len(mses_diabetes) xval = np.arange(n_bars) x_labels = ['Full data', 'Zero imputation', 'Mean Imputation', 'Multivariate Imputation'] colors = ['r', 'g', 'b', 'orange'] # plot diabetes results plt.figure(figsize=(12, 6)) ax1 = plt.subplot(121)
""" print(__doc__) # Author: Virgile Fritsch <*****@*****.**> # License: BSD 3 clause import numpy as np from mrex.covariance import EllipticEnvelope from mrex.svm import OneClassSVM import matplotlib.pyplot as plt import matplotlib.font_manager from mrex.datasets import load_boston # Get data X1 = load_boston()['data'][:, [8, 10]] # two clusters X2 = load_boston()['data'][:, [5, 12]] # "banana"-shaped # Define "classifiers" to be used classifiers = { "Empirical Covariance": EllipticEnvelope(support_fraction=1., contamination=0.261), "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(contamination=0.261), "OCSVM": OneClassSVM(nu=0.261, gamma=0.05) } colors = ['m', 'g', 'b'] legend1 = {} legend2 = {}