Example #1
0
def test_boston_dataset(n_bins):
    X, y = load_boston(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    mapper = _BinMapper(n_bins=n_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(G_H_DTYPE)
    hessians = np.ones(1, dtype=G_H_DTYPE)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned,
                        gradients,
                        hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes,
                        n_bins=n_bins,
                        n_bins_non_missing=mapper.n_bins_non_missing_)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict(X_train)) > 0.85
    assert r2_score(y_test, predictor.predict(X_test)) > 0.70
Example #2
0
def test_permutation_importance_correlated_feature_regression(n_jobs):
    # Make sure that feature highly correlated to the target have a higher
    # importance
    rng = np.random.RandomState(42)
    n_repeats = 5

    X, y = load_boston(return_X_y=True)
    y_with_little_noise = (y +
                           rng.normal(scale=0.001, size=y.shape[0])).reshape(
                               -1, 1)

    X = np.hstack([X, y_with_little_noise])

    clf = RandomForestRegressor(n_estimators=10, random_state=42)
    clf.fit(X, y)

    result = permutation_importance(clf,
                                    X,
                                    y,
                                    n_repeats=n_repeats,
                                    random_state=rng,
                                    n_jobs=n_jobs)

    assert result.importances.shape == (X.shape[1], n_repeats)

    # the correlated feature with y was added as the last column and should
    # have the highest importance
    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
Example #3
0
def test_load_boston():
    res = load_boston()
    assert res.data.shape == (506, 13)
    assert res.target.size == 506
    assert res.feature_names.size == 13
    assert res.DESCR
    assert os.path.exists(res.filename)

    # test return_X_y option
    check_return_X_y(res, partial(load_boston))
Example #4
0
def test_score_sample_weight():

    rng = np.random.RandomState(0)

    # test both ClassifierMixin and RegressorMixin
    estimators = [
        DecisionTreeClassifier(max_depth=2),
        DecisionTreeRegressor(max_depth=2)
    ]
    sets = [datasets.load_iris(), datasets.load_boston()]

    for est, ds in zip(estimators, sets):
        est.fit(ds.data, ds.target)
        # generate random sample weights
        sample_weight = rng.randint(1, 10, size=len(ds.target))
        # check that the score with and without sample weights are different
        assert (est.score(ds.data, ds.target) != est.score(
            ds.data, ds.target,
            sample_weight=sample_weight)), ("Unweighted and weighted scores "
                                            "are unexpectedly equal")
Example #5
0
def test_warm_start_convergence_with_regularizer_decrement():
    X, y = load_boston(return_X_y=True)

    # Train a model to converge on a lightly regularized problem
    final_alpha = 1e-5
    low_reg_model = ElasticNet(alpha=final_alpha).fit(X, y)

    # Fitting a new model on a more regularized version of the same problem.
    # Fitting with high regularization is easier it should converge faster
    # in general.
    high_reg_model = ElasticNet(alpha=final_alpha * 10).fit(X, y)
    assert low_reg_model.n_iter_ > high_reg_model.n_iter_

    # Fit the solution to the original, less regularized version of the
    # problem but from the solution of the highly regularized variant of
    # the problem as a better starting point. This should also converge
    # faster than the original model that starts from zero.
    warm_low_reg_model = deepcopy(high_reg_model)
    warm_low_reg_model.set_params(warm_start=True, alpha=final_alpha)
    warm_low_reg_model.fit(X, y)
    assert low_reg_model.n_iter_ > warm_low_reg_model.n_iter_
Example #6
0
def test_iterative_imputer_catch_warning():
    # check that we catch a RuntimeWarning due to a division by zero when a
    # feature is constant in the dataset
    X, y = load_boston(return_X_y=True)
    n_samples, n_features = X.shape

    # simulate that a feature only contain one category during fit
    X[:, 3] = 1

    # add some missing values
    rng = np.random.RandomState(0)
    missing_rate = 0.15
    for feat in range(n_features):
        sample_idx = rng.choice(np.arange(n_samples),
                                size=int(n_samples * missing_rate),
                                replace=False)
        X[sample_idx, feat] = np.nan

    imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True)
    with pytest.warns(None) as record:
        X_fill = imputer.fit_transform(X, y)
    assert not record.list
    assert not np.any(np.isnan(X_fill))
Example #7
0
def test_plot_partial_dependence(pyplot):
    # Test partial dependence plot function.
    boston = load_boston()
    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
    clf.fit(boston.data, boston.target)

    grid_resolution = 25
    plot_partial_dependence(clf,
                            boston.data, [0, 1, (0, 1)],
                            grid_resolution=grid_resolution,
                            feature_names=boston.feature_names)
    fig = pyplot.gcf()
    axs = fig.get_axes()
    assert len(axs) == 3
    assert all(ax.has_data for ax in axs)

    # check with str features and array feature names
    plot_partial_dependence(clf,
                            boston.data, ['CRIM', 'ZN', ('CRIM', 'ZN')],
                            grid_resolution=grid_resolution,
                            feature_names=boston.feature_names)

    fig = pyplot.gcf()
    axs = fig.get_axes()
    assert len(axs) == 3
    assert all(ax.has_data for ax in axs)

    # check with list feature_names
    feature_names = boston.feature_names.tolist()
    plot_partial_dependence(clf,
                            boston.data, ['CRIM', 'ZN', ('CRIM', 'ZN')],
                            grid_resolution=grid_resolution,
                            feature_names=feature_names)
    fig = pyplot.gcf()
    axs = fig.get_axes()
    assert len(axs) == 3
    assert all(ax.has_data for ax in axs)
Example #8
0
from mrex.utils.testing import assert_raises
from mrex.ensemble.partial_dependence import partial_dependence
from mrex.ensemble.partial_dependence import plot_partial_dependence
from mrex.ensemble import GradientBoostingClassifier
from mrex.ensemble import GradientBoostingRegressor
from mrex import datasets
from mrex.utils.testing import ignore_warnings

# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [-1, -1, -1, 1, 1, 1]
sample_weight = [1, 1, 1, 2, 2, 2]

# also load the boston dataset
boston = datasets.load_boston()

# also load the iris dataset
iris = datasets.load_iris()


@ignore_warnings(category=DeprecationWarning)
def test_partial_dependence_classifier():
    # Test partial dependence for classifier
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)

    pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)

    # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
    assert pdp.shape == (1, 4)
Example #9
0
ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"]

X_digits, y_digits = load_digits(n_class=3, return_X_y=True)

X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200])
y_digits_multi = y_digits[:200]

X_digits, y_digits = load_digits(n_class=2, return_X_y=True)

X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200])
y_digits_binary = y_digits[:200]

classification_datasets = [(X_digits_multi, y_digits_multi),
                           (X_digits_binary, y_digits_binary)]

boston = load_boston()

Xboston = StandardScaler().fit_transform(boston.data)[:200]
yboston = boston.target[:200]

regression_datasets = [(Xboston, yboston)]

iris = load_iris()

X_iris = iris.data
y_iris = iris.target


def test_alpha():
    # Test that larger alpha yields weights closer to zero
    X = X_digits_binary[:100]
Example #10
0
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

###############################################################################
# Real-world data set
###############################################################################

###############################################################################
# In a similar manner, the boston housing data set is used to show the impact
# of transforming the targets before learning a model. In this example, the
# targets to be predicted corresponds to the weighted distances to the five
# Boston employment centers.

from mrex.datasets import load_boston
from mrex.preprocessing import QuantileTransformer, quantile_transform

dataset = load_boston()
target = np.array(dataset.feature_names) == "DIS"
X = dataset.data[:, np.logical_not(target)]
y = dataset.data[:, target].squeeze()
y_trans = quantile_transform(dataset.data[:, target],
                             n_quantiles=300,
                             output_distribution='normal',
                             copy=True).squeeze()

###############################################################################
# A :class:`mrex.preprocessing.QuantileTransformer` is used such that the
# targets follows a normal distribution before applying a
# :class:`mrex.linear_model.RidgeCV` model.

f, (ax0, ax1) = plt.subplots(1, 2)
Example #11
0
"""
====================================
Plotting Cross-Validated Predictions
====================================

This example shows how to use `cross_val_predict` to visualize prediction
errors.

"""
from mrex import datasets
from mrex.model_selection import cross_val_predict
from mrex import linear_model
import matplotlib.pyplot as plt

lr = linear_model.LinearRegression()
X, y = datasets.load_boston(return_X_y=True)

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(lr, X, y, cv=10)

fig, ax = plt.subplots()
ax.scatter(y, predicted, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
Example #12
0
from mrex.model_selection import GridSearchCV
from mrex import datasets
from mrex.model_selection import cross_val_score, train_test_split
from mrex.datasets import make_multilabel_classification
from mrex.svm import SVC
from mrex.multiclass import OneVsRestClassifier
from mrex.neighbors import KNeighborsClassifier
from mrex.base import BaseEstimator, ClassifierMixin
from mrex.dummy import DummyRegressor


# Load datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

X_r, y_r = datasets.load_boston(return_X_y=True)


def test_estimator_init():
    eclf = VotingClassifier(estimators=[])
    msg = ('Invalid `estimators` attribute, `estimators` should be'
           ' a list of (string, estimator) tuples')
    assert_raise_message(AttributeError, msg, eclf.fit, X, y)

    clf = LogisticRegression(random_state=1)

    eclf = VotingClassifier(estimators=[('lr', clf)], voting='error')
    msg = ('Voting must be \'soft\' or \'hard\'; got (voting=\'error\')')
    assert_raise_message(ValueError, msg, eclf.fit, X, y)

    eclf = VotingClassifier(estimators=[('lr', clf)], weights=[1, 2])
Example #13
0
                               sample_posterior=True)
    iterative_impute_scores = get_scores_for_imputer(imputer,
                                                     X_missing,
                                                     y_missing)

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (iterative_impute_scores.mean(), iterative_impute_scores.std()))


results_diabetes = np.array(get_results(load_diabetes()))
mses_diabetes = results_diabetes[:, 0] * -1
stds_diabetes = results_diabetes[:, 1]

results_boston = np.array(get_results(load_boston()))
mses_boston = results_boston[:, 0] * -1
stds_boston = results_boston[:, 1]

n_bars = len(mses_diabetes)
xval = np.arange(n_bars)

x_labels = ['Full data',
            'Zero imputation',
            'Mean Imputation',
            'Multivariate Imputation']
colors = ['r', 'g', 'b', 'orange']

# plot diabetes results
plt.figure(figsize=(12, 6))
ax1 = plt.subplot(121)
Example #14
0
"""
print(__doc__)

# Author: Virgile Fritsch <*****@*****.**>
# License: BSD 3 clause

import numpy as np
from mrex.covariance import EllipticEnvelope
from mrex.svm import OneClassSVM
import matplotlib.pyplot as plt
import matplotlib.font_manager
from mrex.datasets import load_boston

# Get data
X1 = load_boston()['data'][:, [8, 10]]  # two clusters
X2 = load_boston()['data'][:, [5, 12]]  # "banana"-shaped

# Define "classifiers" to be used
classifiers = {
    "Empirical Covariance":
    EllipticEnvelope(support_fraction=1., contamination=0.261),
    "Robust Covariance (Minimum Covariance Determinant)":
    EllipticEnvelope(contamination=0.261),
    "OCSVM":
    OneClassSVM(nu=0.261, gamma=0.05)
}
colors = ['m', 'g', 'b']
legend1 = {}
legend2 = {}