def test_missing_values_resilience(problem, missing_proportion, expected_min_score_classification, expected_min_score_regression): # Make sure the estimators can deal with missing values and still yield # decent predictions rng = np.random.RandomState(0) n_samples = 1000 n_features = 2 if problem == 'regression': X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_features, random_state=rng) gb = HistGradientBoostingRegressor() expected_min_score = expected_min_score_regression else: X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_features, n_redundant=0, n_repeated=0, random_state=rng) gb = HistGradientBoostingClassifier() expected_min_score = expected_min_score_classification mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool) X[mask] = np.nan gb.fit(X, y) assert gb.score(X, y) > expected_min_score
def test_check_gcv_mode_choice(sparse, mode, mode_n_greater_than_p, mode_p_greater_than_n): X, _ = make_regression(n_samples=5, n_features=2) if sparse: X = sp.csr_matrix(X) assert _check_gcv_mode(X, mode) == mode_n_greater_than_p assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n
def test_check_gcv_mode_error(mode): X, y = make_regression(n_samples=5, n_features=2) gcv = RidgeCV(gcv_mode=mode) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): gcv.fit(X, y) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): _check_gcv_mode(X, mode)
def _make_sparse_offset_regression(n_samples=100, n_features=100, proportion_nonzero=.5, n_informative=10, n_targets=1, bias=13., X_offset=30., noise=30., shuffle=True, coef=False, random_state=None): X, y, c = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=n_targets, bias=bias, noise=noise, shuffle=shuffle, coef=True, random_state=random_state) if n_features == 1: c = np.asarray([c]) X += X_offset mask = np.random.RandomState(random_state).binomial( 1, proportion_nonzero, X.shape) > 0 removed_X = X.copy() X[~mask] = 0. removed_X[mask] = 0. y -= removed_X.dot(c) if n_features == 1: c = c[0] if coef: return X, y, c return X, y
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure mrex has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and mrex should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 255 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_mrex = HistGradientBoostingRegressor(max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_mrex, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_mrex.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_mrex = est_mrex.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lightgbm - pred_mrex) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_mrex = est_mrex.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lightgbm - pred_mrex) > 1e-4) < .01
def test_ridge_sag_with_X_fortran(): # check that Fortran array are converted when using SAG solver X, y = make_regression(random_state=42) # for the order of X and y to not be C-ordered arrays X = np.asfortranarray(X) X = X[::2, :] y = y[::2] Ridge(solver='sag').fit(X, y)
def test_multioutput_regression(): # Test that multi-output regression works as expected X, y = make_regression(n_samples=200, n_targets=5) mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50, max_iter=200, random_state=1) mlp.fit(X, y) assert mlp.score(X, y) > 0.9
def make_regression_with_outliers(n_samples=50, n_features=20): rng = np.random.RandomState(0) # Generate data with outliers by replacing 10% of the samples with noise. X, y = make_regression( n_samples=n_samples, n_features=n_features, random_state=0, noise=0.05) # Replace 10% of the sample with noise. num_noise = int(0.1 * n_samples) random_samples = rng.randint(0, n_samples, num_noise) X[random_samples, :] = 2.0 * rng.normal(0, 1, (num_noise, X.shape[1])) return X, y
def test_make_regression(): X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3, effective_rank=5, coef=True, bias=0.0, noise=1.0, random_state=0) assert X.shape == (100, 10), "X shape mismatch" assert y.shape == (100, ), "y shape mismatch" assert c.shape == (10, ), "coef shape mismatch" assert sum(c != 0.0) == 3, "Unexpected number of informative features" # Test that y ~= np.dot(X, c) + bias + N(0, 1.0). assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1) # Test with small number of features. X, y = make_regression(n_samples=100, n_features=1) # n_informative=3 assert X.shape == (100, 1)
def get_estimator_and_data(): if args.problem == 'classification': X, y = make_classification(args.n_samples_max * 2, n_features=args.n_features, n_classes=args.n_classes, n_clusters_per_class=1, random_state=0) return X, y, HistGradientBoostingClassifier elif args.problem == 'regression': X, y = make_regression(args.n_samples_max * 2, n_features=args.n_features, random_state=0) return X, y, HistGradientBoostingRegressor
def test_multi_target_regression(): X, y = datasets.make_regression(n_targets=3) X_train, y_train = X[:50], y[:50] X_test, y_test = X[50:], y[50:] references = np.zeros_like(y_test) for n in range(3): rgr = GradientBoostingRegressor(random_state=0) rgr.fit(X_train, y_train[:, n]) references[:, n] = rgr.predict(X_test) rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) rgr.fit(X_train, y_train) y_pred = rgr.predict(X_test) assert_almost_equal(references, y_pred)
def test_sparse_regression(): # Check regression with sparse input. class CustomSVR(SVR): """SVR variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super().fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [ csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix ]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostRegressor(base_estimator=CustomSVR(), random_state=1).fit( X_train_sparse, y_train) # Trained on dense format dense_classifier = dense_results = AdaBoostRegressor( base_estimator=CustomSVR(), random_state=1).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_almost_equal(sparse_results, dense_results) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_almost_equal(sprase_res, dense_res) types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
def test_make_regression_multitarget(): X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3, n_targets=3, coef=True, noise=1., random_state=0) assert X.shape == (100, 10), "X shape mismatch" assert y.shape == (100, 3), "y shape mismatch" assert c.shape == (10, 3), "coef shape mismatch" assert_array_equal(sum(c != 0.0), 3, "Unexpected number of informative features") # Test that y ~= np.dot(X, c) + bias + N(0, 1.0) assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
def test_multi_target_sparse_regression(): X, y = datasets.make_regression(n_targets=3) X_train, y_train = X[:50], y[:50] X_test = X[50:] for sparse in [ sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix ]: rgr = MultiOutputRegressor(Lasso(random_state=0)) rgr_sparse = MultiOutputRegressor(Lasso(random_state=0)) rgr.fit(X_train, y_train) rgr_sparse.fit(sparse(X_train), y_train) assert_almost_equal(rgr.predict(X_test), rgr_sparse.predict(sparse(X_test)))
def test_permutation_importance_linear_regresssion(): X, y = make_regression(n_samples=500, n_features=10, random_state=0) X = scale(X) y = scale(y) lr = LinearRegression().fit(X, y) # this relationship can be computed in closed form expected_importances = 2 * lr.coef_**2 results = permutation_importance(lr, X, y, n_repeats=50, scoring='neg_mean_squared_error') assert_allclose(expected_importances, results.importances_mean, rtol=1e-1, atol=1e-6)
def test_multi_target_regression_partial_fit(): X, y = datasets.make_regression(n_targets=3) X_train, y_train = X[:50], y[:50] X_test, y_test = X[50:], y[50:] references = np.zeros_like(y_test) half_index = 25 for n in range(3): sgr = SGDRegressor(random_state=0, max_iter=5) sgr.partial_fit(X_train[:half_index], y_train[:half_index, n]) sgr.partial_fit(X_train[half_index:], y_train[half_index:, n]) references[:, n] = sgr.predict(X_test) sgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5)) sgr.partial_fit(X_train[:half_index], y_train[:half_index]) sgr.partial_fit(X_train[half_index:], y_train[half_index:]) y_pred = sgr.predict(X_test) assert_almost_equal(references, y_pred) assert not hasattr(MultiOutputRegressor(Lasso), 'partial_fit')
def test_shuffle(): # Test that the shuffle parameter affects the training process (it should) X, y = make_regression(n_samples=50, n_features=5, n_targets=1, random_state=0) # The coefficients will be identical if both do or do not shuffle for shuffle in [True, False]: mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=shuffle) mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=shuffle) mlp1.fit(X, y) mlp2.fit(X, y) assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0]) # The coefficients will be slightly different if shuffle=True mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=True) mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=False) mlp1.fit(X, y) mlp2.fit(X, y) assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
def test_partial_dependence_helpers(est, method, target_feature): # Check that what is returned by _partial_dependence_brute or # _partial_dependence_recursion is equivalent to manually setting a target # feature to a given value, and computing the average prediction over all # samples. # This also checks that the brute and recursion methods give the same # output. X, y = make_regression(random_state=0, n_features=5, n_informative=5) # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set # the mean to 0 to that this 'bug' doesn't have any effect. y = y - y.mean() est.fit(X, y) # target feature will be set to .5 and then to 123 features = np.array([target_feature], dtype=np.int32) grid = np.array([[.5], [123]]) if method == 'brute': pdp = _partial_dependence_brute(est, grid, features, X, response_method='auto') else: pdp = _partial_dependence_recursion(est, grid, features) mean_predictions = [] for val in (.5, 123): X_ = X.copy() X_[:, target_feature] = val mean_predictions.append(est.predict(X_).mean()) pdp = pdp[0] # (shape is (1, 2) so make it (2,)) # allow for greater margin for error with recursion method rtol = 1e-1 if method == 'recursion' else 1e-3 assert np.allclose(pdp, mean_predictions, rtol=rtol)
def test_early_stopping_regression(scoring, validation_fraction, n_iter_no_change, tol): max_iter = 200 X, y = make_regression(n_samples=50, random_state=0) gb = HistGradientBoostingRegressor( verbose=1, # just for coverage min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, validation_fraction=validation_fraction, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0 ) gb.fit(X, y) if n_iter_no_change is not None: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
def make_missing_value_data(n_samples=int(1e4), seed=0): rng = np.random.RandomState(seed) X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 # strategies and also make it easier to insert np.nan in a structured # way: X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) # First feature has missing values completely at random: rnd_mask = rng.rand(X.shape[0]) > 0.9 X[rnd_mask, 0] = np.nan # Second and third features have missing values for extreme values # (censoring missingness): low_mask = X[:, 1] == 0 X[low_mask, 1] = np.nan high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan # Make the last feature nan pattern very informative: y_max = np.percentile(y, 70) y_max_mask = y >= y_max y[y_max_mask] = y_max X[y_max_mask, 3] = np.nan # Check that there is at least one missing value in each feature: for feature_idx in range(X.shape[1]): assert any(np.isnan(X[:, feature_idx])) # Let's use a test set to check that the learned decision function is # the same as evaluated on unseen data. Otherwise it could just be the # case that we find two independent ways to overfit the training set. return train_test_split(X, y, random_state=rng)
import numpy as np import scipy.sparse as sp from mrex.datasets import make_regression from mrex.linear_model import Ridge from mrex.kernel_ridge import KernelRidge from mrex.metrics.pairwise import pairwise_kernels from mrex.utils.testing import ignore_warnings from mrex.utils.testing import assert_array_almost_equal X, y = make_regression(n_features=10, random_state=0) Xcsr = sp.csr_matrix(X) Xcsc = sp.csc_matrix(X) Y = np.array([y, y]).T def test_kernel_ridge(): pred = Ridge(alpha=1, fit_intercept=False).fit(X, y).predict(X) pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X) assert_array_almost_equal(pred, pred2) def test_kernel_ridge_csr(): pred = Ridge(alpha=1, fit_intercept=False, solver="cholesky").fit(Xcsr, y).predict(Xcsr) pred2 = KernelRidge(kernel="linear", alpha=1).fit(Xcsr, y).predict(Xcsr) assert_array_almost_equal(pred, pred2) def test_kernel_ridge_csc():
# Author: Kornel Kielczewski -- <*****@*****.**> print(__doc__) import matplotlib.pyplot as plt import numpy as np from mrex.datasets import make_regression from mrex.linear_model import Ridge from mrex.metrics import mean_squared_error clf = Ridge() X, y, w = make_regression(n_samples=10, n_features=10, coef=True, random_state=1, bias=3.5) coefs = [] errors = [] alphas = np.logspace(-6, 6, 200) # Train the model with different regularisation strengths for a in alphas: clf.set_params(alpha=a) clf.fit(X, y) coefs.append(clf.coef_) errors.append(mean_squared_error(clf.coef_, w))
from mrex.base import BaseEstimator, ClassifierMixin from mrex.utils.testing import assert_allclose from mrex.utils.testing import assert_array_equal # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] # (X, y), n_targets <-- as expected in the output of partial_dep() binary_classification_data = (make_classification(n_samples=50, random_state=0), 1) multiclass_classification_data = (make_classification(n_samples=50, n_classes=3, n_clusters_per_class=1, random_state=0), 3) regression_data = (make_regression(n_samples=50, random_state=0), 1) multioutput_regression_data = (make_regression(n_samples=50, n_targets=2, random_state=0), 2) @pytest.mark.parametrize('Estimator, method, data', [ (GradientBoostingClassifier, 'recursion', binary_classification_data), (GradientBoostingClassifier, 'recursion', multiclass_classification_data), (GradientBoostingClassifier, 'brute', binary_classification_data), (GradientBoostingClassifier, 'brute', multiclass_classification_data), (GradientBoostingRegressor, 'recursion', regression_data), (GradientBoostingRegressor, 'brute', regression_data), (DecisionTreeRegressor, 'brute', regression_data), (LinearRegression, 'brute', regression_data), (LinearRegression, 'brute', multioutput_regression_data),
In this example we see how to robustly fit a linear model to faulty data using the RANSAC algorithm. """ import numpy as np from matplotlib import pyplot as plt from mrex import linear_model, datasets n_samples = 1000 n_outliers = 50 X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=1, n_informative=1, noise=10, coef=True, random_state=0) # Add outlier data np.random.seed(0) X[:n_outliers] = 3 + 0.5 * np.random.normal(size=(n_outliers, 1)) y[:n_outliers] = -3 + 10 * np.random.normal(size=n_outliers) # Fit line using all data lr = linear_model.LinearRegression() lr.fit(X, y) # Robustly fit linear model with RANSAC algorithm ransac = linear_model.RANSACRegressor() ransac.fit(X, y)
def test_multi_target_regression_one_target(): # Test multi target regression raises X, y = datasets.make_regression(n_targets=1) rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) assert_raises(ValueError, rgr.fit, X, y)
from mrex.datasets import make_classification, make_regression from mrex.preprocessing import KBinsDiscretizer, MinMaxScaler from mrex.model_selection import train_test_split from mrex.base import clone, BaseEstimator, TransformerMixin from mrex.pipeline import make_pipeline # To use this experimental feature, we need to explicitly ask for it: from mrex.experimental import enable_hist_gradient_boosting # noqa from mrex.ensemble import HistGradientBoostingRegressor from mrex.ensemble import HistGradientBoostingClassifier from mrex.ensemble._hist_gradient_boosting.binning import _BinMapper from mrex.utils import shuffle X_classification, y_classification = make_classification(random_state=0) X_regression, y_regression = make_regression(random_state=0) @pytest.mark.parametrize('GradientBoosting, X, y', [ (HistGradientBoostingClassifier, X_classification, y_classification), (HistGradientBoostingRegressor, X_regression, y_regression) ]) @pytest.mark.parametrize( 'params, err_msg', [({'loss': 'blah'}, 'Loss blah is not supported for'), ({'learning_rate': 0}, 'learning_rate=0 must be strictly positive'), ({'learning_rate': -1}, 'learning_rate=-1 must be strictly positive'), ({'max_iter': 0}, 'max_iter=0 must not be smaller than 1'), ({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 2'), ({'max_leaf_nodes': 1}, 'max_leaf_nodes=1 should not be smaller than 2'), ({'max_depth': 0}, 'max_depth=0 should not be smaller than 2'),
# Authors: Manoj Kumar [email protected] # License: BSD 3 clause print(__doc__) import numpy as np import matplotlib.pyplot as plt from mrex.datasets import make_regression from mrex.linear_model import HuberRegressor, Ridge # Generate toy data. rng = np.random.RandomState(0) X, y = make_regression(n_samples=20, n_features=1, random_state=0, noise=4.0, bias=100.0) # Add four strong outliers to the dataset. X_outliers = rng.normal(0, 0.5, size=(4, 1)) y_outliers = rng.normal(0, 2.0, size=4) X_outliers[:2, :] += X.max() + X.mean() / 4. X_outliers[2:, :] += X.min() - X.mean() / 4. y_outliers[:2] += y.min() - y.mean() / 4. y_outliers[2:] += y.max() + y.mean() / 4. X = np.vstack((X, X_outliers)) y = np.concatenate((y, y_outliers)) plt.plot(X, y, 'b.') # Fit the huber regressor over a series of epsilon values.
if LooseVersion(matplotlib.__version__) >= '2.1': density_param = {'density': True} else: density_param = {'normed': True} ############################################################################### # A synthetic random regression problem is generated. The targets ``y`` are # modified by: (i) translating all targets such that all entries are # non-negative and (ii) applying an exponential function to obtain non-linear # targets which cannot be fitted using a simple linear model. # # Therefore, a logarithmic (`np.log1p`) and an exponential function # (`np.expm1`) will be used to transform the targets before training a linear # regression model and using it for prediction. X, y = make_regression(n_samples=10000, noise=100, random_state=0) y = np.exp((y + abs(y.min())) / 200) y_trans = np.log1p(y) ############################################################################### # The following illustrate the probability density functions of the target # before and after applying the logarithmic functions. f, (ax0, ax1) = plt.subplots(1, 2) ax0.hist(y, bins=100, **density_param) ax0.set_xlim([0, 2000]) ax0.set_ylabel('Probability') ax0.set_xlabel('Target') ax0.set_title('Target distribution')
def test_huber_bool(): # Test that it does not crash with bool data X, y = make_regression(n_samples=200, n_features=2, noise=4.0, random_state=0) X_bool = X > 0 HuberRegressor().fit(X_bool, y)