def test_select_percentile_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the percentile heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal(X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)))
def test_check_gcv_mode_error(mode): X, y = make_regression(n_samples=5, n_features=2) gcv = RidgeCV(gcv_mode=mode) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): gcv.fit(X, y) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): _check_gcv_mode(X, mode)
def test_check_gcv_mode_choice(sparse, mode, mode_n_greater_than_p, mode_p_greater_than_n): X, _ = make_regression(n_samples=5, n_features=2) if sparse: X = sp.csr_matrix(X) assert _check_gcv_mode(X, mode) == mode_n_greater_than_p assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n
def test_f_regression(): # Test whether the F test yields meaningful results # on a simple simulated regression problem X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) F, pv = f_regression(X, y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.e-4).all() # with centering, compare with sparse F, pv = f_regression(X, y, center=True) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv) # again without centering, compare with sparse F, pv = f_regression(X, y, center=False) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv)
def _make_sparse_offset_regression(n_samples=100, n_features=100, proportion_nonzero=.5, n_informative=10, n_targets=1, bias=13., X_offset=30., noise=30., shuffle=True, coef=False, random_state=None): X, y, c = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=n_targets, bias=bias, noise=noise, shuffle=shuffle, coef=True, random_state=random_state) if n_features == 1: c = np.asarray([c]) X += X_offset mask = np.random.RandomState(random_state).binomial( 1, proportion_nonzero, X.shape) > 0 removed_X = X.copy() X[~mask] = 0. removed_X[mask] = 0. y -= removed_X.dot(c) if n_features == 1: c = c[0] if coef: return X, y, c return X, y
def test_csr_preprocess_data(): # Test output format of _preprocess_data, when input is csr X, y = make_regression() X[X < 2.5] = 0.0 csr = sparse.csr_matrix(X) csr_, y, _, _, _ = _preprocess_data(csr, y, True) assert csr_.getformat() == 'csr'
def test_missing_values_resilience(problem, missing_proportion, expected_min_score_classification, expected_min_score_regression): # Make sure the estimators can deal with missing values and still yield # decent predictions rng = np.random.RandomState(0) n_samples = 1000 n_features = 2 if problem == 'regression': X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_features, random_state=rng) gb = HistGradientBoostingRegressor() expected_min_score = expected_min_score_regression else: X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_features, n_redundant=0, n_repeated=0, random_state=rng) gb = HistGradientBoostingClassifier() expected_min_score = expected_min_score_classification mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool) X[mask] = np.nan gb.fit(X, y) assert gb.score(X, y) > expected_min_score
def test_least_absolute_deviation(): # For coverage only. X, y = make_regression(n_samples=500, random_state=0) gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation', random_state=0) gbdt.fit(X, y) assert gbdt.score(X, y) > .9
def single_fdr(alpha, n_informative, random_state): X, y = make_regression(n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0. false_discovery_rate = (num_false_positives / (num_true_positives + num_false_positives)) return false_discovery_rate
def test_mutual_info_regression(): X, y = make_regression(n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_regression, k=2) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_regression, percentile=20) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile', param=20).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def test_permutation_importance_equivalence_array_dataframe(n_jobs): # This test checks that the column shuffling logic has the same behavior # both a dataframe and a simple numpy array. pd = pytest.importorskip('pandas') # regression test to make sure that sequential and parallel calls will # output the same results. X, y = make_regression(n_samples=100, n_features=5, random_state=0) X_df = pd.DataFrame(X) # Add a categorical feature that is statistically linked to y: binner = KBinsDiscretizer(n_bins=3, encode="ordinal") cat_column = binner.fit_transform(y.reshape(-1, 1)) # Concatenate the extra column to the numpy array: integers will be # cast to float values X = np.hstack([X, cat_column]) assert X.dtype.kind == "f" # Insert extra column as a non-numpy-native dtype (while keeping backward # compat for old pandas versions): if hasattr(pd, "Categorical"): cat_column = pd.Categorical(cat_column.ravel()) else: cat_column = cat_column.ravel() new_col_idx = len(X_df.columns) X_df[new_col_idx] = cat_column assert X_df[new_col_idx].dtype == cat_column.dtype # Stich an aribtrary index to the dataframe: X_df.index = np.arange(len(X_df)).astype(str) rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0) rf.fit(X, y) n_repeats = 3 importance_array = permutation_importance(rf, X, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs) # First check that the problem is structured enough and that the model is # complex enough to not yield trivial, constant importances: imp_min = importance_array['importances'].min() imp_max = importance_array['importances'].max() assert imp_max - imp_min > 0.3 # Now check that importances computed on dataframe matche the values # of those computed on the array with the same data. importance_dataframe = permutation_importance(rf, X_df, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs) assert_allclose(importance_array['importances'], importance_dataframe['importances'])
def test_ridge_sag_with_X_fortran(): # check that Fortran array are converted when using SAG solver X, y = make_regression(random_state=42) # for the order of X and y to not be C-ordered arrays X = np.asfortranarray(X) X = X[::2, :] y = y[::2] Ridge(solver='sag').fit(X, y)
def test_multioutput_regression(): # Test that multi-output regression works as expected X, y = make_regression(n_samples=200, n_targets=5) mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50, max_iter=200, random_state=1) mlp.fit(X, y) assert mlp.score(X, y) > 0.9
def test_make_regression(): X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3, effective_rank=5, coef=True, bias=0.0, noise=1.0, random_state=0) assert X.shape == (100, 10), "X shape mismatch" assert y.shape == (100, ), "y shape mismatch" assert c.shape == (10, ), "coef shape mismatch" assert sum(c != 0.0) == 3, "Unexpected number of informative features" # Test that y ~= np.dot(X, c) + bias + N(0, 1.0). assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1) # Test with small number of features. X, y = make_regression(n_samples=100, n_features=1) # n_informative=3 assert X.shape == (100, 1)
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] reg = LinearRegression() reg.fit((X), Y) assert reg.coef_.shape == (2, n_features) Y_pred = reg.predict(X) reg.fit(X, y) y_pred = reg.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_invalid_percentile(): X, y = make_regression(n_samples=10, n_features=20, n_informative=2, shuffle=False, random_state=0) with pytest.raises(ValueError): SelectPercentile(percentile=-1).fit(X, y) with pytest.raises(ValueError): SelectPercentile(percentile=101).fit(X, y) with pytest.raises(ValueError): GenericUnivariateSelect(mode='percentile', param=-1).fit(X, y) with pytest.raises(ValueError): GenericUnivariateSelect(mode='percentile', param=101).fit(X, y)
def test_plot_partial_dependence_fig_deprecated(pyplot): # Make sure fig object is correctly used if not None X, y = make_regression(n_samples=50, random_state=0) clf = LinearRegression() clf.fit(X, y) fig = pyplot.figure() grid_resolution = 25 msg = ("The fig parameter is deprecated in version 0.22 and will be " "removed in version 0.24") with pytest.warns(FutureWarning, match=msg): plot_partial_dependence( clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig) assert pyplot.gcf() is fig
def test_ensemble_heterogeneous_estimators_type(Ensemble): # check that ensemble will fail during validation if the underlying # estimators are not of the same type (i.e. classifier or regressor) if issubclass(Ensemble, ClassifierMixin): X, y = make_classification(n_samples=10) estimators = [('lr', LinearRegression())] ensemble_type = 'classifier' else: X, y = make_regression(n_samples=10) estimators = [('lr', LogisticRegression())] ensemble_type = 'regressor' ensemble = Ensemble(estimators=estimators) err_msg = "should be a {}".format(ensemble_type) with pytest.raises(ValueError, match=err_msg): ensemble.fit(X, y)
def test_sparse_regression(): # Check regression with sparse input. class CustomSVR(SVR): """SVR variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super().fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [ csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix ]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostRegressor(base_estimator=CustomSVR(), random_state=1).fit( X_train_sparse, y_train) # Trained on dense format dense_classifier = dense_results = AdaBoostRegressor( base_estimator=CustomSVR(), random_state=1).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_almost_equal(sparse_results, dense_results) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_almost_equal(sprase_res, dense_res) types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
def test_make_regression_multitarget(): X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3, n_targets=3, coef=True, noise=1., random_state=0) assert X.shape == (100, 10), "X shape mismatch" assert y.shape == (100, 3), "y shape mismatch" assert c.shape == (10, 3), "coef shape mismatch" assert_array_equal(sum(c != 0.0), 3, "Unexpected number of informative features") # Test that y ~= np.dot(X, c) + bias + N(0, 1.0) assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
def test_select_percentile_regression_full(): # Test whether the relative univariate feature selection # selects all features when '100%' is asked. X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=100).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def test_permutation_importance_equivalence_sequential_parallel(): # regression test to make sure that sequential and parallel calls will # output the same results. X, y = make_regression(n_samples=500, n_features=10, random_state=0) lr = LinearRegression().fit(X, y) importance_sequential = permutation_importance(lr, X, y, n_repeats=5, random_state=0, n_jobs=1) # First check that the problem is structured enough and that the model is # complex enough to not yield trivial, constant importances: imp_min = importance_sequential['importances'].min() imp_max = importance_sequential['importances'].max() assert imp_max - imp_min > 0.3 # The actually check that parallelism does not impact the results # either with shared memory (threading) or without isolated memory # via process-based parallelism using the default backend # ('loky' or 'multiprocessing') depending on the joblib version: # process-based parallelism (by default): importance_processes = permutation_importance(lr, X, y, n_repeats=5, random_state=0, n_jobs=2) assert_allclose(importance_processes['importances'], importance_sequential['importances']) # thread-based parallelism: with parallel_backend("threading"): importance_threading = permutation_importance(lr, X, y, n_repeats=5, random_state=0, n_jobs=2) assert_allclose(importance_threading['importances'], importance_sequential['importances'])
def test_permutation_importance_linear_regresssion(): X, y = make_regression(n_samples=500, n_features=10, random_state=0) X = scale(X) y = scale(y) lr = LinearRegression().fit(X, y) # this relationship can be computed in closed form expected_importances = 2 * lr.coef_**2 results = permutation_importance(lr, X, y, n_repeats=50, scoring='neg_mean_squared_error') assert_allclose(expected_importances, results.importances_mean, rtol=1e-1, atol=1e-6)
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert np.sum(support[5:] == 1) < 2
def test_preprocess_copy_data_no_checks(is_sparse, to_copy): X, y = make_regression() X[X < 2.5] = 0.0 if is_sparse: X = sparse.csr_matrix(X) X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False) if to_copy and is_sparse: assert not np.may_share_memory(X_.data, X.data) elif to_copy: assert not np.may_share_memory(X_, X) elif is_sparse: assert np.may_share_memory(X_.data, X.data) else: assert np.may_share_memory(X_, X)
def test_select_kbest_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the k best heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='k_best', param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_shuffle(): # Test that the shuffle parameter affects the training process (it should) X, y = make_regression(n_samples=50, n_features=5, n_targets=1, random_state=0) # The coefficients will be identical if both do or do not shuffle for shuffle in [True, False]: mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=shuffle) mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=shuffle) mlp1.fit(X, y) mlp2.fit(X, y) assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0]) # The coefficients will be slightly different if shuffle=True mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=True) mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=False) mlp1.fit(X, y) mlp2.fit(X, y) assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
def test_partial_dependence_helpers(est, method, target_feature): # Check that what is returned by _partial_dependence_brute or # _partial_dependence_recursion is equivalent to manually setting a target # feature to a given value, and computing the average prediction over all # samples. # This also checks that the brute and recursion methods give the same # output. X, y = make_regression(random_state=0, n_features=5, n_informative=5) # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set # the mean to 0 to that this 'bug' doesn't have any effect. y = y - y.mean() est.fit(X, y) # target feature will be set to .5 and then to 123 features = np.array([target_feature], dtype=np.int32) grid = np.array([[.5], [123]]) if method == 'brute': pdp = _partial_dependence_brute(est, grid, features, X, response_method='auto') else: pdp = _partial_dependence_recursion(est, grid, features) mean_predictions = [] for val in (.5, 123): X_ = X.copy() X_[:, target_feature] = val mean_predictions.append(est.predict(X_).mean()) pdp = pdp[0] # (shape is (1, 2) so make it (2,)) # allow for greater margin for error with recursion method rtol = 1e-1 if method == 'recursion' else 1e-3 assert np.allclose(pdp, mean_predictions, rtol=rtol)
def test_early_stopping_regression(scoring, validation_fraction, n_iter_no_change, tol): max_iter = 200 X, y = make_regression(n_samples=50, random_state=0) gb = HistGradientBoostingRegressor( verbose=1, # just for coverage min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, validation_fraction=validation_fraction, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0) gb.fit(X, y) if n_iter_no_change is not None: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
def make_missing_value_data(n_samples=int(1e4), seed=0): rng = np.random.RandomState(seed) X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 # strategies and also make it easier to insert np.nan in a structured # way: X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) # First feature has missing values completely at random: rnd_mask = rng.rand(X.shape[0]) > 0.9 X[rnd_mask, 0] = np.nan # Second and third features have missing values for extreme values # (censoring missingness): low_mask = X[:, 1] == 0 X[low_mask, 1] = np.nan high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan # Make the last feature nan pattern very informative: y_max = np.percentile(y, 70) y_max_mask = y >= y_max y[y_max_mask] = y_max X[y_max_mask, 3] = np.nan # Check that there is at least one missing value in each feature: for feature_idx in range(X.shape[1]): assert any(np.isnan(X[:, feature_idx])) # Let's use a test set to check that the learned decision function is # the same as evaluated on unseen data. Otherwise it could just be the # case that we find two independent ways to overfit the training set. return train_test_split(X, y, random_state=rng)