def test_boston_dataset(n_bins): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_permutation_importance_correlated_feature_regression(n_jobs): # Make sure that feature highly correlated to the target have a higher # importance rng = np.random.RandomState(42) n_repeats = 5 X, y = load_boston(return_X_y=True) y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape( -1, 1) X = np.hstack([X, y_with_little_noise]) clf = RandomForestRegressor(n_estimators=10, random_state=42) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng, n_jobs=n_jobs) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y was added as the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_load_boston(): res = load_boston() assert res.data.shape == (506, 13) assert res.target.size == 506 assert res.feature_names.size == 13 assert res.DESCR assert os.path.exists(res.filename) # test return_X_y option check_return_X_y(res, partial(load_boston))
def test_score_sample_weight(): rng = np.random.RandomState(0) # test both ClassifierMixin and RegressorMixin estimators = [ DecisionTreeClassifier(max_depth=2), DecisionTreeRegressor(max_depth=2) ] sets = [datasets.load_iris(), datasets.load_boston()] for est, ds in zip(estimators, sets): est.fit(ds.data, ds.target) # generate random sample weights sample_weight = rng.randint(1, 10, size=len(ds.target)) # check that the score with and without sample weights are different assert (est.score(ds.data, ds.target) != est.score( ds.data, ds.target, sample_weight=sample_weight)), ("Unweighted and weighted scores " "are unexpectedly equal")
def test_warm_start_convergence_with_regularizer_decrement(): X, y = load_boston(return_X_y=True) # Train a model to converge on a lightly regularized problem final_alpha = 1e-5 low_reg_model = ElasticNet(alpha=final_alpha).fit(X, y) # Fitting a new model on a more regularized version of the same problem. # Fitting with high regularization is easier it should converge faster # in general. high_reg_model = ElasticNet(alpha=final_alpha * 10).fit(X, y) assert low_reg_model.n_iter_ > high_reg_model.n_iter_ # Fit the solution to the original, less regularized version of the # problem but from the solution of the highly regularized variant of # the problem as a better starting point. This should also converge # faster than the original model that starts from zero. warm_low_reg_model = deepcopy(high_reg_model) warm_low_reg_model.set_params(warm_start=True, alpha=final_alpha) warm_low_reg_model.fit(X, y) assert low_reg_model.n_iter_ > warm_low_reg_model.n_iter_
def test_iterative_imputer_catch_warning(): # check that we catch a RuntimeWarning due to a division by zero when a # feature is constant in the dataset X, y = load_boston(return_X_y=True) n_samples, n_features = X.shape # simulate that a feature only contain one category during fit X[:, 3] = 1 # add some missing values rng = np.random.RandomState(0) missing_rate = 0.15 for feat in range(n_features): sample_idx = rng.choice( np.arange(n_samples), size=int(n_samples * missing_rate), replace=False ) X[sample_idx, feat] = np.nan imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True) with pytest.warns(None) as record: X_fill = imputer.fit_transform(X, y) assert not record.list assert not np.any(np.isnan(X_fill))
from sklearn_lib.tree import DecisionTreeRegressor from sklearn_lib.model_selection import GridSearchCV from sklearn_lib import datasets from sklearn_lib.model_selection import cross_val_score, train_test_split from sklearn_lib.datasets import make_multilabel_classification from sklearn_lib.svm import SVC from sklearn_lib.multiclass import OneVsRestClassifier from sklearn_lib.neighbors import KNeighborsClassifier from sklearn_lib.base import BaseEstimator, ClassifierMixin, clone from sklearn_lib.dummy import DummyRegressor # Load datasets iris = datasets.load_iris() X, y = iris.data[:, 1:3], iris.target X_r, y_r = datasets.load_boston(return_X_y=True) @pytest.mark.parametrize( "params, err_msg", [({ 'estimators': [] }, "Invalid 'estimators' attribute, 'estimators' should be a list of"), ({ 'estimators': [('lr', LogisticRegression())], 'voting': 'error' }, r"Voting must be 'soft' or 'hard'; got \(voting='error'\)"), ({ 'estimators': [('lr', LogisticRegression())], 'weights': [1, 2] }, "Number of `estimators` and weights must be equal")])
ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"] X_digits, y_digits = load_digits(n_class=3, return_X_y=True) X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200]) y_digits_multi = y_digits[:200] X_digits, y_digits = load_digits(n_class=2, return_X_y=True) X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200]) y_digits_binary = y_digits[:200] classification_datasets = [(X_digits_multi, y_digits_multi), (X_digits_binary, y_digits_binary)] boston = load_boston() Xboston = StandardScaler().fit_transform(boston.data)[:200] yboston = boston.target[:200] regression_datasets = [(Xboston, yboston)] iris = load_iris() X_iris = iris.data y_iris = iris.target def test_alpha(): # Test that larger alpha yields weights closer to zero X = X_digits_binary[:100]
from sklearn_lib.utils._testing import ignore_warnings # TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved pytestmark = pytest.mark.filterwarnings( "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:" "matplotlib.*") # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] sample_weight = [1, 1, 1, 2, 2, 2] # also load the boston dataset boston = datasets.load_boston() # also load the iris dataset iris = datasets.load_iris() @ignore_warnings(category=FutureWarning) def test_partial_dependence_classifier(): # Test partial dependence for classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5) # only 4 grid points instead of 5 because only 4 unique X[:,0] vals assert pdp.shape == (1, 4)
def boston(): return load_boston()