from sklearn_lib.linear_model._base import make_dataset from sklearn_lib.linear_model._logistic import _multinomial_loss_grad from sklearn_lib.utils.fixes import logsumexp from sklearn_lib.utils.extmath import row_norms from sklearn_lib.utils._testing import assert_almost_equal from sklearn_lib.utils._testing import assert_array_almost_equal from sklearn_lib.utils._testing import assert_allclose from sklearn_lib.utils._testing import assert_raise_message from sklearn_lib.utils import compute_class_weight from sklearn_lib.utils import check_random_state from sklearn_lib.preprocessing import LabelEncoder, LabelBinarizer from sklearn_lib.datasets import make_blobs, load_iris, make_classification from sklearn_lib.base import clone iris = load_iris() # this is used for sag classification def log_dloss(p, y): z = p * y # approximately equal and saves the computation of the log if z > 18.0: return math.exp(-z) * -y if z < -18.0: return -y return -y / (math.exp(z) + 1.0) def log_loss(p, y): return np.mean(np.log(1. + np.exp(-y * p)))
def test_bunch_dir(): # check that dir (important for autocomplete) shows attributes data = load_iris() assert "data" in dir(data)
def test_non_numpy_labels(): dataset = datasets.load_iris() X = dataset.data y = dataset.target assert (silhouette_score(list(X), list(y)) == silhouette_score(X, y))
from sklearn_lib.ensemble import RandomForestRegressor from sklearn_lib.ensemble import VotingClassifier, VotingRegressor from sklearn_lib.tree import DecisionTreeClassifier from sklearn_lib.tree import DecisionTreeRegressor from sklearn_lib.model_selection import GridSearchCV from sklearn_lib import datasets from sklearn_lib.model_selection import cross_val_score, train_test_split from sklearn_lib.datasets import make_multilabel_classification from sklearn_lib.svm import SVC from sklearn_lib.multiclass import OneVsRestClassifier from sklearn_lib.neighbors import KNeighborsClassifier from sklearn_lib.base import BaseEstimator, ClassifierMixin, clone from sklearn_lib.dummy import DummyRegressor # Load datasets iris = datasets.load_iris() X, y = iris.data[:, 1:3], iris.target X_r, y_r = datasets.load_boston(return_X_y=True) @pytest.mark.parametrize( "params, err_msg", [({ 'estimators': [] }, "Invalid 'estimators' attribute, 'estimators' should be a list of"), ({ 'estimators': [('lr', LogisticRegression())], 'voting': 'error' }, r"Voting must be 'soft' or 'hard'; got \(voting='error'\)"), ({
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1) rfecv.fit(X, y) # non-regression test for missing worst feature: assert len(rfecv.grid_scores_) == X.shape[1] assert len(rfecv.ranking_) == X.shape[1] X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # In the event of cross validation score ties, the expected behavior of # RFECV is to return the FEWEST features that maximize the CV score. # Because test_scorer always returns 1.0 in this example, RFECV should # reduce the dimensionality to a single feature (i.e. n_features_ = 1) assert rfecv.n_features_ == 1 # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2) rfecv.fit(X, y) assert len(rfecv.grid_scores_) == 6 assert len(rfecv.ranking_) == X.shape[1] X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Verifying that steps < 1 don't blow up. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data)
def data(): return load_iris(return_X_y=True)
from sklearn_lib.ensemble import StackingClassifier from sklearn_lib.ensemble import StackingRegressor from sklearn_lib.model_selection import train_test_split from sklearn_lib.model_selection import StratifiedKFold from sklearn_lib.model_selection import KFold from sklearn_lib.utils._testing import assert_allclose from sklearn_lib.utils._testing import assert_allclose_dense_sparse from sklearn_lib.utils._testing import ignore_warnings from sklearn_lib.utils.estimator_checks import check_estimator from sklearn_lib.utils.estimator_checks import check_no_attributes_set_in_init X_diabetes, y_diabetes = load_diabetes(return_X_y=True) X_iris, y_iris = load_iris(return_X_y=True) @pytest.mark.parametrize( "cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]) @pytest.mark.parametrize("final_estimator", [None, RandomForestClassifier(random_state=42)]) @pytest.mark.parametrize("passthrough", [False, True]) def test_stacking_classifier_iris(cv, final_estimator, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
# # License: BSD 3 clause import itertools import numpy as np from sklearn_lib.utils._testing import assert_array_almost_equal from sklearn_lib.utils._testing import assert_raise_message from sklearn_lib.utils._testing import assert_warns_message from sklearn_lib import datasets from sklearn_lib.covariance import empirical_covariance, MinCovDet from sklearn_lib.covariance import fast_mcd X = datasets.load_iris().data X_1d = X[:, 0] n_samples, n_features = X.shape def test_mcd(): # Tests the FastMCD algorithm implementation # Small data set # test without outliers (random independent normal data) launch_mcd_on_dataset(100, 5, 0, 0.01, 0.1, 80) # test with a contaminated data set (medium contamination) launch_mcd_on_dataset(100, 5, 20, 0.01, 0.01, 70) # test with a contaminated data set (strong contamination) launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50) # Medium data set