def test_safe_indexing_1d_container(array_type, indices_type): indices = [1, 2] if indices_type == 'slice' and isinstance(indices[1], int): indices[1] += 1 array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=0) assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
def test_safe_indexing_2d_container_axis_0(array_type, indices_type): indices = [1, 2] if indices_type == "slice" and isinstance(indices[1], int): indices[1] += 1 array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=0) assert_allclose_dense_sparse( subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type))
def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset): columns_name = ["col_0", "col_1", "col_2"] array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name) indices = [False, True, True] indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=axis) assert_allclose_dense_sparse( subset, _convert_container(expected_subset, array_type))
def test_safe_indexing_2d_read_only_axis_1( array_read_only, indices_read_only, array_type, indices_type, axis, expected_array ): array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) if array_read_only: array.setflags(write=False) array = _convert_container(array, array_type) indices = np.array([1, 2]) if indices_read_only: indices.setflags(write=False) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=axis) assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
def test_calibration_with_fit_params(fit_params_type, data): """Tests that fit_params are passed to the underlying base estimator. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/12384 """ X, y = data fit_params = { "a": _convert_container(y, fit_params_type), "b": _convert_container(y, fit_params_type), } clf = CheckingClassifier(expected_fit_params=["a", "b"]) pc_clf = CalibratedClassifierCV(clf) pc_clf.fit(X, y, **fit_params)
def test_ros_fit_resample(X_type, data, params): X, Y = data X_ = _convert_container(X, X_type) ros = RandomOverSampler(**params, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X_, Y) X_gt = np.array([ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.47104475, 0.44386323], ]) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) if X_type == "dataframe": assert hasattr(X_resampled, "loc") X_resampled = X_resampled.to_numpy() assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) if params["shrinkage"] is None: assert ros.shrinkage_ is None else: assert ros.shrinkage_ == {0: 0}
def test_function_transformer_raise_error_with_mixed_dtype(X_type): """Check that `FunctionTransformer.check_inverse` raises error on mixed dtype.""" mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"} inverse_mapping = {value: key for key, value in mapping.items()} dtype = "object" data = ["one", "two", "three", "one", "one", 5, 6] data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype) def func(X): return np.array( [mapping[_safe_indexing(X, i)] for i in range(X.size)], dtype=object ) def inverse_func(X): return _convert_container( [inverse_mapping[x] for x in X], X_type, columns_name=["value"], dtype=dtype, ) transformer = FunctionTransformer( func=func, inverse_func=inverse_func, validate=False, check_inverse=True ) msg = "'check_inverse' is only supported when all the elements in `X` is numerical." with pytest.raises(ValueError, match=msg): transformer.fit(data)
def test_encoders_string_categories(input_dtype, category_dtype, array_type): """Check that encoding work with object, unicode, and byte string dtypes. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/15616 https://github.com/scikit-learn/scikit-learn/issues/15726 https://github.com/scikit-learn/scikit-learn/issues/19677 """ X = np.array([['b'], ['a']], dtype=input_dtype) categories = [np.array(['b', 'a'], dtype=category_dtype)] ohe = OneHotEncoder(categories=categories, sparse=False).fit(X) X_test = _convert_container([['a'], ['a'], ['b'], ['a']], array_type, dtype=input_dtype) X_trans = ohe.transform(X_test) expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]]) assert_allclose(X_trans, expected) oe = OrdinalEncoder(categories=categories).fit(X) X_trans = oe.transform(X_test) expected = np.array([[1], [1], [0], [1]]) assert_array_equal(X_trans, expected)
def inverse_func(X): return _convert_container( [inverse_mapping[x] for x in X], X_type, columns_name=["value"], dtype=dtype, )
def test_convert_container( constructor_name, container_type, dtype, superdtype, ): """Check that we convert the container to the right type of array with the right data type.""" if constructor_name in ("dataframe", "series", "index"): # delay the import of pandas within the function to only skip this test # instead of the whole file container_type = container_type() container = [0, 1] container_converted = _convert_container( container, constructor_name, dtype=dtype, ) assert isinstance(container_converted, container_type) if constructor_name in ("list", "tuple", "index"): # list and tuple will use Python class dtype: int, float # pandas index will always use high precision: np.int64 and np.float64 assert np.issubdtype(type(container_converted[0]), superdtype) elif hasattr(container_converted, "dtype"): assert container_converted.dtype == dtype elif hasattr(container_converted, "dtypes"): assert container_converted.dtypes[0] == dtype
def test_ball_tree_query_metrics(metric, array_type): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) Y = rng.random_sample((10, 10)).round(0) elif metric in DISCRETE_METRICS: X = (4 * rng.random_sample((40, 10))).round(0) Y = (4 * rng.random_sample((10, 10))).round(0) X = _convert_container(X, array_type) Y = _convert_container(Y, array_type) k = 5 bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2)
def test_plot_partial_dependence_str_features(pyplot, clf_boston, boston, input_type, feature_names_type): if input_type == 'dataframe': pd = pytest.importorskip("pandas") X = pd.DataFrame(boston.data, columns=boston.feature_names) elif input_type == 'list': X = boston.data.tolist() else: X = boston.data if feature_names_type is None: feature_names = None else: feature_names = _convert_container(boston.feature_names, feature_names_type) grid_resolution = 25 # check with str features and array feature names and single column disp = plot_partial_dependence(clf_boston, X, [('CRIM', 'ZN'), 'ZN'], grid_resolution=grid_resolution, feature_names=feature_names, n_cols=1, line_kw={"alpha": 0.8}) fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 3 assert disp.figure_ is fig assert disp.axes_.shape == (2, 1) assert disp.lines_.shape == (2, 1) assert disp.contours_.shape == (2, 1) assert disp.lines_[0, 0] is None assert disp.contours_[1, 0] is None # line ax = disp.axes_[1, 0] assert ax.get_xlabel() == "ZN" assert ax.get_ylabel() == "Partial dependence" line = disp.lines_[1, 0] avg_preds, values = disp.pd_results[1] target_idx = disp.target_idx assert line.get_alpha() == 0.8 line_data = line.get_data() assert_allclose(line_data[0], values[0]) assert_allclose(line_data[1], avg_preds[target_idx].ravel()) # contour ax = disp.axes_[0, 0] coutour = disp.contours_[0, 0] expect_levels = np.linspace(*disp.pdp_lim[2], num=8) assert_allclose(coutour.levels, expect_levels) assert ax.get_xlabel() == "CRIM" assert ax.get_ylabel() == "ZN"
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices): columns_name = ["col_0", "col_1", "col_2"] array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name) if isinstance(indices, str) and array_type != "dataframe": err_msg = ("Specifying the columns using strings is only supported " "for pandas DataFrames") with pytest.raises(ValueError, match=err_msg): _safe_indexing(array, indices, axis=1) else: subset = _safe_indexing(array, indices, axis=1) expected_output = [3, 6, 9] if expected_output_type == "sparse": # sparse matrix are keeping the 2D shape expected_output = [[3], [6], [9]] expected_array = _convert_container(expected_output, expected_output_type) assert_allclose_dense_sparse(subset, expected_array)
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices): # validation of the indices # we make a copy because indices is mutable and shared between tests indices_converted = copy(indices) if indices_type == "slice" and isinstance(indices[1], int): indices_converted[1] += 1 columns_name = ["col_0", "col_1", "col_2"] array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name) indices_converted = _convert_container(indices_converted, indices_type) if isinstance(indices[0], str) and array_type != "dataframe": err_msg = ("Specifying the columns using strings is only supported " "for pandas DataFrames") with pytest.raises(ValueError, match=err_msg): _safe_indexing(array, indices_converted, axis=1) else: subset = _safe_indexing(array, indices_converted, axis=1) assert_allclose_dense_sparse( subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type))
def test_random_over_sampler_smoothed_bootstrap(X_type, data): # check that smoothed bootstrap is working for numerical array X, y = data sampler = RandomOverSampler(shrinkage=1) X = _convert_container(X, X_type) X_res, y_res = sampler.fit_resample(X, y) assert y_res.shape == (14, ) assert X_res.shape == (14, 2) if X_type == "dataframe": assert hasattr(X_res, "loc")
def test_one_hot_encoder_inverse_transform_raise_error_with_unknown( X, X_trans, sparse_): """Check that `inverse_transform` raise an error with unknown samples, no dropped feature, and `handle_unknow="error`. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/14934 """ enc = OneHotEncoder(sparse=sparse_).fit(X) msg = (r"Samples \[(\d )*\d\] can not be inverted when drop=None and " r"handle_unknown='error' because they contain all zeros") if sparse_: # emulate sparse data transform by a one-hot encoder sparse. X_trans = _convert_container(X_trans, "sparse") with pytest.raises(ValueError, match=msg): enc.inverse_transform(X_trans)
def test_graphical_lasso_cv_alphas_iterable(alphas_container_type): """Check that we can pass an array-like to `alphas`. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/22489 """ true_cov = np.array([ [0.8, 0.0, 0.2, 0.0], [0.0, 0.4, 0.0, 0.0], [0.2, 0.0, 0.3, 0.1], [0.0, 0.0, 0.1, 0.7], ]) rng = np.random.RandomState(0) X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200) alphas = _convert_container([0.02, 0.03], alphas_container_type) GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
def test_value_difference_metric_property(dtype, k, r, y_type, encode_label): # Check the property of the vdm distance. Let's check the property # described in "Improved Heterogeneous Distance Functions", D.R. Wilson and # T.R. Martinez, Journal of Artificial Intelligence Research 6 (1997) 1-34 # https://arxiv.org/pdf/cs/9701101.pdf # # "if an attribute color has three values red, green and blue, and the # application is to identify whether or not an object is an apple, red and # green would be considered closer than red and blue because the former two # both have similar correlations with the output class apple." # defined our feature X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1) # 0 - not an apple / 1 - an apple y = np.array([1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1]) y_labels = np.array(["not apple", "apple"], dtype=object) y = y_labels[y] y = _convert_container(y, y_type) if encode_label: y = LabelEncoder().fit_transform(y) encoder = OrdinalEncoder(dtype=dtype) X_encoded = encoder.fit_transform(X) vdm = ValueDifferenceMetric(k=k, r=r) vdm.fit(X_encoded, y) sample_green = encoder.transform([["green"]]) sample_red = encoder.transform([["red"]]) sample_blue = encoder.transform([["blue"]]) for sample in (sample_green, sample_red, sample_blue): # computing the distance between a sample of the same category should # give a null distance dist = vdm.pairwise(sample).squeeze() assert dist == pytest.approx(0) # check the property explained in the introduction example dist_1 = vdm.pairwise(sample_green, sample_red).squeeze() dist_2 = vdm.pairwise(sample_blue, sample_red).squeeze() dist_3 = vdm.pairwise(sample_blue, sample_green).squeeze() # green and red are very close # blue is closer to red than green assert dist_1 < dist_2 assert dist_1 < dist_3 assert dist_2 < dist_3
def test_r_regression(center): X, y = make_regression(n_samples=2000, n_features=20, n_informative=5, shuffle=False, random_state=0) corr_coeffs = r_regression(X, y, center=center) assert ((-1 < corr_coeffs).all()) assert ((corr_coeffs < 1).all()) sparse_X = _convert_container(X, "sparse") sparse_corr_coeffs = r_regression(sparse_X, y, center=center) assert_allclose(sparse_corr_coeffs, corr_coeffs) # Testing against numpy for reference Z = np.hstack((X, y[:, np.newaxis])) correlation_matrix = np.corrcoef(Z, rowvar=False) np_corr_coeffs = correlation_matrix[:-1, -1] assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3)
def test_num_features_errors_1d_containers(X, constructor_name): X = _convert_container(X, constructor_name) if constructor_name == "array": expected_type_name = "numpy.ndarray" elif constructor_name == "series": expected_type_name = "pandas.core.series.Series" else: expected_type_name = constructor_name message = ( "Unable to find the number of features from X of type " f"{expected_type_name}" ) if hasattr(X, "shape"): message += " with shape (3,)" elif isinstance(X[0], str): message += " where the samples are of type str" with pytest.raises(TypeError, match=re.escape(message)): _num_features(X)
def test_permutation_importance_large_memmaped_data(input_type): # Smoke, non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15810 n_samples, n_features = int(5e4), 4 X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=0) assert X.nbytes > 1e6 # trigger joblib memmaping X = _convert_container(X, input_type) clf = DummyClassifier(strategy='prior').fit(X, y) # Actual smoke test: should not raise any error: n_repeats = 5 r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2) # Auxiliary check: DummyClassifier is feature independent: # permutating feature should not change the predictions expected_importances = np.zeros((n_features, n_repeats)) assert_allclose(expected_importances, r.importances)
def test_checking_classifier(iris, input_type): # Check that the CheckingClassifier outputs what we expect X, y = iris X = _convert_container(X, input_type) clf = CheckingClassifier() clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y)) assert len(clf.classes_) == 3 assert clf.n_features_in_ == 4 y_pred = clf.predict(X) assert_array_equal(y_pred, np.zeros(y_pred.size, dtype=int)) assert clf.score(X) == pytest.approx(0) clf.set_params(foo_param=10) assert clf.fit(X, y).score(X) == pytest.approx(1) y_proba = clf.predict_proba(X) assert y_proba.shape == (150, 3) assert_allclose(y_proba[:, 0], 1) assert_allclose(y_proba[:, 1:], 0) y_decision = clf.decision_function(X) assert y_decision.shape == (150, 3) assert_allclose(y_decision[:, 0], 1) assert_allclose(y_decision[:, 1:], 0) # check the shape in case of binary classification first_2_classes = np.logical_or(y == 0, y == 1) X = _safe_indexing(X, first_2_classes) y = _safe_indexing(y, first_2_classes) clf.fit(X, y) y_proba = clf.predict_proba(X) assert y_proba.shape == (100, 2) assert_allclose(y_proba[:, 0], 1) assert_allclose(y_proba[:, 1], 0) y_decision = clf.decision_function(X) assert y_decision.shape == (100,) assert_allclose(y_decision, 0)
def test_value_difference_metric(data, dtype, k, r, y_type, encode_label): # Check basic feature of the metric: # * the shape of the distance matrix is (n_samples, n_samples) # * computing pairwise distance of X is the same than explicitely between # X and X. X, y = data y = _convert_container(y, y_type) if encode_label: y = LabelEncoder().fit_transform(y) encoder = OrdinalEncoder(dtype=dtype) X_encoded = encoder.fit_transform(X) vdm = ValueDifferenceMetric(k=k, r=r) vdm.fit(X_encoded, y) dist_1 = vdm.pairwise(X_encoded) dist_2 = vdm.pairwise(X_encoded, X_encoded) np.testing.assert_allclose(dist_1, dist_2) assert dist_1.shape == (X.shape[0], X.shape[0]) assert dist_2.shape == (X.shape[0], X.shape[0])
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type): array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type) indices = 2 subset = _safe_indexing(array, indices, axis=0) expected_array = _convert_container([7, 8, 9], expected_output_type) assert_allclose_dense_sparse(subset, expected_array)
def test_num_features(constructor_name): """Check _num_features for array-likes.""" X = [[1, 2, 3], [4, 5, 6]] X = _convert_container(X, constructor_name) assert _num_features(X) == 3
def test_plot_partial_dependence_str_features(pyplot, clf_diabetes, diabetes, input_type, feature_names_type): if input_type == "dataframe": pd = pytest.importorskip("pandas") X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) elif input_type == "list": X = diabetes.data.tolist() else: X = diabetes.data if feature_names_type is None: feature_names = None else: feature_names = _convert_container(diabetes.feature_names, feature_names_type) grid_resolution = 25 # check with str features and array feature names and single column disp = plot_partial_dependence( clf_diabetes, X, [("age", "bmi"), "bmi"], grid_resolution=grid_resolution, feature_names=feature_names, n_cols=1, line_kw={"alpha": 0.8}, ) fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 3 assert disp.figure_ is fig assert disp.axes_.shape == (2, 1) assert disp.lines_.shape == (2, 1) assert disp.contours_.shape == (2, 1) assert disp.deciles_vlines_.shape == (2, 1) assert disp.deciles_hlines_.shape == (2, 1) assert disp.lines_[0, 0] is None assert disp.deciles_vlines_[0, 0] is not None assert disp.deciles_hlines_[0, 0] is not None assert disp.contours_[1, 0] is None assert disp.deciles_hlines_[1, 0] is None assert disp.deciles_vlines_[1, 0] is not None # line ax = disp.axes_[1, 0] assert ax.get_xlabel() == "bmi" assert ax.get_ylabel() == "Partial dependence" line = disp.lines_[1, 0] avg_preds = disp.pd_results[1] target_idx = disp.target_idx assert line.get_alpha() == 0.8 line_data = line.get_data() assert_allclose(line_data[0], avg_preds["values"][0]) assert_allclose(line_data[1], avg_preds.average[target_idx].ravel()) # contour ax = disp.axes_[0, 0] coutour = disp.contours_[0, 0] expect_levels = np.linspace(*disp.pdp_lim[2], num=8) assert_allclose(coutour.levels, expect_levels) assert ax.get_xlabel() == "age" assert ax.get_ylabel() == "bmi"
def test_convert_container(constructor_name, container_type): container = [0, 1] assert isinstance(_convert_container(container, constructor_name), container_type)
def test_safe_indexing_1d_container_mask(array_type, indices_type): indices = [False] + [True] * 2 + [False] * 6 array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=0) assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
def test_safe_indexing_1d_scalar(array_type): array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) indices = 2 subset = _safe_indexing(array, indices, axis=0) assert subset == 3
def test_safe_indexing_None_axis_0(array_type): X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type) X_subset = _safe_indexing(X, None, axis=0) assert_allclose_dense_sparse(X_subset, X)