def test_one_hot_encoder_handle_unknown(): X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) X2 = np.array([[4, 1, 1]]) # Test that one hot encoder raises error for unknown features # present during transform. oh = OneHotEncoder(handle_unknown='error') oh.fit(X) with pytest.raises(ValueError, match='Found unknown categories'): oh.transform(X2) # Test the ignore option, ignores unknown features (giving all 0's) oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) X2_passed = X2.copy() assert_array_equal( oh.transform(X2_passed).toarray(), np.array([[0., 0., 0., 0., 1., 0., 0.]])) # ensure transformed data was not modified in place assert_allclose(X2, X2_passed) # Raise error if handle_unknown is neither ignore or error. oh = OneHotEncoder(handle_unknown='42') with pytest.raises(ValueError, match='handle_unknown should be either'): oh.fit(X)
def test_ridge_loo_cv_asym_scoring(): # checking on asymmetric scoring scoring = 'explained_variance' n_samples, n_features = 10, 5 n_targets = 1 X, y = _make_sparse_offset_regression(n_samples=n_samples, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=1, n_informative=5) alphas = [1e-3, .1, 1., 10., 1e3] loo_ridge = RidgeCV(cv=n_samples, fit_intercept=True, alphas=alphas, scoring=scoring, normalize=True) gcv_ridge = RidgeCV(fit_intercept=True, alphas=alphas, scoring=scoring, normalize=True) loo_ridge.fit(X, y) gcv_ridge.fit(X, y) assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
def test_one_hot_encoder_pandas(): pd = pytest.importorskip('pandas') X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]}) Xtr = check_categorical_onehot(X_df) assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
def test_ridge_gcv_vs_ridge_loo_cv(gcv_mode, X_constructor, X_shape, y_shape, fit_intercept, normalize, noise): n_samples, n_features = X_shape n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression(n_samples=n_samples, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise, n_informative=5) y = y.reshape(y_shape) alphas = [1e-3, .1, 1., 10., 1e3] loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept, alphas=alphas, scoring='neg_mean_squared_error', normalize=normalize) gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept, alphas=alphas, normalize=normalize) loo_ridge.fit(X, y) X_gcv = X_constructor(X) gcv_ridge.fit(X_gcv, y) assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
def test_solver_consistency(solver, proportion_nonzero, n_samples, dtype, sparse_X, seed): alpha = 1. noise = 50. if proportion_nonzero > .9 else 500. X, y = _make_sparse_offset_regression( bias=10, n_features=30, proportion_nonzero=proportion_nonzero, noise=noise, random_state=seed, n_samples=n_samples) svd_ridge = Ridge(solver='svd', normalize=True, alpha=alpha).fit(X, y) X = X.astype(dtype, copy=False) y = y.astype(dtype, copy=False) if sparse_X: X = sp.csr_matrix(X) if solver == 'ridgecv': ridge = RidgeCV(alphas=[alpha], normalize=True) else: ridge = Ridge(solver=solver, tol=1e-10, normalize=True, alpha=alpha) ridge.fit(X, y) assert_allclose(ridge.coef_, svd_ridge.coef_, atol=1e-3, rtol=1e-3) assert_allclose(ridge.intercept_, svd_ridge.intercept_, atol=1e-3, rtol=1e-3)
def test_ridge_regression_dtype_stability(solver, seed): random_state = np.random.RandomState(seed) n_samples, n_features = 6, 5 X = random_state.randn(n_samples, n_features) coef = random_state.randn(n_features) y = np.dot(X, coef) + 0.01 * random_state.randn(n_samples) alpha = 1.0 results = dict() # XXX: Sparse CG seems to be far less numerically stable than the # others, maybe we should not enable float32 for this one. atol = 1e-3 if solver == "sparse_cg" else 1e-5 for current_dtype in (np.float32, np.float64): results[current_dtype] = ridge_regression(X.astype(current_dtype), y.astype(current_dtype), alpha=alpha, solver=solver, random_state=random_state, sample_weight=None, max_iter=500, tol=1e-10, return_n_iter=False, return_intercept=False) assert results[np.float32].dtype == np.float32 assert results[np.float64].dtype == np.float64 assert_allclose(results[np.float32], results[np.float64], atol=atol)
def test_knn_imputer_zero_nan_imputes_the_same(na): # Test with an imputable matrix and compare with different missing_values X_zero = np.array([ [1, 0, 1, 1, 1.], [2, 2, 2, 2, 2], [3, 3, 3, 3, 0], [6, 6, 0, 6, 6], ]) X_nan = np.array([ [1, na, 1, 1, 1.], [2, 2, 2, 2, 2], [3, 3, 3, 3, na], [6, 6, na, 6, 6], ]) X_imputed = np.array([ [1, 2.5, 1, 1, 1.], [2, 2, 2, 2, 2], [3, 3, 3, 3, 1.5], [6, 6, 2.5, 6, 6], ]) imputer_zero = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform") imputer_nan = KNNImputer(missing_values=na, n_neighbors=2, weights="uniform") assert_allclose(imputer_zero.fit_transform(X_zero), X_imputed) assert_allclose(imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan))
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough) reg.fit(X_train, y_train) result = reg.predict(X_test, **predict_params) expected_result_length = 2 if predict_params else 1 if predict_params: assert len(result) == expected_result_length X_trans = reg.transform(X_test) expected_column_count = 12 if passthrough else 2 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -10:]) reg.set_params(lr='drop') reg.fit(X_train, y_train) reg.predict(X_test) X_trans = reg.transform(X_test) expected_column_count_drop = 11 if passthrough else 1 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -10:])
def test_ovr_decision_function(): # test properties for ovr decision function predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]]) confidences = np.array([[-1e16, 0, -1e16], [1., 2., -3.], [-5., 2., 5.], [-0.5, 0.2, 0.5]]) n_classes = 3 dec_values = _ovr_decision_function(predictions, confidences, n_classes) # check that the decision values are within 0.5 range of the votes votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]]) assert_allclose(votes, dec_values, atol=0.5) # check that the prediction are what we expect # highest vote or highest confidence if there is a tie. # for the second sample we have a tie (should be won by 1) expected_prediction = np.array([2, 1, 2, 2]) assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction) # third and fourth sample have the same vote but third sample # has higher confidence, this should reflect on the decision values assert (dec_values[2, 2] > dec_values[3, 2]) # assert subset invariance. dec_values_one = [ _ovr_decision_function(np.array([predictions[i]]), np.array([confidences[i]]), n_classes)[0] for i in range(4) ] assert_allclose(dec_values, dec_values_one, atol=1e-6)
def test_iterative_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
def test_stacking_with_sample_weight(stacker, X, y): # check that sample weights has an influence on the fitting # note: ConvergenceWarning are catch since we are not worrying about the # convergence here n_half_samples = len(y) // 2 total_sample_weight = np.array([0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)) X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split( X, y, total_sample_weight, random_state=42) with ignore_warnings(category=ConvergenceWarning): stacker.fit(X_train, y_train) y_pred_no_weight = stacker.predict(X_test) with ignore_warnings(category=ConvergenceWarning): stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape)) y_pred_unit_weight = stacker.predict(X_test) assert_allclose(y_pred_no_weight, y_pred_unit_weight) with ignore_warnings(category=ConvergenceWarning): stacker.fit(X_train, y_train, sample_weight=sample_weight_train) y_pred_biased = stacker.predict(X_test) assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
def test_iterative_imputer_early_stopping(): rng = np.random.RandomState(0) n = 50 d = 5 A = rng.rand(n, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(n, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=100, tol=1e-2, sample_posterior=False, verbose=1, random_state=rng) X_filled_100 = imputer.fit_transform(X_missing) assert len(imputer.imputation_sequence_) == d * imputer.n_iter_ imputer = IterativeImputer(max_iter=imputer.n_iter_, sample_posterior=False, verbose=1, random_state=rng) X_filled_early = imputer.fit_transform(X_missing) assert_allclose(X_filled_100, X_filled_early, atol=1e-7) imputer = IterativeImputer(max_iter=100, tol=0, sample_posterior=False, verbose=1, random_state=rng) imputer.fit(X_missing) assert imputer.n_iter_ == imputer.max_iter
def test_iterative_imputer_skip_non_missing(skip_complete): # check the imputing strategy when missing data are present in the # testing set only. # taken from: https://github.com/scikit-learn/scikit-learn/issues/14383 rng = np.random.RandomState(0) X_train = np.array([ [5, 2, 2, 1], [10, 1, 2, 7], [3, 1, 1, 1], [8, 4, 2, 2] ]) X_test = np.array([ [np.nan, 2, 4, 5], [np.nan, 4, 1, 2], [np.nan, 1, 10, 1] ]) imputer = IterativeImputer( initial_strategy='mean', skip_complete=skip_complete, random_state=rng ) X_test_est = imputer.fit(X_train).transform(X_test) if skip_complete: # impute with the initial strategy: 'mean' assert_allclose(X_test_est[:, 0], np.mean(X_train[:, 0])) else: assert_allclose(X_test_est[:, 0], [11, 7, 12], rtol=1e-4)
def test_check_sample_weight(): # check array order sample_weight = np.ones(10)[::2] assert not sample_weight.flags["C_CONTIGUOUS"] sample_weight = _check_sample_weight(sample_weight, X=np.ones((5, 1))) assert sample_weight.flags["C_CONTIGUOUS"] # check None input sample_weight = _check_sample_weight(None, X=np.ones((5, 2))) assert_allclose(sample_weight, np.ones(5)) # check numbers input sample_weight = _check_sample_weight(2.0, X=np.ones((5, 2))) assert_allclose(sample_weight, 2 * np.ones(5)) # check wrong number of dimensions with pytest.raises(ValueError, match="Sample weights must be 1D array or scalar"): _check_sample_weight(np.ones((2, 4)), X=np.ones((2, 2))) # check incorrect n_samples msg = r"sample_weight.shape == \(4,\), expected \(2,\)!" with pytest.raises(ValueError, match=msg): _check_sample_weight(np.ones(4), X=np.ones((2, 2))) # float32 dtype is preserved X = np.ones((5, 2)) sample_weight = np.ones(5, dtype=np.float32) sample_weight = _check_sample_weight(sample_weight, X) assert sample_weight.dtype == np.float32 # int dtype will be converted to float64 instead X = np.ones((5, 2), dtype=np.int) sample_weight = _check_sample_weight(None, X, dtype=X.dtype) assert sample_weight.dtype == np.float64
def test_dtype_match(solver): rng = np.random.RandomState(0) alpha = 1.0 n_samples, n_features = 6, 5 X_64 = rng.randn(n_samples, n_features) y_64 = rng.randn(n_samples) X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) tol = 2 * np.finfo(np.float32).resolution # Check type consistency 32bits ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=tol) ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=tol) ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ # Do the actual checks at once for easier debug assert coef_32.dtype == X_32.dtype assert coef_64.dtype == X_64.dtype assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4, atol=5e-4)
def test_inverse_transform(algo, X_sparse): # We need a lot of components for the reconstruction to be "almost # equal" in all positions. XXX Test means or sums instead? tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo) Xt = tsvd.fit_transform(X_sparse) Xinv = tsvd.inverse_transform(Xt) assert_allclose(Xinv, X_sparse.toarray(), rtol=1e-1, atol=2e-1)
def test_threshold_and_max_features(): X, y = datasets.make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) est = RandomForestClassifier(n_estimators=50, random_state=0) transformer1 = SelectFromModel(estimator=est, max_features=3, threshold=-np.inf) X_new1 = transformer1.fit_transform(X, y) transformer2 = SelectFromModel(estimator=est, threshold=0.04) X_new2 = transformer2.fit_transform(X, y) transformer3 = SelectFromModel(estimator=est, max_features=3, threshold=0.04) X_new3 = transformer3.fit_transform(X, y) assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1]) selected_indices = transformer3.transform( np.arange(X.shape[1])[np.newaxis, :]) assert_allclose(X_new3, X[:, selected_indices[0]])
def test_knn_imputer_distance_weighted_not_enough_neighbors( na, working_memory): X = np.array([[3, na], [2, na], [na, 4], [5, 6], [6, 8], [na, 5]]) dist = pairwise_distances(X, metric="nan_euclidean", squared=False, missing_values=na) X_01 = np.average(X[3:5, 1], weights=1 / dist[0, 3:5]) X_11 = np.average(X[3:5, 1], weights=1 / dist[1, 3:5]) X_20 = np.average(X[3:5, 0], weights=1 / dist[2, 3:5]) X_50 = np.average(X[3:5, 0], weights=1 / dist[5, 3:5]) X_expected = np.array([[3, X_01], [2, X_11], [X_20, 4], [5, 6], [6, 8], [X_50, 5]]) with config_context(working_memory=working_memory): knn_3 = KNNImputer(missing_values=na, n_neighbors=3, weights='distance') assert_allclose(knn_3.fit_transform(X), X_expected) knn_4 = KNNImputer(missing_values=na, n_neighbors=4, weights='distance') assert_allclose(knn_4.fit_transform(X), X_expected)
def test_iterative_imputer_all_missing(): n = 100 d = 3 X = np.zeros((n, d)) imputer = IterativeImputer(missing_values=0, max_iter=1) X_imputed = imputer.fit_transform(X) assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
def test_transform_target_regressor_multi_to_single(): X = friedman[0] y = np.transpose([friedman[1], (friedman[1]**2 + 1)]) def func(y): out = np.sqrt(y[:, 0]**2 + y[:, 1]**2) return out[:, np.newaxis] def inverse_func(y): return y tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_2d_func = tt.predict(X) assert y_pred_2d_func.shape == (100, 1) # force that the function only return a 1D array def func(y): return np.sqrt(y[:, 0]**2 + y[:, 1]**2) tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_1d_func = tt.predict(X) assert y_pred_1d_func.shape == (100, 1) assert_allclose(y_pred_1d_func, y_pred_2d_func)
def test_pca_svd_solver_auto(data, n_components, expected_solver): pca_auto = PCA(n_components=n_components, random_state=0) pca_test = PCA(n_components=n_components, svd_solver=expected_solver, random_state=0) pca_auto.fit(data) pca_test.fit(data) assert_allclose(pca_auto.components_, pca_test.components_)
def test_pca_check_projection_list(svd_solver): # Test that the projection of data is correct X = [[1.0, 0.0], [0.0, 1.0]] pca = PCA(n_components=1, svd_solver=svd_solver, random_state=0) X_trans = pca.fit_transform(X) assert X_trans.shape, (2, 1) assert_allclose(X_trans.mean(), 0.00, atol=1e-12) assert_allclose(X_trans.std(), 0.71, rtol=5e-3)
def test_pca_explained_variance_empirical(X, svd_solver): pca = PCA(n_components=2, svd_solver=svd_solver, random_state=0) X_pca = pca.fit_transform(X) assert_allclose(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0)) expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] expected_result = sorted(expected_result, reverse=True)[:2] assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3)
def test_pca_score_consistency_solvers(svd_solver): # Check the consistency of score between solvers X, _ = datasets.load_digits(return_X_y=True) pca_full = PCA(n_components=30, svd_solver='full', random_state=0) pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6)
def test_knn_imputer_drops_all_nan_features(na): X1 = np.array([[na, 1], [na, 2]]) knn = KNNImputer(missing_values=na, n_neighbors=1) X1_expected = np.array([[1], [2]]) assert_allclose(knn.fit_transform(X1), X1_expected) X2 = np.array([[1, 2], [3, na]]) X2_expected = np.array([[2], [1.5]]) assert_allclose(knn.transform(X2), X2_expected)
def test_knn_imputer_one_n_neighbors(na): X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]]) X_imputed = np.array([[0, 0], [4, 2], [4, 3], [5, 3], [7, 7], [7, 8], [14, 13]]) imputer = KNNImputer(n_neighbors=1, missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed)
def test_nrm2(dtype): nrm2 = _nrm2_memview[NUMPY_TO_CYTHON[dtype]] rng = np.random.RandomState(0) x = rng.random_sample(10).astype(dtype, copy=False) expected = np.linalg.norm(x) actual = nrm2(x) assert_allclose(actual, expected, rtol=RTOL[dtype])
def test_asum(dtype): asum = _asum_memview[NUMPY_TO_CYTHON[dtype]] rng = np.random.RandomState(0) x = rng.random_sample(10).astype(dtype, copy=False) expected = np.abs(x).sum() actual = asum(x) assert_allclose(actual, expected, rtol=RTOL[dtype])
def test_pca_deterministic_output(svd_solver): rng = np.random.RandomState(0) X = rng.rand(10, 10) transformed_X = np.zeros((20, 2)) for i in range(20): pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) transformed_X[i, :] = pca.fit_transform(X)[0] assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
def test_precomputed_dists(): redX = X[::2] dists = pairwise_distances(redX, metric='euclidean') clust1 = OPTICS(min_samples=10, algorithm='brute', metric='precomputed').fit(dists) clust2 = OPTICS(min_samples=10, algorithm='brute', metric='euclidean').fit(redX) assert_allclose(clust1.reachability_, clust2.reachability_) assert_array_equal(clust1.labels_, clust2.labels_)