def check_categorical_onehot(X): enc = OneHotEncoder(categories='auto') Xtr1 = enc.fit_transform(X) enc = OneHotEncoder(categories='auto', sparse=False) Xtr2 = enc.fit_transform(X) assert_allclose(Xtr1.toarray(), Xtr2) assert sparse.isspmatrix_csr(Xtr1) return Xtr1.toarray()
def test_dirichlet_expectation(): """Test Cython version of Dirichlet expectation calculation.""" x = np.logspace(-100, 10, 10000) expectation = np.empty_like(x) _dirichlet_expectation_1d(x, 0, expectation) assert_allclose(expectation, np.exp(psi(x) - psi(np.sum(x))), atol=1e-19) x = x.reshape(100, 100) assert_allclose(_dirichlet_expectation_2d(x), psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]), rtol=1e-11, atol=3e-9)
def test_X_CenterStackOp(n_col): rng = np.random.RandomState(0) X = rng.randn(11, 8) X_m = rng.randn(8) sqrt_sw = rng.randn(len(X)) Y = rng.randn(11, *n_col) A = rng.randn(9, *n_col) operator = _X_CenterStackOp(sp.csr_matrix(X), X_m, sqrt_sw) reference_operator = np.hstack( [X - sqrt_sw[:, None] * X_m, sqrt_sw[:, None]]) assert_allclose(reference_operator.dot(A), operator.dot(A)) assert_allclose(reference_operator.T.dot(Y), operator.T.dot(Y))
def test_iterative_imputer_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan m1 = IterativeImputer(max_iter=10, random_state=rng) m2 = IterativeImputer(max_iter=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely assert_allclose(X[:, 1:], pred1) # fit and fit_transform should both be identical assert_allclose(pred1, pred2)
def test_pca_check_projection(svd_solver): # Test that the projection of data is correct rng = np.random.RandomState(0) n, p = 100, 3 X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5]) Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5]) Yt = PCA(n_components=2, svd_solver=svd_solver).fit(X).transform(Xt) Yt /= np.sqrt((Yt**2).sum()) assert_allclose(np.abs(Yt[0][0]), 1., rtol=5e-3)
def test_imputers_add_indicator(marker, imputer_constructor): X = np.array([[marker, 1, 5, marker, 1], [2, marker, 1, marker, 2], [6, 3, marker, marker, 3], [1, 2, 9, marker, 4]]) X_true_indicator = np.array([[1., 0., 0., 1.], [0., 1., 0., 1.], [0., 0., 1., 1.], [0., 0., 0., 1.]]) imputer = imputer_constructor(missing_values=marker, add_indicator=True) X_trans = imputer.fit(X).transform(X) # The test is for testing the indicator, # that's why we're looking at the last 4 columns only. assert_allclose(X_trans[:, -4:], X_true_indicator) assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
def test_axpy(dtype): axpy = _axpy_memview[NUMPY_TO_CYTHON[dtype]] rng = np.random.RandomState(0) x = rng.random_sample(10).astype(dtype, copy=False) y = rng.random_sample(10).astype(dtype, copy=False) alpha = 2.5 expected = alpha * x + y axpy(alpha, x, y) assert_allclose(y, expected, rtol=RTOL[dtype])
def test_k_means_empty_cluster_relocated(): # check that empty clusters are correctly relocated when using sample # weights (#13486) X = np.array([[-1], [1]]) sample_weight = [1.9, 0.1] init = np.array([[-1], [10]]) km = KMeans(n_clusters=2, init=init, n_init=1) km.fit(X, sample_weight=sample_weight) assert len(set(km.labels_)) == 2 assert_allclose(km.cluster_centers_, [[-1], [1]])
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] fit_intercept = filter_ == DENSE_FILTER ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv2.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv3.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('neg_mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert ridge_gcv4.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with sample weights if filter_ == DENSE_FILTER: ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert ridge_gcv.alpha_ == pytest.approx(alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred, rtol=1e-5) return ret
def test_iterative_imputer_rank_one(): rng = np.random.RandomState(0) d = 50 A = rng.rand(d, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(d, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=5, verbose=1, random_state=rng) X_filled = imputer.fit_transform(X_missing) assert_allclose(X_filled, X, atol=0.02)
def test_fused_types_consistency(make_dataset_32, make_dataset_64): dataset_32, dataset_64 = make_dataset_32(), make_dataset_64() NUMBER_OF_RUNS = 5 for _ in range(NUMBER_OF_RUNS): # next sample (xi_data32, _, _), yi32, _, _ = dataset_32._next_py() (xi_data64, _, _), yi64, _, _ = dataset_64._next_py() assert xi_data32.dtype == np.float32 assert xi_data64.dtype == np.float64 assert_allclose(xi_data64, xi_data32, rtol=1e-5) assert_allclose(yi64, yi32, rtol=1e-5)
def test_ger(dtype, order): ger = _ger_memview[NUMPY_TO_CYTHON[dtype]] rng = np.random.RandomState(0) x = rng.random_sample(10).astype(dtype, copy=False) y = rng.random_sample(20).astype(dtype, copy=False) A = np.asarray(rng.random_sample((10, 20)).astype(dtype, copy=False), order=ORDER[order]) alpha = 2.5 expected = alpha * np.outer(x, y) + A ger(alpha, x, y, A) assert_allclose(A, expected, rtol=RTOL[dtype])
def test_pca_inverse(svd_solver, whiten): # Test that the projection of data can be inverted rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) pca = PCA(n_components=2, svd_solver=svd_solver, whiten=whiten).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_allclose(X, Y_inverse, rtol=5e-6)
def test_pca_singular_values_consistency(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca_full = PCA(n_components=2, svd_solver='full', random_state=rng) pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.singular_values_, pca_other.singular_values_, rtol=5e-3)
def test_euclidean_distances_with_norms(dtype, y_array_constr): # check that we still get the right answers with {X,Y}_norm_squared # and that we get a wrong answer with wrong {X,Y}_norm_squared rng = np.random.RandomState(0) X = rng.random_sample((10, 10)).astype(dtype, copy=False) Y = rng.random_sample((20, 10)).astype(dtype, copy=False) # norms will only be used if their dtype is float64 X_norm_sq = (X.astype(np.float64)**2).sum(axis=1).reshape(1, -1) Y_norm_sq = (Y.astype(np.float64)**2).sum(axis=1).reshape(1, -1) Y = y_array_constr(Y) D1 = euclidean_distances(X, Y) D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq) D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq) D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq) assert_allclose(D2, D1) assert_allclose(D3, D1) assert_allclose(D4, D1) # check we get the wrong answer with wrong {X,Y}_norm_squared wrong_D = euclidean_distances(X, Y, X_norm_squared=np.zeros_like(X_norm_sq), Y_norm_squared=np.zeros_like(Y_norm_sq)) with pytest.raises(AssertionError): assert_allclose(wrong_D, D1)
def test_pairwise_distances_chunked_reduce(): rng = np.random.RandomState(0) X = rng.random_sample((400, 4)) # Reduced Euclidean distance S = pairwise_distances(X)[:, :100] S_chunks = pairwise_distances_chunked(X, None, reduce_func=_reduce_func, working_memory=2**-16) assert isinstance(S_chunks, GeneratorType) S_chunks = list(S_chunks) assert len(S_chunks) > 1 # atol is for diagonal where S is explicitly zeroed on the diagonal assert_allclose(np.vstack(S_chunks), S, atol=1e-7)
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true): kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol) log_dens = kde.fit(X).score_samples(Y) assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1E-7, rtol)) assert_allclose(np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1E-7, rtol))
def test_gemv(dtype, opA, transA, order): gemv = _gemv_memview[NUMPY_TO_CYTHON[dtype]] rng = np.random.RandomState(0) A = np.asarray(opA(rng.random_sample((20, 10)).astype(dtype, copy=False)), order=ORDER[order]) x = rng.random_sample(10).astype(dtype, copy=False) y = rng.random_sample(20).astype(dtype, copy=False) alpha, beta = 2.5, -0.5 expected = alpha * opA(A).dot(x) + beta * y gemv(transA, alpha, A, x, beta, y) assert_allclose(y, expected, rtol=RTOL[dtype])
def test_kernel_pca_deterministic_output(): rng = np.random.RandomState(0) X = rng.rand(10, 10) eigen_solver = ('arpack', 'dense') for solver in eigen_solver: transformed_X = np.zeros((20, 2)) for i in range(20): kpca = KernelPCA(n_components=2, eigen_solver=solver, random_state=rng) transformed_X[i, :] = kpca.fit_transform(X)[0] assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
def test_explained_variance_components_10_20(X_sparse, kind, solver): X = X_sparse if kind == 'sparse' else X_sparse.toarray() svd_10 = TruncatedSVD(10, algorithm=solver, n_iter=10).fit(X) svd_20 = TruncatedSVD(20, algorithm=solver, n_iter=10).fit(X) # Assert the 1st component is equal assert_allclose( svd_10.explained_variance_ratio_, svd_20.explained_variance_ratio_[:10], rtol=5e-3, ) # Assert that 20 components has higher explained variance than 10 assert (svd_20.explained_variance_ratio_.sum() > svd_10.explained_variance_ratio_.sum())
def test_euclidean_distances_upcast_sym(batch_size, x_array_constr): # check batches handling when X is Y (#13910) rng = np.random.RandomState(0) X = rng.random_sample((100, 10)).astype(np.float32) X[X < 0.8] = 0 expected = squareform(pdist(X)) X = x_array_constr(X) distances = _euclidean_distances_upcast(X, Y=X, batch_size=batch_size) distances = np.sqrt(np.maximum(distances, 0)) # the default rtol=1e-7 is too close to the float32 precision # and fails due too rounding errors. assert_allclose(distances, expected, rtol=1e-6)
def test_pca_vs_spca(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2) pca = PCA(n_components=2) pca.fit(Y) spca.fit(Y) results_test_pca = pca.transform(Z) results_test_spca = spca.transform(Z) assert_allclose(np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-5) results_test_pca *= np.sign(results_test_pca[0, :]) results_test_spca *= np.sign(results_test_spca[0, :]) assert_allclose(results_test_pca, results_test_spca)
def test_pipeline_score_samples_pca_lof(): iris = load_iris() X = iris.data # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. pca = PCA(svd_solver='full', n_components='mle', whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([('pca', pca), ('lof', lof)]) pipe.fit(X) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0],) # Check the values lof.fit(pca.fit_transform(X)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
def test_euclidean_distances_sym(dtype, x_array_constr): # check that euclidean distances gives same result as scipy pdist # when only X is provided rng = np.random.RandomState(0) X = rng.random_sample((100, 10)).astype(dtype, copy=False) X[X < 0.8] = 0 expected = squareform(pdist(X)) X = x_array_constr(X) distances = euclidean_distances(X) # the default rtol=1e-7 is too close to the float32 precision # and fails due too rounding errors. assert_allclose(distances, expected, rtol=1e-6) assert distances.dtype == dtype
def test_compute_covariance(shape, uniform_weights): rng = np.random.RandomState(0) X = rng.randn(*shape) if uniform_weights: sw = np.ones(X.shape[0]) else: sw = rng.chisquare(1, shape[0]) sqrt_sw = np.sqrt(sw) X_mean = np.average(X, axis=0, weights=sw) X_centered = (X - X_mean) * sqrt_sw[:, None] true_covariance = X_centered.T.dot(X_centered) X_sparse = sp.csr_matrix(X * sqrt_sw[:, None]) gcv = _RidgeGCV(fit_intercept=True) computed_cov, computed_mean = gcv._compute_covariance(X_sparse, sqrt_sw) assert_allclose(X_mean, computed_mean) assert_allclose(true_covariance, computed_cov)
def test_simple_imputation_add_indicator_sparse_matrix(arr_type): X_sparse = arr_type([[np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan], [1, 2, 9]]) X_true = np.array([ [3., 1., 5., 1., 0., 0.], [2., 2., 1., 0., 1., 0.], [6., 3., 5., 0., 0., 1.], [1., 2., 9., 0., 0., 0.], ]) imputer = SimpleImputer(missing_values=np.nan, add_indicator=True) X_trans = imputer.fit_transform(X_sparse) assert sparse.issparse(X_trans) assert X_trans.shape == X_true.shape assert_allclose(X_trans.toarray(), X_true)
def test_gemm(dtype, opA, transA, opB, transB, order): gemm = _gemm_memview[NUMPY_TO_CYTHON[dtype]] rng = np.random.RandomState(0) A = np.asarray(opA(rng.random_sample((30, 10)).astype(dtype, copy=False)), order=ORDER[order]) B = np.asarray(opB(rng.random_sample((10, 20)).astype(dtype, copy=False)), order=ORDER[order]) C = np.asarray(rng.random_sample((30, 20)).astype(dtype, copy=False), order=ORDER[order]) alpha, beta = 2.5, -0.5 expected = alpha * opA(A).dot(opB(B)) + beta * C gemm(transA, transB, alpha, A, B, beta, C) assert_allclose(C, expected, rtol=RTOL[dtype])
def test_linear_regression_sparse_equal_dense(normalize, fit_intercept): # Test that linear regression agrees between sparse and dense rng = check_random_state(0) n_samples = 200 n_features = 2 X = rng.randn(n_samples, n_features) X[X < 0.1] = 0. Xcsr = sparse.csr_matrix(X) y = rng.rand(n_samples) params = dict(normalize=normalize, fit_intercept=fit_intercept) clf_dense = LinearRegression(**params) clf_sparse = LinearRegression(**params) clf_dense.fit(X, y) clf_sparse.fit(Xcsr, y) assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_) assert_allclose(clf_dense.coef_, clf_sparse.coef_)
def test_rot(dtype): rot = _rot_memview[NUMPY_TO_CYTHON[dtype]] rng = np.random.RandomState(0) x = rng.random_sample(10).astype(dtype, copy=False) y = rng.random_sample(10).astype(dtype, copy=False) c = dtype(rng.randn()) s = dtype(rng.randn()) expected_x = c * x + s * y expected_y = c * y - s * x rot(x, y, c, s) assert_allclose(x, expected_x) assert_allclose(y, expected_y)
def check_pca_int_dtype_upcast_to_double(svd_solver): # Ensure that all int types will be upcast to float64 X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4)) X_i64 = X_i64.astype(np.int64, copy=False) X_i32 = X_i64.astype(np.int32, copy=False) pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i64) pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i32) assert pca_64.components_.dtype == np.float64 assert pca_32.components_.dtype == np.float64 assert pca_64.transform(X_i64).dtype == np.float64 assert pca_32.transform(X_i32).dtype == np.float64 assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4)