Beispiel #1
0
def check_categorical_onehot(X):
    enc = OneHotEncoder(categories='auto')
    Xtr1 = enc.fit_transform(X)

    enc = OneHotEncoder(categories='auto', sparse=False)
    Xtr2 = enc.fit_transform(X)

    assert_allclose(Xtr1.toarray(), Xtr2)

    assert sparse.isspmatrix_csr(Xtr1)
    return Xtr1.toarray()
Beispiel #2
0
def test_dirichlet_expectation():
    """Test Cython version of Dirichlet expectation calculation."""
    x = np.logspace(-100, 10, 10000)
    expectation = np.empty_like(x)
    _dirichlet_expectation_1d(x, 0, expectation)
    assert_allclose(expectation, np.exp(psi(x) - psi(np.sum(x))), atol=1e-19)

    x = x.reshape(100, 100)
    assert_allclose(_dirichlet_expectation_2d(x),
                    psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]),
                    rtol=1e-11,
                    atol=3e-9)
Beispiel #3
0
def test_X_CenterStackOp(n_col):
    rng = np.random.RandomState(0)
    X = rng.randn(11, 8)
    X_m = rng.randn(8)
    sqrt_sw = rng.randn(len(X))
    Y = rng.randn(11, *n_col)
    A = rng.randn(9, *n_col)
    operator = _X_CenterStackOp(sp.csr_matrix(X), X_m, sqrt_sw)
    reference_operator = np.hstack(
        [X - sqrt_sw[:, None] * X_m, sqrt_sw[:, None]])
    assert_allclose(reference_operator.dot(A), operator.dot(A))
    assert_allclose(reference_operator.T.dot(Y), operator.T.dot(Y))
Beispiel #4
0
def test_iterative_imputer_no_missing():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 100)
    X[:, 0] = np.nan
    m1 = IterativeImputer(max_iter=10, random_state=rng)
    m2 = IterativeImputer(max_iter=10, random_state=rng)
    pred1 = m1.fit(X).transform(X)
    pred2 = m2.fit_transform(X)
    # should exclude the first column entirely
    assert_allclose(X[:, 1:], pred1)
    # fit and fit_transform should both be identical
    assert_allclose(pred1, pred2)
Beispiel #5
0
def test_pca_check_projection(svd_solver):
    # Test that the projection of data is correct
    rng = np.random.RandomState(0)
    n, p = 100, 3
    X = rng.randn(n, p) * .1
    X[:10] += np.array([3, 4, 5])
    Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])

    Yt = PCA(n_components=2, svd_solver=svd_solver).fit(X).transform(Xt)
    Yt /= np.sqrt((Yt**2).sum())

    assert_allclose(np.abs(Yt[0][0]), 1., rtol=5e-3)
Beispiel #6
0
def test_imputers_add_indicator(marker, imputer_constructor):
    X = np.array([[marker, 1, 5, marker, 1], [2, marker, 1, marker, 2],
                  [6, 3, marker, marker, 3], [1, 2, 9, marker, 4]])
    X_true_indicator = np.array([[1., 0., 0., 1.], [0., 1., 0., 1.],
                                 [0., 0., 1., 1.], [0., 0., 0., 1.]])
    imputer = imputer_constructor(missing_values=marker, add_indicator=True)

    X_trans = imputer.fit(X).transform(X)
    # The test is for testing the indicator,
    # that's why we're looking at the last 4 columns only.
    assert_allclose(X_trans[:, -4:], X_true_indicator)
    assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
Beispiel #7
0
def test_axpy(dtype):
    axpy = _axpy_memview[NUMPY_TO_CYTHON[dtype]]

    rng = np.random.RandomState(0)
    x = rng.random_sample(10).astype(dtype, copy=False)
    y = rng.random_sample(10).astype(dtype, copy=False)
    alpha = 2.5

    expected = alpha * x + y
    axpy(alpha, x, y)

    assert_allclose(y, expected, rtol=RTOL[dtype])
Beispiel #8
0
def test_k_means_empty_cluster_relocated():
    # check that empty clusters are correctly relocated when using sample
    # weights (#13486)
    X = np.array([[-1], [1]])
    sample_weight = [1.9, 0.1]
    init = np.array([[-1], [10]])

    km = KMeans(n_clusters=2, init=init, n_init=1)
    km.fit(X, sample_weight=sample_weight)

    assert len(set(km.labels_)) == 2
    assert_allclose(km.cluster_centers_, [[-1], [1]])
Beispiel #9
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    fit_intercept = filter_ == DENSE_FILTER
    ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv2.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv3.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('neg_mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv4.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with sample weights
    if filter_ == DENSE_FILTER:
        ridge_gcv.fit(filter_(X_diabetes),
                      y_diabetes,
                      sample_weight=np.ones(n_samples))
        assert ridge_gcv.alpha_ == pytest.approx(alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred, rtol=1e-5)

    return ret
Beispiel #10
0
def test_iterative_imputer_rank_one():
    rng = np.random.RandomState(0)
    d = 50
    A = rng.rand(d, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(d, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=5, verbose=1, random_state=rng)
    X_filled = imputer.fit_transform(X_missing)
    assert_allclose(X_filled, X, atol=0.02)
Beispiel #11
0
def test_fused_types_consistency(make_dataset_32, make_dataset_64):
    dataset_32, dataset_64 = make_dataset_32(), make_dataset_64()
    NUMBER_OF_RUNS = 5
    for _ in range(NUMBER_OF_RUNS):
        # next sample
        (xi_data32, _, _), yi32, _, _ = dataset_32._next_py()
        (xi_data64, _, _), yi64, _, _ = dataset_64._next_py()

        assert xi_data32.dtype == np.float32
        assert xi_data64.dtype == np.float64

        assert_allclose(xi_data64, xi_data32, rtol=1e-5)
        assert_allclose(yi64, yi32, rtol=1e-5)
Beispiel #12
0
def test_ger(dtype, order):
    ger = _ger_memview[NUMPY_TO_CYTHON[dtype]]

    rng = np.random.RandomState(0)
    x = rng.random_sample(10).astype(dtype, copy=False)
    y = rng.random_sample(20).astype(dtype, copy=False)
    A = np.asarray(rng.random_sample((10, 20)).astype(dtype, copy=False),
                   order=ORDER[order])
    alpha = 2.5

    expected = alpha * np.outer(x, y) + A
    ger(alpha, x, y, A)

    assert_allclose(A, expected, rtol=RTOL[dtype])
Beispiel #13
0
def test_pca_inverse(svd_solver, whiten):
    # Test that the projection of data can be inverted
    rng = np.random.RandomState(0)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    X += [5, 4, 3]  # make a large mean

    # same check that we can find the original data from the transformed
    # signal (since the data is almost of rank n_components)
    pca = PCA(n_components=2, svd_solver=svd_solver, whiten=whiten).fit(X)
    Y = pca.transform(X)
    Y_inverse = pca.inverse_transform(Y)
    assert_allclose(X, Y_inverse, rtol=5e-6)
Beispiel #14
0
def test_pca_singular_values_consistency(svd_solver):
    rng = np.random.RandomState(0)
    n_samples, n_features = 100, 80
    X = rng.randn(n_samples, n_features)

    pca_full = PCA(n_components=2, svd_solver='full', random_state=rng)
    pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=rng)

    pca_full.fit(X)
    pca_other.fit(X)

    assert_allclose(pca_full.singular_values_,
                    pca_other.singular_values_,
                    rtol=5e-3)
Beispiel #15
0
def test_euclidean_distances_with_norms(dtype, y_array_constr):
    # check that we still get the right answers with {X,Y}_norm_squared
    # and that we get a wrong answer with wrong {X,Y}_norm_squared
    rng = np.random.RandomState(0)
    X = rng.random_sample((10, 10)).astype(dtype, copy=False)
    Y = rng.random_sample((20, 10)).astype(dtype, copy=False)

    # norms will only be used if their dtype is float64
    X_norm_sq = (X.astype(np.float64)**2).sum(axis=1).reshape(1, -1)
    Y_norm_sq = (Y.astype(np.float64)**2).sum(axis=1).reshape(1, -1)

    Y = y_array_constr(Y)

    D1 = euclidean_distances(X, Y)
    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
    D4 = euclidean_distances(X,
                             Y,
                             X_norm_squared=X_norm_sq,
                             Y_norm_squared=Y_norm_sq)
    assert_allclose(D2, D1)
    assert_allclose(D3, D1)
    assert_allclose(D4, D1)

    # check we get the wrong answer with wrong {X,Y}_norm_squared
    wrong_D = euclidean_distances(X,
                                  Y,
                                  X_norm_squared=np.zeros_like(X_norm_sq),
                                  Y_norm_squared=np.zeros_like(Y_norm_sq))
    with pytest.raises(AssertionError):
        assert_allclose(wrong_D, D1)
Beispiel #16
0
def test_pairwise_distances_chunked_reduce():
    rng = np.random.RandomState(0)
    X = rng.random_sample((400, 4))
    # Reduced Euclidean distance
    S = pairwise_distances(X)[:, :100]
    S_chunks = pairwise_distances_chunked(X,
                                          None,
                                          reduce_func=_reduce_func,
                                          working_memory=2**-16)
    assert isinstance(S_chunks, GeneratorType)
    S_chunks = list(S_chunks)
    assert len(S_chunks) > 1
    # atol is for diagonal where S is explicitly zeroed on the diagonal
    assert_allclose(np.vstack(S_chunks), S, atol=1e-7)
Beispiel #17
0
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
    kde = KernelDensity(kernel=kernel,
                        bandwidth=bandwidth,
                        atol=atol,
                        rtol=rtol)
    log_dens = kde.fit(X).score_samples(Y)
    assert_allclose(np.exp(log_dens),
                    dens_true,
                    atol=atol,
                    rtol=max(1E-7, rtol))
    assert_allclose(np.exp(kde.score(Y)),
                    np.prod(dens_true),
                    atol=atol,
                    rtol=max(1E-7, rtol))
Beispiel #18
0
def test_gemv(dtype, opA, transA, order):
    gemv = _gemv_memview[NUMPY_TO_CYTHON[dtype]]

    rng = np.random.RandomState(0)
    A = np.asarray(opA(rng.random_sample((20, 10)).astype(dtype, copy=False)),
                   order=ORDER[order])
    x = rng.random_sample(10).astype(dtype, copy=False)
    y = rng.random_sample(20).astype(dtype, copy=False)
    alpha, beta = 2.5, -0.5

    expected = alpha * opA(A).dot(x) + beta * y
    gemv(transA, alpha, A, x, beta, y)

    assert_allclose(y, expected, rtol=RTOL[dtype])
Beispiel #19
0
def test_kernel_pca_deterministic_output():
    rng = np.random.RandomState(0)
    X = rng.rand(10, 10)
    eigen_solver = ('arpack', 'dense')

    for solver in eigen_solver:
        transformed_X = np.zeros((20, 2))
        for i in range(20):
            kpca = KernelPCA(n_components=2,
                             eigen_solver=solver,
                             random_state=rng)
            transformed_X[i, :] = kpca.fit_transform(X)[0]
        assert_allclose(transformed_X,
                        np.tile(transformed_X[0, :], 20).reshape(20, 2))
Beispiel #20
0
def test_explained_variance_components_10_20(X_sparse, kind, solver):
    X = X_sparse if kind == 'sparse' else X_sparse.toarray()
    svd_10 = TruncatedSVD(10, algorithm=solver, n_iter=10).fit(X)
    svd_20 = TruncatedSVD(20, algorithm=solver, n_iter=10).fit(X)

    # Assert the 1st component is equal
    assert_allclose(
        svd_10.explained_variance_ratio_,
        svd_20.explained_variance_ratio_[:10],
        rtol=5e-3,
    )

    # Assert that 20 components has higher explained variance than 10
    assert (svd_20.explained_variance_ratio_.sum() >
            svd_10.explained_variance_ratio_.sum())
Beispiel #21
0
def test_euclidean_distances_upcast_sym(batch_size, x_array_constr):
    # check batches handling when X is Y (#13910)
    rng = np.random.RandomState(0)
    X = rng.random_sample((100, 10)).astype(np.float32)
    X[X < 0.8] = 0

    expected = squareform(pdist(X))

    X = x_array_constr(X)
    distances = _euclidean_distances_upcast(X, Y=X, batch_size=batch_size)
    distances = np.sqrt(np.maximum(distances, 0))

    # the default rtol=1e-7 is too close to the float32 precision
    # and fails due too rounding errors.
    assert_allclose(distances, expected, rtol=1e-6)
Beispiel #22
0
def test_pca_vs_spca():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
    Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
    spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2)
    pca = PCA(n_components=2)
    pca.fit(Y)
    spca.fit(Y)
    results_test_pca = pca.transform(Z)
    results_test_spca = spca.transform(Z)
    assert_allclose(np.abs(spca.components_.dot(pca.components_.T)),
                    np.eye(2), atol=1e-5)
    results_test_pca *= np.sign(results_test_pca[0, :])
    results_test_spca *= np.sign(results_test_spca[0, :])
    assert_allclose(results_test_pca, results_test_spca)
Beispiel #23
0
def test_pipeline_score_samples_pca_lof():
    iris = load_iris()
    X = iris.data
    # Test that the score_samples method is implemented on a pipeline.
    # Test that the score_samples method on pipeline yields same results as
    # applying transform and score_samples steps separately.
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    lof = LocalOutlierFactor(novelty=True)
    pipe = Pipeline([('pca', pca), ('lof', lof)])
    pipe.fit(X)
    # Check the shapes
    assert pipe.score_samples(X).shape == (X.shape[0],)
    # Check the values
    lof.fit(pca.fit_transform(X))
    assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
Beispiel #24
0
def test_euclidean_distances_sym(dtype, x_array_constr):
    # check that euclidean distances gives same result as scipy pdist
    # when only X is provided
    rng = np.random.RandomState(0)
    X = rng.random_sample((100, 10)).astype(dtype, copy=False)
    X[X < 0.8] = 0

    expected = squareform(pdist(X))

    X = x_array_constr(X)
    distances = euclidean_distances(X)

    # the default rtol=1e-7 is too close to the float32 precision
    # and fails due too rounding errors.
    assert_allclose(distances, expected, rtol=1e-6)
    assert distances.dtype == dtype
Beispiel #25
0
def test_compute_covariance(shape, uniform_weights):
    rng = np.random.RandomState(0)
    X = rng.randn(*shape)
    if uniform_weights:
        sw = np.ones(X.shape[0])
    else:
        sw = rng.chisquare(1, shape[0])
    sqrt_sw = np.sqrt(sw)
    X_mean = np.average(X, axis=0, weights=sw)
    X_centered = (X - X_mean) * sqrt_sw[:, None]
    true_covariance = X_centered.T.dot(X_centered)
    X_sparse = sp.csr_matrix(X * sqrt_sw[:, None])
    gcv = _RidgeGCV(fit_intercept=True)
    computed_cov, computed_mean = gcv._compute_covariance(X_sparse, sqrt_sw)
    assert_allclose(X_mean, computed_mean)
    assert_allclose(true_covariance, computed_cov)
Beispiel #26
0
def test_simple_imputation_add_indicator_sparse_matrix(arr_type):
    X_sparse = arr_type([[np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan],
                         [1, 2, 9]])
    X_true = np.array([
        [3., 1., 5., 1., 0., 0.],
        [2., 2., 1., 0., 1., 0.],
        [6., 3., 5., 0., 0., 1.],
        [1., 2., 9., 0., 0., 0.],
    ])

    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True)
    X_trans = imputer.fit_transform(X_sparse)

    assert sparse.issparse(X_trans)
    assert X_trans.shape == X_true.shape
    assert_allclose(X_trans.toarray(), X_true)
Beispiel #27
0
def test_gemm(dtype, opA, transA, opB, transB, order):
    gemm = _gemm_memview[NUMPY_TO_CYTHON[dtype]]

    rng = np.random.RandomState(0)
    A = np.asarray(opA(rng.random_sample((30, 10)).astype(dtype, copy=False)),
                   order=ORDER[order])
    B = np.asarray(opB(rng.random_sample((10, 20)).astype(dtype, copy=False)),
                   order=ORDER[order])
    C = np.asarray(rng.random_sample((30, 20)).astype(dtype, copy=False),
                   order=ORDER[order])
    alpha, beta = 2.5, -0.5

    expected = alpha * opA(A).dot(opB(B)) + beta * C
    gemm(transA, transB, alpha, A, B, beta, C)

    assert_allclose(C, expected, rtol=RTOL[dtype])
Beispiel #28
0
def test_linear_regression_sparse_equal_dense(normalize, fit_intercept):
    # Test that linear regression agrees between sparse and dense
    rng = check_random_state(0)
    n_samples = 200
    n_features = 2
    X = rng.randn(n_samples, n_features)
    X[X < 0.1] = 0.
    Xcsr = sparse.csr_matrix(X)
    y = rng.rand(n_samples)
    params = dict(normalize=normalize, fit_intercept=fit_intercept)
    clf_dense = LinearRegression(**params)
    clf_sparse = LinearRegression(**params)
    clf_dense.fit(X, y)
    clf_sparse.fit(Xcsr, y)
    assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_)
    assert_allclose(clf_dense.coef_, clf_sparse.coef_)
Beispiel #29
0
def test_rot(dtype):
    rot = _rot_memview[NUMPY_TO_CYTHON[dtype]]

    rng = np.random.RandomState(0)
    x = rng.random_sample(10).astype(dtype, copy=False)
    y = rng.random_sample(10).astype(dtype, copy=False)
    c = dtype(rng.randn())
    s = dtype(rng.randn())

    expected_x = c * x + s * y
    expected_y = c * y - s * x

    rot(x, y, c, s)

    assert_allclose(x, expected_x)
    assert_allclose(y, expected_y)
Beispiel #30
0
def check_pca_int_dtype_upcast_to_double(svd_solver):
    # Ensure that all int types will be upcast to float64
    X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4))
    X_i64 = X_i64.astype(np.int64, copy=False)
    X_i32 = X_i64.astype(np.int32, copy=False)

    pca_64 = PCA(n_components=3, svd_solver=svd_solver,
                 random_state=0).fit(X_i64)
    pca_32 = PCA(n_components=3, svd_solver=svd_solver,
                 random_state=0).fit(X_i32)

    assert pca_64.components_.dtype == np.float64
    assert pca_32.components_.dtype == np.float64
    assert pca_64.transform(X_i64).dtype == np.float64
    assert pca_32.transform(X_i32).dtype == np.float64

    assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4)