Ejemplo n.º 1
0
def test_inverse_transform(algo, X_sparse):
    # We need a lot of components for the reconstruction to be "almost
    # equal" in all positions. XXX Test means or sums instead?
    tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo)
    Xt = tsvd.fit_transform(X_sparse)
    Xinv = tsvd.inverse_transform(Xt)
    assert_allclose(Xinv, X_sparse.toarray(), rtol=1e-1, atol=2e-1)
Ejemplo n.º 2
0
def test_sparse_formats(fmt, X_sparse):
    n_samples = X_sparse.shape[0]
    Xfmt = (X_sparse.toarray() if fmt == "dense" else getattr(
        X_sparse, "to" + fmt)())
    tsvd = TruncatedSVD(n_components=11)
    Xtrans = tsvd.fit_transform(Xfmt)
    assert Xtrans.shape == (n_samples, 11)
    Xtrans = tsvd.transform(Xfmt)
    assert Xtrans.shape == (n_samples, 11)
Ejemplo n.º 3
0
def test_explained_variance_components_10_20(X_sparse, kind, solver):
    X = X_sparse if kind == 'sparse' else X_sparse.toarray()
    svd_10 = TruncatedSVD(10, algorithm=solver, n_iter=10).fit(X)
    svd_20 = TruncatedSVD(20, algorithm=solver, n_iter=10).fit(X)

    # Assert the 1st component is equal
    assert_allclose(
        svd_10.explained_variance_ratio_,
        svd_20.explained_variance_ratio_[:10],
        rtol=5e-3,
    )

    # Assert that 20 components has higher explained variance than 10
    assert (svd_20.explained_variance_ratio_.sum() >
            svd_10.explained_variance_ratio_.sum())
Ejemplo n.º 4
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert X_transformed.shape == (X.shape[0], 3)

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # Test clone
    fs2 = assert_no_warnings(clone, fs)
    assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1]

    # test setting parameters
    fs.set_params(select__k=2)
    assert fs.fit_transform(X, y).shape == (X.shape[0], 4)

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert X_transformed.shape == (X.shape[0], 8)

    # test error if some elements do not support transform
    assert_raises_regex(TypeError,
                        'All estimators should implement fit and '
                        'transform.*\\bNoTrans\\b',
                        FeatureUnion,
                        [("transform", Transf()), ("no_transform", NoTrans())])

    # test that init accepts tuples
    fs = FeatureUnion((("svd", svd), ("select", select)))
    fs.fit(X, y)
Ejemplo n.º 5
0
def test_truncated_svd_eq_pca(X_sparse):
    # TruncatedSVD should be equal to PCA on centered data

    X_dense = X_sparse.toarray()

    X_c = X_dense - X_dense.mean(axis=0)

    params = dict(n_components=10, random_state=42)

    svd = TruncatedSVD(algorithm='arpack', **params)
    pca = PCA(svd_solver='arpack', **params)

    Xt_svd = svd.fit_transform(X_c)
    Xt_pca = pca.fit_transform(X_c)

    assert_allclose(Xt_svd, Xt_pca, rtol=1e-9)
    assert_allclose(pca.mean_, 0, atol=1e-9)
    assert_allclose(svd.components_, pca.components_)
Ejemplo n.º 6
0
def test_singular_values_expected(solver):
    # Set the singular values and see what we get back
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 110

    X = rng.randn(n_samples, n_features)

    pca = TruncatedSVD(n_components=3, algorithm=solver, random_state=rng)
    X_pca = pca.fit_transform(X)

    X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
    X_pca[:, 0] *= 3.142
    X_pca[:, 1] *= 2.718

    X_hat_pca = np.dot(X_pca, pca.components_)
    pca.fit(X_hat_pca)
    assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0], rtol=1e-14)
Ejemplo n.º 7
0
def test_explained_variance(X_sparse, kind, n_components, solver):
    X = X_sparse if kind == 'sparse' else X_sparse.toarray()
    svd = TruncatedSVD(n_components, algorithm=solver)
    X_tr = svd.fit_transform(X)
    # Assert that all the values are greater than 0
    assert_array_less(0.0, svd.explained_variance_ratio_)

    # Assert that total explained variance is less than 1
    assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)

    # Test that explained_variance is correct
    total_variance = np.var(X_sparse.toarray(), axis=0).sum()
    variances = np.var(X_tr, axis=0)
    true_explained_variance_ratio = variances / total_variance

    assert_allclose(
        svd.explained_variance_ratio_,
        true_explained_variance_ratio,
    )
Ejemplo n.º 8
0
def test_singular_values_consistency(solver):
    # Check that the TruncatedSVD output has the correct singular values
    rng = np.random.RandomState(0)
    n_samples, n_features = 100, 80
    X = rng.randn(n_samples, n_features)

    pca = TruncatedSVD(n_components=2, algorithm=solver,
                       random_state=rng).fit(X)

    # Compare to the Frobenius norm
    X_pca = pca.transform(X)
    assert_allclose(np.sum(pca.singular_values_**2.0),
                    np.linalg.norm(X_pca, "fro")**2.0,
                    rtol=1e-2)

    # Compare to the 2-norms of the score vectors
    assert_allclose(pca.singular_values_,
                    np.sqrt(np.sum(X_pca**2.0, axis=0)),
                    rtol=1e-2)
Ejemplo n.º 9
0
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two SVD dimensions
    # Note: Not all random_states produce perfect results.
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    assert_array_equal(hasher.fit(X).transform(X).toarray(),
                       X_transformed.toarray())

    # one leaf active per data point per forest
    assert X_transformed.shape[0] == X.shape[0]
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    svd = TruncatedSVD(n_components=2)
    X_reduced = svd.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert linear_clf.score(X_reduced, y) == 1.
Ejemplo n.º 10
0
def test_solvers(X_sparse, solver, kind):
    X = X_sparse if kind == 'sparse' else X_sparse.toarray()
    svd_a = TruncatedSVD(30, algorithm="arpack")
    svd = TruncatedSVD(30, algorithm=solver, random_state=42)

    Xa = svd_a.fit_transform(X)[:, :6]
    Xr = svd.fit_transform(X)[:, :6]
    assert_allclose(Xa, Xr, rtol=2e-3)

    comp_a = np.abs(svd_a.components_)
    comp = np.abs(svd.components_)
    # All elements are equal, but some elements are more equal than others.
    assert_allclose(comp_a[:9], comp[:9], rtol=1e-3)
    assert_allclose(comp_a[9:], comp[9:], atol=1e-2)
Ejemplo n.º 11
0
def test_integers(X_sparse):
    n_samples = X_sparse.shape[0]
    Xint = X_sparse.astype(np.int64)
    tsvd = TruncatedSVD(n_components=6)
    Xtrans = tsvd.fit_transform(Xint)
    assert Xtrans.shape == (n_samples, tsvd.n_components)
Ejemplo n.º 12
0
def test_too_many_components(algorithm, X_sparse):
    n_features = X_sparse.shape[1]
    for n_components in (n_features, n_features + 1):
        tsvd = TruncatedSVD(n_components=n_components, algorithm=algorithm)
        with pytest.raises(ValueError):
            tsvd.fit(X_sparse)
Ejemplo n.º 13
0
def test_attributes(n_components, X_sparse):
    n_features = X_sparse.shape[1]
    tsvd = TruncatedSVD(n_components).fit(X_sparse)
    assert tsvd.n_components == n_components
    assert tsvd.components_.shape == (n_components, n_features)
Ejemplo n.º 14
0
        # Extract the subject & body
        ('subjectbody', SubjectBodyExtractor()),

        # Use ColumnTransformer to combine the features from subject and body
        (
            'union',
            ColumnTransformer(
                [
                    # Pulling features from the post's subject line (first column)
                    ('subject', TfidfVectorizer(min_df=50), 0),

                    # Pipeline for standard bag-of-words model for body (second column)
                    ('body_bow',
                     Pipeline([
                         ('tfidf', TfidfVectorizer()),
                         ('best', TruncatedSVD(n_components=50)),
                     ]), 1),

                    # Pipeline for pulling ad hoc features from post's body
                    (
                        'body_stats',
                        Pipeline([
                            ('stats', TextStats()),  # returns a list of dicts
                            ('vect', DictVectorizer()
                             ),  # list of dicts -> feature matrix
                        ]),
                        1),
                ],

                # weight components in ColumnTransformer
                transformer_weights={
Ejemplo n.º 15
0
import matplotlib.pyplot as plt

from mrex.datasets import make_circles
from mrex.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from mrex.decomposition import TruncatedSVD
from mrex.naive_bayes import BernoulliNB

# make a synthetic dataset
X, y = make_circles(factor=0.5, random_state=0, noise=0.05)

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
X_transformed = hasher.fit_transform(X)

# Visualize result after dimensionality reduction using truncated SVD
svd = TruncatedSVD(n_components=2)
X_reduced = svd.fit_transform(X_transformed)

# Learn a Naive Bayes classifier on the transformed data
nb = BernoulliNB()
nb.fit(X_transformed, y)

# Learn an ExtraTreesClassifier for comparison
trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)
trees.fit(X, y)

# scatter plot of original and reduced data
fig = plt.figure(figsize=(9, 8))

ax = plt.subplot(221)
ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k')
Ejemplo n.º 16
0
                                 min_df=2,
                                 stop_words='english',
                                 use_idf=opts.use_idf)
X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(opts.n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

# #############################################################################
# Do the actual clustering