Beispiel #1
0
def test_randomized_svd_sign_flip_with_transpose():
    # Check if the randomized_svd sign flipping is always done based on u
    # irrespective of transpose.
    # See https://github.com/scikit-learn/scikit-learn/issues/5608
    # for more details.
    def max_loading_is_positive(u, v):
        """
        returns bool tuple indicating if the values maximising np.abs
        are positive across all rows for u and across all columns for v.
        """
        u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
        v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
        return u_based, v_based

    mat = np.arange(10 * 8).reshape(10, -1)

    # Without transpose
    u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True)
    u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
    assert u_based
    assert not v_based

    # With transpose
    u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
        mat, 3, flip_sign=True, transpose=True)
    u_based, v_based = max_loading_is_positive(u_flipped_with_transpose,
                                               v_flipped_with_transpose)
    assert u_based
    assert not v_based
Beispiel #2
0
def test_randomized_svd_sign_flip():
    a = np.array([[2.0, 0.0], [0.0, 1.0]])
    u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41)
    for seed in range(10):
        u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed)
        assert_almost_equal(u1, u2)
        assert_almost_equal(v1, v2)
        assert_almost_equal(np.dot(u2 * s2, v2), a)
        assert_almost_equal(np.dot(u2.T, u2), np.eye(2))
        assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
Beispiel #3
0
def svd_timing(X,
               n_comps,
               n_iter,
               n_oversamples,
               power_iteration_normalizer='auto',
               method=None):
    """
    Measure time for decomposition
    """
    print("... running SVD ...")
    if method is not 'fbpca':
        gc.collect()
        t0 = time()
        U, mu, V = randomized_svd(X,
                                  n_comps,
                                  n_oversamples,
                                  n_iter,
                                  power_iteration_normalizer,
                                  random_state=random_state,
                                  transpose=False)
        call_time = time() - t0
    else:
        gc.collect()
        t0 = time()
        # There is a different convention for l here
        U, mu, V = fbpca.pca(X,
                             n_comps,
                             raw=True,
                             n_iter=n_iter,
                             l=n_oversamples + n_comps)
        call_time = time() - t0

    return U, mu, V, call_time
Beispiel #4
0
def test_randomized_svd_power_iteration_normalizer():
    # randomized_svd with power_iteration_normalized='none' diverges for
    # large number of power iterations on this dataset
    rng = np.random.RandomState(42)
    X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
    X += 3 * rng.randint(0, 2, size=X.shape)
    n_components = 50

    # Check that it diverges with many (non-normalized) power iterations
    U, s, V = randomized_svd(X,
                             n_components,
                             n_iter=2,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_2 = linalg.norm(A, ord='fro')
    U, s, V = randomized_svd(X,
                             n_components,
                             n_iter=20,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_20 = linalg.norm(A, ord='fro')
    assert np.abs(error_2 - error_20) > 100

    for normalizer in ['LU', 'QR', 'auto']:
        U, s, V = randomized_svd(X,
                                 n_components,
                                 n_iter=2,
                                 power_iteration_normalizer=normalizer,
                                 random_state=0)
        A = X - U.dot(np.diag(s).dot(V))
        error_2 = linalg.norm(A, ord='fro')

        for i in [5, 10, 50]:
            U, s, V = randomized_svd(X,
                                     n_components,
                                     n_iter=i,
                                     power_iteration_normalizer=normalizer,
                                     random_state=0)
            A = X - U.dot(np.diag(s).dot(V))
            error = linalg.norm(A, ord='fro')
            assert 15 > np.abs(error_2 - error)
Beispiel #5
0
def test_randomized_svd_transpose_consistency():
    # Check that transposing the design matrix has limited impact
    n_samples = 100
    n_features = 500
    rank = 4
    k = 10

    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.5,
                             random_state=0)
    assert X.shape == (n_samples, n_features)

    U1, s1, V1 = randomized_svd(X,
                                k,
                                n_iter=3,
                                transpose=False,
                                random_state=0)
    U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0)
    U3, s3, V3 = randomized_svd(X,
                                k,
                                n_iter=3,
                                transpose='auto',
                                random_state=0)
    U4, s4, V4 = linalg.svd(X, full_matrices=False)

    assert_almost_equal(s1, s4[:k], decimal=3)
    assert_almost_equal(s2, s4[:k], decimal=3)
    assert_almost_equal(s3, s4[:k], decimal=3)

    assert_almost_equal(np.dot(U1, V1),
                        np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)
    assert_almost_equal(np.dot(U2, V2),
                        np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)

    # in this case 'auto' is equivalent to transpose
    assert_almost_equal(s2, s3)
Beispiel #6
0
def test_randomized_svd_low_rank_with_noise():
    # Check that extmath.randomized_svd can handle noisy matrices
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.1,
                             random_state=0)
    assert X.shape == (n_samples, n_features)

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    for normalizer in ['auto', 'none', 'LU', 'QR']:
        # compute the singular values of X using the fast approximate
        # method without the iterated power method
        _, sa, _ = randomized_svd(X,
                                  k,
                                  n_iter=0,
                                  power_iteration_normalizer=normalizer,
                                  random_state=0)

        # the approximation does not tolerate the noise:
        assert np.abs(s[:k] - sa).max() > 0.01

        # compute the singular values of X using the fast approximate
        # method with iterated power method
        _, sap, _ = randomized_svd(X,
                                   k,
                                   power_iteration_normalizer=normalizer,
                                   random_state=0)

        # the iterated power method is helping getting rid of the noise:
        assert_almost_equal(s[:k], sap, decimal=3)
Beispiel #7
0
def compute_bench(samples_range, features_range, n_iter=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('====================')
            print('Iteration %03d of %03d' % (it, max_it))
            print('====================')
            X = make_low_rank_matrix(n_samples,
                                     n_features,
                                     effective_rank=rank,
                                     tail_strength=0.2)

            gc.collect()
            print("benchmarking scipy svd: ")
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print("benchmarking scikit-learn randomized_svd: n_iter=0")
            tstart = time()
            randomized_svd(X, rank, n_iter=0)
            results['scikit-learn randomized_svd (n_iter=0)'].append(time() -
                                                                     tstart)

            gc.collect()
            print("benchmarking scikit-learn randomized_svd: n_iter=%d " %
                  n_iter)
            tstart = time()
            randomized_svd(X, rank, n_iter=n_iter)
            results['scikit-learn randomized_svd (n_iter=%d)' %
                    n_iter].append(time() - tstart)

    return results
Beispiel #8
0
def test_randomized_svd_infinite_rank():
    # Check that extmath.randomized_svd can handle noisy matrices
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=1.0,
                             random_state=0)
    assert X.shape == (n_samples, n_features)

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)
    for normalizer in ['auto', 'none', 'LU', 'QR']:
        # compute the singular values of X using the fast approximate method
        # without the iterated power method
        _, sa, _ = randomized_svd(X,
                                  k,
                                  n_iter=0,
                                  power_iteration_normalizer=normalizer)

        # the approximation does not tolerate the noise:
        assert np.abs(s[:k] - sa).max() > 0.1

        # compute the singular values of X using the fast approximate method
        # with iterated power method
        _, sap, _ = randomized_svd(X,
                                   k,
                                   n_iter=5,
                                   power_iteration_normalizer=normalizer)

        # the iterated power method is still managing to get most of the
        # structure at the requested rank
        assert_almost_equal(s[:k], sap, decimal=3)
Beispiel #9
0
def check_randomized_svd_low_rank(dtype):
    # Check that extmath.randomized_svd is consistent with linalg.svd
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10
    decimal = 5 if dtype == np.float32 else 7
    dtype = np.dtype(dtype)

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.0,
                             random_state=0).astype(dtype, copy=False)
    assert X.shape == (n_samples, n_features)

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # Convert the singular values to the specific dtype
    U = U.astype(dtype, copy=False)
    s = s.astype(dtype, copy=False)
    V = V.astype(dtype, copy=False)

    for normalizer in ['auto', 'LU', 'QR']:  # 'none' would not be stable
        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = randomized_svd(X,
                                    k,
                                    power_iteration_normalizer=normalizer,
                                    random_state=0)

        # If the input dtype is float, then the output dtype is float of the
        # same bit size (f32 is not upcast to f64)
        # But if the input dtype is int, the output dtype is float64
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype == np.float64
            assert sa.dtype == np.float64
            assert Va.dtype == np.float64

        assert Ua.shape == (n_samples, k)
        assert sa.shape == (k, )
        assert Va.shape == (k, n_features)

        # ensure that the singular values of both methods are equal up to the
        # real rank of the matrix
        assert_almost_equal(s[:k], sa, decimal=decimal)

        # check the singular vectors too (while not checking the sign)
        assert_almost_equal(np.dot(U[:, :k], V[:k, :]),
                            np.dot(Ua, Va),
                            decimal=decimal)

        # check the sparse matrix representation
        X = sparse.csr_matrix(X)

        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = \
            randomized_svd(X, k, power_iteration_normalizer=normalizer,
                           random_state=0)
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype.kind == 'f'
            assert sa.dtype.kind == 'f'
            assert Va.dtype.kind == 'f'

        assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)