Example #1
0
def test_fast_svd_infinite_rank():
    """Check that extmath.fast_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=1.0,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = fast_svd(X, k, q=0)

    # the approximation does not tolerate the noise:
    assert np.abs(s[:k] - sa).max() > 0.1

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = fast_svd(X, k, q=5)

    # the iterated power method is still managing to get most of the structure
    # at the requested rank
    assert_almost_equal(s[:k], sap, decimal=3)
Example #2
0
def test_fast_svd_transpose_consistency():
    """Check that transposing the design matrix has limit impact"""
    n_samples = 100
    n_features = 500
    rank = 4
    k = 10

    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.5,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    U1, s1, V1 = fast_svd(X, k, q=3, transpose=False, random_state=0)
    U2, s2, V2 = fast_svd(X, k, q=3, transpose=True, random_state=0)
    U3, s3, V3 = fast_svd(X, k, q=3, transpose='auto', random_state=0)
    U4, s4, V4 = linalg.svd(X, full_matrices=False)

    assert_almost_equal(s1, s4[:k], decimal=3)
    assert_almost_equal(s2, s4[:k], decimal=3)
    assert_almost_equal(s3, s4[:k], decimal=3)

    assert_almost_equal(np.dot(U1, V1),
                        np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)
    assert_almost_equal(np.dot(U2, V2),
                        np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)

    # in this case 'auto' is equivalent to transpose
    assert_almost_equal(s2, s3)
Example #3
0
def test_fast_svd_low_rank():
    """Check that extmath.fast_svd is consistent with linalg.svd"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
        effective_rank=rank, tail_strength=0.0, random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_equal(Ua.shape, (n_samples, k))
    assert_equal(sa.shape, (k,))
    assert_equal(Va.shape, (k, n_features))

    # ensure that the singular values of both methods are equal up to the real
    # rank of the matrix
    assert_almost_equal(s[:k], sa)

    # check the singular vectors too (while not checking the sign)
    assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))

    # check the sparse matrix representation
    X = sparse.csr_matrix(X)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_almost_equal(s[:rank], sa[:rank])
Example #4
0
def compute_bench(samples_range, features_range, q=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '===================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '===================='
            X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank,
                                  tail_strength=0.2)

            gc.collect()
            print "benching scipy svd: "
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=0"
            tstart = time()
            fast_svd(X, rank, q=0)
            results['scikit-learn fast_svd (q=0)'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=%d " % q
            tstart = time()
            fast_svd(X, rank, q=q)
            results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart)

    return results
Example #5
0
def test_fast_svd_low_rank():
    """Check that extmath.fast_svd is consistent with linalg.svd"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.0, random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_equal(Ua.shape, (n_samples, k))
    assert_equal(sa.shape, (k,))
    assert_equal(Va.shape, (k, n_features))

    # ensure that the singular values of both methods are equal up to the real
    # rank of the matrix
    assert_almost_equal(s[:k], sa)

    # check the singular vectors too (while not checking the sign)
    assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))

    # check the sparse matrix representation
    X = sparse.csr_matrix(X)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_almost_equal(s[:rank], sa[:rank])
Example #6
0
def test_fast_svd_low_rank_with_noise():
    """Check that extmath.fast_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
        effective_rank=rank, tail_strength=0.5, random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = fast_svd(X, k, n_iterations=0)

    # the approximation does not tolerate the noise:
    assert np.abs(s[:k] - sa).max() > 0.05

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = fast_svd(X, k, n_iterations=5)

    # the iterated power method is helping getting rid of the noise:
    assert_almost_equal(s[:k], sap, decimal=3)
Example #7
0
def test_fast_svd_infinite_rank():
    """Check that extmath.fast_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
        effective_rank=rank, tail_strength=1.0, random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = fast_svd(X, k, n_iterations=0)

    # the approximation does not tolerate the noise:
    assert np.abs(s[:k] - sa).max() > 0.1

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = fast_svd(X, k, n_iterations=5)

    # the iterated power method is still managing to get most of the structure
    # at the requested rank
    assert_almost_equal(s[:k], sap, decimal=3)
Example #8
0
def test_fast_svd_transpose_consistency():
    """Check that transposing the design matrix has limit impact"""
    n_samples = 100
    n_features = 500
    rank = 4
    k = 10

    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
        effective_rank=rank, tail_strength=0.5, random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    U1, s1, V1 = fast_svd(X, k, n_iterations=3, transpose=False, random_state=0)
    U2, s2, V2 = fast_svd(X, k, n_iterations=3, transpose=True, random_state=0)
    U3, s3, V3 = fast_svd(X, k, n_iterations=3, transpose='auto', random_state=0)
    U4, s4, V4 = linalg.svd(X, full_matrices=False)

    assert_almost_equal(s1, s4[:k], decimal=3)
    assert_almost_equal(s2, s4[:k], decimal=3)
    assert_almost_equal(s3, s4[:k], decimal=3)

    assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)
    assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)

    # in this case 'auto' is equivalent to transpose
    assert_almost_equal(s2, s3)
Example #9
0
def test_fast_svd_low_rank_with_noise():
    """Check that extmath.fast_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.5,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = fast_svd(X, k, q=0)

    # the approximation does not tolerate the noise:
    assert np.abs(s[:k] - sa).max() > 0.05

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = fast_svd(X, k, q=5)

    # the iterated power method is helping getting rid of the noise:
    assert_almost_equal(s[:k], sap, decimal=3)
Example #10
0
    def matrixsvd(self):
        svd_matrix = self.projection_matrix.tocsc()

        if self.svd is 'scipy':
            Utemp, Stemp, VTtemp = ssl.svds(
                svd_matrix.tocsc(),
                k=(int(self.projection_matrix.tocsr().shape[0] *
                       self.precision) / 100))
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        elif self.svd is 'sparsesvd':
            (UT, S,
             VT) = sparsesvd(svd_matrix,
                             (int(svd_matrix.shape[0] * self.precision) / 100))

        elif self.svd is 'fast':
            Utemp, Stemp, VTtemp = fast_svd(svd_matrix, (
                int(self.projection_matrix.tocsr().shape[0] * self.precision) /
                100))
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        else:
            Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense())
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        return UT, S, VT
Example #11
0
    def matrixsvd(self):
        svd_matrix = self.projection_matrix.tocsc()

        if self.svd is 'scipy':
            Utemp, Stemp, VTtemp = ssl.svds(svd_matrix.tocsc(),
                    k=(int(self.projection_matrix.tocsr().shape[0] *
                        self.precision) / 100))
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        elif self.svd is 'sparsesvd':
            (UT, S, VT) = sparsesvd(svd_matrix, (int(svd_matrix.shape[0] * self.precision) / 100))

        elif self.svd is 'fast':
            Utemp, Stemp, VTtemp = fast_svd(svd_matrix,
                    (int(self.projection_matrix.tocsr().shape[0] *
                        self.precision) / 100))
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        else:
            Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense())
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        return UT, S, VT
def compute_bench(samples_range, features_range, q=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '===================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '===================='
            X = make_low_rank_matrix(n_samples,
                                     n_features,
                                     effective_rank=rank,
                                     tail_strength=0.2)

            gc.collect()
            print "benching scipy svd: "
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=0"
            tstart = time()
            fast_svd(X, rank, q=0)
            results['scikit-learn fast_svd (q=0)'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=%d " % q
            tstart = time()
            fast_svd(X, rank, q=q)
            results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart)

    return results
Example #13
0
def scree_plots(t, ndim=[]):
    """ Performs scree tests for each dimension of the input tensor

    Parameters
    ----------
    t : numpy array
        Higher-order tensor with size (I1 , I2 , ... , IN) along N dimensions.
    ndim : optional, list with N integer elements
        Number of components to calculate along each dimension. If not defined, the maximum size along each dimension will be used.

    Returns
    -------

    scree : list of N numpy arrays
        One array with size ndim[i] for each dimension saving the eigenvalues for this dimension.

    """
    total_dim = len(t.shape)
    if not ndim:  # case with no input ndim
        for i in range(total_dim):
            ndim.append(t.shape[i])
    elif len(
            ndim
    ) != total_dim:  # case that input ndim does not agree with number of dimensions of the input tensor
        for i in range(total_dim):
            ndim.append(t.shape[i])
    else:  # check whether the number in ndim is less than the size of that dimension
        for i in range(total_dim):
            if ndim[i] > t.shape[i]:
                ndim[i] = t.shape[i]

    scree = []
    for i in range(total_dim):
        t_unfold = unfold_axis(t, i)
        [_, e, _] = fast_svd(np.matmul(t_unfold, np.transpose(t_unfold)),
                             ndim[i],
                             n_iter=15)
        e = np.sqrt(e)
        e = np.real(e)
        scree.append(e)

    return scree
Example #14
0
    def matrixsvd(self):
        svd_matrix = self.projection_matrix.tocsc()
        svd_dict = {}
        result_list = []

        if self.svd is 'scipy':
            Utemp, Stemp, VTtemp = ssl.svds(
                svd_matrix.tocsc(),
                k=(int(self.projection_matrix.tocsr().shape[0] *
                       self.precision) / 100))

            U = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        elif self.svd is 'sparsesvd':
            (U, S,
             VT) = sparsesvd(svd_matrix,
                             (int(svd_matrix.shape[0] * self.precision) / 100))

        elif self.svd is 'fast':

            Utemp, Stemp, VTtemp = fast_svd(svd_matrix, (
                int(self.projection_matrix.tocsr().shape[0] * self.precision) /
                100))

            U = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        else:

            Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense())

            U = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        rank = U.shape[0]

        #for k in cfor(1, lambda i: i <= rank, lambda i: i + 25):
        k = 1
        while k <= rank:

            ut = U[:k]
            s = S[:k]
            vt = VT[:k]
            matrix_u = ss.csr_matrix(ut.T)
            matrix_s = ss.csr_matrix(np.diag(s))
            matrix_vt = ss.csr_matrix(vt)

            temp_matrix = spmatrixmul(self.main_matrix, matrix_u)
            temp_matrix_a = spmatrixmul(matrix_s, matrix_vt)
            temp_matrix_b = spmatrixmul(temp_matrix_a,
                                        self.transpose_matrix.tocsr())
            matrix_result = spmatrixmul(temp_matrix, temp_matrix_b)
            del temp_matrix, temp_matrix_a, temp_matrix_b

            result_list.append(matrix_result)
            difference = (matrix_result - self.truth_matrix)
            fresult = self.fnorm(difference)
            svd_dict[k] = fresult
            print 'k = ', k, 'fresult = ', fresult
            del matrix_result, fresult, difference

            if k == 1:
                k += (self.step - 1)
            else:
                k += self.step

        return svd_dict, result_list, U, S, VT
Example #15
0
def svd_HO(data, rank, max_iter=10):
    """ HOOI method to decompose high order tensor with given ranks

    Parameters
    ----------
    data : numpy array
        Higher-order noisy tensor with size (I1 , I2 , ... , IN) along N dimensions. N >1.
    rank : numpy array
        Integer array (R1, R2, R3, ... , Rn) describes the eigenvectors for each dimension
    max_iter: integer
        Number of iterations for higher-order orthogonal iteration (HOOI) algorithm

    Returns
    -------

    X: numpy array
        Denoised tensor with low rank and the same size of input data.
    U: list of numpy array
        List of orthogonal matrix for each dimension, each component in the list has size Ik x Rk.
    S: numpy array
        Core tensor of the low rank tensor X with size (R1, R2, R3, ... , Rn).
    """
    svd_iter = 10
    data_shape = np.shape(data)  # p0

    # Check that number of dimensions match the number of rank numbers
    if len(data_shape) != len(rank):
        print("The rank should be the same size as the data shape")
        return data, [], []

    # Check that for each rank, the product of all the rest ranks are larger than this rank
    for k in range(len(rank)):
        prod = 1
        for i in range(len(rank)):
            if i != k:
                prod = prod * rank[i]
        if rank[k] > prod:
            print("The rank does not satisfy requirment of HOOI.")
            return data, [], []

    dimensions = len(data_shape)  # d
    ordered_indexes = np.argsort(
        data_shape
    )  # getting the indicies from min len to max, initialization starts from smallest size

    ## Initialize U and Y with SVD
    U = [
        None
    ] * dimensions  # Generate an empty array to save all the U matrices with fixed length
    X = data
    for k in ordered_indexes:  # calculating initial SVD
        unfolded = unfold_axis(X,
                               k)  # unfolding from the axis with minimum size
        [U[k], _, _] = fast_svd(unfolded, rank[k], n_iter=svd_iter)
        X = ttm(X, np.transpose(U[k]), k)  # This needs to be fixed!

    ## Update U with HOOI
    iter_count = 0
    while iter_count < max_iter:
        iter_count += 1
        for k in range(0, dimensions):
            Y = data
            minus_k = list(range(0, dimensions))
            minus_k.remove(
                k
            )  # every value except for k, seems do it in one step will remove all the elements in the list.
            for j in minus_k:
                Y = ttm(Y, np.transpose(U[j]), j)
            MY = unfold_axis(Y, k)
            [U[k], _, _] = fast_svd(MY, rank[k], n_iter=svd_iter)

    ## Use the determined U matrices to calculate core tensor and denoised tensor
    X = data
    for k in ordered_indexes:
        X = ttm(X, np.transpose(U[k]), k)  # Check this part.
    S = X  # core tensor
    for k in range(0, dimensions):
        X = ttm(X, U[k], k)

    return X, U, S
Example #16
0
    def matrixsvd(self):
        svd_matrix = self.projection_matrix.tocsc()
        svd_dict = {}
        result_list = []

        if self.svd is 'scipy':
            Utemp, Stemp, VTtemp = ssl.svds(svd_matrix.tocsc(),
                    k=(int (self.projection_matrix.tocsr().shape[0] *
                        self.precision)/100))

            U = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        elif self.svd is 'sparsesvd':
            (U, S, VT) = sparsesvd(svd_matrix, (int (svd_matrix.shape[0] * self.precision)/100))

        elif self.svd is 'fast':

            Utemp, Stemp, VTtemp = fast_svd(svd_matrix,
                    (int (self.projection_matrix.tocsr().shape[0] *
                        self.precision)/100))

            U = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        else: 

            Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense())

            U = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)



        


        rank = U.shape[0]

        #for k in cfor(1, lambda i: i <= rank, lambda i: i + 25):
        k = 1
        while k <= rank:

            ut = U[:k]
            s = S[:k]
            vt = VT[:k]
            matrix_u = ss.csr_matrix(ut.T)
            matrix_s = ss.csr_matrix(np.diag(s))
            matrix_vt = ss.csr_matrix(vt)

            temp_matrix = spmatrixmul(self.main_matrix, matrix_u)
            temp_matrix_a = spmatrixmul(matrix_s, matrix_vt)
            temp_matrix_b = spmatrixmul(temp_matrix_a, self.transpose_matrix.tocsr())
            matrix_result = spmatrixmul(temp_matrix, temp_matrix_b)
            del temp_matrix, temp_matrix_a, temp_matrix_b

            result_list.append(matrix_result)
            difference = (matrix_result - self.truth_matrix)
            fresult = self.fnorm(difference)
            svd_dict[k] = fresult
            print 'k = ', k, 'fresult = ', fresult
            del matrix_result, fresult, difference

            if k == 1:
                k += (self.step - 1)
            else:
                k += self.step

        return svd_dict, result_list, U, S, VT
        X[i, j] = 1.0
    del links
    print "Converting to CSR representation"
    X = X.tocsr()
    print "CSR conversion done"
    return X, redirects, index_map


# stop after 5M links to make it possible to work in RAM
X, redirects, index_map = get_adjacency_matrix(
    redirects_filename, page_links_filename, limit=5000000)
names = dict((i, name) for name, i in index_map.iteritems())

print "Computing the principal singular vectors using fast_svd"
t0 = time()
U, s, V = fast_svd(X, 5, q=3)
print "done in %0.3fs" % (time() - t0)

# print the names of the wikipedia related strongest compenents of the the
# principal singular vector which should be similar to the highest eigenvector
print "Top wikipedia pages according to principal singular vectors"
pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])
pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])


def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
    """Power iteration computation of the principal eigenvector

    This method is also known as Google PageRank and the implementation
    is based on the one from the NetworkX project (BSD licensed too)
    with copyrights by: