Beispiel #1
0
def get_bond_order(bofile, job_info, num_sv=4):
    metal_ind = job_info['metal_ind']
    natoms = job_info['natoms']
    dict_bondorder = OrderedDict()
    catoms = [metal_ind] + job_info['catoms']
    dict_patterns = {}
    for catom in catoms:
        dict_patterns[catom] = [metal_ind, catom]
    botext = list()
    with open(bofile, 'r') as fo:
        for line in fo:
            if "bond order list" in line:
                botext = list()
            else:
                botext.append(line)
    bo_mat = np.zeros(shape=(natoms, natoms))
    for line in botext:
        ll = line.split()
        row_idx, col_idx = int(ll[0]), int(ll[1])
        bo_mat[row_idx, col_idx] = float(ll[2])
        bo_mat[col_idx, row_idx] = float(ll[2])
    U, Sigma, VT = randomized_svd(bo_mat, n_components=num_sv, n_iter=20)
    sigma = Sigma.tolist()
    for sv in range(num_sv):
        dict_bondorder.update({'bo_sv%d' % sv: sigma[sv]})
    bo_mat_off_diag = bo_mat.copy()
    np.fill_diagonal(bo_mat_off_diag, 0)
    _U, _Sigma, _VT = randomized_svd(bo_mat_off_diag, n_components=num_sv, n_iter=20)
    _sigma = _Sigma.tolist()
    for sv in range(num_sv):
        dict_bondorder.update({'bo_offsv%d' % sv: _sigma[sv]})
    for catom, vals in dict_patterns.items():
        dict_bondorder.update({'bo_%d' % catom: bo_mat[vals[0], vals[1]]})
    dict_bondorder = symmetricalize_dict(job_info, feature_dict=dict_bondorder)
    return dict_bondorder
Beispiel #2
0
def test_randomized_svd_low_rank():
    """Check that extmath.randomized_svd is consistent with linalg.svd"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
        effective_rank=rank, tail_strength=0.0, random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = randomized_svd(X, k)
    assert_equal(Ua.shape, (n_samples, k))
    assert_equal(sa.shape, (k,))
    assert_equal(Va.shape, (k, n_features))

    # ensure that the singular values of both methods are equal up to the real
    # rank of the matrix
    assert_almost_equal(s[:k], sa)

    # check the singular vectors too (while not checking the sign)
    assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))

    # check the sparse matrix representation
    X = sparse.csr_matrix(X)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = randomized_svd(X, k)
    assert_almost_equal(s[:rank], sa[:rank])
def test_randomized_svd_sign_flip_with_transpose():
    # Check if the randomized_svd sign flipping is always done based on u
    # irrespective of transpose.
    # See https://github.com/scikit-learn/scikit-learn/issues/5608
    # for more details.
    def max_loading_is_positive(u, v):
        """
        returns bool tuple indicating if the values maximising np.abs
        are positive across all rows for u and across all columns for v.
        """
        u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
        v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
        return u_based, v_based

    mat = np.arange(10 * 8).reshape(10, -1)

    # Without transpose
    u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True)
    u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
    assert_true(u_based)
    assert_false(v_based)

    # With transpose
    u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
        mat, 3, flip_sign=True, transpose=True)
    u_based, v_based = max_loading_is_positive(
        u_flipped_with_transpose, v_flipped_with_transpose)
    assert_true(u_based)
    assert_false(v_based)
def test_randomized_svd_power_iteration_normalizer():
    # randomized_svd with power_iteration_normalized='none' diverges for
    # large number of power iterations on this dataset
    rng = np.random.RandomState(42)
    X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
    X += 3 * rng.randint(0, 2, size=X.shape)
    n_components = 50

    # Check that it diverges with many (non-normalized) power iterations
    U, s, V = randomized_svd(X, n_components, n_iter=2,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_2 = linalg.norm(A, ord='fro')
    U, s, V = randomized_svd(X, n_components, n_iter=20,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_20 = linalg.norm(A, ord='fro')
    assert_greater(np.abs(error_2 - error_20), 100)

    for normalizer in ['LU', 'QR', 'auto']:
        U, s, V = randomized_svd(X, n_components, n_iter=2,
                                 power_iteration_normalizer=normalizer,
                                 random_state=0)
        A = X - U.dot(np.diag(s).dot(V))
        error_2 = linalg.norm(A, ord='fro')

        for i in [5, 10, 50]:
            U, s, V = randomized_svd(X, n_components, n_iter=i,
                                     power_iteration_normalizer=normalizer,
                                     random_state=0)
            A = X - U.dot(np.diag(s).dot(V))
            error = linalg.norm(A, ord='fro')
            assert_greater(15, np.abs(error_2 - error))
Beispiel #5
0
def test_randomized_svd_infinite_rank():
    """Check that extmath.randomized_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=1.0,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = randomized_svd(X, k, n_iter=0)

    # the approximation does not tolerate the noise:
    assert_greater(np.abs(s[:k] - sa).max(), 0.1)

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = randomized_svd(X, k, n_iter=5)

    # the iterated power method is still managing to get most of the structure
    # at the requested rank
    assert_almost_equal(s[:k], sap, decimal=3)
Beispiel #6
0
def test_randomized_svd_low_rank_with_noise():
    """Check that extmath.randomized_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.5,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = randomized_svd(X, k, n_iter=0)

    # the approximation does not tolerate the noise:
    assert_greater(np.abs(s[:k] - sa).max(), 0.05)

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = randomized_svd(X, k, n_iter=5)

    # the iterated power method is helping getting rid of the noise:
    assert_almost_equal(s[:k], sap, decimal=3)
Beispiel #7
0
def test_randomized_svd_transpose_consistency():
    """Check that transposing the design matrix has limit impact"""
    n_samples = 100
    n_features = 500
    rank = 4
    k = 10

    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.5,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False,
                                random_state=0)
    U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True,
                                random_state=0)
    U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto',
                                random_state=0)
    U4, s4, V4 = linalg.svd(X, full_matrices=False)

    assert_almost_equal(s1, s4[:k], decimal=3)
    assert_almost_equal(s2, s4[:k], decimal=3)
    assert_almost_equal(s3, s4[:k], decimal=3)

    assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)
    assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)

    # in this case 'auto' is equivalent to transpose
    assert_almost_equal(s2, s3)
Beispiel #8
0
def get_svd_learn_clusters(accu_path, data=None, sing_threshold=2.0, assign_clstr=0.1, vis=False):
    """First runs the decomposition for maximum number of singular values.
    Then reruns on a subset > than some value"""

    (N, f) = data.shape
    all_components = min(N,f)
    U, Sigma, VT = randomized_svd(data, n_components=all_components, n_iter=5, random_state=None)

    # print "Sigma:", Sigma
    best_components = sum(Sigma > sing_threshold)
    U, Sigma, VT = randomized_svd(data, n_components=best_components, n_iter=5, random_state=None)
    pred_labels = [np.argmax(doc) if np.max(doc) > assign_clstr else 100 for doc in U]
    # print "predicted classes:", pred_labels

    utils.screeplot(accu_path, Sigma, all_components, vis)

    """Plot a graph for each right singular vector (VT)"""
    max_, min_ = 0, 100
    min_=100
    for i in VT:
        if max(i)>max_: max_ = max(i)
        if min(i)<min_: min_ = min(i)

    if vis:
        with open(accu_path + "/graphlets.p", 'r') as f:
            graphlets = pickle.load(f)

    for i, vocabulary in enumerate(VT):
        title = 'Latent Concept %s' % i
        utils.genome(accu_path, vocabulary, [min_, max_], title)
        if vis:
            for c, v in enumerate(vocabulary):
                if v > 0.1:
                    print "\n",c,  graphlets[c]
    return U, Sigma, VT
    def _randomized_dpca(self,X,mXs,pinvX=None):
        """ Solves the dPCA minimization problem analytically by using a randomized SVD solver from sklearn.

            Returns
            -------
            P : dict mapping strings to array-like,
                Holds encoding matrices for each term in variance decompostions (used to transform data
                to low-dimensional space).

            D : dict mapping strings to array-like,
                Holds decoding matrices for each term in variance decompostions (used in inverse_transform
                to map from low-dimensional representation back to original data space).

        """

        n_features = X.shape[0]
        rX = X.reshape((n_features,-1))
        pinvX = pinv(rX) if pinvX is None else pinvX

        P, D = {}, {}

        for key in list(mXs.keys()):
            mX = mXs[key].reshape((n_features,-1)) # called X_phi in paper
            C = np.dot(mX,pinvX)

            if isinstance(self.n_components,dict):
                U,s,V = randomized_svd(np.dot(C,rX),n_components=self.n_components[key],n_iter=self.n_iter,random_state=np.random.randint(10e5))
            else:
                U,s,V = randomized_svd(np.dot(C,rX),n_components=self.n_components,n_iter=self.n_iter,random_state=np.random.randint(10e5))

            P[key] = U
            D[key] = np.dot(U.T,C).T

        return P, D
Beispiel #10
0
def test_randomized_svd_sign_flip():
    a = np.array([[2.0, 0.0], [0.0, 1.0]])
    u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41)
    for seed in range(10):
        u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed)
        assert_almost_equal(u1, u2)
        assert_almost_equal(v1, v2)
        assert_almost_equal(np.dot(u2 * s2, v2), a)
        assert_almost_equal(np.dot(u2.T, u2), np.eye(2))
        assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
Beispiel #11
0
    def _svd(self, array, n_components, n_discard):
        """Returns first `n_components` left and right singular
        vectors u and v, discarding the first `n_discard`.

        """
        if self.svd_method == "randomized":
            kwargs = {}
            if self.n_svd_vecs is not None:
                kwargs["n_oversamples"] = self.n_svd_vecs
            u, _, vt = randomized_svd(array, n_components, random_state=self.random_state, **kwargs)

        elif self.svd_method == "arpack":
            u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
            if np.any(np.isnan(vt)):
                # some eigenvalues of A * A.T are negative, causing
                # sqrt() to be np.nan. This causes some vectors in vt
                # to be np.nan.
                _, v = eigsh(safe_sparse_dot(array.T, array), ncv=self.n_svd_vecs)
                vt = v.T
            if np.any(np.isnan(u)):
                _, u = eigsh(safe_sparse_dot(array, array.T), ncv=self.n_svd_vecs)

        assert_all_finite(u)
        assert_all_finite(vt)
        u = u[:, n_discard:]
        vt = vt[n_discard:]
        return u, vt.T
Beispiel #12
0
    def fit(self, X, y):
        self.work_titles = {}
        for work in Work.objects.values('id', 'title'):
            self.work_titles[work['id']] = work['title']
        
        work_ids = list(Rating.objects.values_list('work_id', flat=True).distinct())
        nb_works = len(work_ids)
        self.inv_work = {work_ids[i]: i for i in range(nb_works)}

        user_ids = list(User.objects.values_list('id', flat=True))
        nb_users = len(user_ids)
        self.inv_user = {user_ids[i]: i for i in range(nb_users)}

        self.chrono.save('get_work_ids')

        # print("Computing M: (%i × %i)" % (nb_users, nb_works))
        self.M = lil_matrix((nb_users, nb_works))
        """ratings_of = {}
        for (user_id, work_id), rating in zip(X, y):
            ratings_of.setdefault(user_id, []).append(rating)"""
        for (user_id, work_id), rating in zip(X, y):
            self.M[self.inv_user[user_id], self.inv_work[work_id]] = rating #- np.mean(ratings_of[user_id])
        # np.save('backupM', self.M)

        self.chrono.save('fill matrix')

        # Ranking computation
        self.U, self.sigma, self.VT = randomized_svd(self.M, NB_COMPONENTS, n_iter=3, random_state=42)
        # print('Formes', self.U.shape, self.sigma.shape, self.VT.shape)

        self.save('backup.pickle')

        self.chrono.save('factor matrix')
Beispiel #13
0
    def _fit(self, gn):
        from sklearn.utils.validation import check_random_state
        from sklearn.utils.extmath import randomized_svd

        # apply scaling
        gn = self.scaler_.fit(gn).transform(gn)

        # transpose for svd
        # TODO eliminate need for transposition
        x = gn.T
        n_samples, n_features = x.shape

        # intermediates
        random_state = check_random_state(self.random_state)
        n_components = self.n_components
        n_samples, n_features = x.shape

        # singular value decomposition
        u, s, v = randomized_svd(x, n_components,
                                 n_iter=self.iterated_power,
                                 random_state=random_state)

        # calculate explained variance
        self.explained_variance_ = exp_var = (s ** 2) / n_samples
        full_var = np.var(x, axis=0).sum()
        self.explained_variance_ratio_ = exp_var / full_var

        # store components
        self.components_ = v

        return u, s, v
Beispiel #14
0
def gsvd(X, M, A, n_comps = 10):
    """
    Generalized SVD

    :param X:
    :param M:
    :param A:
    :return:
    """

    print("GSVD")
    print("GSVD: Weights... ", end='')
    Xw = np.dot(np.sqrt(M), np.dot(X, np.sqrt(A)))
    print("Done!")

    print("GSVD: SVD... ", end='')
    [P_, D, Q_] = randomized_svd(Xw, n_comps)

    #P_ = P_[:,0:n_comps]
    #D = D[0:n_comps]
    #Q_ = Q_[0:n_comps,:]
    print('Done!')

    print("GSVD: Factor scores and eigenvalues... ", end='')
    Mp = np.power(np.diag(M), -0.5)
    Ap = np.power(np.diag(A), -0.5)

    P = np.dot(np.diag(Mp), P_)
    Q = np.dot(np.diag(Ap), Q_.T)
    ev = np.power(D, 2)

    print('Done!')

    return P, D, Q, ev
Beispiel #15
0
 def _max_singular_value(self, X_filled):
     # quick decomposition of X_filled into rank-1 SVD
     _, s, _ = randomized_svd(
         X_filled,
         1,
         n_iter=5)
     return s[0]
Beispiel #16
0
 def _svd_step(self, X, shrinkage_value, max_rank=None):
     """
     Returns reconstructed X from low-rank thresholded SVD and
     the rank achieved.
     """
     if max_rank:
         # if we have a max rank then perform the faster randomized SVD
         (U, s, V) = randomized_svd(
             X,
             max_rank,
             n_iter=self.n_power_iterations)
     else:
         # perform a full rank SVD using ARPACK
         (U, s, V) = np.linalg.svd(
             X,
             full_matrices=False,
             compute_uv=True)
     s_thresh = np.maximum(s - shrinkage_value, 0)
     rank = (s_thresh > 0).sum()
     s_thresh = s_thresh[:rank]
     U_thresh = U[:, :rank]
     V_thresh = V[:rank, :]
     S_thresh = np.diag(s_thresh)
     X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh))
     return X_reconstruction, rank
def select_factorization_algorithm(factorization_algo, corpus=None, doc_count=0, num_features=2):
    U, S, V = None, None, None
    if factorization_algo == FactorizationAlgorithm.linear_svd:
        A = [[]]
        for doc_id, word_id, value in corpus:
            if len(A) < doc_id + 1:
                A.append([])
            A[doc_id].append(value)
        U, S, V = linalg.svd(A)
    elif factorization_algo == FactorizationAlgorithm.randomized_svd:
        A = [[]]
        for doc_id, word_id, value in corpus:
            if len(A) < doc_id + 1:
                A.append([])
            A[doc_id].append(value)
        U, S, V = randomized_svd(numpy.array(A), n_components=num_features)
    elif factorization_algo == FactorizationAlgorithm.gradient_descent:
        N = doc_count
        M = len(corpus.dictionary.items())
        K = num_features

        P = numpy.random.uniform(low=-0.01, high=0.01, size=(N, K))
        Q = numpy.random.uniform(low=-0.01, high=0.01, size=(M, K))

        # P = numpy.full((N, K), 0.1)
        # Q = numpy.full((M, K), 0.1)

        U, V = svd_factorization(corpus, P, Q, K)
    elif factorization_algo == FactorizationAlgorithm.gradient_descent_engine:
        svd_engine = SVDEngine(num_docs=doc_count, num_words=len(corpus.dictionary.items()), num_features=2)
        svd_engine.feature_training(corpus)
        U, V = svd_engine.document_profiles, svd_engine.word_profiles

    return U, S, V
Beispiel #18
0
def run():
    start = datetime.now()
    KING_ID = User.objects.get(username='******').id
    anime_titles = {}
    anime_ids = set()
    rs = list(Rating.objects.all().select_related('work'))
    print(rs[0])
    cp0 = datetime.now()
    print(cp0 - start)
    for i, rating in enumerate(rs, start=1):
        if i % 1000 == 0:
            print(i)
        if rating.work.id not in anime_ids:
            anime_ids.add(rating.work.id)
            anime_titles[rating.work.id] = rating.work.title
    cp1 = datetime.now()
    print(cp1 - cp0)
    seen_titles = set()
    for rating in Rating.objects.filter(user__id=KING_ID).select_related('work'):
        if rating.choice != 'willsee':
            seen_titles.add(rating.work.title)
    cp2 = datetime.now()
    print(cp2 - cp1)
    nb_users = max(user.id for user in User.objects.all())
    nb_anime = len(anime_ids)
    anime_ids = list(anime_ids)
    inversed = {anime_ids[i]: i for i in range(nb_anime)}
    print("Computing X: (%i×%i)" % (nb_users, nb_anime))
    cp3 = datetime.now()
    print(cp3 - cp2)
    print(nb_users, '×', nb_anime)
    values = {'like': 2, 'dislike': -2, 'neutral': 0.1, 'willsee': 0.5, 'wontsee': -0.5}
    X = lil_matrix((nb_users + 1, nb_anime + 1))
    for rating in Rating.objects.select_related('work', 'user'):
        if rating.work.id < nb_anime:
            X[rating.user.id, inversed[rating.work.id]] = values[rating.choice]

    # Ranking computation
    cp4 = datetime.now()
    print(cp4 - cp3)
    U, sigma, VT = randomized_svd(X, NB_COMPONENTS, n_iter=3, random_state=42)
    XD = np.dot(np.dot(U, np.diag(sigma)), VT)
    ranking = sorted((XD[KING_ID, j], anime_titles[anime_ids[j]]) for j in range(1, nb_anime + 1) if j in anime_titles)[::-1]

    # Summarize the results of the ranking for KING_ID:
    # “=> rank, title, score”
    c = 0
    for i, (rating, title) in enumerate(ranking, start=1):
        if title not in seen_titles:
            print('=>', i, title, rating)
            c += 1
        elif i < 10:
            print(i, title, rating)
        if c >= 10:
            break
    print(len(connection.queries))
    for line in connection.queries:
        print(line)
    end = datetime.now()
    print(end - start)
Beispiel #19
0
def ksvd(Y, D, X, n_cycles=1, verbose=True):
    n_atoms = D.shape[1]
    n_features, n_samples = Y.shape
    unused_atoms = []
    R = Y - fast_dot(D, X)

    for c in range(n_cycles):
        for k in range(n_atoms):
            if verbose:
                sys.stdout.write("\r" + "k-svd..." + ":%3.2f%%" % ((k / float(n_atoms)) * 100))
                sys.stdout.flush()
            # find all the datapoints that use the kth atom
            omega_k = X[k, :] != 0
            if not np.any(omega_k):
                unused_atoms.append(k)
                continue
            # the residual due to all the other atoms but k
            Rk = R[:, omega_k] + np.outer(D[:, k], X[k, omega_k])
            U, S, V = randomized_svd(Rk, n_components=1, n_iter=10, flip_sign=False)
            D[:, k] = U[:, 0]
            X[k, omega_k] = V[0, :] * S[0]
            # update the residual
            R[:, omega_k] = Rk - np.outer(D[:, k], X[k, omega_k])
        print ""
    return D, X, unused_atoms
Beispiel #20
0
def _sv_thresh(X, threshold, num_svalue):
    """
    Perform singular value thresholding.
    Parameters
    ---------
    X : array of shape [n_samples, n_features]
        The input array.
    threshold : float
        The threshold for the singualar values.
    num_svalue : int
        The number of singular values to compute.
    Returns
    -------
    X_thresh : array of shape [n_samples, n_features]
        The output after performing singular value thresholding.
    grater_sv : int
        The number of singular values of `X` which were greater than
        `threshold`
    (U, s, V): tuple
        The singular value decomposition
    """
    m, n = X.shape
    U, s, V = randomized_svd(X, num_svalue)
    greater_sv = np.count_nonzero(s > threshold)
    s = _soft_thresh(s, threshold)
    S = np.diag(s)
    X_thresh = np.dot(U, np.dot(S, V))
    return X_thresh, greater_sv, (U, s, V)
 def apply_uv_decomposition(self):
     U, Sigma, VT = randomized_svd(self.behaviour_matrix,
                                   n_components=15,
                                   n_iter=10,
                                   random_state=None)
     print(U.shape)
     print(VT.shape)
     self.X_hat = np.dot(U, VT)  # U * np.diag(Sigma)
Beispiel #22
0
def do_fit(X):
    n_samples = X.shape[0]

    n_components = X.shape[1]

    U, S, V = extmath.randomized_svd(X, n_components, n_iter=3)

    return U, S, V
Beispiel #23
0
def fast_svd(X, n_components, random_state=None):
    """ Automatically switch between randomized and lapack SVD (heuristic
        of scikit-learn).

    Parameters
    ==========
    X: array, shape (n_samples, n_features)
        The data to decompose

    n_components: integer
        The order of the dimensionality of the truncated SVD

    random_state: int or RandomState
        Pseudo number generator state used for random sampling.

    Returns
    ========

    U: array, shape (n_samples, n_components)
        The first matrix of the truncated svd

    S: array, shape (n_components)
        The second matric of the truncated svd

    V: array, shape (n_components, n_features)
        The last matric of the truncated svd

    """
    random_state = check_random_state(random_state)
    # Small problem, just call full PCA
    if max(X.shape) <= 500:
        svd_solver = 'full'
    elif n_components >= 1 and n_components < .8 * min(X.shape):
        svd_solver = 'randomized'
    # This is also the case of n_components in (0,1)
    else:
        svd_solver = 'full'

    # Call different fits for either full or truncated SVD
    if svd_solver == 'full':
        U, S, V = linalg.svd(X, full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        U, V = svd_flip(U, V)
        # The "copy" are there to free the reference on the non reduced
        # data, and hence clear memory early
        U = U[:, :n_components].copy()
        S = S[:n_components]
        V = V[:n_components].copy()
    else:
        if LooseVersion(sklearn.__version__) >= LooseVersion('0.17'):
            n_iter = 'auto'
        else:
            n_iter = 3
        U, S, V = randomized_svd(X, n_components=n_components,
                                 n_iter=n_iter,
                                 flip_sign=True,
                                 random_state=random_state)
    return U, S, V
def run(in_file, out_path, dim=300, keep_words=None): 
        base_embed = Explicit.load(in_file, normalize=False)
        if keep_words != None:
            base_embed = base_embed.get_subembed(keep_words)
        u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
        np.save(out_path + "-u.npy", u)
        np.save(out_path + "-v.npy", v)
        np.save(out_path + "-s.npy", s)
        util.write_pickle(base_embed.iw, out_path  + "-vocab.pkl")
Beispiel #25
0
    def test(self, override=False):
        """
        Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to
        write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the
        input matrix.

        Parameters
        ----------
        override : bool, optional. default = False
            Set to true to recompute results if prior results are available. Else, returns existing results

        Returns
        -------
        U : :class:`numpy.ndarray`
            Abundance matrix
        S : :class:`numpy.ndarray`
            variance vector
        V : :class:`numpy.ndarray`
            eigenvector matrix
        """
        '''
        Check if a number of compnents has been set and ensure that the number is less than
        the minimum axis length of the data.  If both conditions are met, use fsvd.  If not
        use the regular svd.

        C.Smith -- We might need to put a lower limit on num_comps in the future.  I don't
                   know enough about svd to be sure.
        '''
        if not override:
            if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0:
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                print('Returning previously computed results from: {}'.format(self.h5_results_grp.name))
                print('set the "override" flag to True to recompute results')
                return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \
                       reshape_to_n_dims(self.h5_results_grp['V'])[0]

        self.h5_results_grp = None

        t1 = time.time()

        self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func(self.h5_main), self.num_components,
                                                      n_iter=3)
        self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype)

        print('Took {} to compute randomized SVD'.format(format_time(time.time() - t1)))

        u_mat, success = reshape_to_n_dims(self.__u, h5_pos=self.h5_main.h5_pos_inds,
                                           h5_spec=np.expand_dims(np.arange(self.__u.shape[1]), axis=0))
        if not success:
            raise ValueError('Could not reshape U to N-Dimensional dataset! Error:' + success)

        v_mat, success = reshape_to_n_dims(self.__v, h5_pos=np.expand_dims(np.arange(self.__u.shape[1]), axis=1),
                                           h5_spec=self.h5_main.h5_spec_inds)
        if not success:
            raise ValueError('Could not reshape V to N-Dimensional dataset! Error:' + success)

        return u_mat, self.__s, v_mat
def algin_coor_sys(cs_static, cs):
    # A,B have shape (landmark-by-dimension)
    A = cs.lmrk_coors.as_matrix().transpose()
    B = cs_static.lmrk_coors.as_matrix().transpose()
    U, Sigma, V = randomized_svd(np.dot(A.transpose(), B), n_components=A.shape[0])
    Q = np.dot(U, V.transpose())
    W = Q
    A = np.dot(A, W)
    cs.lmrk_coors = pd.DataFrame(A.transpose(), columns=cs.lmrk_coors.columns)
    return W
Beispiel #27
0
def sv_thresh(X, t, k):
    m, n = X.shape
    U, s, V = randomized_svd(X, k)  #pca(X, raw=True, k=25)
    # Number of singular values greater than `t`
    greater_sv = np.sum(s > t)
    s = soft_thresh(s, t)
    S = np.diag(s)
    ret = np.dot(U, np.dot(S, V))
    assert ret.shape == X.shape
    return ret, greater_sv
Beispiel #28
0
def get_gradient(gradfile, job_info, num_sv=3):
    metal_ind = job_info['metal_ind']
    natoms = job_info['natoms']
    num_lines = natoms + 2
    dict_gradient = OrderedDict()
    catoms = [metal_ind] + job_info['catoms']
    with open(gradfile, 'r') as fo:
        gradtext = fo.readlines()[-num_lines:]
    grad_mat = np.zeros(shape=(natoms, 3))
    for idx, line in enumerate(gradtext):
        ll = line.split()
        if ll[0] == 'terachem':
            dict_gradient.update({'grad_rms': float(ll[7][:-1])})
        if idx > 1:
            grad_mat[idx - 2, :] = [float(x) for x in ll[1:]]
    U, Sigma, VT = randomized_svd(grad_mat, n_components=num_sv, n_iter=20)
    sigma = Sigma.tolist()
    for sv in range(num_sv):
        dict_gradient.update({'grad_sv%d' % sv: sigma[sv]})
    for catom in catoms:
        dict_gradient.update({'grad_%d' % catom: np.linalg.norm(grad_mat[catom, :])})
    max_norm = 0
    for ii in range(natoms):
        _norm = np.linalg.norm(grad_mat[ii, :])
        if _norm > max_norm:
            max_norm = _norm
    dict_gradient.update({'grad_maxnorm': max_norm})
    grad_mat_internal = grad_mat.copy()
    grad_mat_internal = grad_mat_internal - grad_mat_internal[metal_ind, :]
    _U, _Sigma, _VT = randomized_svd(grad_mat_internal, n_components=num_sv, n_iter=20)
    _sigma = _Sigma.tolist()
    for sv in range(num_sv):
        dict_gradient.update({'grad_intsv%d' % sv: _sigma[sv]})
    _max_norm = 0
    for ii in range(natoms):
        _norm = np.linalg.norm(grad_mat_internal[ii, :])
        if _norm > _max_norm:
            _max_norm = _norm
    dict_gradient.update({'grad_intmaxnorm': _max_norm})
    dict_gradient = symmetricalize_dict(job_info, feature_dict=dict_gradient)
    return dict_gradient
    def __set_landmarks__(self, lmrks, Dim):
        vertices = None
        if self.vertex_filter:
            vertices = [v for v in self.g.nodes() if self.vertex_filter(v)]

        L2all = self.proximity_to(lmrks, dests=vertices)
        L2L = L2all.loc[lmrks]
        U, S, _ = randomized_svd(L2L.as_matrix(), n_components=Dim)

        lmrk_coors = np.dot(U, np.sqrt(np.diag(S)))
        self.lmrk_coors = pd.DataFrame(lmrk_coors.transpose(), columns=lmrks)
        self.L2all = L2all
Beispiel #30
0
    def _randomized_dpca(self,X,mXs,pinvX=None):
        """ Solves the dPCA minimization problem analytically by using a randomized SVD solver from sklearn. """

        n_features = X.shape[0]
        rX = X.reshape((n_features,-1))
        pinvX = pinv(rX) if pinvX is None else pinvX

        P, D = {}, {}

        for key in mXs.keys():
            mX = mXs[key].reshape((n_features,-1))
            C = np.dot(mX,pinvX)

            if isinstance(self.n_components,dict):
                U,s,V = randomized_svd(np.dot(C,rX),n_components=self.n_components[key],n_iter=self.n_iter,random_state=np.random.randint(10e5))
            else:
                U,s,V = randomized_svd(np.dot(C,rX),n_components=self.n_components,n_iter=self.n_iter,random_state=np.random.randint(10e5))

            P[key] = U
            D[key] = np.dot(U.T,C).T

        return P, D
Beispiel #31
0
    def svd_dcmp(self, precision=0.01, n_terms_range=(1, np.inf)):
        """
        Does decomposition of covariance matrix defined by set of points
        :param precision: Desired accuracy of the KL approximation, smaller eigen values are dropped.
        :param n_terms_range: (min, max) number of terms in KL expansion to use. The number of terms estimated from
        given precision is snapped to the given interval.

        truncated SVD:
         cov_mat = U*diag(ev) * V,
         _cov_l_factor = U[:,0:m]*sqrt(ev[0:m])

        Note on number of terms:
        According to: C. Schwab and R. A. Todor: KL Approximation of Random Fields by Generalized Fast Multiploe Method
        the eigen values should decay as (Proposition 2.18):
            lambda_m ~ sigma^2 * ( 1/gamma ) **( m**(1/d) + alpha ) / Gamma(0.5 * m**(1/d) )
        where gamma = correlation length / domain diameter
        ans alpha is the correlation exponent. Gamma is the gamma function.
        ... should be checked experimantaly and generalized for sigma(X)

        :return:
        """
        if self.cov_mat is None:
            self.cov_matrix()

        if n_terms_range[0] >= self.n_points:
            U, ev, VT = np.linalg.svd(self.cov_mat)
            m = self.n_points
        else:
            range = list(n_terms_range)
            range[0] = max(1, range[0])
            range[1] = min(self.n_points, range[1])

            prec_range = (self._eigen_value_estimate(range[0]),
                          self._eigen_value_estimate(range[1]))
            if precision < prec_range[0]:
                m = range[0]
            elif precision > prec_range[1]:
                m = range[1]
            else:
                f = lambda m: self._eigen_value_estimate(m) - precision
                m = sp.optmize.bisect(
                    f,
                    range[0],
                    range[1],
                    xtol=0.5,
                )

            m = max(m, range[0])
            threshold = 2 * precision
            # TODO: Test if we should cut eigen values by relative (like now) or absolute value
            while threshold >= precision and m <= range[1]:
                #print("treshold: {} m: {} precision: {} max_m: {}".format(threshold,  m, precision, range[1]))
                U, ev, VT = randomized_svd(self.cov_mat,
                                           n_components=m,
                                           n_iter=3,
                                           random_state=None)
                threshold = ev[-1] / ev[0]
                m = int(np.ceil(1.5 * m))

            m = len(ev)
            m = min(m, range[1])

        #print("KL approximation: {} for {} points.".format(m, self.n_points))
        self.n_approx_terms = m
        self._sqrt_ev = np.sqrt(ev[0:m])
        self._cov_l_factor = U[:, 0:m].dot(np.diag(self._sqrt_ev))
        self.cov_mat = None
        return self._cov_l_factor, ev[0:m]
Beispiel #32
0
def foreground(m, rank=1):
    U, S, Vh = randomized_svd(m, rank)
    L = U * np.diag(S) * Vh
    S = m - L
    return S
    def plot(self):
        colors = ['b', 'g', 'r', 'c', 'm']
        self.plot_axes = 2
        if self.load_from is None and self.store_to is not None:
            #print('Plot to CSV')
            self.plot_u, self.plot_sigma, self.plot_vt = randomized_svd(
                self.tdmatrix, n_components=self.plot_axes)
            csv_filename = self.path + "models/" + self.store_to
            csv_u = open(csv_filename + '_plot_u.csv', 'w+')
            csv_sigma = open(csv_filename + '_plot_sigma.csv', 'w+')
            csv_vt = open(csv_filename + '_plot_vt.csv', 'w+')
            writer_u = csv.writer(csv_u, delimiter=',')
            writer_sigma = csv.writer(csv_sigma, delimiter=',')
            writer_vt = csv.writer(csv_vt, delimiter=',')

            for row in self.plot_u:
                writer_u.writerow(row)
            writer_sigma.writerow(self.plot_sigma)
            for row in self.plot_vt:
                writer_vt.writerow(row)
            csv_u.close()
            csv_sigma.close()
            csv_vt.close()
        elif self.load_from is not None and self.store_to is None:
            #print('Plot from CSV')
            csv_filename = self.path + "models/" + self.load_from
            csv_u = open(csv_filename + '_plot_u.csv', 'r')
            csv_sigma = open(csv_filename + '_plot_sigma.csv', 'r')
            csv_vt = open(csv_filename + '_plot_vt.csv', 'r')
            reader_u = csv.reader(csv_u)
            reader_sigma = csv.reader(csv_sigma)
            reader_vt = csv.reader(csv_vt)
            self.plot_u = list()
            for irow, row in enumerate(reader_u):
                self.plot_u.append(list())
                for icol, val in enumerate(row):
                    self.plot_u[irow].append(float(val))

            self.plot_sigma = list()
            for irow, row in enumerate(reader_sigma):
                for icol, val in enumerate(row):
                    self.plot_sigma.append(float(val))

            self.plot_vt = list()
            for irow, row in enumerate(reader_vt):
                self.plot_vt.append(list())
                for icol, val in enumerate(row):
                    self.plot_vt[irow].append(float(val))
            csv_u.close()
            csv_sigma.close()
            csv_sigma.close()
            csv_vt.close()

        #print("Plot SVD Done")
        #print("U : " + str(len(self.plot_u)) + " x " + str(len(self.plot_u[0])))
        #print("Sigma : " + str(len(self.plot_sigma)))
        #print("Vt : " + str(len(self.plot_vt)) + " x " + str(len(self.plot_vt[0])))
        vectors = list()
        for idir, dirname in enumerate(self.directories):
            vectors.append(list())
            for file in listdir(dirname):
                index = self.ldocs.index(dirname + file)
                vectors[idir].append(
                    self.get_doc_vector(doc_index=index, n_dim=self.plot_axes))
        X = list()
        Y = list()
        for i in range(len(vectors)):
            X.append(list())
            Y.append(list())
            for vec in vectors[i]:
                X[i].append(vec[0])
                Y[i].append(vec[1])

        for i in range(len(vectors)):
            plt.scatter(X[i], Y[i], c=colors[i])
        plt.show()
Beispiel #34
0
cor_mat2 = np.corrcoef(X.T)

eig_vals, eig_vecs = np.linalg.eig(cor_mat2)

print('Eigenvectors \n%s' % eig_vecs)
print('\nEigenvalues \n%s' % eig_vals)

# SVD
# TODO - SVD on X_std
# What are the three matrices again?
u, s, v = np.linalg.svd(X_std.T, full_matrices=True)
print s * s / 150
print s

from sklearn.utils.extmath import randomized_svd
U, Sigma, VT = randomized_svd(X_std, n_components=1)

Sigma

# sorting eigenpairs
for ev in eig_vecs:
    np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
print('Everything ok!')

# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i])
             for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()
Beispiel #35
0
 def my_svd(X):
     _, s, V = randomized_svd(X,
                              n_components,
                              random_state=random_state,
                              n_iter=self.iterated_power)
     return s, V, squared_norm(X) - squared_norm(s)
Beispiel #36
0
def main():
    """
    Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.

    """

    # Get the arguments
    args = docopt(
        '''Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD and save it in pickle format.

    Usage:
        svd.py [-l] <dsm_prefix> <dim> <gamma> <outPath>

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.svd)
        <dim> = dimensionality of low-dimensional output vectors
        <gamma> = eigenvalue weighting parameter
        <outPath> = output path for space

    Options:
        -l, --len   normalize final vectors to unit length

    ''')

    is_len = args['--len']
    dsm_prefix = args['<dsm_prefix>']
    dim = int(args['<dim>'])
    gamma = float(args['<gamma>'])
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get space with sparse matrix
    dsm = load_pkl_files(dsm_prefix)

    id2row = dsm.get_id2row()

    # Get matrix from space
    matrix_ = dsm.get_cooccurrence_matrix()

    # Apply SVD
    u, s, v = randomized_svd(matrix_.get_mat(),
                             n_components=dim,
                             n_iter=5,
                             transpose=False)

    # Weight matrix
    if gamma == 0.0:
        matrix_ = u
    elif gamma == 1.0:
        #matrix_ = np.dot(u, np.diag(s)) # This is equivalent to the below formula (because s is a flattened diagonal matrix)
        matrix_ = s * u
    else:
        #matrix_ = np.dot(u, np.power(np.diag(s), gamma)) # This is equivalent to the below formula
        matrix_ = np.power(s, gamma) * u

    if is_len:
        # L2-normalize vectors
        l2norm1 = np.linalg.norm(matrix_, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        matrix_ /= l2norm1.reshape(len(l2norm1), 1)

    dsm = Space(DenseMatrix(matrix_), id2row, [])

    # Save the Space object in pickle format
    save_pkl_files(dsm,
                   outPath + ".svd.dm",
                   save_in_one_file=True,
                   save_as_w2v=True)
    logging.info("--- %s seconds ---" % (time.time() - start_time))
Beispiel #37
0
from scipy.linalg import hilbert

np.set_printoptions(linewidth=120)

Observations = 10
Features = 4000
N = max(Observations, Features)
k = 7

# Create a known ill-conditionned matrix for testing
H = hilbert(N)[:Observations, :Features]

print(f'Matrix of shape: [{Observations}, {Features}]')
print(f'Target SVD: [{Observations}, {k}]')

(U, S, Vh) = randomized_svd(H, n_components=k, n_oversamples=5, n_iter=2)

print("\n#################################\n")
print("U - left singular vectors")
print(U)
print("\n#################################\n")
print("S - Singular values diagonal")
print(S)
print("\n#################################\n")
print("Vh - transposed right singular vectors")
print(Vh)

# ----------------------------------------------------------------------------------------

# Matrix of shape: [10, 4000]
# Target SVD: [10, 7]
Beispiel #38
0
def _initialize_nmf(X, n_components, init=None, eps=1e-6, random_state=None):

    check_non_negative(X, "NMF initialization")
    n_samples, n_features = X.shape

    if (init is not None and init != 'random'
            and n_components > min(n_samples, n_features)):
        raise ValueError(
            "init = '{}' can only be used when "
            "n_components <= min(n_samples, n_features)".format(init))

    if init is None:
        if n_components <= min(n_samples, n_features):
            init = 'nndsvd'
        else:
            init = 'random'

    # Random initialization
    if init == 'random':
        avg = np.sqrt(X.mean() / n_components)
        rng = check_random_state(random_state)
        H = avg * rng.randn(n_components, n_features)
        W = avg * rng.randn(n_samples, n_components)
        # we do not write np.abs(H, out=H) to stay compatible with
        # numpy 1.5 and earlier where the 'out' keyword is not
        # supported as a kwarg on ufuncs
        np.abs(H, H)
        np.abs(W, W)
        return W, H

    # NNDSVD initialization
    U, S, V = randomized_svd(X, n_components, random_state=random_state)
    W, H = np.zeros(U.shape), np.zeros(V.shape)

    # The leading singular triplet is non-negative
    # so it can be used as is for initialization.
    W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
    H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])

    for j in range(1, n_components):
        x, y = U[:, j], V[j, :]

        # extract positive and negative parts of column vectors
        x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
        x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))

        # and their norms
        x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
        x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)

        m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm

        # choose update
        if m_p > m_n:
            u = x_p / x_p_nrm
            v = y_p / y_p_nrm
            sigma = m_p
        else:
            u = x_n / x_n_nrm
            v = y_n / y_n_nrm
            sigma = m_n

        lbd = np.sqrt(S[j] * sigma)
        W[:, j] = lbd * u
        H[j, :] = lbd * v

    W[W < eps] = 0
    H[H < eps] = 0

    if init == "nndsvd":
        pass
    elif init == "nndsvda":
        avg = X.mean()
        W[W == 0] = avg
        H[H == 0] = avg
    elif init == "nndsvdar":
        rng = check_random_state(random_state)
        avg = X.mean()
        W[W == 0] = abs(avg * rng.randn(len(W[W == 0])) / 100)
        H[H == 0] = abs(avg * rng.randn(len(H[H == 0])) / 100)
    else:
        raise ValueError(
            'Invalid init parameter: got %r instead of one of %r' %
            (init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar')))

    return W, H
Beispiel #39
0
def randSVD(X, n_components=None):
    if n_components is None:
        n_components = min(X.shape)
    U, S, VT = randomized_svd(X, n_components)
    return U, S, VT.T
Beispiel #40
0
    assert len(sys.argv) >= 4, 'bad arguments'
    fmatrix = sys.argv[1]
    fwords = sys.argv[2]
    fdata = sys.argv[3]
    #fkeys = sys.argv[4]

    # load the raw matrix
    X = load_matrix(fmatrix)
    print(X.shape, X.nnz, file=sys.stderr)

    # compute its PPMI
    X = ppmi(X)
    print(X.shape, X.nnz, file=sys.stderr)

    if USE_SVD:
        X, _, _ = randomized_svd(X, n_components=500, n_iter=5, random_state=None)
        print(X.shape, file=sys.stderr)

    # load words
    words = load_words(fwords)
    # make word to index dict
    inv = {w: i for i, w in enumerate(words)}

    assert len(words) == X.shape[0]

    # load test set
    wpairs = load_data(fdata)

    results = evaluate(wpairs, inv, X)
    for val in results:
        print(val)
Beispiel #41
0
def _initialize_mf(M,
                   n_components,
                   init=None,
                   eps=1e-6,
                   random_state=None,
                   non_negative=False):
    """Algorithms for MF initialization.

    Computes an initial guess for the non-negative
    rank k matrix approximation for M: M = AB^T

    Parameters
    ----------
    M : array-like, shape (n_samples, n_features)
        The data matrix to be decomposed.

    n_components : integer
        The number of components desired in the approximation.

    init :  None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'svd'
        Method used to initialize the procedure.
        Default: 'svd' if n_components < n_features, otherwise 'random'.
        Valid options:

        - 'random': non-negative random matrices, scaled with:
            sqrt(X.mean() / n_components)

        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
            initialization (better for sparseness)

        - 'nndsvda': NNDSVD with zeros filled with the average of X
            (better when sparsity is not desired)

        - 'nndsvdar': NNDSVD with zeros filled with small random values
            (generally faster, less accurate alternative to NNDSVDa
            for when sparsity is not desired)

    non_negative: bool
        Whether to decompose into non-negative matrices.

    eps : float
        If non-negative, truncate all values less then this in output to zero.

    random_state : int, RandomState instance or None, optional, default: None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used when ``random`` == 'nndsvdar' or 'random'.

    Returns
    -------
    A : array-like, shape (n_samples, n_components)
        Initial guesses for solving M ~= AB^T

    B : array-like, shape (n_features, n_components)
        Initial guesses for solving M ~= AB^T

    References
    ----------
    C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for
    nonnegative matrix factorization - Pattern Recognition, 2008
    http://tinyurl.com/nndsvd
    """
    if non_negative:
        check_non_negative(M, "MF initialization")

    n_samples, n_features = M.shape

    if init is None:
        if n_components < n_features:
            init = 'nndsvdar' if non_negative else 'svd'
        else:
            init = 'random'

    if init == 'random':
        avg = np.sqrt(np.abs(M.mean()) / n_components)
        rng = check_random_state(random_state)
        A = avg * rng.randn(n_samples, n_components)
        B = avg * rng.randn(n_components, n_features)
        if non_negative:
            np.abs(A, A)
            np.abs(B, B)

    elif init == 'svd':
        if non_negative:
            raise ValueError(
                'SVD initialization incompatible with NMF (use nndsvd instead)'
            )
        if min(n_samples, n_features) < n_components:
            warnings.warn(
                'The number of components is smaller than the rank in svd initialization.'
                +
                'The input will be padded with zeros to compensate for the lack of singular values.'
            )
        # simple SVD based approximation
        U, S, V = randomized_svd(M, n_components, random_state=random_state)
        # randomize_svd only returns min(n_components, n_features, n_samples) singular values and vectors
        # therefore, to retain the desired shape, we need to pad and reshape the inputs
        if n_components > n_features:
            U_padded = np.zeros((U.shape[0], n_components))
            U_padded[:, :U.shape[1]] = U
            U = U_padded
            V_padded = np.zeros((n_components, V.shape[1]))
            V_padded[:V.shape[0], :] = V
            V = V_padded
            S_padded = np.zeros(n_components)
            S_padded[:S.shape[0]] = S
            S = S_padded

        S = np.diag(np.sqrt(S))
        A = np.dot(U, S)
        B = np.dot(S, V)

    elif init in ['nndsvd', 'nndsvda', 'nndsvdar']:
        if not non_negative:
            warnings.warn(
                '%s results in non-negative constrained factors,' % init +
                'so SVD initialization should provide better initial estimate')
        # NNDSVD initialization
        U, S, V = randomized_svd(M, n_components, random_state=random_state)
        A, B = np.zeros(U.shape), np.zeros(V.shape)

        # The leading singular triplet is non-negative
        # so it can be used as is for initialization.
        A[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
        B[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])

        for j in range(1, n_components):
            x, y = U[:, j], V[j, :]

            # extract positive and negative parts of column vectors
            x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
            x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))

            # and their norms
            x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
            x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)

            m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm

            # choose update
            if m_p > m_n:
                u = x_p / x_p_nrm
                v = y_p / y_p_nrm
                sigma = m_p
            else:
                u = x_n / x_n_nrm
                v = y_n / y_n_nrm
                sigma = m_n

            lbd = np.sqrt(S[j] * sigma)
            A[:, j] = lbd * u
            B[j, :] = lbd * v

        A[A < eps] = 0
        B[B < eps] = 0

        if init == "nndsvd":
            pass
        elif init == "nndsvda":
            avg = M.mean()
            A[A == 0] = avg
            B[B == 0] = avg
        elif init == "nndsvdar":
            rng = check_random_state(random_state)
            avg = M.mean()
            A[A == 0] = abs(avg * rng.randn(len(A[A == 0])) / 100)
            B[B == 0] = abs(avg * rng.randn(len(B[B == 0])) / 100)

    else:
        raise ValueError("Invalid init argument")

    return A, B.T
Beispiel #42
0
sys.path.append('..')
import numpy as np
from common.util import most_similar, create_co_matrix, ppmi
from dataset import ptb

window_size = 2
wordvec_size = 100

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
print('counting co-occurence ...')
C = create_co_matrix(corpus, vocab_size, window_size)
print('calculating PPMI ...')
W = ppmi(C, verbose=True)

print('calculating SVD ...')

try:
    from sklearn.utils.extmath import randomized_svd
    U, S, V = randomized_svd(W,
                             n_components=wordvec_size,
                             n_iter=5,
                             random_state=None)
except ImportError:
    U, S, V = np.linalg.svd(W)

word_vecs = U[:, :wordvec_size]

querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
Beispiel #43
0
def PCA_rand(X, n_components, n_iter):
    X = X - mean(X, axis=1, keepdims=True)
    V, d, _ = randomized_svd(X, n_components=n_components, n_iter=n_iter)
    V = V[:, argsort(d)[::-1]]
    d = d[argsort(d)[::-1]]
    return d, V
Beispiel #44
0
def omwrpca_cp(M,
               burnin,
               win_size,
               track_cp_burnin,
               n_check_cp,
               alpha,
               proportion,
               n_positive,
               min_test_size,
               tolerance_num=0,
               lambda1=np.nan,
               lambda2=np.nan,
               factor=1):
    """ 
    The loss function is 
        min_{L,S} { 1/2||M-L-S||_F^2 + lambda1||L||_* + lambda2*||S(:)||_1}
    based on moving window.
     
    Parameters
    ----------
    M : array-like, shape (n_features, n_samples), which will be decomposed into a sparse matrix S 
        and a low-rank matrix L.
    
    burnin : burnin sample size. We require burnin >= win_size.
    
    win_size : length of moving window. We require win_size <= burnin.
    
    track_cp_burnin: the first track_cp_burnin samples generated from omwrpca algorithm will exclude 
    for track change point. Because the result may be unstable.
    
    n_check_cp: buffer size to track changepoint.
    
    alpha: threshold value used in the hypothesis test. Hypothesis test is applied to track subspace changing.
    We suggest use the value 0.01.

    tolerance_num: offset of numbers used in hypothesis test to track change point. A larger tolerance_num gives 
    a more robust result. We restrict tolerance_num to be a non-negative integer. The default value of 
    tolerance_num is 0.
    
    lambda1, lambda2:tuning parameters
    
    factor: parameter factor for PCP.
    
    Returns
    ----------
    Lhat : array-like, low-rank matrix.
    
    Shat : array-like, sparse matrix.
    
    rank : rank of low-rank matrix.
    
    References
    ----------

    Rule of thumb for tuning paramters:
    lambda1 = 1.0/np.sqrt(m);
    lambda2 = 1.0/np.sqrt(m);
    
    """
    m, n = M.shape
    # parameter setting
    assert burnin >= win_size, "Parameter burin should be larger than or equal to parameter win_size."
    if n < burnin:
        print "Parameter burin should be less than or equal to the number of columns of input matrix. Program stops."
        return np.empty((m, 0)), np.empty((m, 0)), [], [], []
    if np.isnan(lambda1):
        lambda1 = 1.0 / np.sqrt(m)
    if np.isnan(lambda2):
        lambda2 = 1.0 / np.sqrt(m)
    # calculate pcp on burnin samples and find rank r
    Lhat, Shat, niter, r = pcp(M[:, :burnin], factor=factor)

    # initialization for omwrpca
    Uhat, sigmas_hat, Vhat = randomized_svd(Lhat,
                                            n_components=r,
                                            n_iter=5,
                                            random_state=0)
    U = Uhat.dot(np.sqrt(np.diag(sigmas_hat)))
    Vhat_win = Vhat[:, -win_size:]
    A = np.zeros((r, r))
    B = np.zeros((m, r))
    for i in range(Vhat_win.shape[1]):
        A = A + np.outer(Vhat_win[:, i], Vhat_win[:, i])
        B = B + np.outer(
            M[:, burnin - win_size + i] - Shat[:, burnin - win_size + i],
            Vhat_win[:, i])

    # initialization for change points tracking
    # dist_num_sparses: distribution of the number of nonzero elements of columns of sparse matrix
    # used for tracking change point
    dist_num_sparses = np.zeros(m + 1)
    # buffer_num: number of nonzero elements of columns of sparse matrix in the buffer used for
    # tracking change point (buffer size = n_check_cp, queue structure)
    buffer_num = deque([])
    # buffer_flag: flags of columns of sparse matrix in the buffer used for tracking change point
    # (buffer size = n_check_cp, queue structure); flag=1 - potential change point; flag=0 - normal point.
    buffer_flag = deque([])
    # num_sparses, cp, rvec are returned by the function
    # initialize num_sparses to track the number of nonzero elements of columns of sparse matrix
    num_sparses = list((Shat != 0).sum(axis=0))
    # initialize change points to an empty list
    cp = []
    # initialize list of rank to [r]
    rvec = [r]

    # main loop
    i = burnin
    while i < n:
        mi = M[:, i]
        vi, si = solve_proj2(mi, U, lambda1, lambda2)
        Shat = np.hstack((Shat, si.reshape(m, 1)))
        vi_delete = Vhat_win[:, 0]
        Vhat_win = np.hstack((Vhat_win[:, 1:], vi.reshape(r, 1)))
        A = A + np.outer(vi, vi) - np.outer(vi_delete, vi_delete)
        B = B + np.outer(mi - si, vi) - np.outer(
            M[:, i - win_size] - Shat[:, i - win_size], vi_delete)
        U = update_col(U, A, B, lambda1)
        Lhat = np.hstack((Lhat, U.dot(vi).reshape(m, 1)))
        num_sparses.append((si.reshape(m, 1) != 0).sum())
        if i >= burnin + track_cp_burnin and i < burnin + track_cp_burnin + min_test_size:
            num = (si != 0).sum()
            dist_num_sparses[num] += 1
        elif i >= burnin + track_cp_burnin + min_test_size:  # do hypothesis testing to find chang point
            num = (si != 0).sum()
            buffer_num.append(num)
            pvalue = dist_num_sparses[max(num - tolerance_num, 0):].sum(
            ) / dist_num_sparses.sum()
            if pvalue <= alpha:
                buffer_flag.append(1)
            else:
                buffer_flag.append(0)
            if len(buffer_flag) >= n_check_cp:  # check change point
                if len(buffer_flag) == n_check_cp + 1:
                    dist_num_sparses[buffer_num[0]] += 1
                    buffer_num.popleft()
                    buffer_flag.popleft()
                nabnormal = sum(buffer_flag)
                # potential change identified
                if nabnormal >= n_check_cp * float(proportion):
                    for k in range(n_check_cp - n_positive + 1):
                        # use the earliest change point if change point exists
                        if sum(itertools.islice(buffer_flag, k,
                                                k + n_positive)) == n_positive:
                            changepoint = i - n_check_cp + 1 + k
                            cp.append(changepoint)
                            Lhat = Lhat[:, :changepoint]
                            Shat = Shat[:, :changepoint]
                            M_update = M[:, changepoint:]
                            num_sparses = num_sparses[:changepoint]
                            # recursively call omwrpca_cp
                            Lhat_update, Shat_update, rvec_update, cp_update, num_sparses_update = \
                            omwrpca_cp(M_update, burnin, win_size, track_cp_burnin, n_check_cp, alpha,
                                       proportion, n_positive, min_test_size, tolerance_num, lambda1, lambda2, factor)
                            # update Lhat, Shat, rvec, num_sparses, cp
                            Lhat = np.hstack((Lhat, Lhat_update))
                            Shat = np.hstack((Shat, Shat_update))
                            rvec.extend(rvec_update)
                            num_sparses.extend(num_sparses_update)
                            cp.extend([changepoint + j for j in cp_update])
                            return Lhat, Shat, rvec, cp, num_sparses
        i += 1

    return Lhat, Shat, rvec, cp, num_sparses
Beispiel #45
0
    def test(self, override=False):
        """
        Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to
        write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the
        input matrix.

        :param override: Set to true to recompute results if prior results are available. Else, returns existing results
        :type override: bool, optional. default = False
            
        :returns: tuple (u_mat, self.__s, v_mat)
            WHERE
            numpy.ndarray u_mat is abundance matrix
            numpy.ndarray self.__s is variance vector
            numpy.ndarray v_mat is eigenvector matrix
        """
        '''
        Check if a number of compnents has been set and ensure that the number is less than
        the minimum axis length of the data.  If both conditions are met, use fsvd.  If not
        use the regular svd.

        C.Smith -- We might need to put a lower limit on num_comps in the future.  I don't
                   know enough about svd to be sure.
        '''
        if not override:
            if isinstance(self.duplicate_h5_groups,
                          list) and len(self.duplicate_h5_groups) > 0:
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                print('Returning previously computed results from: {}'.format(
                    self.h5_results_grp.name))
                print('set the "override" flag to True to recompute results')
                return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \
                       reshape_to_n_dims(self.h5_results_grp['V'])[0]

        self.h5_results_grp = None

        t1 = time.time()

        self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func(
            self.h5_main),
                                                      self.num_components,
                                                      n_iter=3)
        self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype)

        print('Took {} to compute randomized SVD'.format(
            format_time(time.time() - t1)))

        u_mat, success = reshape_to_n_dims(self.__u,
                                           h5_pos=self.h5_main.h5_pos_inds,
                                           h5_spec=np.expand_dims(np.arange(
                                               self.__u.shape[1]),
                                                                  axis=0))
        if not success:
            raise ValueError(
                'Could not reshape U to N-Dimensional dataset! Error:' +
                success)

        # When the source dataset has a singular valued spectroscopic dimension
        # stack_real_to_target causes V to lose all its dimensions
        if self.__v.ndim == 0:
            # However, we want V to be 2D:
            self.__v = np.atleast_2d(self.__v)

        v_mat, success = reshape_to_n_dims(self.__v,
                                           h5_pos=np.expand_dims(np.arange(
                                               self.__u.shape[1]),
                                                                 axis=1),
                                           h5_spec=self.h5_main.h5_spec_inds)
        if not success:
            raise ValueError(
                'Could not reshape V to N-Dimensional dataset! Error:' +
                success)

        return u_mat, self.__s, v_mat
Beispiel #46
0
    def CanonicalBip(self, GroupNames, y, std=True):
        if isinstance(GroupNames, (list)):
            self.GroupNames = GroupNames
        else:
            raise ValueError('not numeric')

        if isinstance(y, (np.ndarray)):
            self.target = y
        else:
            raise ValueError('not numeric')

        if std == True:
            self.standardize()
            data = self.data_st
        else:
            data = self.data

        g = len(GroupNames)
        n = data.shape[0]
        m = data.shape[1]
        r = np.min(np.array([g - 1, m]))

        def Factor2Binary(y, Name=None):
            if Name == None:
                Name = "C"
            ncat = len(list(set(y)))
            n = len(y)
            Z = pd.DataFrame(0, index=np.arange(len(y)), columns=list(set(y)))
            for col in Z.columns:
                for i in range(0, n):
                    if y[i] == col:
                        Z[col].iloc[i] = 1
            return Z

        def matrixsqrt(M, dim, tol=np.finfo(float).eps, inv=True):
            U, Sigma, VT = randomized_svd(M,
                                          n_components=self.dim,
                                          n_iter=5,
                                          random_state=None)
            nz = Sigma > tol
            if inv == True:
                S12 = U.dot(np.diag(1 / np.sqrt(Sigma[nz]))).dot(VT[nz, :])
            else:
                S12 = U.dot(np.diag(np.sqrt(Sigma[nz]))).dot(VT[nz, :])
            return S12

        #Groups to Binary
        Z = Factor2Binary(y)
        ng = Z.sum(axis=0)
        S11 = (Z.T).dot(Z)
        Xb = np.linalg.inv(S11).dot(Z.T).dot(data)
        B = (Xb.T).dot(S11).dot(Xb)
        S = (data.T).dot(data) - B
        Y = np.power(S11, 0.5).dot(Xb).dot(matrixsqrt(S, self.dim, inv=True))

        U, Sigma, VT = randomized_svd(Y,
                                      n_components=self.dim,
                                      n_iter=5,
                                      random_state=None)

        #Variable_Coord
        H = matrixsqrt(S, self.dim, inv=False).dot(np.transpose(VT[0:r, :]))
        self.Var_Coord = H
        #Canonical_Weights
        B = matrixsqrt(S, self.dim, inv=True).dot(np.transpose(VT[0:r, :]))
        self.Can_Weights = B
        #Group_Coord
        J = Xb.dot(B)
        self.Group_Coord = J
        #Individual_Coord
        V = data.dot(B)
        self.Ind_Coord = V
Beispiel #47
0
def SVD(M, dimen, niter=5, state=0):
    U, Sigma, VT = randomized_svd(M,
                                  n_components=dimen,
                                  n_iter=niter,
                                  random_state=state)
    return U, Sigma, VT
Beispiel #48
0
 def factor_rank_one(mat):
     u, s, v = randomized_svd(mat, n_components=1)
     u, v = np.abs(np.sqrt(s) * u[:, 0]), np.abs(np.sqrt(s) * v[0, :])
     return u, v
Beispiel #49
0
def svd(svd_matrix):
    u, s, vt = randomized_svd(svd_matrix,
                              n_components=256,
                              n_iter=3,
                              random_state=None)
    return {'u': u, 's': s, 'vt': vt}
def load_w2v_features(inds_all, author_IMat, ents, prop, path_to_w2v,
                      path_to_sents, alpha):
    """Engineering node features by means of word2vec embeddings
    vectors. The entities and property nodes will be assigned their
    corresponding embedding vector, whereas the authors will be given
    the average of the sentence embeddings from the abstract of all 
    their papers. The sentence embedding is computed through a smoothened
    weighted average, which is also  adjusted by subtracting the first
    principal component of the sentences.

    Args:

    * inds_all: array-like
    Index of all the nodes that were selected in our dataset. The 
    indices that correspond to authors should be located in the first chunk
    of this array. Their size is equal to the the number of
    columns in `author_IMat`, i.e.
    `inds_all = [A_ids, E_inds, P_ind]` where
    A: auhors, E: entities, P: property

    * author_IMat: 2D sparse array
    Incidence matrix corresponding to all the author nodes (no matter
    if they are among the selected nodes or not). Number of papers (hyperedges)
    in this matrix should be equal to the number of abstracts (saved as
    sentences) whose path is given by `path_to_sents`. In case we would like 
    to see papers only in a specific time-window, the rows outside this
    window should be zero-ed out before feeding it to this function.

    * ents: array-like
    List of all entities (no matter if they are among the selected
    nodes or not)

    * prop: str or list of str
    Property keyword(s)

    * path_to_w2v: str
    Path to the Word2Vec model

    * path_to_sents: str
    Path to the sentences using which the Word2Vec model were trained

    * alpha: float scalar
    The smoothing parameter
    """

    # total number of authors
    nA = author_IMat.shape[1]
    # number of selected authors
    nA_selected = np.sum(inds_all < nA)

    # load the w2v model
    model = Word2Vec.load(path_to_w2v)

    # load sentences, and compute their embeddings
    sents = np.array(open(path_to_sents, 'r').read().splitlines())
    sents_embeds = unadjusted_words2sents(sents, model, alpha)

    x_all = np.zeros((len(inds_all), model.vector_size))

    # the easier task first (entities and property)
    for i, ind in enumerate(inds_all[nA_selected:]):
        # the last index is for the property
        if ind != inds_all[-1]:
            ent = ents[ind - nA]
            idx = model.wv.vocab[ent].index
            v = model.trainables.syn1neg[idx, :]
            x_all[nA_selected + i, :] = v / np.sqrt((v**2).sum())
        else:
            x_all[-1, :] = model.wv[prop] / np.sqrt((model.wv[prop]**2).sum())

    # now, the more demanding task (authors)
    author_IMat = author_IMat.tocsc()
    pbar = tqdm(range(nA_selected), position=0, leave=True)
    pbar.set_description('Words2Sents for Authors')
    for i, ind in enumerate(inds_all[:nA_selected]):
        pids = author_IMat[:, ind].indices

        # so far the matrices were row-wise, make it
        # column-wise to be more consistent with the formula
        V = np.concatenate([sents_embeds[j] for j in pids], axis=0).T
        u, _, _ = randomized_svd(V, n_components=1)

        # adjusted average vector
        avg = np.sum(V, axis=1)
        avg = avg - np.dot(np.dot(u, u.T), avg)
        x_all[i, :] = avg.squeeze() / np.sqrt((avg**2).sum())

        pbar.update(1)

    pbar.close()
    return x_all
Beispiel #51
0
    print('vocab size: {0}'.format(len(word_to_id)))
    print('corpus size: {0}'.format(len(corpus)))

    # 共起行列
    print('counting co_occurence..')
    c = create_co_matrix(corpus,
                         vocab_size=len(word_to_id),
                         window_size=window_size)

    # ppmi
    print('calculating ppmi (t) ..')
    m_t = ppmi_text(c, verbose=True)

    print('calculating ppmi (self) ..')
    m = ppmi(c)

    # 次元削減 SVD
    print('calculating svd..')
    U, S, V = randomized_svd(m, n_components=vec_size)

    U_t, S_t, V_t = randomized_svd(m_t, n_components=vec_size)

    # ひょうか
    querys = ['you', 'year', 'car', 'toyota']
    for q in querys:
        print('SVD (self ppmi)')
        most_similar(q, word_to_id, id_to_word, U)
        print('SVD (t ppmi)')
        most_similar(q, word_to_id, id_to_word, U_t)
Beispiel #52
0
 def _max_singular_value(self, X_filled):
     # quick decomposition of X_filled into rank-1 SVD
     _, s, _ = randomized_svd(X_filled, 1, n_iter=5)
     return s[0]
uv = []
for r in xrange(len(review_rating)):
    uv.append(review_rating[r] - user_avg[review_user[r]] -
              business_avg[review_business[r]] + mu)

row = np.array(review_business)
col = np.array(review_user)
val = np.array(uv)
ori = csr_matrix((val, (row, col)),
                 shape=(len(business_avg), len(user_avg))).toarray()

n_comp = 30  # k
n_iter = 15
U, S, VT = randomized_svd(ori,
                          n_components=n_comp,
                          n_iter=n_iter,
                          random_state=None)

# U[2686*k], S[k], VT[K*4929]

rc = []
for n in xrange(n_comp):
    rc.append(n)
rc = np.array(rc)
S = csr_matrix((S, (rc, rc)), shape=(n_comp, n_comp)).toarray()

now = np.dot(np.dot(U, S), VT)

rmse = mean_squared_error(ori, now)
Beispiel #54
0
    del f0
    del f1
    gc.collect()
    print("stacked.shape " + str(stacked.shape) + ", stacked.dtype " +
          str(stacked.dtype))

    assert len(stacked.shape) == 2
    if ncomponents < 0:
        ncomponents = min(stacked.shape[0], stacked.shape[1])

    from sklearn.utils.extmath import randomized_svd
    orig_stack = stacked.copy()
    stackemean = np.mean(stacked, axis=0)
    stacked -= stackemean
    print("starting SVD")
    U, s, VT = randomized_svd(stacked, n_components=ncomponents)  #, n_iter=6)
    print("SVD done! VT.shape " + str(VT.shape) + ", VT.dtype " +
          str(VT.dtype))

    h5file = tables.open_file(pickle_name + '_PCA_transfparams.hdf5', mode='w')
    h5file.create_array(h5file.root, 'mean', stackemean)
    h5file.create_array(h5file.root, 'VT', VT)
    h5file.create_array(h5file.root, 's', s)
    h5file.close()
    print("SAVED TRANSFORM PARAMS")

    print("VT.shape " + str(VT.shape))

    #h5file = tables.open_file(pickle_name+'_PCA.hdf5',mode='w')
    #for ii in range(2):
    #    h5file.create_array(h5file.root, 'f'+str(ii), stacktransf[(ii*ndataset):((ii+1)*ndataset),...])
Beispiel #55
0
    def _fit_transform(self,
                       graph: Graph,
                       return_dataframe: bool = True,
                       verbose: bool = True) -> EmbeddingResult:
        """Return node embedding."""
        matrix = None
        if self._metric == "Jaccard":
            edges, weights = graph.get_jaccard_coo_matrix()
        elif self._metric == "Laplacian":
            edges, weights = graph.get_laplacian_coo_matrix()
        elif self._metric == "Modularity":
            matrix = graph.get_dense_modularity_matrix()
        elif self._metric == "Left Normalized Laplacian":
            edges, weights = graph.get_left_normalized_laplacian_coo_matrix()
        elif self._metric == "Right Normalized Laplacian":
            edges, weights = graph.get_right_normalized_laplacian_coo_matrix()
        elif self._metric == "Symmetric Normalized Laplacian":
            edges, weights = graph.get_symmetric_normalized_laplacian_coo_matrix(
            )
        elif self._metric == "Neighbours Intersection size":
            edges, weights = graph.get_neighbours_intersection_size_coo_matrix(
            )
        elif self._metric == "Ancestors Jaccard":
            matrix = graph.get_shared_ancestors_jaccard_adjacency_matrix(
                graph.get_breadth_first_search_from_node_names(
                    src_node_name=self._root_node_name,
                    compute_predecessors=True),
                verbose=verbose)
        elif self._metric == "Ancestors size":
            matrix = graph.get_shared_ancestors_size_adjacency_matrix(
                graph.get_breadth_first_search_from_node_names(
                    src_node_name=self._root_node_name,
                    compute_predecessors=True),
                verbose=verbose)
        elif self._metric == "Adamic-Adar":
            edges, weights = graph.get_adamic_adar_coo_matrix()
        elif self._metric == "Adjacency":
            edges, weights = graph.get_directed_edge_node_ids(), np.ones(
                graph.get_number_of_directed_edges())
        else:
            raise NotImplementedError(f"The provided metric {self._metric} "
                                      "is not currently supported.")

        if matrix is None:
            matrix = coo_matrix((weights, (edges[:, 0], edges[:, 1])),
                                shape=(graph.get_number_of_nodes(),
                                       graph.get_number_of_nodes()),
                                dtype=np.float32)

            U, sigmas, Vt = sparse_svds(matrix,
                                        k=int(self._embedding_size / 2))
        else:
            U, sigmas, Vt = randomized_svd(matrix,
                                           n_components=int(
                                               self._embedding_size / 2))

        sigmas = np.diagflat(np.sqrt(sigmas))
        left_embedding = np.dot(U, sigmas)
        right_embedding = np.dot(Vt.T, sigmas)

        if return_dataframe:
            node_names = graph.get_node_names()
            left_embedding = pd.DataFrame(left_embedding, index=node_names)
            right_embedding = pd.DataFrame(right_embedding, index=node_names)
        return EmbeddingResult(
            embedding_method_name=self.model_name(),
            node_embeddings=[left_embedding, right_embedding])
Beispiel #56
0
def dict_learning_online(X, n_components=2, alpha=1, n_iter=100,
                         return_code=True, dict_init=None, callback=None,
                         batch_size=3, verbose=False, shuffle=True, n_jobs=1,
                         method='lars', iter_offset=0, random_state=None,
                         return_inner_stats=False, inner_stats=None,
                         return_n_iter=False):
    """Solves a dictionary learning matrix factorization problem online.

    Finds the best dictionary and the corresponding sparse code for
    approximating the data matrix X by solving::

        (U^*, V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1
                     (U,V)
                     with || V_k ||_2 = 1 for all  0 <= k < n_components

    where V is the dictionary and U is the sparse code. This is
    accomplished by repeatedly iterating over mini-batches by slicing
    the input data.

    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        Data matrix.

    n_components : int,
        Number of dictionary atoms to extract.

    alpha : float,
        Sparsity controlling parameter.

    n_iter : int,
        Number of iterations to perform.

    return_code : boolean,
        Whether to also return the code U or just the dictionary V.

    dict_init : array of shape (n_components, n_features),
        Initial value for the dictionary for warm restart scenarios.

    callback :
        Callable that gets invoked every five iterations.

    batch_size : int,
        The number of samples to take in each batch.

    verbose :
        Degree of output the procedure will print.

    shuffle : boolean,
        Whether to shuffle the data before splitting it in batches.

    n_jobs : int,
        Number of parallel jobs to run, or -1 to autodetect.

    method : {'lars', 'cd'}
        lars: uses the least angle regression method to solve the lasso problem
        (linear_model.lars_path)
        cd: uses the coordinate descent method to compute the
        Lasso solution (linear_model.Lasso). Lars will be faster if
        the estimated components are sparse.

    iter_offset : int, default 0
        Number of previous iterations completed on the dictionary used for
        initialization.

    random_state : int or RandomState
        Pseudo number generator state used for random sampling.

    return_inner_stats : boolean, optional
        Return the inner statistics A (dictionary covariance) and B
        (data approximation). Useful to restart the algorithm in an
        online setting. If return_inner_stats is True, return_code is
        ignored

    inner_stats : tuple of (A, B) ndarrays
        Inner sufficient statistics that are kept by the algorithm.
        Passing them at initialization is useful in online settings, to
        avoid loosing the history of the evolution.
        A (n_components, n_components) is the dictionary covariance matrix.
        B (n_features, n_components) is the data approximation matrix

    return_n_iter : bool
        Whether or not to return the number of iterations.

    Returns
    -------
    code : array of shape (n_samples, n_components),
        the sparse code (only returned if `return_code=True`)

    dictionary : array of shape (n_components, n_features),
        the solutions to the dictionary learning problem

    n_iter : int
        Number of iterations run. Returned only if `return_n_iter` is
        set to `True`.

    See also
    --------
    dict_learning
    DictionaryLearning
    MiniBatchDictionaryLearning
    SparsePCA
    MiniBatchSparsePCA

    """

    if method not in ('lars', 'cd', 'admm'):
        raise ValueError('Coding method not supported as a fit algorithm.')
    method = 'lasso_' + method

    t0 = time.time()
    n_samples, n_features = X.shape
    # Avoid integer division problems
    alpha = float(alpha)
    random_state = check_random_state(random_state)

    if n_jobs == -1:
        n_jobs = cpu_count()

    # Init V with SVD of X
    if dict_init is not None:
        dictionary = dict_init
    else:
        _, S, dictionary = randomized_svd(X, n_components)
        dictionary = S[:, np.newaxis] * dictionary
    r = len(dictionary)
    if n_components <= r:
        dictionary = dictionary[:n_components, :]
    else:
        dictionary = np.r_[dictionary,
                           np.zeros((n_components - r, dictionary.shape[1]))]
    dictionary = np.ascontiguousarray(dictionary.T)

    if verbose == 1:
        print('[dict_learning]', end=' ')

    n_batches = floor(float(len(X)) / batch_size)
    if shuffle:
        X_train = X.copy()
        random_state.shuffle(X_train)
    else:
        X_train = X
    batches = np.array_split(X_train, n_batches)
    batches = itertools.cycle(batches)

    # The covariance of the dictionary
    if inner_stats is None:
        A = np.zeros((n_components, n_components))
        # The data approximation
        B = np.zeros((n_features, n_components))
    else:
        A = inner_stats[0].copy()
        B = inner_stats[1].copy()

    # If n_iter is zero, we need to return zero.
    ii = iter_offset - 1

    for ii, this_X in zip(range(iter_offset, iter_offset + n_iter), batches):
        dt = (time.time() - t0)
        if verbose == 1:
            sys.stdout.write(".")
            sys.stdout.flush()
        elif verbose:
            if verbose > 10 or ii % ceil(100. / verbose) == 0:
                print ("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)"
                       % (ii, dt, dt / 60))

        this_code = sparse_encode(this_X, dictionary.T, algorithm=method,
                                  alpha=alpha, n_jobs=n_jobs).T

        # Update the auxiliary variables
        if ii < batch_size - 1:
            theta = float((ii + 1) * batch_size)
        else:
            theta = float(batch_size ** 2 + ii + 1 - batch_size)
        beta = (theta + 1 - batch_size) / (theta + 1)

        A *= beta
        A += np.dot(this_code, this_code.T)
        B *= beta
        B += np.dot(this_X.T, this_code.T)

        # Update dictionary
        dictionary = _update_dict(dictionary, B, A, verbose=verbose,
                                  random_state=random_state)
        # XXX: Can the residuals be of any use?

        # Maybe we need a stopping criteria based on the amount of
        # modification in the dictionary
        if callback is not None:
            callback(locals())

    if return_inner_stats:
        if return_n_iter:
            return dictionary.T, (A, B), ii - iter_offset + 1
        else:
            return dictionary.T, (A, B)
    if return_code:
        if verbose > 1:
            print('Learning code...', end=' ')
        elif verbose == 1:
            print('|', end=' ')
        code = sparse_encode(X, dictionary.T, algorithm=method, alpha=alpha, 
                             n_jobs=n_jobs)
        if verbose > 1:
            dt = (time.time() - t0)
            print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60))
        if return_n_iter:
            return code, dictionary.T, ii - iter_offset + 1
        else:
            return code, dictionary.T

    if return_n_iter:
        return dictionary.T, ii - iter_offset + 1
    else:
        return dictionary.T
Beispiel #57
0
def laplacian_embedding(G,
                        max_dim=2,
                        elb=1,
                        get_lcc=True,
                        weightcol='weight',
                        svd_seed=None):
    """
    Inputs
        G - A networkx graph
    Outputs
        eig_vectors - The scaled (or unscaled) eigenvectors
    """
    # if get_lcc==True:
    #     #print("extracting largest_connected_component")
    #     G_lcc = lcc_BNU.extract_lcc(G)
    # else:
    #     G_lcc = G.copy()

    # weightcolumn = weightcol

    # print("pass_to_ranks")
    # G_ptr = ptr.pass_to_ranks(G_lcc, weightcol=weightcolumn)

    # print ("diagonoal augmentation")
    # G_aug_ptr= cvec.diag_aug(G_ptr, weightcol=weightcolumn)

    sorted_vertex = sorted(G.nodes())
    A = nx.to_scipy_sparse_matrix(G, nodelist=sorted_vertex)

    row, col = A.shape
    n = min(row, col)

    if nx.is_directed(G) == False:
        deg = (A.sum(axis=1).T).astype(float)
        deg_array = np.squeeze(np.asarray(deg))
        D = np.diag(deg_array**(-0.5))
        LSE_Matrix = D @ A @ D

    else:
        deg = (A.sum(axis=1).T + A.sum(axis=0)).astype(float)
        deg_array = np.squeeze(np.asarray(deg))
        D = np.diag(deg_array**(-1))
        LSE_Matrix = np.identity(n) - D @ A

    #print ("spectral embedding into %d dimensions" %max_dim)
    U, Sigma, VT = randomized_svd(LSE_Matrix,
                                  n_components=min(max_dim, n - 1),
                                  n_iter=50,
                                  random_state=svd_seed)

    #print ("dimension reduction (elbow selection)")
    rank_graph = getElbows_BNU.getElbows(Sigma, n_elbows=elb)
    reduced_dim = rank_graph[(elb - 1)]

    #print ("elbow is %d" %reduced_dim)
    s_sqrt = np.sqrt(Sigma)  #[np.newaxis] Zeinab commented this out

    s_sqrt_dim_reduced = s_sqrt[:reduced_dim]
    U_dim_reduced = U[:, :reduced_dim]
    VT_dim_reduced = VT[:reduced_dim, :]

    Xhat1 = np.multiply(s_sqrt_dim_reduced, U_dim_reduced)

    if nx.is_directed(G) == False:
        Xhat2 = np.array([]).reshape(Xhat1.shape[0], 0)
    else:
        Xhat2 = np.multiply(np.transpose(VT_dim_reduced), s_sqrt_dim_reduced)
    Xhat = np.concatenate((Xhat1, Xhat2), axis=1)

    embedded = collections.namedtuple('embedded', 'X vertex_labels')
    result = embedded(X=Xhat, vertex_labels=sorted_vertex)

    return result
Beispiel #58
0
def session_pca(imgs,
                mask_img,
                parameters,
                n_components=20,
                confounds=None,
                memory_level=0,
                memory=Memory(cachedir=None),
                verbose=0,
                copy=True):
    """Filter, mask and compute PCA on Niimg-like objects

    This is an helper function whose first call `base_masker.filter_and_mask`
    and then apply a PCA to reduce the number of time series.

    Parameters
    ----------
    imgs: list of Niimg-like objects
        See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg.
        List of subject data

    mask_img: Niimg-like object
        See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg.
        Mask to apply on the data

    parameters: dictionary
        Dictionary of parameters passed to `filter_and_mask`. Please see the
        documentation of the `NiftiMasker` for more informations.

    confounds: CSV file path or 2D matrix
        This parameter is passed to signal.clean. Please see the
        corresponding documentation for details.

    n_components: integer, optional
        Number of components to be extracted by the PCA

    memory_level: integer, optional
        Integer indicating the level of memorization. The higher, the more
        function calls are cached.

    memory: joblib.Memory
        Used to cache the function calls.

    verbose: integer, optional
        Indicate the level of verbosity (0 means no messages).

    copy: boolean, optional
        Whether or not data should be copied
    """

    data, affine = cache(filter_and_mask,
                         memory,
                         memory_level=memory_level,
                         func_memory_level=2,
                         ignore=['verbose', 'memory', 'memory_level',
                                 'copy'])(imgs,
                                          mask_img,
                                          parameters,
                                          memory_level=memory_level,
                                          memory=memory,
                                          verbose=verbose,
                                          confounds=confounds,
                                          copy=copy)
    if n_components <= data.shape[0] // 4:
        U, S, _ = randomized_svd(data.T, n_components)
    else:
        U, S, _ = linalg.svd(data.T, full_matrices=False)
    U = U.T[:n_components].copy()
    S = S[:n_components]
    return U, S
def check_randomized_svd_low_rank(dtype):
    # Check that extmath.randomized_svd is consistent with linalg.svd
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10
    decimal = 5 if dtype == np.float32 else 7
    dtype = np.dtype(dtype)

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.0,
                             random_state=0).astype(dtype, copy=False)
    assert X.shape == (n_samples, n_features)

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # Convert the singular values to the specific dtype
    U = U.astype(dtype, copy=False)
    s = s.astype(dtype, copy=False)
    V = V.astype(dtype, copy=False)

    for normalizer in ['auto', 'LU', 'QR']:  # 'none' would not be stable
        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = randomized_svd(X,
                                    k,
                                    power_iteration_normalizer=normalizer,
                                    random_state=0)

        # If the input dtype is float, then the output dtype is float of the
        # same bit size (f32 is not upcast to f64)
        # But if the input dtype is int, the output dtype is float64
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype == np.float64
            assert sa.dtype == np.float64
            assert Va.dtype == np.float64

        assert Ua.shape == (n_samples, k)
        assert sa.shape == (k, )
        assert Va.shape == (k, n_features)

        # ensure that the singular values of both methods are equal up to the
        # real rank of the matrix
        assert_almost_equal(s[:k], sa, decimal=decimal)

        # check the singular vectors too (while not checking the sign)
        assert_almost_equal(np.dot(U[:, :k], V[:k, :]),
                            np.dot(Ua, Va),
                            decimal=decimal)

        # check the sparse matrix representation
        X = sparse.csr_matrix(X)

        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = \
            randomized_svd(X, k, power_iteration_normalizer=normalizer,
                           random_state=0)
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype.kind == 'f'
            assert sa.dtype.kind == 'f'
            assert Va.dtype.kind == 'f'

        assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
Beispiel #60
0
def load_data(data_path):
    timer = utils.timer(name='main').tic()
    split_folder = os.path.join(data_path, 'cold')

    u_file = os.path.join(
        data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.U.txt')
    v_file = os.path.join(
        data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt')
    item_content_file = os.path.join(data_path, 'item_features_0based.txt')
    train_file = os.path.join(split_folder, 'train.csv')
    test_cold_file = os.path.join(split_folder, 'test.csv')
    test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv')

    dat = {}
    # load preference data
    timer.tic()
    #    u_pref = np.fromfile(u_file, dtype='>f4').reshape(n_users, 200)
    #    v_pref = np.fromfile(v_file, dtype='>f4').reshape(n_items, 200)

    u_pref = np.loadtxt(u_file).reshape(n_users, 200)
    v_pref = np.loadtxt(v_file).reshape(n_items, 200)

    dat['u_pref'] = u_pref
    dat['v_pref'] = v_pref

    timer.toc('loaded U:%s,V:%s' %
              (str(u_pref.shape), str(v_pref.shape))).tic()

    # pre-process
    _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
    _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)

    timer.toc('standardized U,V').tic()

    # load content data
    timer.tic()
    item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)

    item_content = tfidf(item_content)

    from sklearn.utils.extmath import randomized_svd
    u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5)
    item_content = u * s
    _, item_content = utils.prep_standardize(item_content)

    if sp.issparse(item_content):
        dat['item_content'] = item_content.tolil(copy=False)
    else:
        dat['item_content'] = item_content
    timer.toc('loaded item feature sparse matrix: %s' %
              (str(item_content.shape))).tic()

    # load split
    timer.tic()
    train = pd.read_csv(
        train_file, delimiter=",", header=-1,
        dtype=np.int32).values.ravel().view(
            dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
    dat['user_indices'] = np.unique(train['uid'])
    timer.toc('read train triplets %s' % train.shape).tic()

    dat['eval_cold'] = data.load_eval_data(test_cold_file,
                                           test_cold_iid_file,
                                           name='eval_cold',
                                           cold=True,
                                           train_data=train,
                                           citeu=True)
    return dat