def _max_singular_value(self, X_filled):
     # quick decomposition of X_filled into rank-1 SVD
     _, s, _ = randomized_svd(
         X_filled,
         1,
         n_iter=5)
     return s[0]
Esempio n. 2
0
def proj_low_rank(x, k):
    '''
        proj_low_rank
    '''
    U, s, V = randomized_svd(x, n_components=k, n_iter=5, transpose='auto')
    s[k:] = 0
    return np.dot(U, np.dot(np.diag(s), V))
 def SVD(tdm):
     U, s, VT = randomized_svd(tdm,
                               n_components=10,
                               n_iter=5,
                               random_state=None)
     #U, s, VT = np.linalg.svd(tdm)
     return U, s, VT
 def _svd_step(self, X, shrinkage_value, max_rank=None):
     """
     Returns reconstructed X from low-rank thresholded SVD and
     the rank achieved.
     """
     if max_rank:
         # if we have a max rank then perform the faster randomized SVD
         (U, s, V) = randomized_svd(
             X,
             max_rank,
             n_iter=self.n_power_iterations)
     else:
         # perform a full rank SVD using ARPACK
         (U, s, V) = np.linalg.svd(
             X,
             full_matrices=False,
             compute_uv=True)
     s_thresh = np.maximum(s - shrinkage_value, 0)
     rank = (s_thresh > 0).sum()
     s_thresh = s_thresh[:rank]
     U_thresh = U[:, :rank]
     V_thresh = V[:rank, :]
     S_thresh = np.diag(s_thresh)
     X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh))
     return X_reconstruction, rank
def removeBackground(pixel_matrix, out_file):
    u, s, v = decomposition.randomized_svd(pixel_matrix, 2)
    d = np.dot(u, np.diag(s))
    low_rank = np.dot(d, v)
    scale = 100
    dims = (int(320 * (float(scale) / 100)), int(180 * (float(scale) / 100)))
    plt.imsave(fname=out_file,
               arr=np.reshape(low_rank[:, 140], dims),
               cmap='gray')
Esempio n. 6
0
def get_spatial_and_temporal_filters(w, dims):
    """
    
    Asumming a RF is time-space separable, 
    get spatial and temporal filters using SVD. 

    Paramters
    =========

    w : array_like, shape (nt, nx, ny) or (nt, nx * ny)

        2D or 3D Receptive field. 

    dims : list or array_like, shape (ndim, )

        Number of coefficients in each dimension. 
        Assumed order [t, x, y]

    Return
    ======

    [sRF, tRF] : list, shape [2, ]
        
        Spatial and temporal filters separated by SVD. 

    """

    if len(dims) == 3:

        dims_tRF = dims[0]
        dims_sRF = dims[1:]
        U, S, Vt = randomized_svd(w.reshape(dims_tRF, np.prod(dims_sRF)), 3)
        sRF = Vt[0].reshape(*dims_sRF)
        tRF = U[:, 0]

    elif len(dims) == 2:
        dims_tRF = dims[0]
        dims_sRF = dims[1]
        U, S, Vt = randomized_svd(w.reshape(dims_tRF, dims_sRF), 3)
        sRF = Vt[0]
        tRF = U[:, 0]

    return [sRF, tRF]
Esempio n. 7
0
def semantic_vector_similarity(dataset: list, tv=None, refer_matrix=None, refer_dataset=None):

    # essays = []
    # for sample in dataset:
    #     essays.append(" ".join([token for i, token in enumerate(sample['essay_lemma'])
    #                             if sample['essay_is_stop'][i] is False and len(token) > 2]))
    #
    essays = [" ".join(sample['essay_lemma']) for sample in dataset]
    if tv is None:
        tv = TfidfVectorizer(use_idf=True, smooth_idf=True, norm='l2')
        tv.fit(essays)
    tv_fit = tv.transform(essays)
    matrix = tv_fit.toarray().transpose()

    if refer_matrix is None:
        refer_matrix = matrix
    if refer_dataset is None:
        refer_dataset = dataset

    _, _, semantic_matrix = decomposition.randomized_svd(matrix, int(refer_matrix.shape[1]/200))
    _, _, refer_semantic_matrix = decomposition.randomized_svd(refer_matrix, int(refer_matrix.shape[1]/200))
    vector_len = np.sqrt(np.sum(semantic_matrix**2, axis=0))
    semantic_matrix = np.divide(semantic_matrix, vector_len)
    vector_len = np.sqrt(np.sum(refer_semantic_matrix**2, axis=0))
    refer_semantic_matrix = np.divide(refer_semantic_matrix, vector_len)

    result_list = []

    for t, sample in enumerate(dataset):
        sim = np.matmul(semantic_matrix[:, t].T, refer_semantic_matrix)

        # top_k = sim.argsort()[-int(refer_matrix.shape[1]/15):][::-1].tolist()
        top_k = sim.argsort()[:][::-1].tolist()

        weighted_sim = [refer_dataset[k]['domain1_score'] * sim[k] for k in top_k]
        result = np.sum(weighted_sim)

        result_list.append(result)
        sample['semantic_vector_similarity'] = result

    return tv, refer_matrix, {'semantic_vector_similarity': {'mean': np.mean(result_list), 'std': np.std(result_list)}}
Esempio n. 8
0
def LSA(bow):
    # Write here
    X = np.asarray(bow)
    #Computing SVD
    U, Sigma, VT = randomized_svd(X,
                                  n_components=2,
                                  n_iter=1,
                                  random_state=None)
    Sigma = np.diag(Sigma)
    W = np.dot(Sigma, VT)
    D = np.dot(U, Sigma)
    return D, W
Esempio n. 9
0
def pca(df: DataFrame, file_path: str, eigenvalues_condition: Callable[[float],
                                                                       bool]):
    """
    Transforma un dataset en otro con menos dimensiones mediante PCA y permite guardarlo en un archivo csv.
    Implementacion basada en el documento 'A tutorial on Principal Components Analysis' de Lindsay I Smith

    :param df: dataset con atributos solamente numericos y sin el atributo objetivo
    :param file_path: ruta relativa al archivo csv en donde se guardara el resultado
    :param eigenvalues_condition: funcion booleana para filtrar los valores propios (y con estos los vectores propios
        asociados) que se usaran para generar la matriz row_feature_vector (ver documento).
    """

    # se omite el primer paso asumiendo que los datos cumplen las precondiciones

    # segundo paso: resta de los promedios
    row_data_adjust = DataFrame()
    means = []
    for a in df.columns.values:
        means.append(df[a].mean())
    for (i, a) in enumerate(df.columns.values):
        row_data_adjust[a] = df[a] - means[i]

    # tercer paso: calculo de matriz de covarianzas
    C = row_data_adjust.cov()

    # cuarto paso: calculo de valores y vectores propios de la matriz de covarianzas
    U, Sigma, V = randomized_svd(C.as_matrix(),
                                 n_components=C.shape[0],
                                 n_iter=5,
                                 random_state=None)

    # quinto paso: eleccion de componentes para formar el vector de caracteristicas
    order = (-Sigma).argsort()
    Sigma = Sigma[order]
    U = U[:, order]
    filtered_indices = [
        i for i in range(len(Sigma)) if eigenvalues_condition(Sigma[i])
    ]
    row_feature_vector = U[:, filtered_indices].transpose()

    # sexto paso : derivacion del nuevo dataset
    row_data_adjust = row_data_adjust.as_matrix()\
        .transpose()
    # noinspection PyUnresolvedReferences
    final_data = np.matmul(row_feature_vector, row_data_adjust)
    final_data = final_data.transpose()

    # se guarda en un csv
    final_data = DataFrame(final_data)
    final_data.to_csv(file_path, index=False, encoding='utf-8')
Esempio n. 10
0
def LSA(sent_text, reduction_rate, position_scores):

    #vector representation of the text, returns terms-sentences matrix using pure frequency for weights
    tfidf = TfidfVectorizer(input='content',
                            stop_words=tokenized_stop_words,
                            decode_error='strict',
                            strip_accents='unicode',
                            tokenizer=Tokenizer(),
                            binary=True)
    A = tfidf.fit_transform(sent_text).T.toarray()

    #SVD on A matrix. dimensionality reduction (topics)
    U, s, Vh = randomized_svd(A, reduction_rate)

    #singular values to full matrix
    S = diagsvd(s, reduction_rate, reduction_rate)

    #scoring method from
    #Steinberger, J. and Jezek, K. 2004.
    #Using Latent Semantic Analysis in Text Summarization and Summary Evaluation.
    #Proceedings of ISIM '04, pages 93-100
    #B=S^2*Vh -> matrix used to find sentences with greatest combined weight across all topics
    B = (S * S @ Vh).T
    sc = []
    #score for every sentence is its norm in B
    for i in range(len(B)):
        sc.append(norm(B[i]))

    #add scores according to positions
    for i, score in enumerate(sc):
        sc[i] += position_scores[i]

    #score vector
    scores = np.array(sc)
    scores = 10 * scores
    #get the top scored sentences in the cluster
    ranking = scores.argsort()[:-len(sent_text) - 1:-1][:reduction_rate]

    #dictionary with sentences (pos in cluster) and their score
    ranked_sentences = {}

    #fill the dictionary
    for i in ranking:
        ranked_sentences[i] = scores[i]

    return ranked_sentences
Esempio n. 11
0
    def reduce(self, x, algorithm='pca', n_components='mle'):
        from sklearn.decomposition import PCA, FactorAnalysis, TruncatedSVD, randomized_svd

        if algorithm == 'pca':
            _method = PCA(n_components=n_components,
                          copy=True,
                          svd_solver='auto')
            _res = _method.fit_transform(x)
        elif algorithm == 'factor':
            # 因子分析
            _method = FactorAnalysis(n_components=n_components)
            _res = _method.fit_transform(x)
        elif algorithm == 'rsvd':
            _res = randomized_svd(M=x, n_components=4)
        elif algorithm == 'tsvd':
            _method = TruncatedSVD(n_components=3, n_iter=4)
            _res = _method.fit_transform(x)

        return _res
Esempio n. 12
0
def read(filename, dcp_name,n_component):

    X, y = read_feature_label(filename)

    # =======降维====
    if dcp_name == 'svd':
        from sklearn.decomposition import randomized_svd  # 奇异值分解
        X = randomized_svd(X,n_components=n_component)
    else:
        dcp = decomposition(dcp_name,n_component)
        X = dcp.fit_transform(X)


    from sklearn.preprocessing import scale
    X = scale(X)
    y = y

    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=2 / 3)
    return x_train, x_test, y_train, y_test
Esempio n. 13
0
 def _svd_step(self, X, shrinkage_value, max_rank=None):
     """
     Returns reconstructed X from low-rank thresholded SVD and
     the rank achieved.
     """
     if max_rank:
         # if we have a max rank then perform the faster randomized SVD
         (U, s, V) = randomized_svd(X,
                                    max_rank,
                                    n_iter=self.n_power_iterations)
     else:
         # perform a full rank SVD using ARPACK
         (U, s, V) = np.linalg.svd(X, full_matrices=False, compute_uv=True)
     s_thresh = np.maximum(s - shrinkage_value, 0)
     rank = (s_thresh > 0).sum()
     s_thresh = s_thresh[:rank]
     U_thresh = U[:, :rank]
     V_thresh = V[:rank, :]
     S_thresh = np.diag(s_thresh)
     X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh))
     return X_reconstruction, rank
Esempio n. 14
0
def low_rank_cov_root(covs, rank, implementation='randomized_svd'):
    """
    return X: (n_data, n_dim, rank) matrix so that X[i].dot(X[i].T) ~ covs[i]
    """
    n_data, n_dim = covs.shape[:2]
    if implementation == 'randomized_svd':
        X = np.empty((n_data, n_dim, rank))
        for i in xrange(n_data):
            U, s, V = randomized_svd(covs[i], rank)
            X[i] = U * np.sqrt(s)
    elif implementation == 'scipy':
        X = np.empty((n_data, n_dim, rank))
        for i in xrange(n_data):
            eigval, eigvec = eigh(covs[i],
                                  eigvals=(n_dim - rank, n_dim - 1))
            X[i] = eigvec * np.sqrt(eigval)
    elif implementation == 'numpy':
        eigval, eigvec = np.linalg.eigh(covs)
        idx = np.argsort(eigval, axis=-1)[:, -rank:]
        val_idx = np.ogrid[0:n_data, 0:n_dim]
        vec_idx = np.ogrid[0:n_data, 0:n_dim, 0:n_dim]
        X = (eigvec[vec_idx[0], vec_idx[1], idx[:, np.newaxis]] *
                np.sqrt(eigval[val_idx[0], idx][:, np.newaxis]))
    return X
Esempio n. 15
0
x_val_sti = x_val_sti.sort_index(axis=1)
x_val_sti_data = np.nan_to_num(x_val_sti)

y_val = validation['inAudience']
y_val = np.nan_to_num(y_val)
y_val_data = y_val.astype(int)

# Separating data by conversion
x_train_con = x_train_data[y_train, :]
x_train_not_con = x_train_data[np.logical_not(y_train), :]

x_val_con = x_val_data[y_val, :]
x_val_not_con = x_val_data[np.logical_not(y_val), :]

# Running Randomized SVD on all data sets
[U_train, S_train, V_train] = randomized_svd(x_train_data, 500, 20)

[U_train_con, S_train_con, V_train_con] = randomized_svd(x_train_con, 500, 20)

[U_train_not_con, S_train_not_con,
 V_train_not_con] = randomized_svd(x_train_not_con, 500, 20)

[U_val, S_val, V_val] = randomized_svd(x_val_data, 500, 20)

[U_val_con, S_val_con, V_val_con] = randomized_svd(x_val_con, 500, 20)

[U_val_not_con, S_val_not_con,
 V_val_not_con] = randomized_svd(x_val_not_con, 500, 20)

print('All the training data')
for i in range(5):
Esempio n. 16
0
def svd_wrapper(matrix, mode, ncomp, debug, verbose, usv=False):
    """ Wrapper for different SVD libraries.
    
    Note:
    ----
    Sklearn.PCA deprecated as it uses linalg.svd(X, full_matrices=False) under 
    the hood, which is already included.
    Sklearn.RandomizedPCA deprecated as it uses sklearn.randomized_svd which is
    already included.
    
    """
    if not matrix.ndim == 2:
        raise TypeError("Input matrix is not a 2d array")

    def reconstruction(ncomp, U, S, V, var=1):
        rec_matrix = np.dot(U, np.dot(np.diag(S), V))
        print("  Matrix reconstruction MAE =", mean_absolute_error(matrix, rec_matrix))
        exp_var = (S ** 2) / matrix.shape[0]
        full_var = np.var(matrix, axis=0).sum()
        explained_variance_ratio = exp_var / full_var  # Percentage of variance explained by each PC
        if var == 1:
            pass
        else:
            explained_variance_ratio = explained_variance_ratio[::-1]
        ratio_cumsum = explained_variance_ratio.cumsum()
        msg = "  Explained variance for {:} PCs = {:.5f}"
        print(msg.format(ncomp, ratio_cumsum[ncomp - 1]))

    if mode == "eigen":
        M = np.dot(matrix, matrix.T)  # covariance matrix
        e, EV = linalg.eigh(M)  # eigenvalues and eigenvectors
        pc = np.dot(EV.T, matrix)  # PCs / compact trick
        V = pc[::-1]  # reverse since last eigenvectors are the ones we want
        S = np.sqrt(e)[::-1]  # reverse since eigenvalues are in increasing order
        for i in xrange(V.shape[1]):
            V[:, i] /= S
        V = V[:ncomp]
        if verbose:
            print("Done SVD/PCA with scipy linalg eigh functions")

    # When num_px < num_frames (rare case) or we need all the PCs
    elif mode == "lapack":
        U, S, V = linalg.svd(matrix, full_matrices=False)
        if debug:
            reconstruction(ncomp, U, S, V, 1)
        # we cut projection matrix according to the # of PCs
        V = V[:ncomp]
        U = U[:, :ncomp]
        S = S[:ncomp]
        if verbose:
            print("Done SVD/PCA with scipy SVD (LAPACK)")

    elif mode == "arpack":
        U, S, V = svds(matrix, k=ncomp)
        if debug:
            reconstruction(ncomp, U, S, V, -1)
        if verbose:
            print("Done SVD/PCA with scipy sparse SVD (ARPACK)")

    elif mode == "opencv":
        _, V = cv2.PCACompute(matrix, maxComponents=ncomp)  # eigenvectors, PCs
        if verbose:
            print("Done SVD/PCA with opencv.")

    elif mode == "randsvd":
        U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2, transpose="auto", random_state=None)
        if debug:
            reconstruction(ncomp, U, S, V, 1)
        if verbose:
            print("Done SVD/PCA with randomized SVD")

    else:
        raise TypeError("The SVD mode is not available")

    if usv and mode != "opencv":
        return U, S, V
    else:
        return V
Esempio n. 17
0
        X[i, j] = 1.0
    del links
    print("Converting to CSR representation")
    X = X.tocsr()
    print("CSR conversion done")
    return X, redirects, index_map


# stop after 5M links to make it possible to work in RAM
X, redirects, index_map = get_adjacency_matrix(
    redirects_filename, page_links_filename, limit=5000000)
names = dict((i, name) for name, i in iteritems(index_map))

print("Computing the principal singular vectors using randomized_svd")
t0 = time()
U, s, V = randomized_svd(X, 5, n_iter=3)
print("done in %0.3fs" % (time() - t0))

# print the names of the wikipedia related strongest components of the
# principal singular vector which should be similar to the highest eigenvector
print("Top wikipedia pages according to principal singular vectors")
pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])
pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])


def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
    """Power iteration computation of the principal eigenvector

    This method is also known as Google PageRank and the implementation
    is based on the one from the NetworkX project (BSD licensed too)
    with copyrights by:
Esempio n. 18
0
                "--image",
                required=True,
                help="Path to the image to be scanned")
ap.add_argument("-c",
                "--components",
                type=int,
                default=3,
                help="No Of Components to be Used for Compression")

args = vars(ap.parse_args())

img = args['image']
dims = (600, 600)
k = args['components']

im = np.array(Image.open(img).resize(dims))
#im =  np.array(Image.open(img).convert('L').resize(dims))
im = im / 255.0
flatim = im.flatten()
flatim = im.flatten().reshape(-1, 3)
u, s, v = decomposition.randomized_svd(flatim, k)
low_rank = u @ np.diag(s) @ v
low_rank = low_rank.reshape(dims + (3, ))
data_needed = k * dims[0] + dims[1] * k + k  #size of v,d +k
print("\t\tUSING {} COMPONENTs and {} DATA POINTS OUT OF ORIGNAL {} POINTS.".
      format(k, data_needed, dims[0] * dims[1]))

low_rank = (low_rank * 255.0).astype(np.uint8)
low_rank = Image.fromarray(low_rank)
low_rank.save("compressed.png")
    del links
    print("Converting to CSR representation")
    X = X.tocsr()
    print("CSR conversion done")
    return X, redirects, index_map


# stop after 5M links to make it possible to work in RAM
X, redirects, index_map = get_adjacency_matrix(redirects_filename,
                                               page_links_filename,
                                               limit=5000000)
names = dict((i, name) for name, i in iteritems(index_map))

print("Computing the principal singular vectors using randomized_svd")
t0 = time()
U, s, V = randomized_svd(X, 5, n_iter=3)
print("done in %0.3fs" % (time() - t0))

# print the names of the wikipedia related strongest components of the
# principal singular vector which should be similar to the highest eigenvector
print("Top wikipedia pages according to principal singular vectors")
pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])
pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])


def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
    """Power iteration computation of the principal eigenvector

    This method is also known as Google PageRank and the implementation
    is based on the one from the NetworkX project (BSD licensed too)
    with copyrights by:
Esempio n. 20
0
def svd_wrapper(matrix, mode, ncomp, debug, verbose, usv=False):
    """ Wrapper for different SVD libraries with the option of showing the 
    cumulative explained variance ratio.
    
    Note:
    ----
    Sklearn.PCA deprecated as it uses linalg.svd(X, full_matrices=False) under 
    the hood, which is already included.
    Sklearn.RandomizedPCA deprecated as it uses sklearn.randomized_svd which is
    already included.
    
    """
    if not matrix.ndim==2:
        raise TypeError('Input matrix is not a 2d array')
    
    def reconstruction(ncomp, U, S, V, var=1): 
        if mode=='lapack':
            rec_matrix = np.dot(U[:,:ncomp], np.dot(np.diag(S[:ncomp]), V[:ncomp]))
            rec_matrix = rec_matrix.T
            print('  Matrix reconstruction with {:} PCs:'.format(ncomp))
            print('  Mean Absolute Error =', mean_absolute_error(matrix, 
                                                                 rec_matrix))
            print('  Mean Squared Error =', mean_squared_error(matrix,rec_matrix))
            
            exp_var = S**2
            full_var = np.sum(S**2)
            explained_variance_ratio = exp_var / full_var        # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        elif mode=='eigen':
            exp_var = S**2                                       # squared because we previously took the sqrt of the EVals
            full_var = np.sum(S**2)
            explained_variance_ratio = exp_var / full_var        # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        else:
            rec_matrix = np.dot(U, np.dot(np.diag(S), V))
            print('  Matrix reconstruction MAE =', mean_absolute_error(matrix, 
                                                                       rec_matrix))
            exp_var = (S**2) / matrix.shape[0]
            full_var = np.var(matrix, axis=0).sum()
            explained_variance_ratio = exp_var / full_var        # % of variance explained by each PC       
            if var==1:  pass    
            else:  explained_variance_ratio = explained_variance_ratio[::-1]
            ratio_cumsum = np.cumsum(explained_variance_ratio)
            msg = '  This info makes sense when the matrix is mean centered '
            msg += '(temp-mean scaling)'
            print (msg)
        
        lw = 2
        alpha = 0.4
        fig = plt.figure(figsize=(6,3))
        fig.subplots_adjust(wspace=0.4)
        ax1 = plt.subplot2grid((1,3), (0,0), colspan=2)
        ax1.step(list(range(explained_variance_ratio.shape[0])), 
                 explained_variance_ratio, alpha=alpha, where='mid', 
                 label='Individual EVR', lw=lw)
        ax1.plot(ratio_cumsum, '.-', alpha=alpha, 
                 label='Cumulative EVR', lw=lw)
        ax1.legend(loc='best', frameon=False, fontsize='medium')
        ax1.set_ylabel('Explained variance ratio (EVR)')
        ax1.set_xlabel('Principal components')
        ax1.grid(linestyle='solid', alpha=0.2)
        ax1.set_xlim(-10, explained_variance_ratio.shape[0]+10)
        ax1.set_ylim(0, 1)
        
        trunc = 20
        ax2 = plt.subplot2grid((1,3), (0,2), colspan=1)
        #plt.setp(ax2.get_yticklabels(), visible=False)
        ax2.step(list(range(trunc)), explained_variance_ratio[:trunc], alpha=alpha, 
                 where='mid', lw=lw)
        ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw)
        ax2.set_xlabel('Principal components')
        ax2.grid(linestyle='solid', alpha=0.2)
        ax2.set_xlim(-2, trunc+2)
        ax2.set_ylim(0, 1)
        
        msg = '  Cumulative explained variance ratio for {:} PCs = {:.5f}'
        #plt.savefig('figure.pdf', dpi=300, bbox_inches='tight')
        print(msg.format(ncomp, ratio_cumsum[ncomp-1]))
        
    if ncomp>min(matrix.shape[0],matrix.shape[1]):
        msg = '{:} PCs can be obtained from a matrix with size [{:},{:}].'
        msg += ' Increase the size of the patches or decrease the number of'
        msg += ' principal components.'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))
        
    if mode=='eigen':
        # in our data n_frames is always smaller than n_pixels. In this setting
        # by taking the covariance as np.dot(matrix.T,matrix) we get all 
        # (n_pixels) eigenvectors but it is much slower and takes more memory 
        M = np.dot(matrix, matrix.T)                             # covariance matrix
        e, EV = linalg.eigh(M)                                   # eigenvalues and eigenvectors
        pc = np.dot(EV.T, matrix)                                # PCs using a compact trick when cov is MM'
        V = pc[::-1]                                             # reverse since last eigenvectors are the ones we want 
        S = np.sqrt(e)[::-1]                                     # reverse since eigenvalues are in increasing order 
        if debug: reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:,i] /= S                                          # scaling by the square root of eigenvalues
        V = V[:ncomp]
        if verbose: print('Done PCA with numpy linalg eigh functions')
        
    elif mode=='lapack':
        # in our data n_frames is always smaller than n_pixels. In this setting
        # taking the SVD of M' and keeping the left (transposed) SVs is faster
        # than taking the SVD of M and taking the right ones
        U, S, V = linalg.svd(matrix.T, full_matrices=False)         
        if debug: reconstruction(ncomp, U, S, V)
        V = V[:ncomp]                                           # we cut projection matrix according to the # of PCs               
        U = U[:,:ncomp]
        S = S[:ncomp]
        if verbose: print('Done SVD/PCA with numpy SVD (LAPACK)')
            
    elif mode=='arpack':
        U, S, V = svds(matrix, k=ncomp) 
        if debug: reconstruction(ncomp, U, S, V, -1)
        if verbose: print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode=='randsvd':
        U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2, 
                                 transpose='auto', random_state=None)
        if debug: reconstruction(ncomp, U, S, V)
        if verbose: print('Done SVD/PCA with randomized SVD')

    else:
        raise TypeError('The SVD mode is not available')
            
    if usv:
        if mode=='lapack':
            return U.T, S, V.T
        else:
            return U, S, V
    else:
        if mode=='lapack':
            return U.T
        else:
            return V
Esempio n. 21
0
# %% hidden=true
np.savetxt("britlit_H.csv", H1[:, ind], delimiter=",", fmt='%.14f')
FileLink('britlit_H.csv')

# %% hidden=true
np.savetxt("britlit_raw.csv", dtm[:, ind], delimiter=",", fmt='%.14f')
FileLink('britlit_raw.csv')

# %% hidden=true
[str(word) for word in vocab[ind]]

# %% [markdown]
# ### SVD

# %%
U, s, V = decomposition.randomized_svd(dtm, 10)

# %%
ind = get_all_topic_words(V)

# %%
len(ind)

# %%
vocab[ind]

# %%
show_topics(H1)

# %%
np.savetxt("britlit_U.csv", U, delimiter=",", fmt='%.14f')
Esempio n. 22
0
File: svd.py Progetto: zuzhaoye/VIP
def svd_wrapper(matrix, mode, ncomp, verbose, full_output=False,
                random_state=None, to_numpy=True):
    """ Wrapper for different SVD libraries (CPU and GPU). 
      
    Parameters
    ----------
    matrix : numpy ndarray, 2d
        2d input matrix.
    mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy',
        'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional
        Switch for the SVD method/library to be used.

        ``lapack``: uses the LAPACK linear algebra library through Numpy
        and it is the most conventional way of computing the SVD
        (deterministic result computed on CPU).

        ``arpack``: uses the ARPACK Fortran libraries accessible through
        Scipy (computation on CPU).

        ``eigen``: computes the singular vectors through the
        eigendecomposition of the covariance M.M' (computation on CPU).

        ``randsvd``: uses the randomized_svd algorithm implemented in
        Sklearn (computation on CPU).

        ``cupy``: uses the Cupy library for GPU computation of the SVD as in
        the LAPACK version. `

        `eigencupy``: offers the same method as with the ``eigen`` option
        but on GPU (through Cupy).

        ``randcupy``: is an adaptation of the randomized_svd algorithm,
        where all the computations are done on a GPU (through Cupy). `

        `pytorch``: uses the Pytorch library for GPU computation of the SVD.

        ``eigenpytorch``: offers the same method as with the ``eigen``
        option but on GPU (through Pytorch).

        ``randpytorch``: is an adaptation of the randomized_svd algorithm,
        where all the linear algebra computations are done on a GPU
        (through Pytorch).

    ncomp : int
        Number of singular vectors to be obtained. In the cases when the full
        SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular 
        vectors is truncated.
    verbose: bool
        If True intermediate information is printed out.
    full_output : bool optional
        If True the 3 terms of the SVD factorization are returned. If ``mode``
        is eigen then only S and V are returned.
    random_state : int, RandomState instance or None, optional
        If int, random_state is the seed used by the random number generator.
        If RandomState instance, random_state is the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random. Used for ``randsvd`` mode.
    to_numpy : bool, optional
        If True (by default) the arrays computed in GPU are transferred from
        VRAM and converted to numpy ndarrays.

    Returns
    -------
    V : numpy ndarray
        The right singular vectors of the input matrix. If ``full_output`` is
        True it returns the left and right singular vectors and the singular
        values of the input matrix. If ``mode`` is set to eigen then only S and
        V are returned.
    
    References
    ----------
    * For ``lapack`` SVD mode see:
        https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html
        http://www.netlib.org/lapack/
    * For ``eigen`` mode see:
        https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.eigh.html
    * For ``arpack`` SVD mode see:
        https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html
        http://www.caam.rice.edu/software/ARPACK/
    * For ``randsvd`` SVD mode see:
        https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py
        Finding structure with randomness: Stochastic algorithms for constructing
        approximate matrix decompositions
        Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061
    * For ``cupy`` SVD mode see:
        https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.linalg.svd.html
    * For ``eigencupy`` mode see:
        https://docs-cupy.chainer.org/en/master/reference/generated/cupy.linalg.eigh.html
    * For ``pytorch`` SVD mode see:
        http://pytorch.org/docs/master/torch.html#torch.svd
    * For ``eigenpytorch`` mode see:
        http://pytorch.org/docs/master/torch.html#torch.eig

    """
    if matrix.ndim != 2:
        raise TypeError('Input matrix is not a 2d array')

    if ncomp > min(matrix.shape[0], matrix.shape[1]):
        msg = '{} PCs cannot be obtained from a matrix with size [{},{}].'
        msg += ' Increase the size of the patches or request less PCs'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))

    if mode == 'eigen':
        # building C as np.dot(matrix.T,matrix) is slower and takes more memory
        C = np.dot(matrix, matrix.T)    # covariance matrix
        e, EV = linalg.eigh(C)          # EVals and EVs
        pc = np.dot(EV.T, matrix)       # PCs using a compact trick when cov is MM'
        V = pc[::-1]                    # reverse since we need the last EVs
        S = np.sqrt(np.abs(e))          # SVals = sqrt(EVals)
        S = S[::-1]                     # reverse since EVals go in increasing order
        for i in range(V.shape[1]):
            V[:, i] /= S    # scaling EVs by the square root of EVals
        V = V[:ncomp]
        if verbose:
            print('Done PCA with numpy linalg eigh functions')

    elif mode == 'lapack':
        # n_frames is usually smaller than n_pixels. In this setting taking
        # the SVD of M' and keeping the left (transposed) SVs is faster than
        # taking the SVD of M (right SVs)
        U, S, V = linalg.svd(matrix.T, full_matrices=False)
        V = V[:ncomp]       # we cut projection matrix according to the # of PCs
        U = U[:, :ncomp]
        S = S[:ncomp]
        if verbose:
            print('Done SVD/PCA with numpy SVD (LAPACK)')

    elif mode == 'arpack':
        U, S, V = svds(matrix, k=ncomp)
        if verbose:
            print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode == 'randsvd':
        U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2,
                                 transpose='auto', random_state=random_state)
        if verbose:
            print('Done SVD/PCA with randomized SVD')

    elif mode == 'cupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True,
                                               compute_uv=True)
        V = vh_gpu[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if full_output:
            S = s_gpu[:ncomp]
            if to_numpy:
                S = cupy.asnumpy(S)
            U = u_gpu[:, :ncomp]
            if to_numpy:
                U = cupy.asnumpy(U)
        if verbose:
            print('Done SVD/PCA with cupy (GPU)')

    elif mode == 'randcupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy')
        if to_numpy:
            V = cupy.asnumpy(V)
            S = cupy.asnumpy(S)
            U = cupy.asnumpy(U)
        if verbose:
            print('Done randomized SVD/PCA with cupy (GPU)')

    elif mode == 'eigencupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)     # move the data to the current device
        C = cupy.dot(a_gpu, a_gpu.T)    # covariance matrix
        e, EV = cupy.linalg.eigh(C)     # eigenvalues and eigenvectors
        pc = cupy.dot(EV.T, a_gpu)      # using a compact trick when cov is MM'
        V = pc[::-1]                    # reverse to get last eigenvectors
        S = cupy.sqrt(e)[::-1]          # reverse since EVals go in increasing order
        for i in range(V.shape[1]):
            V[:, i] /= S                # scaling by the square root of eigvals
        V = V[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if verbose:
            print('Done PCA with cupy eigh function (GPU)')

    elif mode == 'pytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T))
        u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu)
        V = vh_gpu[:ncomp]
        S = s_gpu[:ncomp]
        U = torch.transpose(u_gpu, 0, 1)[:ncomp]
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done SVD/PCA with pytorch (GPU)')

    elif mode == 'eigenpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32')))
        C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1))
        e, EV = torch.eig(C, eigenvectors=True)
        V = torch.mm(torch.transpose(EV, 0, 1), a_gpu)
        S = torch.sqrt(e[:, 0])
        for i in range(V.shape[1]):
            V[:, i] /= S
        V = V[:ncomp]
        if to_numpy:
            V = np.array(V)
        if verbose:
            print('Done PCA with pytorch eig function')

    elif mode == 'randpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch')
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done randomized SVD/PCA with randomized pytorch (GPU)')

    else:
        raise ValueError('The SVD `mode` is not recognized')

    if full_output:
        if mode == 'lapack':
            return V.T, S, U.T
        elif mode == 'pytorch':
            if to_numpy:
                return V.T, S, U.T
            else:
                return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1)
        elif mode in ('eigen', 'eigencupy', 'eigenpytorch'):
            return S, V
        else:
            return U, S, V
    else:
        if mode == 'lapack':
            return U.T
        elif mode == 'pytorch':
            return U
        else:
            return V
Esempio n. 23
0
clf = NMF(n_components=5, random_state=0)
W = clf.fit_transform(vectors)
H = clf.components_

print(W, H)
plt.figure(2)
plt.plot(H[0])

# 判断是否相等
# NMF 的分解是不准确的
print(np.allclose(W @ H, vectors))

# TF-IDF
vectorizer_tfidf = TfidfVectorizer(stop_words='english')
vectors_tfidf = vectorizer_tfidf.fit_transform(news_train.data)    # (documents, vocab)
W1 = clf.fit_transform(vectors_tfidf)
H1 = clf.components_
# 误差
# Frobenius norm of V−WH
print(clf.reconstruction_err_)

# 第一个组分的重要性
plt.figure(3)
plt.plot(H1[0])

# 随机 SVD
u, s, v = randomized_svd(vectors, 5)
print(u, s, v)

plt.show()
Esempio n. 24
0
def save_pc_values(dataset_name,
                   model_dir,
                   batch_size=10000,
                   top=None,
                   bottom=None,
                   save_dim_coefficients=False,
                   frac=0.5):
    """Save first PC-related metadata for each layer activation.

  Either output projected values onto the first PC and the
  fraction of variation explained, or the PC itself.
  """
    out_dir = os.path.join(model_dir, 'all_pc_values_%d.pkl' % batch_size)
    out_dir_explained_ratio = os.path.join(
        model_dir, 'all_pc_explained_ratio_%d.pkl' % batch_size)
    if top is not None:  # remove outliers and recalculate PC
        out_dir = out_dir.replace('.pkl', '_no_outlier_remove_%.2f.pkl' % frac)
        out_dir_explained_ratio = out_dir_explained_ratio.replace(
            '.pkl', '_no_outlier_remove_%.2f.pkl' % frac)

    if tf.io.gfile.exists(out_dir_explained_ratio):
        exit()

    model = tf.keras.models.load_model(model_dir)
    if 'weights' in model_dir or 'copy-10' in model_dir or 'copy-11' in model_dir or 'copy-12' in model_dir:
        model = convert_bn_to_train_mode(model)
        out_dir = out_dir.replace('.pkl', '_bn_train_mode.pkl')
        out_dir_explained_ratio = out_dir_explained_ratio.replace(
            '.pkl', '_bn_train_mode.pkl')

    test_dataset = load_test_data(batch_size, dataset_name=dataset_name)
    images, _ = test_dataset.__iter__().next()
    all_activations = get_activations(images, model)
    bs = images.numpy().shape[0]
    all_pc_values = []
    all_pc_explained_ratio = []
    all_coefficients = []
    all_coefficients_no_outliers = []
    for i, act in enumerate(all_activations):
        if top is not None and (i > top or i < bottom):
            continue
        act = act.reshape(bs, -1)
        processed_act = act - np.mean(act, axis=0)
        svd = TruncatedSVD(n_components=1, random_state=0)
        svd.fit(processed_act.T)
        act_pc = svd.components_.squeeze()

        if save_dim_coefficients:  # save the PC itself
            U, _, _ = randomized_svd(processed_act.T,
                                     n_components=1,
                                     n_iter=5,
                                     random_state=None)
            all_coefficients.append(U.squeeze())

        if top is None:
            all_pc_values.append(act_pc)
            all_pc_explained_ratio.append(svd.explained_variance_ratio_[0])
        else:  # remove outliers for the corresponding layers
            n_examples = len(act_pc)
            outlier_idx = np.argsort(np.abs(act_pc))[int(
                n_examples * frac):]  # remove the bottom {frac} of the data
            selected_idx = np.array(
                [i for i in range(bs) if i not in outlier_idx])
            act_no_outlier = act[selected_idx, :]
            processed_act = act_no_outlier - np.mean(act_no_outlier, axis=0)

            if save_dim_coefficients:
                U, _, _ = randomized_svd(processed_act.T,
                                         n_components=1,
                                         n_iter=5,
                                         random_state=None)
                all_coefficients_no_outliers.append(U.squeeze())
            else:
                svd = TruncatedSVD(n_components=1, random_state=0)
                svd.fit(processed_act.T)
                act_pc = svd.components_.squeeze()
                all_pc_values.append(act_pc)
                all_pc_explained_ratio.append(svd.explained_variance_ratio_[0])

    if save_dim_coefficients:
        out_dir = os.path.join(model_dir,
                               'all_dim_coefficients_%d.pkl' % batch_size)
        pickle.dump(all_coefficients, tf.io.gfile.GFile(out_dir, 'wb'))
        out_dir = os.path.join(
            model_dir,
            'all_dim_coefficients_%d_no_outlier_remove_half.pkl' % batch_size)
        pickle.dump(all_coefficients_no_outliers,
                    tf.io.gfile.GFile(out_dir, 'wb'))
    else:
        pickle.dump(all_pc_values, tf.io.gfile.GFile(out_dir, 'wb'))
        pickle.dump(all_pc_explained_ratio,
                    tf.io.gfile.GFile(out_dir_explained_ratio, 'wb'))
Esempio n. 25
0
    def __init__(self, working_dir):
        #Check if we have a gpu, otherwise, set self.device to cpu:
        if torch.cuda.is_available():
            gpu_memory_map = get_gpu_memory_map()
            if gpu_memory_map[0] < gpu_memory_map[1]:
                self.device = torch.device('cuda:0')
            else:
                self.device = torch.device('cuda:1')
        else:
            self.device = torch.device('cpu')
        self.working_dir = working_dir
        try:
            with open(self.working_dir + 'parameters_algo.json',
                      'r') as read_file_parameters_algo:
                parameters_algo = json.load(read_file_parameters_algo)
        except FileNotFoundError:
            print('working_dir not found')
        self.data_name = parameters_algo['data_name']
        self.parameters_algo = verify_parameters_algo(parameters_algo)
        # if no data_path is given, use the default path
        if 'data_path' in parameters_algo:
            self.data, self.angles, self.psf, self.center_image = automatic_load_data(
                self.data_name,
                self.device,
                channel=self.parameters_algo['channel'],
                crop=self.parameters_algo['crop'],
                data_dir=parameters_algo['data_path'])
        else:
            self.data, self.angles, self.psf, self.center_image = automatic_load_data(
                self.data_name,
                self.device,
                channel=self.parameters_algo['channel'],
                crop=self.parameters_algo['crop'])
        self.t, self.n, _ = self.data.shape
        self.kernel = torch.fft.fft2(torch.fft.fftshift(self.psf))
        self.data_np = self.data.cpu().detach().numpy()
        #self.psf = self.psf[self.n//2-10:self.n//2+10,self.n//2-10:self.n//2+10]
        #if 'center_image' in parameters_algo:
        #    self.center_image = tuple(parameters_algo['center_image'])
        #else:
        #    self.center_image = False
        mask_np = get_mask(self.n, self.parameters_algo['mask_center'],
                           self.center_image)
        self.mask = torch.from_numpy(mask_np).to(self.device)
        dtype = torch.FloatTensor  # todo : check how to deal with this
        # Define the rotation matrices:
        self.pa_rotate_matrix = get_all_rotation_matrices(
            self.angles, self.center_image, dtype).to(self.device)
        self.pa_derotate_matrix = get_all_rotation_matrices(
            -self.angles, self.center_image, dtype).to(self.device)
        #check if there is any synthetic data to add (only used to create synthetic data to test mayo)
        try:
            with open(self.working_dir + 'add_synthetic_signal.json',
                      'r') as read_file_add_synthetic_signal:
                add_synthetic_signal = json.load(
                    read_file_add_synthetic_signal)
                self.data_np, self.synthetic_disc_planet = create_synthetic_data_with_disk_planet(
                    self.data_np, self.pa_rotate_matrix.to('cpu'),
                    self.kernel.cpu().detach().numpy(), add_synthetic_signal)
                self.data = torch.from_numpy(self.data_np).to(self.device)
                print('Synthetic signal added to data')
        except FileNotFoundError:
            pass
        self.matrix = self.data.reshape(self.t, self.n * self.n)
        if 'ref_cube' in self.parameters_algo:
            try:
                if 'data_path' in parameters_algo:
                    path_ref_cube = parameters_algo[
                        'data_path'] + self.data_name + '/' + self.parameters_algo[
                            'ref_cube'][self.parameters_algo['channel']]
                else:
                    print('warning: we need to fix the default data path')
                    path_ref_cube = "D:/HCImaging/Data/" + self.data_name + '/' + self.parameters_algo[
                        'ref_cube'][self.parameters_algo['channel']]

                ref_cube = mayo_hci.open_fits(path_ref_cube, device='cpu')
                n_frames_ref, _, _ = ref_cube.shape
                _, _, V_T = np.linalg.svd(ref_cube.view(
                    n_frames_ref, self.n * self.n),
                                          full_matrices=False)
                self.V = torch.from_numpy(V_T.T).to(self.device)
            except FileNotFoundError:
                raise FileNotFoundError("Ref cube file not found!")
            self.FRIES = True
        else:
            self.FRIES = False
        self.run_GreeDS()  # step 3 in algorithm 2 from Pairet etal 2020
        self.xd = self.GreeDS_frame.clone()
        self.residuals = self.GreeDS_frame.clone()
        self.xp = torch.zeros((self.n, self.n), device=self.device)
        if self.FRIES:
            self.V = self.V[:, :self.parameters_algo['rank']]
        else:
            U_L0_np, _, _ = randomized_svd(
                self.xl.reshape(self.t,
                                self.n * self.n).cpu().detach().numpy(),
                n_components=self.parameters_algo['rank'],
                n_iter=5,
                transpose='auto')
            self.U_L0 = torch.from_numpy(U_L0_np).to(self.device)
        self.define_optimization_function(
        )  #dummy definition of function, redifined in child classes
Esempio n. 26
0
 def _truncatedSVD(self) :
     from sklearn.decomposition import randomized_svd
     self.U, s, self.VT = randomized_svd(self.DTM ,
                               n_components = self.k ,
                               n_iter=10 )
Esempio n. 27
0
File: svd.py Progetto: kakasoueu/VIP
def svd_wrapper(matrix,
                mode,
                ncomp,
                debug,
                verbose,
                usv=False,
                random_state=None,
                to_numpy=True):
    """ Wrapper for different SVD libraries (CPU and GPU). 
      
    Parameters
    ----------
    matrix : array_like, 2d
        2d input matrix.
    mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy',
            'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional
        Switch for the SVD method/library to be used. ``lapack`` uses the LAPACK 
        linear algebra library through Numpy and it is the most conventional way 
        of computing the SVD (deterministic result computed on CPU). ``arpack`` 
        uses the ARPACK Fortran libraries accessible through Scipy (computation
        on CPU). ``eigen`` computes the singular vectors through the 
        eigendecomposition of the covariance M.M' (computation on CPU).
        ``randsvd`` uses the randomized_svd algorithm implemented in Sklearn 
        (computation on CPU). ``cupy`` uses the Cupy library for GPU computation
        of the SVD as in the LAPACK version. ``eigencupy`` offers the same 
        method as with the ``eigen`` option but on GPU (through Cupy). 
        ``randcupy`` is an adaptation of the randomized_svd algorithm, where all
        the computations are done on a GPU (through Cupy). ``pytorch`` uses the
        Pytorch library for GPU computation of the SVD. ``eigenpytorch`` offers
        the same method as with the ``eigen`` option but on GPU (through
        Pytorch). ``randpytorch`` is an adaptation of the randomized_svd
        algorithm, where all the linear algebra computations are done on a GPU
        (through Pytorch).
    ncomp : int
        Number of singular vectors to be obtained. In the cases when the full
        SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular 
        vectors is truncated. 
    debug : bool
        If True the explained variance ratio is computed and displayed.
    verbose: bool
        If True intermediate information is printed out.
    usv : bool optional
        If True the 3 terms of the SVD factorization are returned.
    random_state : int, RandomState instance or None, optional
        If int, random_state is the seed used by the random number generator.
        If RandomState instance, random_state is the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random. Used for ``randsvd`` mode.
    to_numpy : bool, optional
        If True (by default) the arrays computed in GPU are transferred from
        VRAM and converted to numpy ndarrays.

    Returns
    -------
    The right singular vectors of the input matrix. If ``usv`` is True it 
    returns the left and right singular vectors and the singular values of the
    input matrix.
    
    References
    ----------
    * For ``lapack`` SVD mode see:
    https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html
    http://www.netlib.org/lapack/
    
    * For ``eigen`` mode see:
    https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.eigh.html
    
    * For ``arpack`` SVD mode see:
    https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html
    http://www.caam.rice.edu/software/ARPACK/
    
    * For ``randsvd`` SVD mode see:
    https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py
    Finding structure with randomness: Stochastic algorithms for constructing
    approximate matrix decompositions
    Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061
    
    * For ``cupy`` SVD mode see:
    https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.linalg.svd.html
    
    * For ``eigencupy`` mode see:
    https://docs-cupy.chainer.org/en/master/reference/generated/cupy.linalg.eigh.html

    * For ``pytorch`` SVD mode see:
    http://pytorch.org/docs/master/torch.html#torch.svd

    * For ``eigenpytorch`` mode see:
    http://pytorch.org/docs/master/torch.html#torch.eig

    """
    def reconstruction(ncomp, U, S, V, var=1):
        if mode == 'lapack':
            rec_matrix = np.dot(U[:, :ncomp],
                                np.dot(np.diag(S[:ncomp]), V[:ncomp]))
            rec_matrix = rec_matrix.T
            print('  Matrix reconstruction with {} PCs:'.format(ncomp))
            print('  Mean Absolute Error =', MAE(matrix, rec_matrix))
            print('  Mean Squared Error =', MSE(matrix, rec_matrix))

            # see https://github.com/scikit-learn/scikit-learn/blob/c3980bcbabd9d2527548820581725df2904e4a0d/sklearn/decomposition/pca.py
            exp_var = (S**2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var  # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        elif mode == 'eigen':
            exp_var = (S**2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var  # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        else:
            rec_matrix = np.dot(U, np.dot(np.diag(S), V))
            print('  Matrix reconstruction MAE =', MAE(matrix, rec_matrix))
            exp_var = (S**2) / (S.shape[0] - 1)
            full_var = np.var(matrix, axis=0).sum()
            explained_variance_ratio = exp_var / full_var  # % of variance explained by each PC
            if var == 1:
                pass
            else:
                explained_variance_ratio = explained_variance_ratio[::-1]
            ratio_cumsum = np.cumsum(explained_variance_ratio)
            msg = '  This info makes sense when the matrix is mean centered '
            msg += '(temp-mean scaling)'
            print(msg)

        lw = 2
        alpha = 0.4
        fig = plt.figure(figsize=(6, 3))
        fig.subplots_adjust(wspace=0.4)
        ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2)
        ax1.step(range(explained_variance_ratio.shape[0]),
                 explained_variance_ratio,
                 alpha=alpha,
                 where='mid',
                 label='Individual EVR',
                 lw=lw)
        ax1.plot(ratio_cumsum,
                 '.-',
                 alpha=alpha,
                 label='Cumulative EVR',
                 lw=lw)
        ax1.legend(loc='best', frameon=False, fontsize='medium')
        ax1.set_ylabel('Explained variance ratio (EVR)')
        ax1.set_xlabel('Principal components')
        ax1.grid(linestyle='solid', alpha=0.2)
        ax1.set_xlim(-10, explained_variance_ratio.shape[0] + 10)
        ax1.set_ylim(0, 1)

        trunc = 20
        ax2 = plt.subplot2grid((1, 3), (0, 2), colspan=1)
        # plt.setp(ax2.get_yticklabels(), visible=False)
        ax2.step(range(trunc),
                 explained_variance_ratio[:trunc],
                 alpha=alpha,
                 where='mid',
                 lw=lw)
        ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw)
        ax2.set_xlabel('Principal components')
        ax2.grid(linestyle='solid', alpha=0.2)
        ax2.set_xlim(-2, trunc + 2)
        ax2.set_ylim(0, 1)

        msg = '  Cumulative explained variance ratio for {} PCs = {:.5f}'
        # plt.savefig('figure.pdf', dpi=300, bbox_inches='tight')
        print(msg.format(ncomp, ratio_cumsum[ncomp - 1]))

    # --------------------------------------------------------------------------

    if matrix.ndim != 2:
        raise TypeError('Input matrix is not a 2d array')

    if usv:
        if mode not in ('lapack', 'arpack', 'randsvd', 'cupy', 'randcupy',
                        'pytorch', 'randpytorch'):
            msg = "Returning USV is supported with modes lapack, arpack, "
            msg += "randsvd, cupy, randcupy, pytorch or randpytorch"
            raise ValueError(msg)

    if ncomp > min(matrix.shape[0], matrix.shape[1]):
        msg = '{} PCs cannot be obtained from a matrix with size [{},{}].'
        msg += ' Increase the size of the patches or request less PCs'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))

    if mode == 'eigen':
        # building the covariance as np.dot(matrix.T,matrix) is slower and takes more memory
        C = np.dot(matrix, matrix.T)  # covariance matrix
        e, EV = linalg.eigh(C)  # eigenvalues and eigenvectors
        pc = np.dot(EV.T, matrix)  # PCs using a compact trick when cov is MM'
        V = pc[::-1]  # reverse since last eigenvectors are the ones we want
        S = np.sqrt(e)[::
                       -1]  # reverse since eigenvalues are in increasing order
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S  # scaling by the square root of eigenvalues
        V = V[:ncomp]
        if verbose:
            print('Done PCA with numpy linalg eigh functions')

    elif mode == 'lapack':
        # n_frames is usually smaller than n_pixels. In this setting taking the SVD of M'
        # and keeping the left (transposed) SVs is faster than taking the SVD of M (right SVs)
        U, S, V = linalg.svd(matrix.T, full_matrices=False)
        if debug:
            reconstruction(ncomp, U, S, V)
        V = V[:ncomp]  # we cut projection matrix according to the # of PCs
        U = U[:, :ncomp]
        S = S[:ncomp]
        if verbose:
            print('Done SVD/PCA with numpy SVD (LAPACK)')

    elif mode == 'arpack':
        U, S, V = svds(matrix, k=ncomp)
        if debug:
            reconstruction(ncomp, U, S, V, -1)
        if verbose:
            print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode == 'randsvd':
        U, S, V = randomized_svd(matrix,
                                 n_components=ncomp,
                                 n_iter=2,
                                 transpose='auto',
                                 random_state=random_state)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done SVD/PCA with randomized SVD')

    elif mode == 'cupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu,
                                               full_matrices=True,
                                               compute_uv=True)
        V = vh_gpu[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if usv:
            S = s_gpu[:ncomp]
            if to_numpy:
                S = cupy.asnumpy(S)
            U = u_gpu[:, :ncomp]
            if to_numpy:
                U = cupy.asnumpy(U)
        if verbose:
            print('Done SVD/PCA with cupy (GPU)')

    elif mode == 'randcupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy')
        if to_numpy:
            V = cupy.asnumpy(V)
            S = cupy.asnumpy(S)
            U = cupy.asnumpy(U)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done randomized SVD/PCA with cupy (GPU)')

    elif mode == 'eigencupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        C = cupy.dot(a_gpu, a_gpu.T)  # covariance matrix
        e, EV = cupy.linalg.eigh(C)  # eigenvalues and eigenvectors
        pc = cupy.dot(EV.T, a_gpu)  # PCs using a compact trick when cov is MM'
        V = pc[::-1]  # reverse since last eigenvectors are the ones we want
        S = cupy.sqrt(
            e)[::-1]  # reverse since eigenvalues are in increasing order
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S  # scaling by the square root of eigenvalues
        V = V[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if verbose:
            print('Done PCA with cupy eigh function (GPU)')

    elif mode == 'pytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T))
        u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu)
        V = vh_gpu[:ncomp]
        S = s_gpu[:ncomp]
        U = torch.transpose(u_gpu, 0, 1)[:ncomp]
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done SVD/PCA with pytorch (GPU)')

    elif mode == 'eigenpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32')))
        C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1))
        e, EV = torch.eig(C, eigenvectors=True)
        V = torch.mm(torch.transpose(EV, 0, 1), a_gpu)
        S = torch.sqrt(e[:, 0])
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S
        V = V[:ncomp]
        if to_numpy:
            V = np.array(V)
        if verbose:
            print('Done PCA with pytorch eig function')

    elif mode == 'randpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch')
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done randomized SVD/PCA with randomized pytorch (GPU)')

    else:
        raise ValueError('The SVD mode is not available')

    if usv:
        if mode == 'lapack':
            return V.T, S, U.T
        elif mode == 'pytorch':
            if to_numpy:
                return V.T, S, U.T
            else:
                return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1)
        else:
            return U, S, V
    else:
        if mode == 'lapack':
            return U.T
        elif mode == 'pytorch':
            return U
        else:
            return V
Esempio n. 28
0
 def _truncatedSVD(self):
     self.U, s, self.VT = randomized_svd(self.DTM,
                                         n_components=self.k,
                                         n_iter=10)
Esempio n. 29
0
 def reduce(X, n_components, power=0.0):
     U, Sigma, VT = randomized_svd(X, n_components=n_components)
     # note: TruncatedSVD always multiplies U by Sigma, but can tune results by just using U or raising Sigma to a power
     return U * (Sigma**power)
Esempio n. 30
0
from sklearn.decomposition import randomized_svd

A = [[1, 0, 1, 0, 0], [1, 1, 0, 0, 0], [0, 1, 0, 0, 0], [0, 1, 1, 0, 0],
     [0, 0, 0, 1, 0], [0, 0, 1, 1, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 1]]

name_word = [
    "romeo", "juliet", "happy", "dagger", "live", "die", "free",
    "new-hampshire"
]
name_doc = ["D1", "D2", "D3", "D4", "D5"]

A = np.asarray(A)

#Computing SVD
U, Sigma, VT = randomized_svd(A, n_components=2, n_iter=1, random_state=None)

#Sigma is a diagonal matrix, but randomized_svd returns a vector so we have to change it back to a diagonal matrix
Sigma = np.diag(Sigma)

#Compute the vectors for each word and each document
Word = np.dot(U, Sigma)
Doc = np.dot(Sigma, VT)

#Scatter plot the resulting vectors
fig, ax = plt.subplots()
ax.scatter(Word.T[0], Word.T[1])
ax.scatter(Doc[0], Doc[1])

for i, txt in enumerate(name_word):
    if (txt == "free"):
Esempio n. 31
0
def svd_wrapper(matrix, mode, ncomp, debug, verbose, usv=False,
                random_state=None, to_numpy=True):
    """ Wrapper for different SVD libraries (CPU and GPU). 
      
    Parameters
    ----------
    matrix : array_like, 2d
        2d input matrix.
    mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy',
            'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional
        Switch for the SVD method/library to be used. ``lapack`` uses the LAPACK 
        linear algebra library through Numpy and it is the most conventional way 
        of computing the SVD (deterministic result computed on CPU). ``arpack`` 
        uses the ARPACK Fortran libraries accessible through Scipy (computation
        on CPU). ``eigen`` computes the singular vectors through the 
        eigendecomposition of the covariance M.M' (computation on CPU).
        ``randsvd`` uses the randomized_svd algorithm implemented in Sklearn 
        (computation on CPU). ``cupy`` uses the Cupy library for GPU computation
        of the SVD as in the LAPACK version. ``eigencupy`` offers the same 
        method as with the ``eigen`` option but on GPU (through Cupy). 
        ``randcupy`` is an adaptation f the randomized_svd algorithm, where all
        the computations are done on a GPU (through Cupy). ``pytorch`` uses the
        Pytorch library for GPU computation of the SVD. ``eigenpytorch`` offers
        the same method as with the ``eigen`` option but on GPU (through
        Pytorch). ``randpytorch`` is an adaptation of the randomized_svd
        algorithm, where all the linear algebra computations are done on a GPU
        (through Pytorch).
    ncomp : int
        Number of singular vectors to be obtained. In the cases when the full
        SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular 
        vectors is truncated. 
    debug : bool
        If True the explained variance ratio is computed and displayed.
    verbose: bool
        If True intermediate information is printed out.
    usv : bool optional
        If True the 3 terms of the SVD factorization are returned.
    random_state : int, RandomState instance or None, optional
        If int, random_state is the seed used by the random number generator.
        If RandomState instance, random_state is the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random. Used for ``randsvd`` mode.
    to_numpy : bool, optional
        If True (by default) the arrays computed in GPU are transferred from
        VRAM and converted to numpy ndarrays.

    Returns
    -------
    V : array_like
        The right singular vectors of the input matrix. If ``usv`` is True it
        returns the left and right singular vectors and the singular values of
        the input matrix.
    
    References
    ----------
    * For ``lapack`` SVD mode see:
        https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html
        http://www.netlib.org/lapack/
    * For ``eigen`` mode see:
        https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.eigh.html
    * For ``arpack`` SVD mode see:
        https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html
        http://www.caam.rice.edu/software/ARPACK/
    * For ``randsvd`` SVD mode see:
        https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py
        Finding structure with randomness: Stochastic algorithms for constructing
        approximate matrix decompositions
        Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061
    * For ``cupy`` SVD mode see:
        https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.linalg.svd.html
    * For ``eigencupy`` mode see:
        https://docs-cupy.chainer.org/en/master/reference/generated/cupy.linalg.eigh.html
    * For ``pytorch`` SVD mode see:
        http://pytorch.org/docs/master/torch.html#torch.svd
    * For ``eigenpytorch`` mode see:
        http://pytorch.org/docs/master/torch.html#torch.eig

    """

    def reconstruction(ncomp, U, S, V, var=1):
        if mode == 'lapack':
            rec_matrix = np.dot(U[:, :ncomp],
                                np.dot(np.diag(S[:ncomp]), V[:ncomp]))
            rec_matrix = rec_matrix.T
            print('  Matrix reconstruction with {} PCs:'.format(ncomp))
            print('  Mean Absolute Error =', MAE(matrix, rec_matrix))
            print('  Mean Squared Error =', MSE(matrix, rec_matrix))

            # see https://github.com/scikit-learn/scikit-learn/blob/c3980bcbabd9d2527548820581725df2904e4a0d/sklearn/decomposition/pca.py
            exp_var = (S ** 2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var   # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        elif mode == 'eigen':
            exp_var = (S ** 2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var   # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        else:
            rec_matrix = np.dot(U, np.dot(np.diag(S), V))
            print('  Matrix reconstruction MAE =', MAE(matrix, rec_matrix))
            exp_var = (S ** 2) / (S.shape[0] - 1)
            full_var = np.var(matrix, axis=0).sum()
            explained_variance_ratio = exp_var / full_var   # % of variance explained by each PC
            if var == 1:
                pass
            else:
                explained_variance_ratio = explained_variance_ratio[::-1]
            ratio_cumsum = np.cumsum(explained_variance_ratio)
            msg = '  This info makes sense when the matrix is mean centered '
            msg += '(temp-mean scaling)'
            print(msg)

        lw = 2; alpha = 0.4
        fig = plt.figure(figsize=vip_figsize)
        fig.subplots_adjust(wspace=0.4)
        ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2)
        ax1.step(range(explained_variance_ratio.shape[0]),
                 explained_variance_ratio, alpha=alpha, where='mid',
                 label='Individual EVR', lw=lw)
        ax1.plot(ratio_cumsum, '.-', alpha=alpha,
                 label='Cumulative EVR', lw=lw)
        ax1.legend(loc='best', frameon=False, fontsize='medium')
        ax1.set_ylabel('Explained variance ratio (EVR)')
        ax1.set_xlabel('Principal components')
        ax1.grid(linestyle='solid', alpha=0.2)
        ax1.set_xlim(-10, explained_variance_ratio.shape[0] + 10)
        ax1.set_ylim(0, 1)

        trunc = 20
        ax2 = plt.subplot2grid((1, 3), (0, 2), colspan=1)
        # plt.setp(ax2.get_yticklabels(), visible=False)
        ax2.step(range(trunc), explained_variance_ratio[:trunc], alpha=alpha,
                 where='mid', lw=lw)
        ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw)
        ax2.set_xlabel('Principal components')
        ax2.grid(linestyle='solid', alpha=0.2)
        ax2.set_xlim(-2, trunc + 2)
        ax2.set_ylim(0, 1)

        msg = '  Cumulative explained variance ratio for {} PCs = {:.5f}'
        # plt.savefig('figure.pdf', dpi=300, bbox_inches='tight')
        print(msg.format(ncomp, ratio_cumsum[ncomp - 1]))

    # --------------------------------------------------------------------------

    if matrix.ndim != 2:
        raise TypeError('Input matrix is not a 2d array')

    if usv:
        if mode not in ('lapack', 'arpack', 'randsvd', 'cupy', 'randcupy',
                        'pytorch', 'randpytorch'):
            msg = "Returning USV is supported with modes lapack, arpack, "
            msg += "randsvd, cupy, randcupy, pytorch or randpytorch"
            raise ValueError(msg)

    if ncomp > min(matrix.shape[0], matrix.shape[1]):
        msg = '{} PCs cannot be obtained from a matrix with size [{},{}].'
        msg += ' Increase the size of the patches or request less PCs'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))

    if mode == 'eigen':
        # building C as np.dot(matrix.T,matrix) is slower and takes more memory
        C = np.dot(matrix, matrix.T)        # covariance matrix
        e, EV = linalg.eigh(C)              # EVals and EVs
        pc = np.dot(EV.T, matrix)           # PCs using a compact trick when cov is MM'
        V = pc[::-1]                        # reverse since we need the last EVs
        S = np.sqrt(np.abs(e))              # SVals = sqrt(EVals)
        S = S[::-1]                         # reverse since EVals go in increasing order
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S                    # scaling EVs by the square root of EVals
        V = V[:ncomp]
        if verbose:
            print('Done PCA with numpy linalg eigh functions')

    elif mode == 'lapack':
        # n_frames is usually smaller than n_pixels. In this setting taking the SVD of M'
        # and keeping the left (transposed) SVs is faster than taking the SVD of M (right SVs)
        U, S, V = linalg.svd(matrix.T, full_matrices=False)
        if debug:
            reconstruction(ncomp, U, S, V)
        V = V[:ncomp]                       # we cut projection matrix according to the # of PCs
        U = U[:, :ncomp]
        S = S[:ncomp]
        if verbose:
            print('Done SVD/PCA with numpy SVD (LAPACK)')

    elif mode == 'arpack':
        U, S, V = svds(matrix, k=ncomp)
        if debug:
            reconstruction(ncomp, U, S, V, -1)
        if verbose:
            print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode == 'randsvd':
        U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2,
                                 transpose='auto', random_state=random_state)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done SVD/PCA with randomized SVD')

    elif mode == 'cupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)  # move the data to the current device
        u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True,
                                               compute_uv=True)
        V = vh_gpu[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if usv:
            S = s_gpu[:ncomp]
            if to_numpy:
                S = cupy.asnumpy(S)
            U = u_gpu[:, :ncomp]
            if to_numpy:
                U = cupy.asnumpy(U)
        if verbose:
            print('Done SVD/PCA with cupy (GPU)')

    elif mode == 'randcupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy')
        if to_numpy:
            V = cupy.asnumpy(V)
            S = cupy.asnumpy(S)
            U = cupy.asnumpy(U)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done randomized SVD/PCA with cupy (GPU)')

    elif mode == 'eigencupy':
        if no_cupy:
            raise RuntimeError('Cupy is not installed')
        a_gpu = cupy.array(matrix)
        a_gpu = cupy.asarray(a_gpu)         # move the data to the current device
        C = cupy.dot(a_gpu, a_gpu.T)        # covariance matrix
        e, EV = cupy.linalg.eigh(C)         # eigenvalues and eigenvectors
        pc = cupy.dot(EV.T, a_gpu)          # PCs using a compact trick when cov is MM'
        V = pc[::-1]                        # reverse since last eigenvectors are the ones we want
        S = cupy.sqrt(e)[::-1]              # reverse since eigenvalues are in increasing order
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S                    # scaling by the square root of eigenvalues
        V = V[:ncomp]
        if to_numpy:
            V = cupy.asnumpy(V)
        if verbose:
            print('Done PCA with cupy eigh function (GPU)')

    elif mode == 'pytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T))
        u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu)
        V = vh_gpu[:ncomp]
        S = s_gpu[:ncomp]
        U = torch.transpose(u_gpu, 0, 1)[:ncomp]
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if verbose:
            print('Done SVD/PCA with pytorch (GPU)')

    elif mode == 'eigenpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32')))
        C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1))
        e, EV = torch.eig(C, eigenvectors=True)
        V = torch.mm(torch.transpose(EV, 0, 1), a_gpu)
        S = torch.sqrt(e[:, 0])
        if debug:
            reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:, i] /= S
        V = V[:ncomp]
        if to_numpy:
            V = np.array(V)
        if verbose:
            print('Done PCA with pytorch eig function')

    elif mode == 'randpytorch':
        if no_torch:
            raise RuntimeError('Pytorch is not installed')
        U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch')
        if to_numpy:
            V = np.array(V)
            S = np.array(S)
            U = np.array(U)
        if debug:
            reconstruction(ncomp, U, S, V)
        if verbose:
            print('Done randomized SVD/PCA with randomized pytorch (GPU)')

    else:
        raise ValueError('The SVD mode is not available')

    if usv:
        if mode == 'lapack':
            return V.T, S, U.T
        elif mode == 'pytorch':
            if to_numpy:
                return V.T, S, U.T
            else:
                return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1)
        else:
            return U, S, V
    else:
        if mode == 'lapack':
            return U.T
        elif mode == 'pytorch':
            return U
        else:
            return V
Esempio n. 32
0
 def _max_singular_value(self, X_filled):
     # quick decomposition of X_filled into rank-1 SVD
     _, s, _ = randomized_svd(X_filled, 1, n_iter=5)
     return s[0]
def svd(X, K):
    _, _, Vt = randomized_svd(X, n_components=K)
    X_red = X.dot(Vt.T)
    X_red = normalizer.fit_transform(X_red)
    return X_red
Esempio n. 34
0
    for j in range(1, 3):
        print("Similarity Score:", sims[j][1])
        print("what", sims[j][0])
        print("Rank", j, titles[sims[j][0]])
        print("Abstract:", abstracts[sims[j][0]])
        print("PDF:", "www.openreview.net" + pdfLinks[sims[j][0]])
        print("Venue:", venues[sims[j][0]], '\n')
    print('---')

threshold = 0.25
#vec_lsi2 = corpus_lsi[0]
#print(vec_lsi2)
#print(len(corpus_lsi))
adj_matrix = sparse.lil_matrix((len(corpus_lsi), len(corpus_lsi)),
                               dtype=np.float32)
for i in range(len(corpus_lsi)):
    vec_lsi = corpus_lsi[i]
    sim_scores = index[vec_lsi]
    for j in range(len(sim_scores)):
        if sim_scores[j] > threshold:
            adj_matrix[i, j] = 1.0  #sim_scores[j]
        else:
            adj_matrix[i, j] = 0.0
adj_matrix = adj_matrix.tocsr()
U, s, V = randomized_svd(adj_matrix, 5, n_iter=3)
pprint([titles[i] for i in np.abs(U.T[0]).argsort()[-10:]])
pprint([titles[i] for i in np.abs(V[0]).argsort()[-10:]])
print('-----')
scores = centrality_scores(adj_matrix, max_iter=100, tol=1e-10)
pprint([titles[i] for i in np.abs(scores).argsort()[-10:]])
Esempio n. 35
0
def decomp_exp(param_name, param_range, model_type, other_params, metrics):
    X_train_m, X_val_m, X_test_m, y_train_m, y_val_m, y_test_m, class_names_m = load_data('motions',
                                                                                          scale=True, valset=True)
    X_train_p, X_val_p, X_test_p, y_train_p, y_val_p, y_test_p, class_names_p = load_data('particles',
                                                                                          scale=True, valset=True)

    result = defaultdict(list)
    for param in param_range:
        # print("testing number of clusters = {}".format(param))
        params = {param_name: param}
        # Motions
        t0 = time.time()

        if model_type=="pca":
            cluster_m = PCA(**params, **other_params)
        elif model_type=="ica":
            cluster_m = FastICA(**params, **other_params)
        elif model_type == "rand_svd":
            cluster_m = randomized_svd(**params, **other_params)
        else:
            sys.exit("please select a valid model type, either 'pca' or 'ica', or 'rand_svd'")

        fit_m = cluster_m.fit(X_train_m)
        result['time_fit_m'].append(time.time() - t0)
        result['score_m'].append(cluster_m.score(X_train_m)/X_train_m.shape[1])
        result['score_val_m'].append(cluster_m.score(X_val_m)/X_val_m.shape[1])

        result['explained_variance_ratio'].append(cluster_m.explained_variance_ratio_)
        result['components'].append(cluster_m.components_)

        t0 = time.time()
        y_pred_train_m = fit_m.predict(X_train_m)
        y_pred_val_m = fit_m.predict(X_val_m)
        result['time_pred_m'].append(time.time() - t0)

        for metric in metrics:
            result[metric+"_m"].append(getattr(sys.modules[__name__], metric)(y_train_m, y_pred_train_m))
            result[metric+'_val_m'].append(getattr(sys.modules[__name__], metric)(y_val_m, y_pred_val_m))

        # Particles
        t0 = time.time()
        if model_type=="gaussian mixture":
            cluster_p = GaussianMixture(**params, **other_params)
        elif model_type=="kmeans":
            cluster_p = KMeans(**params, **other_params)
        else:
            sys.exit("please select either 'gaussian mixture' or 'kmeans'")

        fit_p = cluster_p.fit(X_train_p)
        result['time_fit_p'].append(time.time() - t0)
        result['score_p'].append(cluster_p.score(X_train_p)/X_train_p.shape[1])
        result['score_val_p'].append(cluster_p.score(X_val_p)/X_val_p.shape[1])

        result['explained_variance_ratio'].append(cluster_p.explained_variance_ratio_)
        result['components'].append(cluster_p.components_)

        t0 = time.time()
        y_pred_train_p = fit_p.predict(X_train_p)
        y_pred_val_p = fit_p.predict(X_val_p)
        result['time_pred_p'].append(time.time() - t0)

        for metric in metrics:
            result[metric+"_p"].append(getattr(sys.modules[__name__], metric)(y_train_p, y_pred_train_p))
            result[metric+'_val_p'].append(getattr(sys.modules[__name__], metric)(y_val_p, y_pred_val_p))

    result['param_range'] = param_range
    result['param_name'] = param_name
    result['model_type'] = model_type
    result['metrics'] = metrics

    return result
def reduce(X, n_components, power=0.0):
    U, Sigma, VT = randomized_svd(X, n_components=n_components)
    return U * (Sigma**power)