Esempio n. 1
0
def pseudoinverse(Mat, precision):
    """
    Pseudoinverse computation.

    Objective:
    ----------
    To compute pseudoinverse using Singular Value Depcomposition

    Reason:
    -------
    SVD using Scipy is slow and consumes a lot of memory, similarly
    pysparse matrix consumes a lot of memory. This is a better
    alternative to a direct computation of inverse.

    Process:
    --------
    The function uses sparsesvd to compute the SVD of a sparse matrix,
    there is a precision attached in the function, this controls the
    cutting (or the k) of the SVD. Precision is actually a percentage
    and uses this to get the k.

        k = (Precision/100) * rows of the matrix.


    The function takes a sparse matrix and a precision score as the input.

    """
    matrix = Mat.tocsc()
    if matrix.shape[0] <= matrix.shape[1]:

        k = int((precision * matrix.shape[0]) / 100)
        ut, s, vt = sparsesvd(matrix.tocsc(), k)
        UT = ss.csr_matrix(ut)
        SI = ss.csr_matrix(np.diag(1 / s))
        VT = ss.csr_matrix(vt)

        temp_matrix = spmatrixmul(VT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, UT)
        del ut, s, vt, UT, SI, VT, temp_matrix

    else:

        k = int((precision * matrix.transpose().shape[0]) / 100)
        ut, s, vt = sparsesvd(matrix.transpose().tocsc(), k)
        UT = ss.csr_matrix(ut)
        SI = ss.csr_matrix(np.diag(1 / s))
        VT = ss.csr_matrix(vt)

        temp_matrix = spmatrixmul(UT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, VT)
        del ut, s, vt, UT, SI, VT, temp_matrix

    return pinv_matrix.tocsr()
Esempio n. 2
0
def pseudoinverse(Mat, precision):
    """
    Pseudoinverse computation.

    Objective:
    ----------
    To compute pseudoinverse using Singular Value Depcomposition

    Reason:
    -------
    SVD using Scipy is slow and consumes a lot of memory, similarly
    pysparse matrix consumes a lot of memory. This is a better
    alternative to a direct computation of inverse.

    Process:
    --------
    The function uses sparsesvd to compute the SVD of a sparse matrix,
    there is a precision attached in the function, this controls the
    cutting (or the k) of the SVD. Precision is actually a percentage
    and uses this to get the k.

        k = (Precision/100) * rows of the matrix.


    The function takes a sparse matrix and a precision score as the input.

    """
    matrix = Mat.tocsc()
    if matrix.shape[0] <= matrix.shape[1]:

        k = int((precision * matrix.shape[0]) / 100)
        ut, s, vt = sparsesvd(matrix.tocsc(), k)
        UT = ss.csr_matrix(ut)
        SI = ss.csr_matrix(np.diag(1 / s))
        VT = ss.csr_matrix(vt)

        temp_matrix = spmatrixmul(VT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, UT)
        del ut, s, vt, UT, SI, VT, temp_matrix

    else:

        k = int((precision * matrix.transpose().shape[0]) / 100)
        ut, s, vt = sparsesvd(matrix.transpose().tocsc(), k)
        UT = ss.csr_matrix(ut)
        SI = ss.csr_matrix(np.diag(1 / s))
        VT = ss.csr_matrix(vt)

        temp_matrix = spmatrixmul(UT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, VT)
        del ut, s, vt, UT, SI, VT, temp_matrix

    return pinv_matrix.tocsr()
	def tune(my_corpus, dictionary, min_topics=2,max_topics=50,step=2):
		def sym_kl(p,q):
			return np.sum([scipy.stats.entropy(p,q),scipy.stats.entropy(q,p)])

		kl = []
		Hbar = []
		perplexity = []
		n_topics = []
		l = np.array([sum(cnt for _, cnt in doc) for doc in my_corpus])
		corpus = Index.get_corpus('train features')
		for i in range(min_topics,max_topics,step):
			n_topics.append(i)
			lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,num_topics=i, alpha = 'auto')
			m1 =  scipy.sparse.csc_matrix(lda.expElogbeta)
			U,cm1,V = sparsesvd(m1, m1.shape[0])
			#Document-topic matrix
			lda_topics = lda[my_corpus]
			m2 = gensim.matutils.corpus2dense(lda_topics, lda.num_topics).transpose()
			cm2 = l.dot(m2)
			cm2 = cm2 + 0.0001
			cm2norm = np.linalg.norm(l)
			cm2 = cm2/cm2norm
			kl.append(sym_kl(cm1,cm2))
			entropy_list = [scipy.stats.entropy([x[1] for x in lda[v]] ) for v in my_corpus]
			Hbar.append(np.mean(entropy_list))
			perplexity.append( lda.log_perplexity(my_corpus) )
			print("NumTopics: %s | Unscaled Entropy: %s | Per-word-bound: %s | Per-word-perplexity: %s | Arun measure %s" % \
					(i, Hbar[-1], perplexity[-1], np.exp2(-perplexity[-1]), kl[-1]))
		return n_topics, Hbar, perplexity, kl
def LSA(M, k):  ##will return top k sentences
    SM = scipy.sparse.csc_matrix(M)  # convert to sparse CSC format
    u, s, vt = sparsesvd(SM, k + 10)  #
    ##SVD calculated at this stage, concept matrix vt, from now we can apply various approaches
    ##to filter out top k sentences.
    ##We are using OzSoy's approach
    ##Using Cross Method
    m, n = M.shape

    Avg = numpy.average(M, 1)
    for i in range(0, m):
        for j in range(0, n):
            if M[i][j] < Avg[i]:
                M[i][j] = 0
    Length = numpy.dot(s, vt)
    L = []
    ##returning top k sentences
    for i in range(0, n):
        L.append(tuple([Length[i], i]))

    if k >= len(L):
        return L
    #building min heap

    count = int(k / 2 - 1)

    while (count >= 0):
        L = heapify(L, count, k)
        count -= 1
    for i in range(k, len(L)):
        if L[0][0] < L[i][0]:
            L[0] = L[i]
            L = heapify(L, 0, k)
    return L[:k]
Esempio n. 5
0
def generate_model(in_path, title_limit, user_limit, features, out_path):
    # connect to db
    db = pg.connect(in_path)
    # load scores
    scores = load_scores(db)
    db.close()
    print "Loaded scores"
    # filter insignificant titles/users, second filtering to remove empty cols/rows
    (mat, old_ids_1) = filter_too_small(scores, title_limit, user_limit)
    (mat, old_ids_2) = filter_too_small(mat.tocsc(), 1, 1)
    print "Filtered insignificant titles and users"
    # matrix is in csr format, calc row nnz averages and convert to csc
    averages = map(lambda x: row_nnz_average(mat,x), range(0, mat.shape[0]))
    mat = mat.tocsc()
    # build compact titleid translation tables
    old_ids = join_old_id_dicts(old_ids_1, old_ids_2)
    (title_to_document, document_to_tile) = build_title_mapping(old_ids, mat.shape[0])
    # run svd
    print "Built additional data"
    (ut, s, vt) = sparsesvd(mat.tocsc(), features)
    print "Factorization finished"
    s_sqrt = numpy.diag(numpy.sqrt(s))
    s_inv = numpy.diag(numpy.power(s,-1))
    terms = ut.transpose().dot(s_sqrt)
    documents = s_sqrt.dot(s_inv).dot(ut)
    # dump results
    savemat(out_path, {"Terms": terms, "Documents": documents, "Averages": averages, "TitleMapping": title_to_document, "DocumentMapping" : document_to_tile}, oned_as='row')
    print "Saved generated results"
Esempio n. 6
0
def append_lines_update_svd(m_old, m_new):
    if m_old.shape[1] != m_new.shape[1]:
        print(
            '\nAppend_Lines:New matrix muss has the same colums as old matrix!'
        )
        return
    m, n = m_old.shape
    factor = min(m, n)

    # factor = 5
    c = m_new.shape[0] - m
    # print("c {}, factor {}".format(c, factor))
    temp = sps.csc_matrix(m_old)
    u, s, v = sparsesvd.sparsesvd(temp, factor)

    u = u.T
    # print("u shape {}".format(u.shape))

    B = m_new[m:, :].T
    A = np.concatenate((np.zeros((m, c)), np.eye(c)), axis=0)
    S = np.diag(s)
    U = np.concatenate((u, np.zeros((c, u.shape[1]))), axis=0)
    V = v.T
    # print("u shape {}, A shape {}, U shape {}, V shape {}".format(u.shape, A.shape, U.shape, V.shape))
    u_new, s_new, v_new = increment_svd(U, S, V, A, B)

    e = np.linalg.norm(m_new - np.dot(u_new, np.dot(s_new, v_new.T)), 2)
    print('Error is', e)
Esempio n. 7
0
def computeSVD(urm, K):
    U, s, Vt = sparsesvd(urm, K)
    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i, i] = math.sqrt(s[i])
    return np.transpose(U), S.dot(Vt)
    def build(self):
        # Strictly speaking, enumeration order of Python dict's is not defined.
        # In other words, iterating over same dict twice may yield different results. So, we have to keep mapping
        # between user/item, and row/column.
        self.user_row_index = dict()  # type: Dict[str, int]
        self.item_col_index = dict()  # type: Dict[str, int]
        self.rating_matrix_demeaned = dok_matrix((len(self.user_histories), len(self.item_histories)))

        for col_index, (item_id, histories) in enumerate(self.item_histories.items()):
            self.item_col_index[item_id] = col_index
            avg_rating = np.mean([x.rating for x in histories])
            self.item_average_rating[item_id] = avg_rating
            for record in histories:
                row_index = self.user_row_index.get(record.user_id, len(self.user_row_index))
                self.user_row_index[record.user_id] = row_index
                self.rating_matrix_demeaned[row_index, col_index] = record.rating - avg_rating
        self.global_average = np.mean(list(self.item_average_rating.values()))
        u, s, v = sparsesvd(csc_matrix(self.rating_matrix_demeaned), self.components)

        row_vectors = u.T
        self.col_vectors = np.dot(np.diag(s), v).T
        self.col_vectors_inv = np.linalg.pinv(self.col_vectors.transpose())
        for u, row in self.user_row_index.items():
            self.user_vectors[u] = row_vectors[row]
        for i, col in self.item_col_index.items():
            self.item_vectors[i] = self.col_vectors[col]
Esempio n. 9
0
def main():
    args = docopt("""
    Usage:
        pmi2svd.py [options] <repres> <pmi_path> <output_path>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 500]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
        --k NUM [default: 1]
    """)

    repres = args['<repres>']
    pmi_path = args['<pmi_path>']
    output_path = args['<output_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    k = int(args['--k'])

    if (repres == "BPMI"):
        explicit = BinExplicit(pmi_path, normalize=False)
    elif (repres == "PMI"):
        explicit = NoExplicit(pmi_path, normalize=False, k=k)
    elif (repres == "NPMI"):
        explicit = NegExplicit(pmi_path, normalize=False)
    else:
        explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)

    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
    save_vocabulary(output_path + '.words.vocab', explicit.iw)
    save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--ppmi_file", type=str, required=True,
                        help="Path to the counts (matrix) file.")
    parser.add_argument("--svd_file", type=str, required=True,
                        help="Path to the SVD file.")
    parser.add_argument("--input_vocab_file", type=str, required=True,
                        help="Path to the input vocabulary file.")
    parser.add_argument("--output_vocab_file", type=str, required=True,
                        help="Path to the output vocabulary file.")

    parser.add_argument("--size", type=int, default=100,
                        help="Vector size.")
    parser.add_argument("--normalize", action="store_true",
                        help="If set, we factorize normalized PPMI matrix")

    args = parser.parse_args()

    print("Ppmi2svd")
    input_vocab, _ = load_vocabulary(args.input_vocab_file)
    output_vocab, _ = load_vocabulary(args.output_vocab_file)
    ppmi, _, _ = load_sparse(args.ppmi_file)
    if args.normalize:
        ppmi = normalize(ppmi, sparse=True)
    ut, s, vt = sparsesvd(ppmi.tocsc(), args.size)    

    np.save(args.svd_file + ".ut.npy", ut)
    np.save(args.svd_file + ".s.npy", s)
    np.save(args.svd_file + ".vt.npy", vt)

    save_dense(args.svd_file + ".input", ut.T, input_vocab)
    save_dense(args.svd_file + ".output", vt.T, output_vocab)
    print("Ppmi2svd finished")
Esempio n. 11
0
    def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS):
        """
        Construct the (U, S) projection from a corpus `docs`. The projection can
        be later updated by merging it with another Projection via `self.merge()`.

        This is the class taking care of the 'core math'; interfacing with corpora,
        splitting large corpora into chunks and merging them etc. is done through
        the higher-level `LsiModel` class.
        """
        self.m, self.k = m, k
        self.power_iters = power_iters
        self.extra_dims = extra_dims
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition,
            # *in-core*.
            if not use_svdlibc:
                u, s = stochastic_svd(docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    extra_dims=self.extra_dims)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                if not scipy.sparse.issparse(docs):
                    docs = matutils.corpus2csc(docs)
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut, vt
                k = clip_spectrum(s**2, self.k)
            self.u = u[:, :k].copy()
            self.s = s[:k].copy()
        else:
            self.u, self.s = None, None
Esempio n. 12
0
def learnProjection(sourceDomain, targetDomain):
    """
    Learn the projection matrix and store it to a file. 
    """
    h = 50  # no. of latent dimensions.
    print "Loading the bipartite matrix...",
    coocData = sio.loadmat("../work/%s-%s/DSxDI.mat" %
                           (sourceDomain, targetDomain))
    M = sp.lil_matrix(coocData['DSxDI'])
    (nDS, nDI) = M.shape
    print "Done."
    print "Computing the Laplacian...",
    D1 = sp.lil_matrix((nDS, nDS), dtype=np.float64)
    D2 = sp.lil_matrix((nDI, nDI), dtype=np.float64)
    for i in range(0, nDS):
        D1[i, i] = 1.0 / np.sqrt(np.sum(M[i, :].data[0]))
    for i in range(0, nDI):
        D2[i, i] = 1.0 / np.sqrt(np.sum(M[:, i].T.data[0]))
    B = (D1.tocsr().dot(M.tocsr())).dot(D2.tocsr())
    print "Done."
    print "Computing SVD...",
    ut, s, vt = sparsesvd(B.tocsc(), h)
    sio.savemat("../work/%s-%s/proj.mat" % (sourceDomain, targetDomain),
                {'proj': ut.T})
    print "Done."
    pass
Esempio n. 13
0
    def matrixsvd(self):
        svd_matrix = self.projection_matrix.tocsc()

        if self.svd is 'scipy':
            Utemp, Stemp, VTtemp = ssl.svds(
                svd_matrix.tocsc(),
                k=(int(self.projection_matrix.tocsr().shape[0] *
                       self.precision) / 100))
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        elif self.svd is 'sparsesvd':
            (UT, S,
             VT) = sparsesvd(svd_matrix,
                             (int(svd_matrix.shape[0] * self.precision) / 100))

        elif self.svd is 'fast':
            Utemp, Stemp, VTtemp = fast_svd(svd_matrix, (
                int(self.projection_matrix.tocsr().shape[0] * self.precision) /
                100))
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        else:
            Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense())
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        return UT, S, VT
Esempio n. 14
0
    def decompose(self, corpus):

        skip_counts = bounter(size_mb=1024)
        word_counts = bounter(size_mb=1024)
        for l in corpus:
            wds = l.split()
            skips = list(skipgrams(wds, 2, 5))
            skips = ["#".join(t) for t in skips]
            if len(wds) > 0 and len(skips) > 0:
                skip_counts.update(skips)
                word_counts.update(wds)

        vocabulary = list(word_counts)
        shift = 1  # shift 1 does nothing since log(1) == 0.0
        M = count_skipgrams(skip_counts, word_counts, vocabulary, shift)
        # TODO: eigen something trick
        # singular value decomposition
        # U, _, V = svds(M, k=256)  # U, S, V
        U, _, V = sparsesvd(M, 300)
        # add context to U
        word_vecs = U.T + V.T
        del U
        del V
        # normalize rows
        word_vecs_norm = word_vecs / np.sqrt(
            np.sum(word_vecs * word_vecs, axis=0, keepdims=True))
        del word_vecs
        return vocabulary, word_vecs_norm
def cnt2svd(count_file, vocab_file, PPMI):
    with open(count_file, "r", encoding="UTF-8-sig") as src_file:
        text = src_file.readlines()

    word2index = read_vocab(vocab_file)

    print("length of word_dict: " + str(len(word2index)))

    counts = csc_matrix((len(word2index), len(word2index)), dtype="float32")
    tmp_counts = dok_matrix((len(word2index), len(word2index)),
                            dtype="float32")
    times = 0
    for i in range(len(text)):
        word, context, count = text[i].strip().split()
        tmp_counts[word2index[word], word2index[context]] = int(count)
        times += 1
        if times == st.UPDATE_THRESHOLD:
            counts = counts + tmp_counts.tocsc()
            tmp_counts = dok_matrix((len(word2index), len(word2index)),
                                    dtype="float32")
            times = 0
    counts = counts + tmp_counts.tocsc()
    #calculate e^pmi
    sum_r = np.array(counts.sum(axis=1))[:, 0]
    sum_c = np.array(counts.sum(axis=0))[0, :]

    sum_total = sum_c.sum()
    sum_r = np.reciprocal(sum_r)
    sum_c = np.reciprocal(sum_c)

    pmi = csc_matrix(counts)

    normalizer = dok_matrix((len(sum_r), len(sum_r)))
    normalizer.setdiag(sum_r)
    pmi = normalizer.tocsc().dot(pmi)

    normalizer = dok_matrix((len(sum_c), len(sum_c)))
    normalizer.setdiag(sum_c)
    pmi = pmi.dot(normalizer.tocsc())

    pmi = pmi * sum_total
    pmi.data = np.log(pmi.data)

    if PPMI:
        pmi[pmi < 0] = 0

    I = eye(pmi.shape[0], format="csc")
    print("start svd")
    start = time.time()
    ut, s = sparsesvd(pmi, I, st.VECTOR_LENGTH)[:2]

    if PPMI:
        for i in range(len(s)):
            ut[i, :] *= np.sqrt(s[i])
    else:
        for i in range(len(s)):
            ut[i, :] *= s[i]

    print(time.time() - start)
    return ut.T, word2index
def LSA(M,k):  ##will return top k sentences
    SM = scipy.sparse.csc_matrix(M) # convert to sparse CSC format
    u, s, vt = sparsesvd(SM,k+10) #
    ##SVD calculated at this stage, concept matrix vt, from now we can apply various approaches
    ##to filter out top k sentences.
    ##We are using OzSoy's approach
    ##Using Cross Method
    m,n=M.shape

    Avg=numpy.average(M,1)
    for i in range(0,m):
        for j in range(0,n):
            if M[i][j]<Avg[i]:
                M[i][j]=0
    Length=numpy.dot(s,vt)
    L=[]
    ##returning top k sentences
    for i in range(0,n):
        L.append(tuple([Length[i],i]))

    if k>=len(L):
        return L
    #building min heap

    count= int(k/2-1)

    while(count>=0):
        L=heapify(L,count,k)
        count-=1
    for i in range(k,len(L)):
        if L[0][0]<L[i][0]:
            L[0]=L[i]
            L=heapify(L,0,k)
    return L[:k]
    def __init__(self, m, k, docs = None):
        """
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.
        """

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition 
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if m * k < 10000:
                # SVDLIBC gives spurious results for small matrices.. run full
                # LAPACK svd on them instead
                docs = docs.todense()
                logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                u, s, vt = numpy.linalg.svd(docs, full_matrices = False)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut
            del vt
            k = clipSpectrum(s, self.k)
            self.u, self.s = u[:, :k], s[:k]
        else:
            self.u, self.s = None, None
Esempio n. 18
0
    def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS):
        """
        Construct the (U, S) projection from a corpus `docs`. The projection can
        be later updated by merging it with another Projection via `self.merge()`.

        This is the class taking care of the 'core math'; interfacing with corpora,
        splitting large corpora into chunks and merging them etc. is done through
        the higher-level `LsiModel` class.
        """
        self.m, self.k = m, k
        self.power_iters = power_iters
        self.extra_dims = extra_dims
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition,
            # *in-core*.
            if not use_svdlibc:
                u, s = stochastic_svd(docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    extra_dims=self.extra_dims)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                if not scipy.sparse.issparse(docs):
                    docs = matutils.corpus2csc(docs)
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut, vt
                k = clip_spectrum(s**2, self.k)
            self.u = u[:, :k].copy()
            self.s = s[:k].copy()
        else:
            self.u, self.s = None, None
Esempio n. 19
0
 def applySvd(self):
     len_row = max(self.array_row) + 1
     len_col = max(self.array_col) + 1
     print 'Applying SVD with ROW: ' + str(len_row) + ' and COL: ' + str(
         len_col)
     sparse_matrix = scipy.sparse.csc_matrix(
         (self.array_data, (self.array_row, self.array_col)),
         shape=(len_row, len_col))
     print 'sparsed matrix'
     Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension)
     print 'U Sigma Vt done!'
     sparse_matrix = array(0)
     print 'Mounting Matrix SVD'
     self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt))
     print 'Done!'
     print Ut.T
     print '\n'
     print Sigma
     print '\n'
     print Vt
     print '\n'
     print self.svd_matrix.T
     print '\n'
     Ut = None
     Sigma = None
     Vt = None
Esempio n. 20
0
def calc_svd(matrix, dim, impl, impl_args):
    """
    apply truncated SVD with several implementations

    truncated SVD:
    sparsesvd: https://pypi.org/project/sparsesvd/
    scipy: https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html

    randomized truncated SVD:
    gensim: https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/lsimodel.py
    scikit: https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html

    Check out the comparision: https://github.com/jfilter/sparse-svd-benchmark
    """
    if impl == "sparsesvd":
        # originally used SVD implementation
        ut, s, _ = sparsesvd(matrix.m.tocsc(), dim)
        # returns in a different format
        ut = ut.T
    if impl == "scipy":
        ut, s, _ = linalg.svds(matrix.m, dim)
    # randomized (but fast) truncated SVD
    if impl == "gensim":
        # better default arguments
        args = {"power_iters": 5, "extra_dims": 10, **impl_args}
        ut, s = stochastic_svd(matrix.m, dim, matrix.m.shape[0], **args)
    if impl == "scikit":
        ut, s, _ = randomized_svd(matrix.m, dim, **impl_args)

    return ut, s
Esempio n. 21
0
def test_svd_matrix(W, WT, D, DT):
    Winv = ss.csr_matrix(np.linalg.pinv(W.todense()))
    WTinv = ss.csr_matrix(np.linalg.pinv(W.transpose().todense()))
#    A = np.dot(np.dot(Winv, D), WTinv)
    A = ((Winv * D) * WTinv)
    A = A.tocsc()
    res_dict = {}
    old_z = 0

    for k in range(270, 280):
        (ut, s, vt) = sparsesvd(A, k)
        U = ss.csr_matrix(ut.T)
        S = ss.csr_matrix(np.diag(s))
        V = ss.csr_matrix(vt)
        L = (W * U) * (S * V * WT.transpose())
        z = U.shape[1]

        if z == old_z:
            break

        else:
            Res = fnorm(L, DT)
            res_dict[z] = Res
            Result = OrderedDict(sorted(res_dict.items(),
                key=lambda t: np.float64(t[1])))
            old_z = z

    return Result
Esempio n. 22
0
    def load(self, dirname, svd_k = 0):
        """
        Load the embedding and optionally perform SVD
        on load. If svd_k is set to 0, no SVD is performed.
        """
        
        self.dirname = dirname
        
        try:
            self.emb.x, self.emb.y = load_svmlight_file(dirname + EMBEDDING_FILENAME)
        except (ValueError, IOError):
            return None
        
        if svd_k != 0:
            try:
                import sparsesvd
                import scipy.sparse
                
                X = self.emb.x.T
                X = scipy.sparse.csc_matrix(X)
                Ut, S, Vt = sparsesvd.sparsesvd(X, svd_k)
                self.emb.x = scipy.sparse.csr_matrix(Vt.T)

            except ImportError:
                print('Warning: Cannot perform SVD without sparsesvd module')

        self._loadFeatureTable()
        self._loadTOC()
        return self.emb
Esempio n. 23
0
File: app.py Progetto: SlumDunk/wiki
def sparse_svd():
    svd_input = np.empty((len(request.get_json()), len(request.get_json()[0])))
    for i in range(0, len(request.get_json())):
        svd_input[i] = request.get_json()[i]
    smat = scipy.sparse.csc_matrix(svd_input)
    u, s, vh = sparsesvd(smat, min(smat.shape))
    return json.dumps(u.T.tolist())
Esempio n. 24
0
def computeSVDpackage(urm, K):

    U, s, Vt = sparsesvd(urm, K)
    # print(U.shape)
    # print(U)
    # print(len(s))
    # print(s.shape)
    print(s)
    # # print(Vt)
    print(Vt.shape)
    print(Vt)
    # # print(Vt.transpose())

    # print(Ux - U)
    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        # S[i, i] = mt.sqrt(s[i])
        S[i, i] = s[i]

    U = csc_matrix(np.transpose(U), dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)

    return U, S, Vt
Esempio n. 25
0
def main():
    args = docopt("""
    Usage:
        pmi2svd.py [options] <pmi_path> <output_path>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 500]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
    """)
    
    pmi_path = args['<pmi_path>']
    output_path = args['<output_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    
    explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)

    start = time.time()
    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)
    print("Time elapsed for SVD: %f" % (time.time() - start))

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
    save_vocabulary(output_path + '.words.vocab', explicit.iw)
    save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
Esempio n. 26
0
def main():
    en_vector=ENVector()
    en_vector.read_freq("results/freq_en_fixed_pmi.txt")
    #print "Reading Pair Co-occurence"
    #en_vector.read_and_duplicate("results/pair_en_test.txt")
    en_vector.read_pair_pmi("results/pair_en_fixed_pmi.txt")
    en_vector.sort_by_freq()
    #print "Generating Label"
    en_vector.generate_label()
    # print "Generating Matrix Label"
    en_vector.generate_matrix_label()
    #print "Calculating Vector Size"
    en_vector.calculate_size()

    matrix=sp.lil_matrix((limit,limit))
    for i in range(min(limit,len(en_vector.matrix_label))):
        for j in range((len(en_vector.matrix_label[i]))):
            if en_vector.matrix_label[i][j]>=limit:
                continue
            word1=en_vector.word_list[i]
            word2=en_vector.word_list[en_vector.matrix_label[i][j]]
            matrix[i,en_vector.matrix_label[i][j]]=en_vector.pair_count[(word1,word2)]

    smat=sp.csc_matrix(matrix)
    ut,s,vt=sparsesvd(smat,10)
    for i in range(limit):
        for j in range(10):
            print (ut[j][i]*s[j]),
        print 
Esempio n. 27
0
def main():
    args = docopt("""
    Usage:
        counts2svd.py [options] <counts_path>

    Options:
        --dim NUM           Dimensionality of eigenvectors [default: 500]
        --neg NUM           Number of negative samples; subtracts its log from PMI [default: 1]
        --pos NUM           Number of positive samples; add its log to PMI [default: 1]
        --cds NUM           Context distribution smoothing [default: 0.75]
        --randomized        Use randomized SVD
        --normalized        Use normalized embedder
        --oversample NUM    Number of oversamples in randomized SVD [default: 10]
        --power_iter NUM    Number of iterations of power method in randomized SVD [default: 2]
    """)

    start = time.time()

    counts_path = args['<counts_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    pos = int(args['--pos'])
    cds = float(args['--cds'])
    randomized = args['--randomized']
    normalized = args['--normalized']
    oversample = int(args['--oversample'])
    power_iter = int(args['--power_iter'])

    output_path = counts_path + "_svd_dim=%d_neg=%d_pos=%d_cds=%.2f" % (dim, neg, pos, cds)
    if randomized:
        output_path += "_rand_oversample=%d_power_iter=%d" % (oversample, power_iter)
    if normalized:
        output_path += "_normalized_power_iter=%d" % power_iter

    logging.basicConfig(filename=output_path + ".log", filemode="w", level=logging.DEBUG)
    logging.getLogger().addHandler(logging.StreamHandler())

    _, iw = load_vocabulary(counts_path + '.words.vocab')
    adjacency_matrix = load_adjacency_matrix(counts_path)
    ppmi = build_ppmi_matrix(adjacency_matrix, cds, neg, pos)

    start_learning = time.time()
    logging.info("Starting SVD")
    if randomized:
        # ppmi = normalize(ppmi, norm='l2', axis=1)
        s, ut = randomized_eigh(ppmi, dim, oversample, power_iter, row_normalized=normalized)
    elif normalized:
        ut = normalized_embedder(ppmi, dim, power_iter)
        s = np.zeros(dim)
    else:
        # ppmi = normalize(ppmi, norm='l2', axis=1)
        ut, s, _ = sparsesvd(ppmi.tocsc(), dim)
        ut = ut.T

    logging.info("Time elapsed on learning: %f" % (time.time() - start_learning))

    np.save(output_path + '.vecs.npy', ut)
    np.save(output_path + '.vals.npy', s)
    save_vocabulary(output_path + '.words.vocab', iw)
    logging.info("Time elapsed: %f" % (time.time() - start))
def lsa( ):
    from sparsesvd import sparsesvd
    from numpy import array
    import scipy.sparse as sp
    # calculate svd and perform lsa
    print "########     READING TERM DOC MATRIX #########"
    termDocEntries = pickle.load(open(outfile  +"/tdm.p" ,"rb"))
    id2title = pickle.load(open(outfile + "/id_file.p","rb"))
    word2id = pickle.load(open(outfile + "/word_id.p","rb"))
    fileCount = len(id2title)
    #fileCount = 60000
    vocab_size = len(word2id)
    print "########     READING COMPLETE        #########"
    I = array([ i for ((i,j),v) in termDocEntries] )
    J = array([ j for ((i,j),v) in termDocEntries] )
    V = array([ v for ((i,j),v) in termDocEntries] )
    shape = (fileCount, vocab_size)
    print "Dimension of TDM is : ", shape
    print "########     STARTING LSA            #########"
    termDocMatrix = sp.csc_matrix( (V,(I,J)), shape= (fileCount, vocab_size ), dtype=np.float32)

    UT , S, V = sparsesvd(termDocMatrix, 300) 
    (m1,m2) =  UT.T.shape

    S1 = np.zeros((m2,m2), dtype=np.float32)
    for i in range(m2):
        S1[i][i] = S[i]
    US = np.dot(UT.T, S1)
    print m1, m2
    (n1,n2) = V.shape

    pickle.dump( US , open( outfile + "/u_sigma.p", "wb" ) )
    pickle.dump( V.T , open( outfile + "/v.p", "wb" ) )
    print "########     LSA COMPLETE        #########"
Esempio n. 29
0
 def applySvd(self):
     len_row = max(self.array_row) + 1
     len_col = max(self.array_col) + 1
     print "Applying SVD with ROW: " + str(len_row) + " and COL: " + str(len_col)
     sparse_matrix = scipy.sparse.csc_matrix(
         (self.array_data, (self.array_row, self.array_col)), shape=(len_row, len_col)
     )
     print "sparsed matrix"
     Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension)
     print "U Sigma Vt done!"
     sparse_matrix = array(0)
     print "Mounting Matrix SVD"
     self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt))
     print "Done!"
     print Ut.T
     print "\n"
     print Sigma
     print "\n"
     print Vt
     print "\n"
     print self.svd_matrix.T
     print "\n"
     Ut = None
     Sigma = None
     Vt = None
Esempio n. 30
0
def arun(corpus, dictionary, min_topics=10, max_topics=21, step=5):
    print "Arun runing"
    output = []
    for i in range(min_topics, max_topics, step):
        lda = LDA(dictionary, corpus, i, "lda20/lda_training_" + str(i))
        print "Модель построена/загружена"
        m1 = lda.expElogbeta
        # U, cm1, V = np.linalg.svd(m1)
        smat = scipy.sparse.csc_matrix(m1)  # convert to sparse CSC format
        U, cm1, V = sparsesvd(smat, i + 30)  # do SVD, asking for 100 factors
        print "sparsesvd сделано"
        #Document-topic matrix
        lda_topics = lda[my_corpus]
        m2 = matutils.corpus2dense(lda_topics, lda.num_topics).transpose()
        cm2 = l.dot(m2)
        cm2 = cm2 + 0.0001
        print "cm2norm begin"
        cm2norm = np.linalg.norm(l)
        print "cm2norm end"
        cm2 = cm2/cm2norm
        print len(cm1), len(cm2)
        kl = sym_kl(cm1, cm2)
        output.append((i, kl))
        print i, kl
    print output
    return output
Esempio n. 31
0
    def matrixsvd(self):
        svd_matrix = self.projection_matrix.tocsc()

        if self.svd is 'scipy':
            Utemp, Stemp, VTtemp = ssl.svds(svd_matrix.tocsc(),
                    k=(int(self.projection_matrix.tocsr().shape[0] *
                        self.precision) / 100))
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        elif self.svd is 'sparsesvd':
            (UT, S, VT) = sparsesvd(svd_matrix, (int(svd_matrix.shape[0] * self.precision) / 100))

        elif self.svd is 'fast':
            Utemp, Stemp, VTtemp = fast_svd(svd_matrix,
                    (int(self.projection_matrix.tocsr().shape[0] *
                        self.precision) / 100))
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        else:
            Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense())
            UT = np.nan_to_num(Utemp.transpose())
            S = np.nan_to_num(Stemp)
            VT = np.nan_to_num(VTtemp)

        return UT, S, VT
Esempio n. 32
0
def learn(mat):
    print "Starting learning process..."
    start_time = time.time()
    user_mat, axis_weights, movie_mat = sparsesvd(mat, NUM_COMPONENTS)
    print "Matrix decomposition complete (elapsed time: %f s)." % (time.time() - start_time)
    print "Learning process complete."
    return (user_mat, axis_weights, movie_mat)
    def __init__(self, m, k, docs = None):
        """
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.
        """

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition 
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if m * k < 10000:
                # SVDLIBC gives spurious results for small matrices.. run full
                # LAPACK svd on them instead
                docs = docs.todense()
                logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                u, s, vt = numpy.linalg.svd(docs, full_matrices = False)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut
            del vt
            k = clipSpectrum(s, self.k)
            self.u, self.s = u[:, :k], s[:k]
        else:
            self.u, self.s = None, None
Esempio n. 34
0
def main():

    #===========================================================================
    # mat = numpy.random.rand(300, 300)
    # smat = scipy.sparse.csc_matrix(mat)
    # ut, s, vt = sparsesvd(smat,100)
    # tmp=numpy.diag(s)
    # test=numpy.dot(ut.T, numpy.dot(numpy.diag(s), vt))#vt=(300,300), ut=(300,300), s=(300,1)
    # u2, s2, v2=svds(mat, k=100)
    #
    # print ""
    #===========================================================================
    #ut, s, vt = sparsesvd(smat,100) # do SVD, asking for 100 factors
    # ut - Unitary matrices.
    #s -The singular values for every matrix, sorted in descending order.
    #vt  - Unitary matrices
    #assert numpy.allclose(mat, numpy.dot(ut.T, numpy.dot(numpy.diag(s), vt)))   #test if mat is close to numpy.dot(ut.T, numpy.dot(numpy.diag(s), vt))

    ################################################################################################################

    mat1 = ss.load_npz(
        '/home/ira/Dropbox/IraTechnion/Patterns_Research/sp_sg/mat_ppmi_round_allpats.npz'
    )
    (nrows, ncols) = mat1.get_shape()

    #u1, s1, v1 = svds(mat1, k=500)
    u1, s1, v1 = sparsesvd(csc_matrix(mat1),
                           500)  #v1(500,746K), u1(500,746K) s1[500,1]
    reduced_mat = numpy.dot(u1.T, numpy.diag(s1))
    ss.save_npz('svd_reduced_mat_500_allpats', csr_matrix(reduced_mat))

    print "I'm here"
Esempio n. 35
0
def generate_archetypes(singer_resumes, archetype_count_k=20, cache_file=CACHE):
    """ Generate and write to disk an archetype matrix given a population """

    # Generate a unique, ordered, list of characters
    characters = set()  # Could optimized by using single comprehension
    for singer_resume in singer_resumes:
        characters.update(singer_resume)
    characters = list(characters)

    # Create a dict to lookup character index by id
    character_positions = dict()
    for i, character in enumerate(characters):
        character_positions[character] = i

    # Construct an empty matrix to populate
    dimensions = len(singer_resumes), len(characters)
    singer_matrix = scipy.sparse.lil_matrix(dimensions)

    # Populate the matrix
    for j, singer_resume in enumerate(singer_resumes):
        for character in singer_resume:
            position = character_positions[character]
            singer_matrix[j, position] = True

    # Convert matrix to a sparse matrix
    sparse_singer_matrix = scipy.sparse.csc_matrix(singer_matrix)

    # Do magic with maths
    U, s, V = sparsesvd(sparse_singer_matrix, archetype_count_k)

    archetypes = V

    # Cache the data for later use
    arrays = {CHARACTERS: character_positions, ARCHETYPES: archetypes}
    np.savez(cache_file, **arrays)
Esempio n. 36
0
def single_line_update_svd(m_old, m_new):
    if m_old.shape != m_new.shape:
        print("\nSingel_line: Matrix shape don't match")
    shape = m_old.shape
    diff = m_new - m_old
    r, c = np.nonzero(diff)
    # print(r,c)
    x = diff[diff != 0]
    # print(x, x.shape)
    if x.shape[0] == shape[0]:
        # update colum
        col = c[0]
        a = x.reshape((shape[0], 1))
        b = np.zeros((shape[1], 1))
        b[c, 0] = 1
    else:
        # update row
        row = r[0]
        a = np.zeros((shape[0], 1))
        a[row, 0] = 1
        b = x.reshape((shape[1], 1))
    # print(np.allclose(m_new, m_old + [email protected]))
    temp = sps.csc_matrix(m_old)
    u, s, v = sparsesvd.sparsesvd(temp, min(shape))
    u = u.T
    v = v.T
    s = np.diag(s)
    u_new, s_new, v_new = increment_svd(u, s, v, a, b)
    e = np.linalg.norm(m_new - (u_new @ s_new @ v_new.T), 2)
    print("Error is ", e)
Esempio n. 37
0
def learn(mat):
    print "Starting learning process..."
    start_time = time.time()
    user_mat, axis_weights, movie_mat = sparsesvd(mat, NUM_COMPONENTS)
    print "Matrix decomposition complete (elapsed time: %f s)." % (
        time.time() - start_time)
    print "Learning process complete."
    return (user_mat, axis_weights, movie_mat)
def test():
    mat = sp.rand(200, 100, density=0.01) # create a random matrix
    smat = csc_matrix(mat) # convert to sparse CSC format
    ut, s, vt = sparsesvd(smat, 1) # do SVD, asking for 100 factors
    mat_prime = np.dot(ut.T, np.dot(np.diag(s), vt))
    
    print (len(np.transpose(mat.nonzero())))
    print (len(np.transpose(mat_prime.nonzero())))
Esempio n. 39
0
 def decompose(self):
     sigular_vals = 100
     print "decomposing, with %d singular-value requested." % sigular_vals
     ut, s, vt = sparsesvd(csc_matrix(self.matrix), sigular_vals)
     print "s*vt:", os.linesep, numpy.dot(s, vt)
     print "ut:", os.linesep, ut
     print "s:", os.linesep, s
     print "vt", os.linesep, vt
Esempio n. 40
0
def svd_reduction(dataArray, k, get="feature-latent"):
    sparseDataArray = csc_matrix(dataArray)
    ut, s, vt = sparsesvd(sparseDataArray, k)

    if get=="feature-latent":
        return np.matmul(dataArray.transpose(), ut.transpose())
    else:
        return np.matmul(dataArray, vt.transpose())
Esempio n. 41
0
 def projections(self):
     """Get the set of vectors for all URIs"""
     if self._ut is None:
         self._ut, self._s, self._vt = sparsesvd(self._adjacency,
                                                 self._rank)
         (self._ut_shape, self._s_shape,
          self._vt_shape) = (self._ut.shape, self._s.shape, self._vt.shape)
     return self._ut.T
Esempio n. 42
0
    def _compute_svd(self, normalize_data = True):
        self.logger.info('Computing the Singular Value Decomposition of the relation matrix')

        if normalize_data:
            self.data_normalization()

        self.relationship_matrix_csc = self.relationship_matrix.tocsc()
        self.svd_u, self.svd_s, self.svd_v = sparsesvd(self.relationship_matrix_csc, self.dimensionality)
Esempio n. 43
0
def learnProjection(dataset, pivotsMethod, n):
    """
    Learn the projection matrix and store it to a file. 
    """
    h = 50 # no. of SVD dimensions.
    #n = 500 # no. of pivots.

    # Parameters to reduce the number of features in the tail
    # domainTh = {'books':5, 'dvd':5, 'kitchen':5, 'electronics':5}

    # Load pivots.
    pivotsFile = "../work/%s/obj/%s" % (dataset, pivotsMethod)
    features = pi.load_stored_obj(pivotsFile)
    pivots = dict(features[:n]).keys()
    print "selecting top-%d features in %s as pivots" % (len(pivots), pivotsMethod)

# Load features and get domain specific features
    fname = "../work/%s/obj/freq" % (dataset)
    if "un_" in pivotsMethod:
        fname = "../work/%s/obj/un_freq" % (dataset)
    features = pi.load_stored_obj(fname)
    feats = dict(features)
    # print feats.keys()

    # DSwords = [item for item in feats if item not in pivots]

    feats = feats.keys()
    # Load train vectors.
    print "Loading Training vectors...",
    startTime = time.time()
    vects = []
    vects.extend(loadFeatureVecors("../data/%s/train-sentences" % dataset, feats))
    endTime = time.time()
    print "%ss" % str(round(endTime-startTime, 2))     

    print "Total no. of documents =", len(vects)
    print "Total no. of features =", len(feats)

    # Learn pivot predictors.
    print "Learning Pivot Predictors.."
    startTime = time.time()
    M = sp.lil_matrix((len(feats), len(pivots)), dtype=np.float)
    for (j, w) in enumerate(pivots):
        print "%d of %d %s" % (j, len(pivots), w)
        for (feat, val) in getWeightVector(w, vects):
            i = feats.index(feat)
            M[i,j] = val
    endTime = time.time()
    print "Took %ss" % str(round(endTime-startTime, 2))   

    # Perform SVD on M
    print "Perform SVD on the weight matrix...",
    startTime = time.time()
    ut, s, vt = sparsesvd(M.tocsc(), h)
    endTime = time.time()
    print "%ss" % str(round(endTime-startTime, 2))     
    sio.savemat("../work/%s/proj_scl.mat" % (dataset), {'proj':ut.T})
    pass
Esempio n. 44
0
def testBeyondAccurracyMetrics(train_filename, eval_item_filename, user_means_filename):
    
    logging.info('testing beyond-accuracy topNLists with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename))
    
    train_data = trainData.TrainData(train_filename, user_means_filename)
    _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE)
    
    with open(eval_item_filename,'rb') as eval_file:
        for line in eval_file:
            data = line.split('\t')
            user_id = data[0]
            user_index = train_data.getUserIndex(user_id)
            
            if len(train_data.getUserProfileByIndex(user_index)) < 1:
                continue
            
            ground_truth_items = data[1].split(',')
            random_unrated_items = data[2].rstrip('\n').split(',')
             
            evaluation_item_ids = ground_truth_items + random_unrated_items
             
            rec_list_szie = config.RECOMMENDATION_LIST_SIZE * config.DIVERSIFICATION_CANDIDATES_FACTOR
            
#             predictions = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids)
#             top_recs = topNLists.getTopNList(predictions, rec_list_szie)
            
#             predictions_ib = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
#             top_recs_ib = topNLists.getTopNList(predictions_ib, rec_list_szie)
            
#             predictions = library_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
#             top_recs = topNLists.getTopNList(predictions, rec_list_szie, evaluation_item_ids)
            
            predictions_ub = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs_ub = topNLists.getTopNList(predictions_ub, rec_list_szie)
            
#             print 'user',user_id
            
#             print top_recs_ib, top_recs_ub
            
#             rare = train_data.getPopularityInfo()[:10]
#             pop = train_data.getPopularityInfo()[-10:]
            
            top_recs = top_recs_ub
            print 'diversity_ratings',diversity.getListDiversity(train_data, top_recs, 'div_r')
            print 'diversity_content',diversity.getListDiversity(train_data, top_recs, 'div_c')
            print 'content',serendipity.getListSerendipity(train_data, user_index, top_recs, 'sur_c')
            
#             print 'rare cooccurrence',serendipity.getListSerendipity(train_data, user_index, rare, 'sur_r')
#             print 'rare cooccurrence normalized',serendipity.getListSerendipity(train_data, user_index, rare, 'sur_r_n')
#             
#             print 'pop cooccurrence',serendipity.getListSerendipity(train_data, user_index, pop, 'sur_r')
#             print 'pop cooccurrence normalized',serendipity.getListSerendipity(train_data, user_index, pop, 'sur_r_n')
#             
#             print 'rare novelty',novelty.getListNovelty(train_data, rare)
#             
#             print 'pop novelty',novelty.getListNovelty(train_data, pop)
            
            print '------------------------------'
Esempio n. 45
0
def main(size, thr, ns, sppmi, f_in, f_out):
    size = int(size)
    thr = int(thr)
    ns = float(ns)
    sppmi = int(sppmi)  #1: sppmi 0: raw
    print "Input text file: ", f_in
    print "Building dict..."
    vocab_count, text_num = build_dict(f_in, thr)
    alltokens = vocab_count.keys()
    vocab_id = dict((t, i) for i, t in enumerate(alltokens))
    vocab_size = len(vocab_id)
    print "the number of texts: ", text_num
    print "vocabulary size: ", vocab_size
    train_num = sum(vocab_count.values())
    print "The number of tokens: ", train_num
    i = 0
    row = []
    col = []
    data = []
    for l in open(f_in).readlines():
        tokens = l.split()
        indexes = list(np.zeros(vocab_size))
        for t in tokens:
            try:
                if sppmi == 1:
                    indexes[vocab_id[t]] += train_num / ns / (len(tokens) *
                                                              vocab_count[t])
                else:
                    indexes[vocab_id[t]] += 1
            except KeyError:
                pass
        if sppmi == 1:
            for j in range(len(indexes)):
                if indexes[j] > 1:  # only positive values are retained
                    row.append(i)
                    col.append(j)
                    data.append(np.log(indexes[j]))
        else:
            for j in range(len(indexes)):
                if indexes[j] > 0:
                    row.append(i)
                    col.append(j)
                    data.append(indexes[j])
        i += 1
    print "the size of the co-occurrence matrix of term-document type doc_num, vocab_size:", text_num, vocab_size
    s_co_mat = scipy.sparse.csc_matrix(
        (np.array(data), (np.array(row), np.array(col))),
        shape=(text_num, vocab_size))
    ut, s, vt = sparsesvd(s_co_mat, size)
    ut = ut.transpose()
    f = open(f_out, "w")
    for i in range(ut.shape[0]):
        f.write(str(i) + " ")
        for j in range(ut.shape[1]):
            f.write(str(ut[i, j]) + " ")
        f.write("\n")
Esempio n. 46
0
    def __factorize_rating_matrix(self):

        mat = sparse.lil_matrix((self.m, self.n))
        for user in self.user_ratings.iterkeys():
            for item in self.user_ratings[user]:
                mat[self.user_positions[user], self.item_positions[item]] = self.user_ratings[user][item]

        u, s, q = sparsesvd(sparse.csc_matrix(mat), self.num_facs)

        return u.T, numpy.diag(s), q.T
Esempio n. 47
0
def sparseSVD(D):
  import scipy.sparse
  try:
    import sparsesvd
  except:
    print 'bummer ... better get sparsesvd'
    exit(0)
  Ds = scipy.sparse.csc_matrix(D)
  a = sparsesvd.sparsesvd(Ds,Ds.shape[0])
  return a
Esempio n. 48
0
	def run(self):
		denseMat 	= loadMatFile(self.rawMatFile)
		sparseMat	= scipy.sparse.csc_matrix(denseMat)
		if int(len(denseMat)) >= self.cutOffSVD :
			ut, s, vt 	= sparsesvd.sparsesvd(sparseMat, self.cutOffSVD)
			dump2File(vt, self.VtFile)
			reducedMatrix = computeReduction(vt, denseMat, self.cutOffSVD)
			dump2File(reducedMatrix, self.redMatFile)
		else : 
			print('--> not enough contexts')
Esempio n. 49
0
def transform_data(X, X_test):
    X_all = np.vstack((X,X_test))
    tfidf = feature_extraction.text.TfidfTransformer()
    X_all = tfidf.fit_transform(X_all).toarray()
    X_all_sparse = scipy.sparse.csc_matrix(X_all)
    U, s, V = sparsesvd(X_all_sparse, 60)
    print U.shape, s.shape, V.shape
    S = np.diag(s)
    X_all = np.dot(np.transpose(U), np.dot(S, V))
    return X_all[0:X.shape[0],:], X_all[X.shape[0]:,:]
Esempio n. 50
0
 def __init__(self, corpus):
     term_doc = TermDoc()
     for document in corpus:
         # Tokenize/stem each document
         tokens = process_document(document)
         # Add it to the sparse term-document matrix
         term_doc.add_document(document, tokens)
     print len(term_doc._words), len(term_doc._documents)
     # Calculate the SVD
     self.T, self.s, self.D = sparsesvd(term_doc._matrix.as_csc(), 300)
Esempio n. 51
0
    def testLearnModel2(self): 
        X = scipy.sparse.rand(10, 10, 0.2)
        X = X.tocsc()        
        lmbdas = numpy.array([10.0, 0.0])
        eps = 0.01         
        k = 9
        
        #Check out singular values 
        U, s, V = sparsesvd(X.tocsc(), k) 

        softImpute = SoftImpute(lmbdas, eps, k)
        ZList = softImpute.learnModel2(X)
        
        #Test that when lambda=0 get approx original matrix back 
        X2 = ZList[1].todense()
        nptst.assert_almost_equal(X.todense(), X2)
        
        #When lambda is greater or equal to largest singular value, get 0 
        U, s, V = sparsesvd(X.tocsc(), k) 
        lmbdas = numpy.array([numpy.max(s)]) 
        softImpute = SoftImpute(lmbdas, eps, k)
        Z = softImpute.learnModel2(X)
        self.assertEquals(numpy.linalg.norm(Z.todense()), 0)
        
        #Check solution for medium values of lambda 
        eps = 0.1
        lmbdas = numpy.array([0.1, 0.2, 0.5, 1.0])
        softImpute = SoftImpute(lmbdas, eps, k)
        ZList = softImpute.learnModel2(X)
        
        for j, Z in enumerate(ZList): 
            Z = Z.todense()
            Zomega = numpy.zeros(X.shape)
            
            rowInds, colInds = X.nonzero()
            for i in range(X.nonzero()[0].shape[0]): 
                Zomega[rowInds[i], colInds[i]] = Z[rowInds[i], colInds[i]]
                
            U, s, V = ExpSU.SparseUtils.svdSoft(numpy.array(X-Zomega+Z), lmbdas[j])      
            
            tol = 0.1
            self.assertTrue(numpy.linalg.norm(Z -(U*s).dot(V.T))**2 < tol)
Esempio n. 52
0
def process_SVD1(inputFileName, outputFileName, n, p):
    """
    Peform SVD1.
    """
    mat, rowids = loadMatrix(inputFileName)
    X = mat.tocsc()
    ut, s, vt = sparsesvd(X, n)
    A = np.dot(ut.T, np.diag(s ** p))
    saveMatrix(A, rowids, outputFileName)
    mmwrite("%s.ut" % inputFileName, ut)
    np.savetxt("%s.s" % inputFileName, s)
    mmwrite("%s.vt" % inputFileName, vt)
    pass
Esempio n. 53
0
def psp_pseudoinverse(Mat, precision):

    list_nz = (Mat.sum(axis=1) == 1) 
    list_mat = []
    
    for i in range(list_nz):
        if list_nz[i]:
            list_mat.append(i)
    
    temp_Mat = Mat[list_mat, :]
    matrix = spmatrix.ll_mat(temp_Mat.shape[0], temp_Mat.shape[1])
    matrix.update_add_at(temp_Mat.tocoo().data, temp_Mat.tocoo().row,
            temp_Mat.tocoo().col)

    if matrix.shape[0] <= matrix.shape[1]:

        k = int((precision * matrix.shape[0]) / 100)
        ut, s, vt = sparsesvd(matrix.tocsc(), k)
        UT = ss.csr_matrix(ut)
        SI = ss.csr_matrix(np.diag(1 / s))
        VT = ss.csr_matrix(vt)

        temp_matrix = spmatrixmul(VT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, UT)
        del ut, s, vt, UT, SI, VT, temp_matrix

    else:

        k = int((precision * matrix.transpose().shape[0]) / 100)
        ut, s, vt = sparsesvd(matrix.transpose().tocsc(), k)
        UT = ss.csr_matrix(ut)
        SI = ss.csr_matrix(np.diag(1 / s))
        VT = ss.csr_matrix(vt)

        temp_matrix = spmatrixmul(UT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, VT)
        del ut, s, vt, UT, SI, VT, temp_matrix

    return pinv_matrix.tocsr()
Esempio n. 54
0
def debug():
    """
    Test the various functions implemented in this module.
    """
    mat, rowids = loadMatrix("../work/testMatrix")
    #convertPPMI(mat)
    #saveMatrix(mat, rowids, "../work/pmiMatrix")
    X = mat.tocsc()
    ut, s, vt = sparsesvd(X, 50)
    #print allclose(X, np.dot(ut.T, np.dot(np.diag(s), vt)))    
    A = np.dot(ut.T, np.diag(s))
    saveMatrix(A, rowids, "../work/featMatrix")
    pass
Esempio n. 55
0
    def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS,
                 extra_dims=P2_EXTRA_DIMS, dtype=np.float64):
        """Construct the (U, S) projection from a corpus.

        Parameters
        ----------
        m : int
            Number of features (terms) in the corpus.
        k : int
            Desired rank of the decomposed matrix.
        docs : {iterable of list of (int, float), scipy.sparse.csc}
            Corpus in BoW format or as sparse matrix.
        use_svdlibc : bool, optional
            If True - will use `sparsesvd library <https://pypi.python.org/pypi/sparsesvd/>`_,
            otherwise - our own version will be used.
        power_iters: int, optional
            Number of power iteration steps to be used. Tune to improve accuracy.
        extra_dims : int, optional
            Extra samples to be used besides the rank `k`. Tune to improve accuracy.
        dtype : numpy.dtype, optional
            Enforces a type for elements of the decomposed matrix.

        """
        self.m, self.k = m, k
        self.power_iters = power_iters
        self.extra_dims = extra_dims
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition,
            # *in-core*.
            if not use_svdlibc:
                u, s = stochastic_svd(
                    docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    extra_dims=self.extra_dims, dtype=dtype)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix", str(docs.shape))
                if not scipy.sparse.issparse(docs):
                    docs = matutils.corpus2csc(docs)
                # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30)
                u = ut.T
                del ut, vt
                k = clip_spectrum(s ** 2, self.k)
            self.u = u[:, :k].copy()
            self.s = s[:k].copy()
        else:
            self.u, self.s = None, None
def setupencodersandimages(listoffiles, scalew, scaleh, numgabor):
  combined=[]
  for file_ in listoffiles:
    combined.append(converttoarray(file_, scalew, scaleh))
  imgs=numpy.array([img/numpy.linalg.norm(img) for img in combined]).T
  csc=csc_matrix(imgs)

  ut, S, vt=sparsesvd(csc, len(listoffiles))
  M=numpy.diag([numpy.linalg.norm(ut[i:]) for i in range(ut.shape[0])])
  #W is M inverse
  UW=numpy.dot(ut.T, numpy.linalg.inv(M))
  MSvt=numpy.dot(M, numpy.dot(numpy.diag(S), vt))
  gaborenc=[makerandomgabor(scalew*scaleh) for i in range(numgabor)]
  gaborenc=[(1/numpy.linalg.norm(i).flatten())*i for i in gaborenc]
  return imgs, UW, MSvt, gaborenc
Esempio n. 57
0
    def convertor2matrix(self, rank):
        self.loadDataFileMovieBased(moiveRatingTrainsetFilename)
        self.loadDataFile(moiveRatingTrainsetFilename)
        self.matrix = [[0 for x in range(0, len(self.allMovieRatingRecord))] for y in range(0, len(self.allUserRatingRecord))]
        self.users = list(self.allUserRatingRecord.keys())
        self.movies = list(self.allMovieRatingRecord)
        matrix = self.matrix
        for u in range(len(self.users)):
            user = self.users[u]
            userEntry = self.allUserRatingRecord[user]
            for movie in userEntry:
                matrix[u][self.movies.index(movie)] = int(userEntry[movie][0])
        avg = 0.0
        userAvg = []
        for i in range(len(matrix)):
            c = 0.0
            s = 0.0
            for j in range(len(matrix[i])):
                if matrix[i][j] != 0:
                    c += 1
                    s += matrix[i][j]
            userAvg.append(s/c)
        movieAvg=[]
        for i in range(len(matrix[0])):
            c = 0.0
            s = 0.0
            for j in range(len(matrix)):
                if matrix[j][i] != 0:
                    c += 1
                    s += matrix[j][i]
            if c == 0.0:
                print self.movies[i]
            movieAvg.append(s/c)
        for i in range(len(matrix)):
            for j in range(len(matrix[i])):
                if matrix[i][j] != 0:
                    matrix[i][j] -= userAvg[i]


        smat =  scipy.sparse.csc_matrix(matrix)
        u, s, v = sparsesvd(smat,rank)    
        u = u.transpose()
        s = diag(s)
        res = dot(dot(u,s),v)   
        for i in range(len(res)):
            for j in range(len(res[i])):
                res[i][j] += userAvg[i]  
        self.result = res  
Esempio n. 58
0
	def run(self):
		self.context 			= loadContextFile(self.contextFile)
		self.lexicon		 	= buildLexicon(self.context, self.stopList, self.lexiconCutoff)
		self.lexiconSorted		= getSortedLexicon(self.lexicon)
		self.lexiconPosition	= buildLexiconPositionTable(self.lexiconSorted)
		writeLexicon(self.lexiconFile, self.lexicon, self.lexiconSorted)
		self.denseMat 			= computeMatrix(int(self.winLength) , self.lexicon, self.lexiconPosition, self.context)		
		
		if int(len(self.denseMat)) >= self.cutOffSVD :
			self.sparseMat	= scipy.sparse.csc_matrix(self.denseMat)
			ut, s, self.vt 	= sparsesvd.sparsesvd(self.sparseMat, self.cutOffSVD)
			dump2File(self.vt, self.VtFile)
			self.reducedMatrix = computeReduction(self.vt, self.denseMat, self.cutOffSVD)
			dump2File(self.reducedMatrix, self.redMatFile)		
		else : 
			print('--> not enough contexts')