Exemple #1
0
def raw2ppmi(cooccur, word2id, k_shift=1.0):
    """
    Convert raw counts from `get_coccur` into positive PMI values (as per Levy & Goldberg),
    in place.

    The result is an efficient stream of sparse word vectors (=no extra data copy).

    """
    logger.info("computing PPMI on co-occurence counts")

    # following lines a bit tedious, as we try to avoid making temporary copies of the (large) `cooccur` matrix
    marginal_word = cooccur.sum(axis=1)
    marginal_context = cooccur.sum(axis=0)
    cooccur /= marginal_word[:, None]  # #(w, c) / #w
    cooccur /= marginal_context  # #(w, c) / (#w * #c)
    cooccur *= marginal_word.sum()  # #(w, c) * D / (#w * #c)
    numpy.log(cooccur, out=cooccur)  # PMI = log(#(w, c) * D / (#w * #c))

    logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, ))
    cooccur -= numpy.log(
        k_shift)  # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k)

    logger.info("clipping PMI scores to be non-negative PPMI")
    cooccur.clip(
        0.0,
        out=cooccur)  # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k))

    logger.info("normalizing PPMI word vectors to unit length")
    for i, vec in enumerate(cooccur):
        cooccur[i] = matutils.unitvec(vec)

    return matutils.Dense2Corpus(cooccur, documents_columns=False)
Exemple #2
0
def ns(numpy_matrix, number_of_corpus_features, scipy_sparse_matrix):
    corpus = matutils.Dense2Corpus(numpy_matrix)

    numpy_matrix = matutils.corpus2dense(corpus,
                                         num_terms=number_of_corpus_features)

    corpus = matutils.Sparse2Corpus(scipy_sparse_matrix)
    scipy_csc_matrix = matutils.corpus2csc(corpus)
Exemple #3
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
        This is done by folding input document into the latent topic space.
        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).
        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow,
                                  num_terms=self.num_terms,
                                  dtype=self.projection.u.dtype)
        topic_dist = (
            vec.T *
            self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]
                          ) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Exemple #4
0
    def transform(self, X, y=None):
        corpus = matutils.Dense2Corpus(np.transpose(X))

        # Apply the semantic model to the training set bag of words (fast)
        feat = self.semSpace[corpus]

        # convert from TransformedCorpus datatype to numpy doc x topic array (medium speed, needs more benchmarking)
        topics_csr = matutils.corpus2csc(feat)
        X_ = topics_csr.T.toarray()

        return X_
Exemple #5
0
    def fit(self, X, y=None):
        corpus = matutils.Dense2Corpus(np.transpose(X))

        # construct a semantic model based on document-topic similarity (15-20 min for 1500k reviews?)
        self.semSpace = lsi(corpus,
                            id2word=self.this_dict,
                            num_topics=self.d,
                            chunksize=20000,
                            distributed=self.distributed)

        return self
    def shift_clip_pmi(pmimtr, k_shift=1.0):
        """
        Turns a pmi matrix into a PPMI matrix by setting all negative values to 0 and then shifting by a factor of
        -log(k).

        :param pmimtr: The matrix of PMI values.
        :param k_shift: The shift factor.
        :return: A PPMI matrix.
        """

        logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, ))
        pmimtr -= np.log(k_shift)  # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k)

        logger.info("clipping PMI scores to be non-negative PPMI")
        pmimtr.clip(0.0, out=pmimtr)  # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k))

        logger.info("normalizing PPMI word vectors to unit length")
        for i, vec in enumerate(pmimtr):
            pmimtr[i] = matutils.unitvec(vec)

        return matutils.corpus2csc(matutils.Dense2Corpus(pmimtr, documents_columns=False)).T
def raw2ppmi(cooccur, word2id, k_shift=1.0):

    logger.info("computing PPMI on co-occurence counts")


    marginal_word = cooccur.sum(axis=1)
    marginal_context = cooccur.sum(axis=0)
    cooccur /= marginal_word[:, None]  # #(w, c) / #w
    cooccur /= marginal_context  # #(w, c) / (#w * #c)
    cooccur *= marginal_word.sum()  # #(w, c) * D / (#w * #c)
    numpy.log(cooccur, out=cooccur)  # PMI = log(#(w, c) * D / (#w * #c))

    logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, ))
    cooccur -= numpy.log(k_shift)  # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k)

    logger.info("clipping PMI scores to be non-negative PPMI")
    cooccur.clip(0.0, out=cooccur)  # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k))

    logger.info("normalizing PPMI word vectors to unit length")
    for i, vec in enumerate(cooccur):
        cooccur[i] = matutils.unitvec(vec)

    return matutils.Dense2Corpus(cooccur, documents_columns=False)
Exemple #8
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.
        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms)

        topic_dist = (
            vec.T *
            self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x
        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]
                          ) * topic_dist  # s^-1 * u^-1 * x

        # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist.flat)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Exemple #9
0
def save_lsi_w2v_matrix(corpus,
                        tfidf_model,
                        lsi_model,
                        filename='../data/big_matrix_wgt.index'):
    """
    Function which concatenates the LSI matrix created in stage 1 to the
    Doc2Vec matrix created by 'doc_matrix'.  The resulting matrix is pickled
    and saved to disk along with its resulting similarity matrix.

    INPUT: corpus - a gensim corpus object
           tfidf_model - a trained gensim TFIDF model
           lsi_model - a trained gensim LSI model
    """
    lsi = lsi_model[tfidf_model[corpus]]
    numpy_matrix = matutils.corpus2dense(lsi, num_terms=400).T
    with open('../data/w2v_matrix_wgt.obj', 'r') as f:
        mat = pickle.load(f)
    big_matrix = np.append(numpy_matrix, mat, axis=1).T
    big_lsi = matutils.Dense2Corpus(big_matrix)
    index = similarities.MatrixSimilarity(big_lsi)
    file1 = open('../data/big_matrix_wgt.obj', 'w')
    pickle.dump(big_matrix.T, file1)
    file1.close()
    index.save(filename)
Exemple #10
0
 #Inserting word with 500th rank in top 10 in Topic K/2
 topicID, wordID = K / 2, np.argsort(fee[K / 2])[-250]
 feestar, w2i = generateTargetFee(fee, topicID, wordID, dcyFile)
 print("Attacking corpus to insert \'%s\' in topic no. %d\n" %
       (w2i, topicID))
 M_new = outer.update(eta, phi, fee, feestar, M_0, M, 1, loss_func_ind)
 riskNorm.append(loss(fee, feestar, loss_func_ind))
 wordRank.append(rankOf(wordID, fee[topicID]))
 print("Rank of word \'%s\' = %d" % (w2i, wordRank[-1]))
 print("Risk function ||fee-feestar||: %f" % (riskNorm[-1]))
 print('Iteration %d complete' % it)
 print('************************************************************\n')
 while (loss(fee, feestar, loss_func_ind) > TOL and it < maxiter):
     M = M_new
     print('dimensions of M: %ix%i' % (M.shape[0], M.shape[1]))
     corpus = matutils.Dense2Corpus(M, documents_columns=False)
     corpora.BleiCorpus.serialize(corpFile, corpus)
     eta, gamma, phi, fee = findVariationalParams(corpFile, paramFolder, D,
                                                  V, alpha, K)
     feestar, perm = permuteFee(feestar, fee)
     riskNorm.append(loss(fee, feestar, loss_func_ind))
     M_new = outer.update(eta, phi, fee, feestar, M_0, M, it, loss_func_ind)
     wordRank.append(rankOf(wordID, fee[perm[topicID]]))
     it += 1
     print("norm(M-M_0): %f" % (np.linalg.norm(M_new - M_0, 1)))
     print("Risk Function ||fee-feestar||: %f" % (riskNorm[-1]))
     print("Rank of word \'%s\' = %d" % (w2i, wordRank[-1]))
     print('Iteration %d complete' % it)
     print('************************************************************\n')
 M_final = np.int32(M)
 corpus = matutils.Dense2Corpus(M_final, documents_columns=False)
Exemple #11
0
print(corpus)

#TRAIN TFIDF MODEL
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
tfidf.save('emma/model/emma.tfidf') # save TFIDF model trained on emma

#CREATE TFIDF TRANSFORM
corpus_tfidf = tfidf[corpus] # create a wrapper over the original corpus: bow->tfidf
corpora.MmCorpus.serialize('emma/emma_tfidf.mm', corpus_tfidf)

# TRAIN LSI MODEL
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) # initialize an LSI transformation
lsi.save('emma/model/emma.lsi') # save LSI model trained on emma 

# CREATE LSI INSTANCE
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
corpora.MmCorpus.serialize('emma/emma_doc_lsi.mm', corpus_lsi)
lsi.print_topics(10)

# CREATE WORD/WORD MATRIX numTerms x numTopics MATRICS
termcorpus_lsi = matutils.Dense2Corpus(lsi.projection.u.T) 
corpora.MmCorpus.serialize('emma/emma_term_lsi.mm', termcorpus_lsi)

# CREATE LSI DOCUMENT SIMILARITY INDEX
index = similarities.Similarity('emma/doc_index', corpus_lsi, num_features = 10)
index.save('emma/emma_doc_lsi.index')

# CREATE LSI DOCUMENT SIMILARITY INDEX 
index = similarities.Similarity('emma/term_index', termcorpus_lsi, num_features = 10)
index.save('emma/emma_term_lsi.index')
Exemple #12
0
def dense_to_corpus(numpy_matrix):
    corpus = matutils.Dense2Corpus(numpy_matrix)
    return corpus
Exemple #13
0
    tmp1 = int(K/2)
    tmp2 = int(V/2)  
    feestar[tmp1][tmp2] += 0.2*feestar[tmp1][tmp2+1]
    feestar[tmp1][tmp2+1] -= 0.2*feestar[tmp1][tmp2+1]
    ########################################
    M=M_0
    #Use fee in outer. My fee calculation is vectorized
    M_new = outer.update(eta, phi, feestar, M_0, M)
    it=1
    print ('Iteration %d complete'%it)
    while(np.linalg.norm(M-M_new,1)/np.linalg.norm(M,1)>0.01):
        M = M_new
        # Made some modifications here
        # Blei-lda's C code doesn't operate on M but corpus
        # Hence for each M, a new corpus is written
        corpus = matutils.Dense2Corpus(M_new,
                                       documents_columns=False)
        corpora.BleiCorpus.serialize(corpFile,corpus)
    	eta,gamma,phi,fee=findVariationalParams\
                           (M,corpFile,paramFolder,alpha,K)
        it+=1
        M_new = outer.update(eta,phi,feestar,M_0,M)
        print('Iteration %d complete'%it)

    # vanilla LDA is still left to be done here.
    # Actually you should change
    # phi_star properly to see if everything works fine

    M_final = outer.project_to_int(M)
    t1=time.time()
    print ("Time taken = %f sec"%(t1-t0))