def raw2ppmi(cooccur, word2id, k_shift=1.0): """ Convert raw counts from `get_coccur` into positive PMI values (as per Levy & Goldberg), in place. The result is an efficient stream of sparse word vectors (=no extra data copy). """ logger.info("computing PPMI on co-occurence counts") # following lines a bit tedious, as we try to avoid making temporary copies of the (large) `cooccur` matrix marginal_word = cooccur.sum(axis=1) marginal_context = cooccur.sum(axis=0) cooccur /= marginal_word[:, None] # #(w, c) / #w cooccur /= marginal_context # #(w, c) / (#w * #c) cooccur *= marginal_word.sum() # #(w, c) * D / (#w * #c) numpy.log(cooccur, out=cooccur) # PMI = log(#(w, c) * D / (#w * #c)) logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, )) cooccur -= numpy.log( k_shift) # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k) logger.info("clipping PMI scores to be non-negative PPMI") cooccur.clip( 0.0, out=cooccur) # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k)) logger.info("normalizing PPMI word vectors to unit length") for i, vec in enumerate(cooccur): cooccur[i] = matutils.unitvec(vec) return matutils.Dense2Corpus(cooccur, documents_columns=False)
def ns(numpy_matrix, number_of_corpus_features, scipy_sparse_matrix): corpus = matutils.Dense2Corpus(numpy_matrix) numpy_matrix = matutils.corpus2dense(corpus, num_terms=number_of_corpus_features) corpus = matutils.Sparse2Corpus(scipy_sparse_matrix) scipy_csc_matrix = matutils.corpus2csc(corpus)
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. If `scaled` is set, scale topics by the inverse of singular values (default: no scaling). """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = ( vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec) # # use np's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype)) if not is_corpus: # convert back from matrix into a 1d vec topic_dist = topic_dist.reshape(-1) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics] ) * topic_dist # s^-1 * u^-1 * x # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def transform(self, X, y=None): corpus = matutils.Dense2Corpus(np.transpose(X)) # Apply the semantic model to the training set bag of words (fast) feat = self.semSpace[corpus] # convert from TransformedCorpus datatype to numpy doc x topic array (medium speed, needs more benchmarking) topics_csr = matutils.corpus2csc(feat) X_ = topics_csr.T.toarray() return X_
def fit(self, X, y=None): corpus = matutils.Dense2Corpus(np.transpose(X)) # construct a semantic model based on document-topic similarity (15-20 min for 1500k reviews?) self.semSpace = lsi(corpus, id2word=self.this_dict, num_topics=self.d, chunksize=20000, distributed=self.distributed) return self
def shift_clip_pmi(pmimtr, k_shift=1.0): """ Turns a pmi matrix into a PPMI matrix by setting all negative values to 0 and then shifting by a factor of -log(k). :param pmimtr: The matrix of PMI values. :param k_shift: The shift factor. :return: A PPMI matrix. """ logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, )) pmimtr -= np.log(k_shift) # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k) logger.info("clipping PMI scores to be non-negative PPMI") pmimtr.clip(0.0, out=pmimtr) # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k)) logger.info("normalizing PPMI word vectors to unit length") for i, vec in enumerate(pmimtr): pmimtr[i] = matutils.unitvec(vec) return matutils.corpus2csc(matutils.Dense2Corpus(pmimtr, documents_columns=False)).T
def raw2ppmi(cooccur, word2id, k_shift=1.0): logger.info("computing PPMI on co-occurence counts") marginal_word = cooccur.sum(axis=1) marginal_context = cooccur.sum(axis=0) cooccur /= marginal_word[:, None] # #(w, c) / #w cooccur /= marginal_context # #(w, c) / (#w * #c) cooccur *= marginal_word.sum() # #(w, c) * D / (#w * #c) numpy.log(cooccur, out=cooccur) # PMI = log(#(w, c) * D / (#w * #c)) logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, )) cooccur -= numpy.log(k_shift) # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k) logger.info("clipping PMI scores to be non-negative PPMI") cooccur.clip(0.0, out=cooccur) # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k)) logger.info("normalizing PPMI word vectors to unit length") for i, vec in enumerate(cooccur): cooccur[i] = matutils.unitvec(vec) return matutils.Dense2Corpus(cooccur, documents_columns=False)
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] vec = matutils.corpus2csc(bow, num_terms=self.num_terms) topic_dist = ( vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics] ) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def save_lsi_w2v_matrix(corpus, tfidf_model, lsi_model, filename='../data/big_matrix_wgt.index'): """ Function which concatenates the LSI matrix created in stage 1 to the Doc2Vec matrix created by 'doc_matrix'. The resulting matrix is pickled and saved to disk along with its resulting similarity matrix. INPUT: corpus - a gensim corpus object tfidf_model - a trained gensim TFIDF model lsi_model - a trained gensim LSI model """ lsi = lsi_model[tfidf_model[corpus]] numpy_matrix = matutils.corpus2dense(lsi, num_terms=400).T with open('../data/w2v_matrix_wgt.obj', 'r') as f: mat = pickle.load(f) big_matrix = np.append(numpy_matrix, mat, axis=1).T big_lsi = matutils.Dense2Corpus(big_matrix) index = similarities.MatrixSimilarity(big_lsi) file1 = open('../data/big_matrix_wgt.obj', 'w') pickle.dump(big_matrix.T, file1) file1.close() index.save(filename)
#Inserting word with 500th rank in top 10 in Topic K/2 topicID, wordID = K / 2, np.argsort(fee[K / 2])[-250] feestar, w2i = generateTargetFee(fee, topicID, wordID, dcyFile) print("Attacking corpus to insert \'%s\' in topic no. %d\n" % (w2i, topicID)) M_new = outer.update(eta, phi, fee, feestar, M_0, M, 1, loss_func_ind) riskNorm.append(loss(fee, feestar, loss_func_ind)) wordRank.append(rankOf(wordID, fee[topicID])) print("Rank of word \'%s\' = %d" % (w2i, wordRank[-1])) print("Risk function ||fee-feestar||: %f" % (riskNorm[-1])) print('Iteration %d complete' % it) print('************************************************************\n') while (loss(fee, feestar, loss_func_ind) > TOL and it < maxiter): M = M_new print('dimensions of M: %ix%i' % (M.shape[0], M.shape[1])) corpus = matutils.Dense2Corpus(M, documents_columns=False) corpora.BleiCorpus.serialize(corpFile, corpus) eta, gamma, phi, fee = findVariationalParams(corpFile, paramFolder, D, V, alpha, K) feestar, perm = permuteFee(feestar, fee) riskNorm.append(loss(fee, feestar, loss_func_ind)) M_new = outer.update(eta, phi, fee, feestar, M_0, M, it, loss_func_ind) wordRank.append(rankOf(wordID, fee[perm[topicID]])) it += 1 print("norm(M-M_0): %f" % (np.linalg.norm(M_new - M_0, 1))) print("Risk Function ||fee-feestar||: %f" % (riskNorm[-1])) print("Rank of word \'%s\' = %d" % (w2i, wordRank[-1])) print('Iteration %d complete' % it) print('************************************************************\n') M_final = np.int32(M) corpus = matutils.Dense2Corpus(M_final, documents_columns=False)
print(corpus) #TRAIN TFIDF MODEL tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model tfidf.save('emma/model/emma.tfidf') # save TFIDF model trained on emma #CREATE TFIDF TRANSFORM corpus_tfidf = tfidf[corpus] # create a wrapper over the original corpus: bow->tfidf corpora.MmCorpus.serialize('emma/emma_tfidf.mm', corpus_tfidf) # TRAIN LSI MODEL lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) # initialize an LSI transformation lsi.save('emma/model/emma.lsi') # save LSI model trained on emma # CREATE LSI INSTANCE corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi corpora.MmCorpus.serialize('emma/emma_doc_lsi.mm', corpus_lsi) lsi.print_topics(10) # CREATE WORD/WORD MATRIX numTerms x numTopics MATRICS termcorpus_lsi = matutils.Dense2Corpus(lsi.projection.u.T) corpora.MmCorpus.serialize('emma/emma_term_lsi.mm', termcorpus_lsi) # CREATE LSI DOCUMENT SIMILARITY INDEX index = similarities.Similarity('emma/doc_index', corpus_lsi, num_features = 10) index.save('emma/emma_doc_lsi.index') # CREATE LSI DOCUMENT SIMILARITY INDEX index = similarities.Similarity('emma/term_index', termcorpus_lsi, num_features = 10) index.save('emma/emma_term_lsi.index')
def dense_to_corpus(numpy_matrix): corpus = matutils.Dense2Corpus(numpy_matrix) return corpus
tmp1 = int(K/2) tmp2 = int(V/2) feestar[tmp1][tmp2] += 0.2*feestar[tmp1][tmp2+1] feestar[tmp1][tmp2+1] -= 0.2*feestar[tmp1][tmp2+1] ######################################## M=M_0 #Use fee in outer. My fee calculation is vectorized M_new = outer.update(eta, phi, feestar, M_0, M) it=1 print ('Iteration %d complete'%it) while(np.linalg.norm(M-M_new,1)/np.linalg.norm(M,1)>0.01): M = M_new # Made some modifications here # Blei-lda's C code doesn't operate on M but corpus # Hence for each M, a new corpus is written corpus = matutils.Dense2Corpus(M_new, documents_columns=False) corpora.BleiCorpus.serialize(corpFile,corpus) eta,gamma,phi,fee=findVariationalParams\ (M,corpFile,paramFolder,alpha,K) it+=1 M_new = outer.update(eta,phi,feestar,M_0,M) print('Iteration %d complete'%it) # vanilla LDA is still left to be done here. # Actually you should change # phi_star properly to see if everything works fine M_final = outer.project_to_int(M) t1=time.time() print ("Time taken = %f sec"%(t1-t0))