def create_documents_view(self, corpus, ir_mode): dictionary, pdocs = self.create_dictionary(corpus) bow = self.docs2bows(corpus, dictionary, pdocs) loaded_corpus = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus if ir_mode == 1: model = [[(w[0], 1 + np.log2(w[1])) for w in v] for v in bow] # TF model elif ir_mode == 2: model = models.TfidfModel(loaded_corpus) # TF IDF model elif ir_mode == 3: model = models.LdaModel(loaded_corpus) # LDA model elif ir_mode == 4: model = models.LdaMulticore(loaded_corpus) # LDA Multicore model elif ir_mode == 5: model = models.LsiModel(loaded_corpus) # LSI model elif ir_mode == 6: model = models.RpModel(loaded_corpus) # RP model elif ir_mode == 7: model = models.LogEntropyModel( loaded_corpus) # LogEntropyModel model # tf = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus return model, dictionary
def train(self, arg_fname, is_pre=True, method='lsi', **params): self.fname = arg_fname self.method = method self._generate_conf() if is_pre: self.docs, self.dictionary, corpus = self._preprocess() else: self.docs = pickle.load(open(self.conf['fname_docs'])) self.dictionary = corpora.Dictionary.load(self.conf['fname_dict']) corpus = corpora.MmCorpus(self.conf['fname_corpus']) if params is None: params = {} logger.info("training TF-IDF model") self.tfidf = models.TfidfModel(corpus, id2word=self.dictionary) corpus_tfidf = self.tfidf[corpus] if method == 'lsi': logger.info("training LSI model") self.lsi = models.LsiModel(corpus_tfidf, id2word=self.dictionary, **params) self.lsi.print_topics(-1) self.lsi_similarity_index = similarities.MatrixSimilarity( self.lsi[corpus_tfidf]) self.para = self.lsi[corpus_tfidf] elif method == 'lda_tfidf': logger.info("training LDA model") # try 6 workers here instead of original 8 self.lda_tfidf = models.LdaMulticore(corpus_tfidf, id2word=self.dictionary, workers=6, **params) self.lda_tfidf.print_topics(-1) self.lda_tfidf_similarity_index = similarities.MatrixSimilarity( self.lda[corpus_tfidf]) self.para = self.lda[corpus_tfidf] elif method == 'lda': logger.info("training LDA model") # try 6 workers here instead of original 8 self.lda = models.LdaMulticore(corpus, id2word=self.dictionary, workers=6, **params) self.lda.print_topics(-1) self.lda_similarity_index = similarities.MatrixSimilarity( self.lda[corpus]) self.para = self.lda[corpus] elif method == 'logentropy': logger.info("training a log-entropy model") self.logent = models.LogEntropyModel(corpus, id2word=self.dictionary) self.logent_similarity_index = similarities.MatrixSimilarity( self.logent[corpus]) self.para = self.logent[corpus] else: msg = "unknown semantic method %s" % method logger.error(msg) raise NotImplementedError(msg)
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[np.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s", cor) self.assertTrue(cor > 0.6)
def query_similarity(queries, corpus, method='tfidf', n_neighbors=2): dictionary, corpusdic = build_corpusdic(corpus) if method == 'lsi': mdl = models.LsiModel(corpusdic, id2word=dictionary, num_topics=100) elif method == 'tfidf': mdl = models.TfidfModel(corpusdic) elif method == 'rp': mdl = models.RpModel(corpusdic, num_topics=100) elif method == 'hdp': mdl = models.HdpModel(corpusdic, id2word=dictionary) elif method == 'lda': mdl = models.LdaModel(corpusdic, id2word=dictionary, num_topics=100) elif method == 'lem': mdl = models.LogEntropyModel(corpusdic) elif method == 'norm': mdl = models.NormModel(corpus, norm='l2') else: raise ValueError("There is an invalid model method in the input!") index = similarities.MatrixSimilarity(mdl[corpusdic]) indx_list = [] sim_list = [] for query in queries: vec_bow = dictionary.doc2bow(query.lower().split()) vec_lsi = mdl[vec_bow] # convert the query to LSI space sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) sims = sims[:n_neighbors] indx_, sim_ = np.array(sims).transpose() indx_list.append(indx_) sim_list.append(sim_) return indx_list, sim_list
def build_doc_similarity_table(corpus, method='tfidf', n_neighboors=3, return_similarity=True, batch_size=5000, doc_dtype=np.int64, score_dtype=np.float16): """ Batch processor wrapper for ``find_similar_docs`` to find `n_neighboors` similar docs to all the docs *Note 1*: increasing ``batch_size`` can increase memory usage, but can be faster """ # This structure should be paralleized to doc multiple CPUs dictionary, corpusdic = build_corpusdic(corpus) if method == 'lsi': lsi = models.LsiModel(corpusdic, id2word=dictionary, num_topics=100) elif method == 'tfidf': lsi = models.TfidfModel(corpusdic) elif method == 'rp': lsi = models.RpModel(corpusdic, num_topics=100) elif method == 'hdp': lsi = models.HdpModel(corpusdic, id2word=dictionary) elif method == 'lda': lsi = models.LdaModel(corpusdic, id2word=dictionary, num_topics=100) elif method == 'lem': lsi = models.LogEntropyModel(corpusdic) elif method == 'norm': lsi = models.NormModel(corpus, norm='l2') else: raise ValueError("There is an invalid model method in the input!") #Determing the similarities between different documents index = similarities.MatrixSimilarity(lsi[corpusdic]) vec_lsi = lsi[corpusdic] doc_matrix = index[vec_lsi] doc_count = doc_matrix.shape[0] similarity_table = np.zeros((doc_count, n_neighboors)).astype(doc_dtype) similarity_score = None if return_similarity: similarity_score = np.zeros( (doc_count, n_neighboors)).astype(score_dtype) start_ = 0 while (start_ < doc_count): end_ = start_ + batch_size if (end_ > doc_count): end_ = doc_count query_index = np.arange(start_, end_) query_matrix = doc_matrix[query_index] (similarity_table[query_index], similarities_) = \ find_similar_docs(doc_matrix, query_matrix, n_neighboors=n_neighboors, method='nearest', return_similarity=return_similarity) if return_similarity: similarity_score[query_index] = similarities_ start_ = start_ + batch_size return (similarity_table, similarity_score)
def gen_data(train, test, no_below, no_above, num_topics=300): mydictionary = My_dictionary(train, test, no_below=no_below, no_above=no_above) train_corpus = mydictionary.train_corpus test_corpus = mydictionary.test_corpus dictionary = mydictionary.dictionary save_load(mode='save', files={ 'train_corpus': train_corpus, 'test_corpus': test_corpus, 'dictionary': dictionary }) #print(len(corpus),corpus[:10]) #tfidf print('tfidf...') tfidf = models.TfidfModel(train_corpus) # 第一步--初始化一个模型 train_corpus_tfidf = tfidf[train_corpus] test_corpus_tfidf = tfidf[test_corpus] #对整个语料库实施转换 train_tfidf_array = corpustoarray(train_corpus_tfidf) test_tfidf_array = corpustoarray(test_corpus_tfidf) pd.DataFrame(train_tfidf_array).to_csv('file\\train_tfidf.csv') pd.DataFrame(test_tfidf_array).to_csv('file\\test_tfidf.csv') #lsi 200-500的num_topics维度为“金标准” print('lsi...') lsi = models.LsiModel(train_corpus_tfidf, id2word=dictionary, num_topics=num_topics) # 初始化一个LSI train_corpus_lsi = lsi[ train_corpus_tfidf] # 在原始语料库上加上双重包装: bow->tfidf->fold-in-lsi test_corpus_lsi = lsi[test_corpus] train_lsi_array = corpustoarray(train_corpus_lsi) test_lsi_array = corpustoarray(test_corpus_lsi) pd.DataFrame(train_lsi_array).to_csv('file\\train_lsi.csv') pd.DataFrame(test_lsi_array).to_csv('file\\test_lsi.csv') #lsi=models.LsiModel.load('model\\model.lsi') #RP print('rp...') rp = models.RpModel(train_corpus_tfidf, id2word=dictionary, num_topics=num_topics) train_corpus_rp = rp[train_corpus_tfidf] test_corpus_rp = rp[test_corpus] train_rp_array = corpustoarray(train_corpus_rp) test_rp_array = corpustoarray(test_corpus_rp) pd.DataFrame(train_rp_array).to_csv('file\\train_rp.csv') pd.DataFrame(test_rp_array).to_csv('file\\test_rp.csv') ''' #LDA 2003 LDA最早由Blei, David M.、吴恩达和Jordan, Michael I于2003年提出 有bug lda = models.LdaSeqModel(corpus, id2word=dictionary,num_topics=300) corpus_lda=lda[corpus] ''' #HDP 2011 Wang, Paisley, Blei: http://proceedings.mlr.press/v15/wang11a/wang11a.pdf print('hdp...') hdp = models.HdpModel(train_corpus, id2word=dictionary) train_corpus_hdp = hdp[train_corpus] test_corpus_hdp = hdp[test_corpus] train_hdp_array = corpustoarray(train_corpus_hdp) test_hdp_array = corpustoarray(test_corpus_hdp) pd.DataFrame(train_hdp_array).to_csv('file\\train_hdp.csv') pd.DataFrame(test_hdp_array).to_csv('file\\test_hdp.csv') #Log Entropy Model 2015 print('log...') log = models.LogEntropyModel(train_corpus) train_corpus_log = log[train_corpus] test_corpus_log = log[test_corpus] train_log_array = corpustoarray(train_corpus_log) test_log_array = corpustoarray(test_corpus_log) pd.DataFrame(train_log_array).to_csv('file\\train_log.csv') pd.DataFrame(test_log_array).to_csv('file\\test_log.csv')