def get_lda(text_dictionary): train = [] for key, line in text_dictionary.items(): line = line.strip().split(' ') train.append(line) print(len(train)) print(' '.join(train[2])) dictionary = corpora.Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) topic_list = lda.print_topics(20) print(type(lda.print_topics(20))) print(len(lda.print_topics(20))) for topic in topic_list: print(topic) print("第一主题") print(lda.print_topic(1)) print('给定一个新文档,输出其主题分布') # test_doc = list(new_doc) #新文档进行分词 test_doc = train[2] # 查看训练集中第三个样本的主题分布 doc_bow = dictionary.doc2bow(test_doc) # 文档转换成bow doc_lda = lda[doc_bow] # 得到新文档的主题分布 # 输出新文档的主题分布 print(doc_lda) for topic in doc_lda: print("%s\t%f\n" % (lda.print_topic(topic[0]), topic[1]))
def chosen_lda(corpus, dictionary, data, n_topics, alpha=.1, eta=0.01): ''' This function trains a Gensim LDA model on chosen hyperparameters Arguments: ---------- corpus : matrix-format corpus (BOW or TF-IDF) dictionary : corpus-related dictionary data : text data for coherence score computation n_topics : number of desired topics alpha : alpha parameter (from 0 to infinity) eta : beta parameter (from 0 to infinity) Outputs: ---------- lda : trained model ''' lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=35, random_state=100, alpha=alpha, eta=eta) ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)] lda_coherence = CoherenceModel(topics=ldatopics, texts=data, dictionary=dictionary, window_size=10).get_coherence() print(lda_coherence) lda.print_topics(num_topics=n_topics) lda.save('../03_Dump/model') return lda
def save_model(model_path): train_set = get_train_set() # 构建训练语料 dictionary = Dictionary(train_set) corpus = [dictionary.doc2bow(text) for text in train_set] # lda模型训练 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) lda.print_topics(100) lda.save(model_path)
def get_gensim_topics(num_topics_list, sentences, print_flag = False): """ Gensim by default employs a version of count vectorization input: sentences (list of list of words) outputs coherence, perplexity, and topics prints topics if print == True """ texts = sentences.apply(retokenize).tolist() dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] perplexity_ls = [] coherence_ls = [] for i in num_topics_list: lda = LdaModel(corpus, num_topics=i, id2word = dictionary, random_state = 10) perplexity = lda.log_perplexity(corpus) perplexity_ls.append(perplexity) coherence_model_lda = CoherenceModel(model = lda, texts = texts, dictionary = dictionary, coherence = 'c_v') coherence = coherence_model_lda.get_coherence() coherence_ls.append(coherence) if print_flag == True: print('Num. Topics: ', i) print('') for i in (lda.print_topics()): words = i[1] words_ls = words.split('+') words_ls = ([i.split('*')[1] for i in words_ls]) words_ls = [i.replace('"', '') for i in words_ls] print(', '.join(words_ls)) print('') return perplexity_ls, coherence_ls
def get_lda_model(): """ (50,28767) 获得话题 :return: """ text_array = list() with open("jobs-unigrams-filter") as f: for line in tqdm(f): line = line.strip().split(" ") line.remove(line[0]) text_array.append(line) dictionary = Dictionary(text_array) # print(common_dictionary) common_corpus = [dictionary.doc2bow(text) for text in text_array] # Train the model on the corpus. lda = LdaModel(common_corpus, id2word=dictionary, num_topics=50, passes=10, iterations=1000) temp_file = datapath("LDA_twitter") lda.save(temp_file) topics = lda.get_topics() print(topics.shape) topic_list = lda.print_topics(50) for topic in topic_list: print(topic)
def lda(clean_docs, model_name, topics): # turn all data into a dictionary mappping of normalized words and their integer ids from gensim import corpora dictionary = corpora.Dictionary(clean_docs) # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples) # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus corpus = [] for doc in clean_docs: corpus.append(dictionary.doc2bow(doc)) # serialize version: save dictionary and corpus for future use from gensim.corpora import MmCorpus MmCorpus.serialize('corpus_' + model_name + '.mm', corpus) dictionary.save('dictionary_' + model_name + '.gensim') # Train LDA model from gensim.models import LdaModel num_topics = topics # find this number of topics in the data passes = 15 ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes) ldamodel.save('model_' + model_name + '.gensim') topics = ldamodel.print_topics(num_words=5) for topic in topics: print(topic)
def test(key, text1): #comments,messages = read_data() #text1 = key_word(comments,messages) #生成字典 dictionary = corpora.Dictionary(text1) #生成语料库i corpus = [dictionary.doc2bow(text) for text in text1] #tfidf加权 tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] p_list = [] topicnum_list = [] num_topic = 2 #for i in range(2,50): # num_topics = i lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topic) #perplex = lda.log_perplexity(corpus_tfidf) #p_list.append(perplex) #topicnum_list.append(num_topics) topics = lda.print_topics() print '*' * 50 print key, ':' for i in topics: str1 = str(i[0]) + ':' print str1, i[1].encode('utf8')
def Train(train_set): # stopwords = codecs.open('stopwords.txt', 'r', encoding='utf8').readlines() # stopwords = [w.strip() for w in stopwords] # train_set = [] # for line in train: # line = list(jieba.cut(line)) # train_set.append([w for w in line if w not in stopwords]) # 构建训练语料 dictionary = Dictionary(train_set) corpus = [dictionary.doc2bow(text) for text in train_set] # lda模型训练 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) lda.print_topics(20)
def create_gensim_lda_model(dictionary, corpus, number_of_topics, words): # LDAモデルの作成 ldamodel = LdaModel(corpus, num_topics=number_of_topics, id2word=dictionary) print(ldamodel.print_topics(num_topics=number_of_topics, num_words=words)) return ldamodel
def lda_process(split_text, embedding_size, wordvec): new_line, new_dict = [], [] for line in split_text: for w in line.split(): if w in stopwords: continue new_line.append(w) new_dict.append(new_line) new_line = [] dictionary = Dictionary(new_dict) corpus = [dictionary.doc2bow(text) for text in new_dict] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=1, passes=20) _, title_terms = lda.print_topics(num_words=5)[0] title_vec = [] sub_terms = title_terms.split('+') for term in sub_terms: listItems = term.split('*') try: title_vec.append( float(listItems[0]) * wordvec[re.findall(r'\"(.+)\"', listItems[1])[0]]) except KeyError: title_vec.append(float(listItems[0]) * np.zeros(embedding_size)) #print(wordvec[re.findall(r'\"(.+)\"',listItems[1])]) title_vector = np.average(np.array(title_vec), axis=0) return title_vector.reshape(1, 300)
def LDA_model_from_token(text_file_name): token_file_name = text_file_name[:-4] + '.csv' print("loading "+token_file_name) data_word = [] with codecs.open(token_file_name, 'r') as f: rdr = csv.reader(f) next(rdr) for i, line in enumerate(rdr): data_word.append(line) print("Complete loading") id2word=corpora.Dictionary(data_word) id2word.filter_extremes(no_below = 10) #10회 이하로 등장한 단어는 삭제 texts = data_word corpus=[id2word.doc2bow(text) for text in texts] lda = LdaModel(corpus, num_topics=10, id2word=id2word) temp_file = datapath(token_file_name[:-4]) lda.save(temp_file) lda = LdaModel.load(temp_file) topics = lda.print_topics(num_words=10) for topic in topics: print(topic)
def train(corpuspath,modelpath): train = [] # stopwords = codecs.open('stopWords/1893(utf8).txt','r',encoding='utf8').readlines() # stopwords = [ w.strip() for w in stopwords ] fp = codecs.open(corpuspath, 'r', encoding='utf8') for line in fp: line = line.strip() if line == '':continue line = line.split() train.append([w for w in line]) dictionary = corpora.Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=20) lda.save(modelpath) topic_words = open('../result/lda/fact_lad_print-10.txt','w',encoding='utf-8') print_str = '' for topic in lda.print_topics(num_words=100): termNumber = topic[0] listOfTerms = topic[1].split('+') # for term in listOfTerms: # listItems = term.split('*') # # print(listItems) # print(' ', listItems[1], '(', listItems[0], ')', sep='') print_str += topic[1] + '\n' topic_words.write(print_str) topic_words.close()
def build_lda_model(data_name): # load the training data train_data = np.load(open('./dict/' + data_name + '.npy', 'rb')) # load the dictionary dictionary = pickle.load(open('./dict/' + data_name + '.pkl', 'rb')) lda = LdaModel(train_data, id2word=dictionary, num_topics=20, passes=2, alpha='symmetric', eta=None) lda.print_topics(num_topics=20, num_words=10) # save the model lda.save('./lda/' + data_name + '.model')
def main(): sentence_list = load_data('E:\\tmp\\csv_test') stop_words = load_stop_word('dependencies/stop_word.txt') word_split = participle(sentence_list, stop_words) dictionary = Dictionary(word_split) corpus = [dictionary.doc2bow(text) for text in word_split] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10) print(lda.print_topics(10))
def save_ldamodel(dictionary, text_data, cnt_cata): corpus = [dictionary.doc2bow(text) for text in text_data] ldamodel = LdaModel(corpus, num_topics=cnt_cata, id2word=dictionary) # 查看主题 for topic in ldamodel.print_topics(): print(topic[1]) ldamodel.save('model/{}/ADA.gensim'.format(cnt_cata), "wb")
def LDA(self, topics): # convert the vectorized data to a gensim corpus object corpus = gensim.matutils.Sparse2Corpus(self.corpusVectorized, documents_columns=False) # maintain a dictionary for index-word mapping id2word = dict((v, k) for k, v in self.vectorizer.vocabulary_.iteritems()) print id2word # build the lda model lda = LdaModel(corpus, num_topics=topics, id2word=id2word, passes=10) print lda.print_topics() lda_docs = lda[corpus] for row in lda_docs: print row scores = np.round([[doc[1] for doc in row] for row in lda_docs], 3) print scores cols=[] for i in range(topics): cols.append("topic "+str(i)) df_lda = pd.DataFrame(scores, columns=cols) df_lda
class TopicModel(object): def dataPreprocess(self,path): self.preprocess=Preprocess() self.preprocess.reader(path) def train(self): self.lda=LdaModel(self.preprocess.corpus,id2word=self.preprocess.dictionary,num_topics=10) for topic in self.lda.print_topics(num_topics=10,num_words=10): print(topic[1]) def evaluation(self): pass
def lda(): # remove stop words stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines() stopwords = [ w.strip() for w in stopwords ] fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8') train = [] for line in fp: line = line.split() train.append([ w for w in line if w not in stopwords ]) dictionary = corpora.Dictionary(train) corpus = [ dictionary.doc2bow(text) for text in train ] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) lda.print_topics(30) # print topic id=20 lda.print_topic(20) # save/load model lda.save('D:\\nlp\corpora\news.model')
class LDAModelGensim: ''' Creates a LDA model using Gensim's LdaModel class. ''' def __init__(self, sentences=None, num_topics=2): self.converter = CorpusConverter() self.corpus, self.id2word = self.converter.convert(sentences) self.num_topics = num_topics self.lda_model = LdaModel(self.corpus, self.num_topics, self.id2word) def get_model_topics(self): return self.lda_model.print_topics(-1)
def get_topic_words(sent, stop_words, cnt=15): sent = re.sub(r'[\r\n]', '', sent) wlst = jieba.lcut(sent) ls = [] for w in wlst: if w not in stop_words: ls.append(w) di = Dictionary([ls]) corpus = [di.doc2bow(text) for text in [ls]] lda = LdaModel(corpus, id2word=di, num_topics=1) tp = lda.print_topics(num_words=cnt)[0][1] return re.findall('"(.+?)"', tp)
def my_lda_learn(topic): text = [] with open(filtered_data_file, 'r', encoding="UTF-8") as f: for line in f.readlines(): text.append(line.split()) dictionary = Dictionary(text) text2bow = [dictionary.doc2bow(one_text) for one_text in text] my_lda = LdaModel( text2bow, id2word=dictionary, num_topics=topic, passes=20) print(my_lda.print_topics(num_topics=topic, num_words=10))
def explore(parameters, run): print(parameters) no_above = parameters["no_above"] chunksize = parameters["chunksize"] passes = parameters["passes"] iterations = parameters["iterations"] size = parameters["size"] num_topics = parameters["num_topics"] with open(fname, 'a', newline='', encoding='utf-8') as csv_file: run += 1 print("Run " + str(run) + " out of " + str(runs)) writer = csv.writer(csv_file) corpora.Dictionary.filter_extremes(dictionary, no_below=no_below, no_above=no_above, keep_tokens=None) corpus = [dictionary.doc2bow(review) for review in reviews] corpora.MmCorpus.serialize(name + '.mm', corpus) mm = corpora.MmCorpus( name + '.mm') # `mm` document stream now has random access mm_used = mm[:size] writer.writerows([[ "Data size", "Topics", "no_above", "Chunksize", "Passes", "Iteration" ], [size, num_topics, no_above, chunksize, passes, iterations], []]) lda = LdaModel(mm_used, num_topics=num_topics, chunksize=chunksize, id2word=dictionary, passes=passes, iterations=iterations, eval_every=eval_every) lst = [] for topic in LdaModel.print_topics(lda, -1, 10): terms = [ x[0] for x in LdaModel.get_topic_terms(lda, topic[0], topn=10) ] term_strings = [str(dictionary[term]) for term in terms] str_topic = [] str_topic.append("Topic " + str(topic[0] + 1)) str_topic.extend(term_strings) lst.append(str_topic) writer.writerows(zip(*lst)) writer.writerow([]) return run
def lda(): # remove stop words stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines() stopwords = [w.strip() for w in stopwords] fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8') train = [] for line in fp: line = line.split() train.append([w for w in line if w not in stopwords]) dictionary = corpora.Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) lda.print_topics(30) # print topic id=20 lda.print_topic(20) # save/load model lda.save('D:\\nlp\corpora\news.model')
def perform_lda_iterations(num_topics, num_passes): """Performs LDA on up to a specified number of topics with a specified number of passes.""" tw = joblib.load('../data/clean/tweets-series.pkl') rm = joblib.load('../data/clean/remarks-series.pkl') rm = rm.apply(__unlist) tcv = CountVectorizer(stop_words='english', token_pattern="\\b[a-z][a-z]+\\b") tdm = tcv.fit_transform(tw).transpose() rcv = CountVectorizer(stop_words='english', token_pattern="\\b[a-z][a-z]+\\b") rdm = rcv.fit_transform(rm).transpose() tc = matutils.Sparse2Corpus(tdm) rc = matutils.Sparse2Corpus(rdm) tid2word = dict((v, k) for k, v in tcv.vocabulary_.items()) rid2word = dict((v, k) for k, v in rcv.vocabulary_.items()) for i in range(2, 20): tlda = LdaModel(corpus=tc, num_topics=i, minimum_probability=0.03, id2word=tid2word, passes=20) print('Modeled topics at ', i) print(tlda.print_topics()) for i in range(2, 20): rlda = LdaModel(corpus=rc, num_topics=i, minimum_probability=0.03, id2word=rid2word, passes=20) print('Modeled topics at ', i) print(rlda.print_topics())
def make_lda_model(self, num_topics=11): ''' Build an optimized LDA model. prints a coherence score for sanity checking (EDA has revealed the target coherence to be ~0.39) ''' print(' - Building LDA Model model with {} topics'.format(num_topics)) dictionary = corpora.Dictionary(self.token_list) corpus = [dictionary.doc2bow(text) for text in self.token_list] #set up mallet path # os.environ.update({'MALLET_HOME':r'anaconda3/lib/python3.7/site-packages/mallet-2.0.8/'}) # mallet_path = '/anaconda3/lib/python3.7/site-packages/mallet-2.0.8/bin/mallet' # update this path # # #Make Model: # ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary) ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=20) #Get Coherence Score: coherence_score = CoherenceModel(model=ldamodel, texts=self.token_list, dictionary=dictionary, coherence='c_v').get_coherence() # model_topics = optimal_model.show_topics(formatted=False) # print topics pp.pprint(ldamodel.print_topics(num_words=6)) print(" - Num Topics: {}. Coherence Value of: {:2.3f}".format( num_topics, coherence_score)) self.all_topics = ldamodel.print_topics(num_words=6) self.ldamodel = ldamodel self.corpus = corpus self.dictionary = dictionary self.coherence_score = coherence_score
def lda_model(dictionary, corpus, corpus_tfidf, cluster_keyword_lda): # 使用lda模型,获取主题分布 lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=20) f_keyword = open(cluster_keyword_lda, 'w+',encoding='utf-8') for topic in lda.print_topics(20, 20): print('****' * 5) words = [] for word in topic[1].split('+'): word = word.split('*')[1].replace(' ', '') words.append(word) f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n') # 利用lsi模型,对文本进行向量表示,这相当于与tfidf文档向量表示进行了降维,维度大小是设定的主题数目 corpus_lda = lda[corpus_tfidf] for doc in corpus_lda: print(len(doc), doc) return lda
def lda_output(text, train): line_list = text.split(" ") train.append([w for w in line_list if w not in stopwords]) dictionary = corpora.Dictionary(train) corpus = [dictionary.doc2bow(train_text) for train_text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=topic_num) lda.save('news_lda.model') # 打印前20个topic的词分布 topics = lda.print_topics(topic_num) # 打印id为20的topic的词分布 for i in range(topic_num): print(str(i) + ":" + topics[i][1]) print("===============================")
def lda_model(cut_df, num_topics=10, top_words=5, show=True): te = cut_df['cut_doc'].values dictionary = gensim.corpora.Dictionary(te) corpus = [dictionary.doc2bow(text) for text in te] #corpus -> tfidf tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #tfidf -> lda lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics) corpus_lda = lda[corpus] if show: topics = lda.print_topics(num_topics, top_words) for toc in topics: print(toc) return lda
def lda_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lda, target_lt, num_cluster=11): '''使用lda模型,获取主题分布''' lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_cluster) f_keyword = open(cluster_keyword_lda, 'w+') for topic in lda.print_topics(num_cluster, 53): #print('***************************') words=[] for word in topic[1].split('+'): word = word.split('*')[1].replace(' ','') words.append(word) f_keyword.write(str(topic[0])+'\t'+','.join(words)+'\n') #利用lda模型,对文本进行向量表示,这相当于与tfidf文档向量表示进行了降维,维度大小是设定的主题数目 corpus_lda = lda[corpus_tfidf] write_results("./results_lda.txt", corpus_lda, target_lt) return lda
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data-dir', default='./data/test_arxiv_plain.txt', help='Path to directory where the data is stored') parser.add_argument('--model-dir', default='../model', help='Path to directory where the model is stored') parser.add_argument('--train', default=True, help='True for train, False for test mode') parser.add_argument('--n_topic', default=20, help='Number of of topics') args = parser.parse_args() model_dir = './model/model' dict_dir = './model/dict.txt' if args.train == True: print('Reading texts') with open(args.data_dir) as f_in: texts = f_in.read().split('\n') del texts[-1] for i in tqdm(range(len(texts))): texts[i] = texts[i].split() print('Generating corpora') dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] dictionary.save_as_text(dict_dir) print('Loading model') lda = LdaModel(corpus, num_topics=args.n_topic) lda.save(model_dir) else: lda = LdaModel.load(model_dir, mmap='r') dictionary = Dictionary() dictionary.load_from_text(dict_dir) print('Processing results') topics = lda.print_topics() with open('./report.txt', 'w') as f_out: for topic_id, topic_pair in topics: print(topic_id, end=': ', file=f_out) topic_words = topic_pair.split('"')[1::2] topic_words = list(map(int, topic_words)) topic_words = [dictionary.get(word) for word in topic_words] print(topic_words, file=f_out)
def perform_lda(doc_set, num_topics=64): print('lda started') tokenizer = RegexpTokenizer(r'\w+') raw = " ".join([doc.lower() for doc in doc_set]) tokens = tokenizer.tokenize(raw) en_stop = get_stop_words('en') stopped_tokens = [i for i in tokens if not i in en_stop] # p_stemmer = PorterStemmer() # texts = [p_stemmer.stem(i) for i in stopped_tokens] texts = stopped_tokens dictionary = corpora.Dictionary([stopped_tokens]) corpus = [dictionary.doc2bow(text.split()) for text in texts] ldamodel = LdaModel(corpus, num_topics=64, id2word=dictionary, passes=20) for line in ldamodel.print_topics(num_topics=num_topics, num_words=10): print('\t', line)
# stemming process print(count) # print(List) # counts = Counter(List) # print(counts) print(documentInfo) train_set = documentInfo # construct training corpus dictionary = Dictionary(train_set) corpus = [dictionary.doc2bow(text) for text in train_set] print(corpus) print(dictionary) # train lda model lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=30) print(lda) print(lda.print_topics(5)) # # def lda_test(train_set): # # train corpus # dictionary = Dictionary(train_set) # corpus = [dictionary.doc2bow(text) for text in train_set] # print(corpus) # print(dictionary) # # lda model training # lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50) # print(lda) # return (lda.print_topics(50))
print 'Saving dictionary (%s)...' % DICT dictionary.save(DICT) print 'Building bag-of-words corpus ...' bow_corpus = [ dictionary.doc2bow(t) for t in texts ] print 'Serializing corpus (%s) ...' % BOW MmCorpus.serialize(BOW, bow_corpus) size = len(bow_corpus) * 4 / 5 training = bow_corpus[:size] testing = bow_corpus[size:] print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training)) lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000) print 'Saving LDA model (%s) ...' % NSFLDA lda.save(NSFLDA) print 'Random subset of topics:' print '\n'.join(lda.print_topics()) print 'Computing perplexity on %d held-out documents ...' % len(testing) perplexity = 2 ** -(lda.log_perplexity(testing)) print 'Perplexity: %.2f' % perplexity
# # logging.info('combine report and wiki dictionary...') # wiki_to_report = report_dict.merge_with(wiki_dict) # merged_dict = report_dict # # logging.info('combine report and wiki corpus...') # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus logging.info('generate wiki corpus...') wiki_txt = unpickle('data/txt/processed_wiki.pkl') wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt] logging.info('combine report and wiki corpus...') merged_corpus = wiki_corpus + report_corpus # compute TFIDF # logging.info('compute TFIDF...') # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict) # perform LDA logging.info('perform LDA...') if use_wiki is True: lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes, iterations=iterations, alpha='auto', chunksize=chunksize) lda.save('result/model_wiki.lda') lda.print_topics(topics=num_topics, topn=10) else: lda = LdaModel(corpus=report_corpus, id2word=report_dict, num_topics=num_topics, passes=passes, iterations=iterations, alpha='auto', chunksize=chunksize) lda.save('result/model.lda') lda.print_topics(topics=num_topics, topn=10)