def train(self, arg_fname, is_pre=True, method='lsi', **params): self.fname = arg_fname self.method = method self._generate_conf() if is_pre: self.docs, self.dictionary, corpus = self._preprocess() else: self.docs = pickle.load(open(self.conf['fname_docs'])) self.dictionary = corpora.Dictionary.load(self.conf['fname_dict']) corpus = corpora.MmCorpus(self.conf['fname_corpus']) if params is None: params = {} logger.info("training TF-IDF model") self.tfidf = models.TfidfModel(corpus, id2word=self.dictionary) corpus_tfidf = self.tfidf[corpus] if method == 'lsi': logger.info("training LSI model") self.lsi = models.LsiModel(corpus_tfidf, id2word=self.dictionary, **params) self.lsi.print_topics(-1) self.lsi_similarity_index = similarities.MatrixSimilarity( self.lsi[corpus_tfidf]) self.para = self.lsi[corpus_tfidf] elif method == 'lda_tfidf': logger.info("training LDA model") # try 6 workers here instead of original 8 self.lda_tfidf = models.LdaMulticore(corpus_tfidf, id2word=self.dictionary, workers=6, **params) self.lda_tfidf.print_topics(-1) self.lda_tfidf_similarity_index = similarities.MatrixSimilarity( self.lda[corpus_tfidf]) self.para = self.lda[corpus_tfidf] elif method == 'lda': logger.info("training LDA model") # try 6 workers here instead of original 8 self.lda = models.LdaMulticore(corpus, id2word=self.dictionary, workers=6, **params) self.lda.print_topics(-1) self.lda_similarity_index = similarities.MatrixSimilarity( self.lda[corpus]) self.para = self.lda[corpus] elif method == 'logentropy': logger.info("training a log-entropy model") self.logent = models.LogEntropyModel(corpus, id2word=self.dictionary) self.logent_similarity_index = similarities.MatrixSimilarity( self.logent[corpus]) self.para = self.logent[corpus] else: msg = "unknown semantic method %s" % method logger.error(msg) raise NotImplementedError(msg)
def get_LDA_model_multi_cores(paths, corpus, id2word, num_topics, passes, a=None, b=None): if a is None and b is None: lda_model = models.LdaMulticore(corpus=corpus, id2word=id2word, passes=passes, num_topics=num_topics, workers=4, chunksize=100, per_word_topics=True, minimum_probability=0.0) else: lda_model = models.LdaMulticore(corpus=corpus, id2word=id2word, passes=passes, num_topics=num_topics, workers=4, alpha=a, eta=b, chunksize=100, per_word_topics=True, minimum_probability=0.0) save_lda_model(paths, lda_model, num_topics=num_topics, passes=passes, alpha=a, beta=b) return lda_model
def createModel(self, corpus, dictionary, info): logging.basicConfig(format='%(asctime)s: %(levelname)s : %(message)s', level=logging.INFO) path = 'TopicModel/' + info.data + '_' + info.identifier if not type(corpus) == list: corpus = matutils.Sparse2Corpus(corpus, documents_columns=False) if not os.path.exists(path): if self.name == 'LDA': if info.multicore: self.model = models.LdaMulticore( corpus, num_topics=info.numberTopics, id2word=dictionary, passes=info.passes, iterations=info.iterations, batch=0) else: self.model = models.LdaModel(corpus, num_topics=info.numberTopics, id2word=dictionary, passes=info.passes, iterations=info.iterations, update_every=info.online, chunksize=info.chunksize) elif self.name == 'LSI': self.model = models.LsiModel(corpus, info.numberTopics, dictionary) self.info = str(self.model) else: print 'Unkown Model type' print 'save Model' self.model.save(path) else: print 'Load Model' self.model = models.LdaModel.load(path)
def infer(self): courses = [ list(set(stop_words(item).remove())) for item in [w.split() for w in self.Courses] ] classes = list(set(stop_words(self.File_class).remove())) dictionary = corpora.Dictionary(courses) feature_cnt = len(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in courses] tfidf = models.TfidfModel(corpus) kw_vector = dictionary.doc2bow(classes) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt) sim = index[tfidf[kw_vector]] course_rec = dict(zip(sim, self.Names)) course_sort = sorted(course_rec.items(), reverse=True) lda_model = models.LdaMulticore(tfidf[corpus], num_topics=10, id2word=dictionary, passes=2, workers=2) for idx, topic in lda_model.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) for index, score in sorted(lda_model[tfidf[kw_vector]], key=lambda tup: -1 * tup[1]): print("\nScore: {}\t \nTopic: {}".format( score, lda_model.print_topic(index, 10))) return course_sort
def genlda(textlist, n): # ticks = str(time.time()).replace('.','')[-6:-1] nn = str(n) dictionary = corpora.Dictionary(textlist) corpus = [dictionary.doc2bow(text) for text in textlist] lda = models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=n, passes=100, workers=3) doc_topic = [a for a in lda[corpus]] topics_r = lda.print_topics(num_topics=n, num_words=20) k = 0 LDAlabel = [] for i in lda.get_document_topics(corpus)[:]: listj = [] for j in i: listj.append(j[1]) bz = listj.index(max(listj)) iiilabel = k, i[bz][0], i[bz][1], listj, listj.index(max(listj)) LDAlabel.append(iiilabel) k = k + 1 return LDAlabel
def compute_coherence_values(dictionary, corpus, limit, start=2, step=3): """ Compute u_mass coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] for num_topics in range(start, limit, step): model = models.LdaMulticore(gensim_corpus, id2word=dictionary, num_topics=num_topics, workers=2) model_list.append(model) coherencemodel = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass') coherence_values.append(coherencemodel.get_coherence()) return model_list, coherence_values
def lda(name): if os.path.exists('./corpora_dicts/{}.dict'.format(name)): dictionary = corpora.Dictionary.load( './corpora_dicts/{}.dict'.format(name)) corpus = corpora.MmCorpus('./corpus/{}.mm'.format(name)) print("Loaded!!!") print(corpus) else: print("Error!!!!") lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=15, passes=2, workers=2) """ To save the model temp_file = datapath("model") lda.save(temp_file) To load a saved model lda = LdaModel.load(temp_file) """ # print('----------------END-----------------') # s = re.findall(something, str(lda.print_topics())) # print(s) for idx, topics in lda.print_topics(-1): print("Topic: {} ------------>".format(idx)) print(topics)
def train_lda(self, num_topics, chunksize=1000, passes=4): self.model = models.LdaMulticore(corpus=self.corpus_tfidf, num_topics=num_topics, id2word=self.dictionary, workers=workers, chunksize=chunksize, passes=passes)
def main(text_dir): topics = range(10, 101, 10) + range(120, 201, 20) + range(250, 451, 50) #topics = range(10, 21, 10) #corpus = DocCorpus(text_dir) #dictionary = corpus.dictionary corpus = MmCorpus('../twitter_LDA_topic_modeling/simple-wiki.mm') dictionary = Dictionary.load( '../twitter_LDA_topic_modeling/simple-wiki.dict') print('Building LDA models') lda_models = [ models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=i, passes=5) for i in tqdm(topics) ] print('Generating coherence models') texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus] pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1)) func = partial(build_coherence_models, corpus=corpus, dictionary=dictionary, texts=texts) coherence_models = pool.map(func, lda_models) pool.close() # print('Extracting data from models') # model_data = [extract_data(model, corpus, dictionary) for model in tqdm(lda_models)] # d = defaultdict(list) # print('Generating output data') # for i, data in tqdm(enumerate(model_data)): # d['num_topics'].append(data['num_topics']) # d['cao_juan_2009'].append(cao_juan_2009(data['topic_term_dists'], data['num_topics'])) # d['arun_2010'].append(arun_2010(data['topic_term_dists'], data['doc_topic_dists'], data['doc_lengths'], data['num_topics'])) # d['deveaud_2014'].append(deveaud_2014(data['topic_term_dists'], data['num_topics'])) # d['u_mass_coherence'].append(data['u_mass_coherence']) d = defaultdict(list) print('Generating output data') for data in tqdm(coherence_models): d['num_topics'].append(data['num_topics']) d['u_mass'].append(data['u_mass']) d['c_v'].append(data['c_v']) d['c_uci'].append(data['c_uci']) d['c_npmi'].append(data['c_npmi']) df = pd.DataFrame(d) df = df.set_index('num_topics') df.to_csv('coherence_simple_wiki', sep='\t') df.plot(xticks=df.index, style=['bs-', 'yo-', 'r^-', 'gx-']) ax1 = df.plot(xticks=df.index, style='bs-', grid=True, y='u_mass') ax2 = df.plot(xticks=df.index, style='yo-', grid=True, y='c_v', ax=ax1) ax3 = df.plot(xticks=df.index, style='r^-', grid=True, y='c_npmi', ax=ax2) df.plot(xticks=df.index, style='gx-', grid=True, y='c_uci', ax=ax3) plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.17), fancybox=True, shadow=True, ncol=4, fontsize=9) plt.subplots_adjust(bottom=0.2) plt.xticks(df.index, rotation=45, ha='right', fontsize=8) plt.savefig('coherence_simple_wiki') plt.close()
def fit(self, list_toks): utils.verbose('start training lda dictionary') self.dict = corpora.Dictionary(list_toks) utils.verbose('start building lda corpus') self.corpus = [self.dict.doc2bow(toks) for toks in list_toks] utils.verbose('start training lda model') self.model = models.LdaMulticore(self.corpus, self.vec_dim, id2word=self.dict) utils.verbose('start saving lda dictionary and model') self.model.save(self.paths['model']) self.dict.save(self.paths['dict']) utils.verbose('start vectorization for lda') self.ann = AnnoyIndex(self.vec_dim) for n, toks in enumerate(list_toks): if not n % 10000 and n: utils.verbose('vectorizing {} lines for lda'.format(n)) vec = self.get(toks) self.ann.add_item(n, vec) utils.verbose('start building lda ann') self.ann.build(self.num_trees) self.ann.save(self.paths['ann']) utils.verbose('dump lda annoy into {}'.format(self.paths['ann']))
def lda_tfidf(num_topics, tfidf, text, dictionary, random_state, cluster_ID, data_path): coherence_ldas = [] LDA_models = [] topics = [] for num_topic in num_topics: lda_tfidfmodel = models.LdaMulticore(tfidf, num_topics=num_topic, id2word=dictionary, passes=2, workers=2, random_state=random_state) coherence_model_lda = CoherenceModel(model=lda_tfidfmodel, texts=text, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() topics.append(num_topic) LDA_models.append(lda_tfidfmodel) coherence_ldas.append(coherence_lda) plt.figure(figsize=(20, 10)) plt.plot(num_topics, coherence_ldas, marker='o', markersize=10) plt.savefig(f"{data_path}{cluster_ID}.png") plt.close() best_index = coherence_ldas.index(max(coherence_ldas)) # get best result # retuen model, topic_number return LDA_models, topics, best_index
def main(): nltk.download(['punkt', 'stopwords']) data = Data('test') corpus = data.get_corpus() tfidf = models.TfidfModel(corpus) print('Building TF-IDF index...') t_index = similarities.MatrixSimilarity(tfidf[corpus], num_best=10) print('Builing LDA index...') lda = models.LdaMulticore(corpus, id2word=data.dictionary, num_topics=40) l_index = similarities.MatrixSimilarity(lda[corpus], num_best=10) print('Idexies built') out = 'test_missing_with_predictions.txt' print('Saving output to {!r}'.format(out)) with open(out, 'w') as f: for miss in data.missing: res = defaultdict(float) vector = data.dictionary.doc2bow(data.clean(miss)) q = tfidf[vector] ql = lda[vector] for i, p in t_index[q]: res[i] += p for i, p in l_index[ql]: res[i] += p rating = sorted(res, key=res.get, reverse=True) id = data.dmap[rating[0]] line = '{} +++$+++ {}\n'.format(id, ' '.join(miss)) f.write(line)
def lda(corpus_of_text): """ Compare documents by Latent Dirichlet Allocation (LDA). :param corpus_of_text: list of documents, where each document is a sublist of tokenized strings. :return model: set of words that are most associated with each topic. """ # Create a dictionary and corpus for the LDA model lda_dict = corpora.Dictionary(corpus_of_text) lda_corpus = [lda_dict.doc2bow(line) for line in corpus_of_text] # Train the model lda_model = models.LdaMulticore(corpus=lda_corpus, id2word=lda_dict, random_state=100, num_topics=4, passes=10, chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) # Save the model # lda_model.save('lda_model.model') return lda_model.print_topics(-1) # See the topics
def __init__(self, corpus, dictionary): self.similar_index = 0 self.lda = models.LdaMulticore(corpus, id2word=dictionary, workers=8, num_topics=50) self.corpus = self.lda[corpus]
def lda_model(corpus, dictionary, number_of_topics=20, save_path='saved_models/lda_bow'): if not isfile(save_path): lda_model = models.LdaMulticore(corpus, num_topics=number_of_topics, id2word=dictionary, passes=2, workers=2) lda_model.save(save_path) else: lda_model = models.LdaMulticore.load(save_path) return lda_model
def generate_lda_sub_topic(topicid): # 载入词典 dictionary = corpora.Dictionary.load(topic_dict_path % topicid) print "载入 topic %d 词典完成" % topicid # 载入语料 texts = get_topic_texts(topicid) begin = datetime.datetime.now() corpus = [dictionary.doc2bow(text) for text in texts] # store to disk, for later use # corpora.MmCorpus.serialize('./nanfang.mm', corpus) # 单核 # LDA = models.LdaModel(corpus, id2word=dictionary, num_topics=200, update_every=1, minimum_probability=0.1, passes=5) # 多核 # models.ldamulticore.LdaMulticore(corpus, num_topics=200, id2word=dictionary, workers=None, chunksize=2000, passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001) print "开始训练 topic %d 的子 topic,共 %d 个" % (topicid, num_sub_topic) LDA = models.LdaMulticore(corpus, num_topics=num_sub_topic, id2word=dictionary, workers=4, chunksize=2000, passes=1) end = datetime.datetime.now() print "训练用时", end - begin # 保存 LDA 模型 path = "%s%d" % (lda_model_topic, topicid) LDA.save(path) print "topic %d 模型已保存到 %s 中\n" % (topicid, path)
def fitTopics(self,topic_ct,passes): start = datetime.datetime.now() self.topic_ct = topic_ct self.passes = passes self.verboseMsg('worp===>%d topics, %d passes: start ' %(topic_ct,passes)) self.lda = models.LdaMulticore( self.corpus, num_topics = self.topic_ct, passes = passes, id2word = self.vocab, workers = 4, iterations = 2500, eval_every = 100, chunksize = 2000 ) self.verboseMsg('worp===>%d topics, %d passes: lda model complete ' %(topic_ct,passes)) self.topic_vectors = self.lda.print_topics(num_topics=self.topic_ct, num_words=8) self.topic_proba = [] for x in self.corpus: local = self.lda.get_document_topics(x) row = { x:float(0) for x in range(self.topic_ct)} for y in local: row[y[0]] = y[1] self.topic_proba.append(row) self.verboseMsg('worp===>%d topics, %d passes: creating probabilities in dataframe ' %(topic_ct,passes)) self.topic_proba_df = pd.DataFrame(self.topic_proba) self.verboseMsg('worp===>%d topics, %d passes: complete ' %(topic_ct,passes)) print datetime.datetime.now() - start
def fit_lda(num_topics, corpus, id2word, passes, multicore=0, save=True): """ Fits a gensim lda model on the corpus, Allows for easy switching between single and multicore implementations :param num_topics: Number of topics to model :param corpus: gensim's Sparse2Corpus object :param id2word: :param passes: :param multicore: :return: """ if multicore: lda_fit = models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=multicore, passes=passes) else: lda_fit = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=passes) if save: lda_fit.save(f'../outputs/lda_{num_topics}_topics.mdl') return lda_fit
def create_documents_view(self, corpus, ir_mode): dictionary, pdocs = self.create_dictionary(corpus) bow = self.docs2bows(corpus, dictionary, pdocs) loaded_corpus = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus if ir_mode == 1: model = [[(w[0], 1 + np.log2(w[1])) for w in v] for v in bow] # TF model elif ir_mode == 2: model = models.TfidfModel(loaded_corpus) # TF IDF model elif ir_mode == 3: model = models.LdaModel(loaded_corpus) # LDA model elif ir_mode == 4: model = models.LdaMulticore(loaded_corpus) # LDA Multicore model elif ir_mode == 5: model = models.LsiModel(loaded_corpus) # LSI model elif ir_mode == 6: model = models.RpModel(loaded_corpus) # RP model elif ir_mode == 7: model = models.LogEntropyModel( loaded_corpus) # LogEntropyModel model # tf = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus return model, dictionary
def train_predict(self): bow_corpus, dictionary = self._feature_preparations() corpus_tfidf = models.TfidfModel(bow_corpus)[bow_corpus] lda_model_tfidf = \ models.LdaMulticore(corpus_tfidf, num_topics=self.nb_topics, id2word=dictionary, passes=2, workers=2) for idx, topic in lda_model_tfidf.print_topics(-1): print('Topic: {} Word: {}'.format(idx, topic))
def createTopics(words): dictionary = corpora.Dictionary(words) dictionary.save("dictionary_" + str(i) + "_" + str(notopics) + "topics.dict") global global_dict global_dict = dictionary #step 2 #convert to bag of words corpus = map(dictionary.doc2bow, words) corpora.MmCorpus.serialize( "corpus_" + str(i) + "_" + str(notopics) + "topics.mm", corpus) tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaMulticore(corpus_tfidf, id2word=dictionary, num_topics=notopics, workers=4) # The following line of code gets topic probability distribution for a document # corpus_lda = lda[corpus_tfidf] global global_lda global_lda = lda #step 3 #save topic models #These are models that you use to make topic inferences about documents lda.save("model_" + str(i) + "_" + str(notopics) + "topics.lda") # pickle.dump(corpus_lda, open("corpus_lda.pck","wb")) pickle.dump(tfidf, open("tfidf.pck", "wb")) print("done")
def generate_lda_topic(): # 载入词典 dictionary = corpora.Dictionary.load(dict_path) print "载入词典完成" # 载入语料 texts = get_texts() begin = datetime.datetime.now() corpus = [dictionary.doc2bow(text) for text in texts] # store to disk, for later use # corpora.MmCorpus.serialize('./nanfang.mm', corpus) # 单核 # LDA = models.LdaModel(corpus, id2word=dictionary, num_topics=200, update_every=1, minimum_probability=0.1, passes=5) # 多核 # models.ldamulticore.LdaMulticore(corpus, num_topics=200, id2word=dictionary, workers=None, chunksize=2000, passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001) print "开始训练第一层 LDA 模型,共 %d 个 topic" % num_topic LDA = models.LdaMulticore(corpus, num_topics=num_topic, id2word=dictionary, workers=4, chunksize=2000, passes=5) level1 = datetime.datetime.now() print "第一层 LDA 模型训练完成,用时", level1 - begin # 分离每个 topic 的数据 # topic_text = [[] for i in range(num_topic)] # print "开始分离各个 topic 的数据" # for i in range(len(texts)): # if (i % 10000 == 0): # print "正在处理第 %d 行" % i # # 获取每个文本的 topics # topics = LDA.get_document_topics(corpus[i]) # # 这里选择 Top1 # if (len(topics) < 1): # continue # # print len(topics), topics[0] # topic_text[topics[0][0]].append(" ".join(texts[i])) # 写入每个 topic 的数据 # for i in range(num_topic): # print "写入 topic %d 的用户问句(已分词)" % i # with codecs.open("%s%d" %(topic_query, i), "w+", "utf-8") as f: # for line in topic_text[i]: # if (len(line) > 1): # f.write(line+"\n") # 写入子 topic # for i in range(num_topic): # generate_topic_dict(i) # 生成词典 # generate_sub_topic(i) # 生成 子 topic end = datetime.datetime.now() print "处理 LDA 用时", end - begin # 保存 LDA 模型 LDA.save(lda_model) print "模型已保存到 %s 中" % lda_model
def train_topic( self, num_topics, no_below=1, no_above=0.9, keep_n=None, keep_tokens=None, remove_most_freq_n=None, bad_tokens=None, model="ldamulticore", bigrams=True, **kwargs, ): """ no_below (int|None) – Keep tokens which are contained in at least no_below documents. no_above (float|None): Keep tokens which are contained in no more than no_above documents (fraction of total corpus size, not an absolute number). keep_n (int|None) – Keep only the first keep_n most frequent tokens. keep_tokens (iterable of str) – Iterable of tokens that must stay in dictionary after filtering. remove_most_freq_n (int|None): Remove n most frequent tokens model ('ldamulticore'|'lda'|'ldamallet') """ if bigrams is True: phrases = models.Phrases(self.tokenlists, delimiter=b" ") phraser = models.phrases.Phraser(phrases) self.tokenlists = [phraser[tl] for tl in self.tokenlists] dictionary = corpora.Dictionary(self.tokenlists) if remove_most_freq_n: dictionary.filter_n_most_frequent(remove_most_freq_n) dictionary.filter_extremes( no_below=no_below, no_above=no_above, keep_n=keep_n, keep_tokens=keep_tokens ) bows = [dictionary.doc2bow(tl) for tl in self.tokenlists] if bad_tokens: dictionary.filter_tokens( bad_ids=[dictionary.id2token[tok] for tok in bad_tokens] ) self.bows = bows self.dictionary = dictionary if model == "ldamulticore": self.model = models.LdaMulticore( bows, num_topics=num_topics, id2word=dictionary, **kwargs ) if model == "lda": self.model = models.LdaModel( bows, num_topics=num_topics, id2word=dictionary, **kwargs ) if model == "ldamallet": raise ValueError("mallet is not yet implemented")
def calculate_lda(self, params): if params.get('cpu_cores', 1) > 1: lda = models.LdaMulticore(self, id2word=self.dictionary, num_topics=params.get('topics', n_topics), workers=params.get('cpu_cores', params.get('cpu_cores', n_cpu_cores))) else: lda = models.LdaModel(self, id2word=self.dictionary, num_topics=params.get('topics', n_topics)) lda.save(DOCUMENT_PATH + self.filename + '.lda') return lda
def make_lda(self): """Create LDA model object.""" lda_model = models.LdaMulticore(self.bow_corpus, num_topics=10, id2word=self.dictionary, passes=2, workers=2) return lda_model
def lda_model_tfidf(tfidf_corpus, dictionary, number_of_topics=20, save_path='saved_models/lda_tfidf'): if not isfile(save_path): lda_model_tfidf = models.LdaMulticore(tfidf_corpus, num_topics=number_of_topics, \ id2word=dictionary, passes=2, workers=4) lda_model_tfidf.save(save_path) else: lda_model_tfidf = models.LdaMulticore.load(save_path) return lda_model_tfidf
def fit_LDA_gensim(file_path, num_topics=10, passes=1, chunksize=2000): """ train and return LDA model Parameters: file_path : path to the text file containing tfidf-filtered tokenized chats one chat per line, tokens separated by whitespace num_topics : update_every: passes : chunksize : """ #====================================================================================# # Configure messages sent to the terminal if verbose == 'yes': level = logging.INFO else: level = PROGRESS_NUM logging.basicConfig(format='%(levelname)s : %(message)s', level=level) #====================================================================================# #====================================================================================# logging.log(PROGRESS_NUM, 'create a Gensim dictionary from the texts') dictionary = corpora.Dictionary(\ line.split() for line in codecs.open(file_path,'r','utf-8')) #====================================================================================# #====================================================================================# logging.log(PROGRESS_NUM, 'convert chats to a bag of words corpus') chats = text_stream(file_path) # creates corpus object without loading the whole document in RAM corpus = corpus_stream(file_path, dictionary) ## creates corpus object loading the whole document in RAM #corpus = [dictionary.doc2bow(text.split()) for text in chats] #====================================================================================# #====================================================================================# logging.log(PROGRESS_NUM, 'Training LDA') if multicore == 'yes': lda = models.LdaMulticore(corpus, id2word=dictionary,\ num_topics=num_topics, passes=passes,chunksize=chunksize) else: lda = models.LdaModel(corpus, id2word=dictionary, \ num_topics=num_topics, passes=passes,chunksize=chunksize) lda.show_topics() if verbose == 'yes': lda.print_topics(num_topics) #====================================================================================# #====================================================================================# # creates corpus object loading the whole document in RAM # needed to plot with pyLDAvis corpus = [dictionary.doc2bow(text.strip().split()) for text in chats] #====================================================================================# return lda, corpus, dictionary
def try_LDA(corpus, id2word, num_topics): count = 1 print(count) for n in range(31, num_topics + 1): lda = models.LdaMulticore(corpus=corpus, num_topics=n, id2word=id2word, passes=5, random_state = 42) with open(f'LDA/LDA_model_{n}.pkl', 'wb') as f: pickle.dump(lda, f) count +=1 print(count)
def train_predict(self): bow_corpus, dictionary = self._feature_preparations() lda_model = models.LdaMulticore(bow_corpus, num_topics=self.nb_topics, id2word=dictionary, passes=2, workers=2) for idx, topic in lda_model.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic))
def make_lda_tfidf(self): """Running LDA using TF-IDF.""" lda_model_tfidf = models.LdaMulticore( self.tfidf_model[self.bow_corpus], num_topics=20, id2word=self.dictionary, passes=2, workers=4) return lda_model_tfidf