def create_LDA(comment_dict, num_topics=20, chunk_size=50, max_iter=20, from_db=True, get_data_func=None): lda = None text_gen = data_preprocessor(max_iter=max_iter, from_db=from_db, get_data_func=get_data_func) corpus = [] for _, stemmed_text, _ in text_gen: if len(stemmed_text) != 0: corpus.append(comment_dict.doc2bow(stemmed_text)) if len(corpus) == chunk_size: if lda is None: lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=comment_dict, per_word_topics=1, passes=10) else: lda.update(corpus=corpus) corpus = [] return lda
def main(vocab_file, inv_vocab_file, infiles): vocab = load_pickled(vocab_file) inv_vocab = load_pickled(inv_vocab_file) lda = LdaModel(id2word=inv_vocab, num_topics=200) for f in infiles: tc = TweetCorpus(f, vocab) lda.update(tc) lda.save('topics.lda')
def gensim_lda(d): from gensim import corpora, models from gensim.models.ldamodel import LdaModel list_doc = [] for i in range(0,len(d)): list_doc = list_doc + d[i] dictionary = corpora.Dictionary(list_doc) model = LdaModel(num_topics = 20, id2word = dictionary) for i in range(0, len(d)): print 'Generating corpus and updating model ', i corpus = [dictionary.doc2bow(doc) for doc in d[i]] model.update(corpus) model.save('model_20') print model.show_topics(num_topics = 20, num_words = 10)
def build_all(): """ build and save a bunch of models to evaluate """ with sqlite3.connect('../database/chat.db') as conn: # get vocabulary MIN_OCCURANCE = 100 vocab = Dictionary([ pd.read_sql( 'select word from words where freq >= {}'.format( MIN_OCCURANCE), conn)['word'].tolist() ]) # models for different number of topics N_EPOCHS = 10 for n_topics in [i for i in range(5, 26, 5)]: # one model per each aggregation style for style, sql in zip(['basic', 'user', 'user_day_room'], get_model_sql()): # init model lda_model = LdaModel(id2word=vocab, num_topics=n_topics, alpha='auto', per_word_topics=True) # do training print('training model_{0}_{1}'.format(style, n_topics)) for epoch in range(N_EPOCHS): print('\tepoch', epoch, '...', end='\r') for chunk in pd.read_sql(sql, conn, chunksize=10000): chunk_corpa = [ vocab.doc2bow(text) for text in chunk['lemma'].str.split(' ').tolist() ] lda_model.update(chunk_corpa) print('\tepoch', epoch, '... done!') # Save model to disk. lda_model.save("saved_models/model_{0}_{1}".format( style, n_topics))
def load_or_build_lda_model(conn): try: # load vocab dictionary vocab = SaveLoad.load('../model_development/saved_models/model_user_day_room_10.id2word') # load model ldamodel = LdaModel.load('../model_development/saved_models/model_user_day_room_10') print('Pretrained lda model loaded!') except: # query for aggregating texts per user per room per day sql = """ select group_concat(lemma, ' ') as lemma from lemmas join ( select chat_id , from_userid , strftime('%Y-%m-%d', sent_at) as sent_day , room_id from chats ) using (chat_id) where nullif(lemma, '') is not null group by from_userid , sent_day , room_id order by random(); """ # get vocabulary MIN_OCCURANCE = 100 vocab = Dictionary([pd.read_sql('select word from words where freq >= {}'.format(MIN_OCCURANCE), conn)['word'].tolist()]) # models for different number of topics N_EPOCHS = 10 n_topics = 10 style = 'user_day_room' # init model lda_model = LdaModel( id2word=vocab, num_topics=n_topics, alpha='auto', per_word_topics=True) # do training print('training model_{0}_{1}'.format(style, n_topics)) for epoch in range(N_EPOCHS): print('\tepoch', epoch, '...', end='\r') for chunk in pd.read_sql(sql, conn, chunksize=10000): chunk_corpa = [vocab.doc2bow(text) for text in chunk['lemma'].str.split(' ').tolist()] lda_model.update(chunk_corpa) print('\tepoch', epoch, '... done!') # Save model to disk. lda_model.save("saved_models/model_{0}_{1}".format(style, n_topics)) return vocab, ldamodel
dictionary = corpora.Dictionary(resultlist) corpus = [dictionary.doc2bow(text) for text in resultlist] with open ('mcdi_word.csv', 'rb')as f: reader = csv.reader(f) wlist = [] for row in reader: wlist.append(row) idlist = [] for row in wlist: idrow = [] for key in dictionary.iteritems(): if key[1].encode('utf-8') in row: idrow.append(key[0]) idlist.append(idrow) a = 0.05 ntopic = 75 eta_arr = ones((ntopic, len(dictionary))) * 0.5 for x in range(0, len(idlist)): for id in idlist[x]: eta_arr[x, id] *= 1000 lda = LdaModel(id2word = dictionary, num_topics = ntopic) lda.update(corpus) topiclist = lda.print_topics(num_topics = 75, num_words = 50) lda.save('childs_file_75.model')
if i == 0: if lda_model != None: del lda_model lda_model = LdaModel( corpus=train_data, id2word=dictionary, num_topics=n_component, decay=learning_decay, iterations=valid_iter, random_state=0, ) # lda_model.update(train_data) else: lda_model.update(corpus=train_data, decay=learning_decay, iterations=valid_iter) train_s.append( CoherenceModel(model=lda_model, corpus=train_data, dictionary=dictionary, coherence='u_mass').get_coherence()) test_s.append( CoherenceModel(model=lda_model, corpus=test_data, dictionary=dictionary, coherence='u_mass').get_coherence()) train_p.append(lda_model.log_perplexity(train_data)) test_p.append(lda_model.log_perplexity(test_data))
class LDA(object): def __init__(self, topics = 10, worker = 3, pretrained_model = None, dictionary = None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary) def save(self, model_file, dictionary_file): """ 保存训练的模型,同时保存对应的词典 Args: model_file -- 模型文件 dictionary_file -- 词典文件 Returns: 无 """ if self._model: self._model.save(model_file) if self._common_dictionary: self._common_dictionary.save(dictionary_file) def update(self, corpus = [[]]): """ 在线更新,在已有模型的基础上在线更新 Args: corpus -- 用于更新的文档列表 """ if not self._model and len(corpus) > 0: self._common_dictionary = Dictionary(corpus) corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus] self._model = LdaModel(corpus_data, self._topics) elif self._model and len(corpus) > 0: self._common_dictionary.add_documents(corpus) new_corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus] self._model.update(new_corpus_data) def inference(self, document = []): """ 对新文档推断其话题分布 Args: document -- 文档,其实是词列表 Returns: 话题分布列表 """ if self._model: doc = [self._common_dictionary.doc2bow(document)] return self._model.get_document_topics(doc) return [] @property def model(self): return self._model @property def dictionary(self): return self._common_dictionary
class LDA(object): def __init__(self, topics=10, worker=3, pretrained_model=None, dictionary=None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary) def save(self, model_file, dictionary_file): """ 保存训练的模型,同时保存对应的词典 Args: model_file -- 模型文件 dictionary_file -- 词典文件 Returns: 无 """ if self._model: self._model.save(model_file) if self._common_dictionary: self._common_dictionary.save(dictionary_file) def update(self, corpus=[[]]): """ 在线更新,在已有模型的基础上在线更新 Args: corpus -- 用于更新的文档列表 """ if not self._model and len(corpus) > 0: self._common_dictionary = Dictionary(corpus) corpus_data = [ self._common_dictionary.doc2bow(sentence) for sentence in corpus ] self._model = LdaModel(corpus_data, self._topics) elif self._model and len(corpus) > 0: self._common_dictionary.add_documents(corpus) new_corpus_data = [ self._common_dictionary.doc2bow(sentence) for sentence in corpus ] self._model.update(new_corpus_data) def inference(self, document=[]): """ 对新文档推断其话题分布 Args: document -- 文档,其实是词列表 Returns: 话题分布列表 """ if self._model: doc = [self._common_dictionary.doc2bow(document)] return self._model.get_document_topics(doc) return [] @property def model(self): return self._model @property def dictionary(self): return self._common_dictionary
def train_LDA(base_path, table_paths, batch_size, limit, use_dictionary=False, **kwargs): model_name = dic2name(kwargs) print("Model: ", model_name) topic_num = kwargs['tn'] # Pass 1 get the dictionary if use_dictionary == 'True': dic = Dictionary.load( join(LDA_CACHE, 'dictionary_{}'.format(model_name))) else: dic = Dictionary([]) b = 0 for corpus in corpus_iter(base_path, table_paths, batch_size, limit, **kwargs): dic.add_documents(corpus) print('Dictionary batch {}: current dic size {}'.format( b, len(dic))) b += 1 # save dictionary dic.save(join(LDA_CACHE, 'dictionary_{}'.format(model_name))) print("Dictionary size", len(dic)) # Pass 2 train LDA whole_corpus = corpus_iter(base_path, table_paths, batch_size, limit, **kwargs) first_batch = next(whole_corpus) first_bow = [dic.doc2bow(text, allow_update=False) for text in first_batch] #print(first_bow) lda = LdaModel(first_bow, id2word=dic, num_topics=topic_num, minimum_probability=0.0) batch_no = 0 print('LDA update batch {}'.format(batch_no)) for batch in whole_corpus: batch_bow = [dic.doc2bow(text, allow_update=False) for text in batch] #print(corpus_bow) lda.update(batch_bow) batch_no += 1 print('LDA update batch {}'.format(batch_no)) # Save model to disk. temp_file = join(LDA_CACHE, "model_{}".format(model_name)) lda.save(temp_file) print( "Training from {} done. Batch_size: {}, long str tokenization threshold: {}, numerical representations: {}.\ \nTotal size of dictionary: {}".format(table_paths, batch_size, kwargs['thr'], kwargs['num'], len(dic))) return
class LDAWDF: mysql: mysql.MySQL ldamodel: LdaModel dictionary = None corpus = None def __init__(self, mysql): self.mysql = mysql self.dataFolder = './data/' self.saveFile = 'lda_model' self.saveFileDict = 'lda_model_dict' def trainFromStart(self): with self.mysql as db: content = db.getContentsText() documents = [] for item in content: documents.append(item['content'].split()) self.dictionary = corpora.Dictionary(documents) self.dictionary.filter_extremes(no_below=5, no_above=0.5) doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents] self.corpus = doc_term_matrix # Running and Training LDA model on the document term matrix. print("Starting to train LDA Model...") self.ldamodel = LdaModel( doc_term_matrix, num_topics=200, id2word=self.dictionary, passes=100) def printTest(self): print(self.ldamodel.print_topics(num_topics=10, num_words=5)) def save(self): self.ldamodel.save(self.dataFolder + self.saveFile) self.dictionary.save(self.dataFolder + self.saveFileDict) def canLoad(self): my_file = Path(self.dataFolder + self.saveFile) my_file_dict = Path(self.dataFolder + self.saveFileDict) return my_file.is_file() and my_file_dict.is_file() def update(self, corpus): self.ldamodel.update(corpus) def load(self, subfolder=None): if subfolder: sf = subfolder + '/' else: sf = '' self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile) self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict) def fillDb(self): topics = {} result = [] result2 = [] nbTopics = self.ldamodel.get_topics().shape[0] # "Old" for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 3) topicTerms.sort(key=lambda x: x[1], reverse=True) words = [] for topicTerm in topicTerms: words.append(self.dictionary.get(topicTerm[0])) topics[topicId] = ' '.join(words) with mysql as db: contentsText = db.getContentsText() for element in contentsText: bow = self.dictionary.doc2bow(element['content'].split()) docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05) if len(docTopics) > 0: docTopics.sort(key=lambda x: x[1], reverse=True) result.append((element['url'], topics[docTopics[0][0]])) for docTopic in docTopics: result2.append((element['url'], docTopic[0], str(docTopic[1]))) db.emptyUrlsTopic() db.emptyCurrentUrlsTopic() db.emptyCurrentUserTags() db.setCurrentUrlsTopic(result2) db.setPrecalcTopics() # "New" terms = [] for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 5) topicTerms.sort(key=lambda x: x[1], reverse=True) for topicTerm in topicTerms: terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1]))) with mysql as db: db.emptyLdaTopics() db.setLdaTopics(terms) def get_terms_topics(self, keywords): bow = self.dictionary.doc2bow(keywords[:30]) topics = {} keywordsResult = {} for word in bow: wordTopics = self.ldamodel.get_term_topics(word[0], 0.05) keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics} for wordTopic in wordTopics: wordTopicId = wordTopic[0] if wordTopicId not in topics: topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId) return {'topics': topics, 'keywords': keywordsResult}