def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(5): # restart at most 5 times # create the transformation model model = ldamulticore.LdaMulticore(id2word=dictionary, num_topics=2, passes=100) model.update(corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full( transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose( sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning( "LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
def testTopicSeeding(self): passed = False for topic in range(2): # try seeding it both ways round, check you get the same # topics out but with which way round they are depending # on the way round they're seeded for i in range(5): # restart at most 5 times eta = numpy.ones((2, len(dictionary))) * 0.5 system = dictionary.token2id[u'system'] # aggressively seed the word 'system', in one of the # two topics, 10 times higher than the other words eta[topic, system] *= 10 model = ldamulticore.LdaMulticore(id2word=dictionary, num_topics=2, passes=200, eta=eta) model.update(corpus) topics = [dict((word, p) for p, word in model.show_topic(j)) for j in range(2)] # check that the word system in the topic we seeded, got a high weight, # and the word 'trees' (the main word in the other topic) a low weight -- # and vice versa for the other topic (which we didn't seed with 'system') result = [[topics[topic].get(u'system',0), topics[topic].get(u'trees',0)], [topics[1-topic].get(u'system',0), topics[1-topic].get(u'trees',0)]] expected = [[0.385, 0.022], [0.025, 0.157]] passed = numpy.allclose(result, expected, atol=1e-2) if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, result, expected)) self.assertTrue(passed)
def topic_model(tokenized_docs, num_topics=10, iterations=50, passes=10, chunksize=2000, workers=DEFAULT_WORKERS, **kwargs): id2word, corpus = create_id2word(tokenized_docs) model = ldamulticore.LdaMulticore( corpus=corpus, id2word=id2word, num_topics=num_topics, workers=workers, iterations=iterations, passes=passes, chunksize=chunksize, eval_every=10, # Setting this to one slows down training by ~2x per_word_topics=True) # computing perplexity and coherence perplexity = model.log_perplexity(corpus) coherence_model = CoherenceModel(model=model, texts=tokenized_docs, dictionary=id2word, coherence='c_v') coherence = coherence_model.get_coherence() return model, corpus, coherence, perplexity
def train(): global config config = AwsTrain() logger.info('MODE: ' + config.dictionary_label) visual_matrix = loadVisualMatrix(config) imgid2wordscoretuple = prepareTexts() #uncomment if loading previously loaded, and comment the next line # dictionary = corpora.Dictionary().load(getLastDictFileName()) dictionary = createDictionary() config.dict_size = len(dictionary) logger.info('Dict read') #comment this if loading from a previously serialised corpus(much quicker) bow = BOW(dictionary=dictionary, input=MyCorpus(visual_matrix, imgid2wordscoretuple)) corporaFname = 'data/corpora' + config.dictionary_label gensim.corpora.MmCorpus.serialize(corporaFname, bow) bow = gensim.corpora.MmCorpus(corporaFname) logger.info('Corpora read') topics = config.lda_topics passes = config.lda_passes # start training lda = models.LdaMulticore(corpus=bow, id2word=dictionary, num_topics=topics, passes=passes, chunksize=config.chunksize, workers=4) modelFname = config.model_folder + 'lda_%i_topics_%i_passes_%s.%s.model' % ( topics, passes, config.dictionary_label, pretty_current_time()) # persist the model for later lda.save(modelFname)
def latent_dirichlet_allocation(corpus_fname, output_fname, tokenizer_name="mecab"): make_save_path(output_fname) documents, tokenized_corpus = [], [] tokenizer = get_tokenizer(tokenizer_name) with open(corpus_fname, 'r', encoding='utf-8') as f: for document in f: tokens = list(set(tokenizer.morphs(document.strip()))) documents.append(document) tokenized_corpus.append(tokens) dictionary = corpora.Dictionary(tokenized_corpus) corpus = [dictionary.doc2bow(text) for text in tokenized_corpus] LDA = ldamulticore.LdaMulticore(corpus, id2word=dictionary, num_topics=30, minimum_probability=0.0, workers=4) # 특정 토픽의 확률이 0.5보다 클 경우에만 데이터를 리턴한다 # 확률의 합은 1이기 때문에 해당 토픽이 해당 문서에서 확률값이 가장 큰 토픽이 된다 all_topics = LDA.get_document_topics(corpus, minimum_probability=0.5, per_word_topics=False) with open(output_fname + ".results", 'w') as f: for doc_idx, topic in enumerate(all_topics): if len(topic) == 1: topic_id, prob = topic[0] f.writelines(documents[doc_idx].strip() + "\u241E" + ' '.join(tokenized_corpus[doc_idx]) + "\u241E" + str(topic_id) + "\u241E" + str(prob) + "\n") LDA.save(output_fname + ".model")
def getOptimalTopicNum(self): dictionary = corpora.Dictionary(self.corpus) corpus = [dictionary.doc2bow(text) for text in self.corpus] com_nums = [] for i in range(0, 100, 10): if i == 0: p = 1 else: p = i com_nums.append(p) coherence_list = [] for i in com_nums: # lda = gensim.models.ldamodel.LdaModel(corpus=corpus, # id2word=dictionary, # num_topics=i, # iterations=100, # alpha='auto', # random_state=100, # update_every=1, # chunksize=10, # passes=20, # per_word_topics=True) lda = ldamulticore.LdaMulticore(corpus=corpus, id2word=dictionary, passes=20, num_topics=i, workers=4, iterations=100, alpha='symmetric', gamma_threshold=0.001) coh_model_lda = CoherenceModel(model=lda, corpus=corpus, dictionary=dictionary, coherence='u_mass') coherence_value = coh_model_lda.get_coherence() # coh = lda.log_perplexity(corpus) coherence_list.append(coherence_value) print('k = {} coherence value = {}'.format( str(i), str(coherence_value))) coh_dict = dict(zip(com_nums, coherence_list)) sorted_coh_dict = sorted(coh_dict.items(), key=operator.itemgetter(1), reverse=True) plt.plot(com_nums, coherence_list) plt.xlabel('topic') plt.ylabel('coherence value') plt.draw() fig = plt.gcf() fig.savefig(self.model_path + '/coherence.png') t_ind = np.argmin(coherence_list) self.num_topics = t_ind * 10 print('optimal topic number = ', str(t_ind)) return sorted_coh_dict[0][0]
def train_lda(train_corpus4): lda_train4 = ldamulticore.LdaMulticore( corpus=train_corpus4, num_topics=50, passes=50, eval_every=1, per_word_topics=True) lda_train4.
def trainLDA(docRep, dictionary, save=False, name=""): ''' Function to train and return an ldamodel. Expects a sparse matrix as input ''' corpus = Sparse2Corpus(docRep, documents_columns=False) ldamodel = ldamulticore.LdaMulticore( corpus, num_topics=20, id2word=dictionary, workers=4, passes=4) if save: saveData(ldamodel, 'ldamodel-' + name) return ldamodel
def testPersistenceCompressed(self): fname = testfile() + '.gz' model = ldamulticore.LdaMulticore(self.corpus, num_topics=2) model.save(fname) model2 = ldamulticore.LdaMulticore.load(fname, mmap=None) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta)) tstvec = [] self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
def testLargeMmapCompressed(self): fname = testfile() + '.gz' model = ldamulticore.LdaMulticore(self.corpus, num_topics=2) # simulate storing large arrays separately model.save(fname, sep_limit=0) # test loading the large model arrays with mmap self.assertRaises(IOError, ldamulticore.LdaModel.load, fname, mmap='r')
def testPersistence(self): model = ldamulticore.LdaMulticore(self.corpus, num_topics=2) model.save(testfile()) model2 = ldamulticore.LdaMulticore.load(testfile()) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta)) tstvec = [] self.assertTrue(numpy.allclose( model[tstvec], model2[tstvec])) # try projecting an empty vector
def train(self): if self.workers > 1: self.model = ldamulticore.LdaMulticore( self.corpus, **dict(self.model_params, id2word=self.corpus.dictionary, workers=self.workers)) else: self.model = ldamodel.LdaModel( self.corpus, **dict(self.model_params, id2word=self.corpus.dictionary)) self.save_article_topics() self.save_topic_words()
def testLargeMmap(self): fname = testfile() model = ldamulticore.LdaMulticore(self.corpus, num_topics=2) # simulate storing large arrays separately model.save(testfile(), sep_limit=0) # test loading the large model arrays with mmap model2 = ldamulticore.LdaModel.load(testfile(), mmap='r') self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(isinstance(model2.expElogbeta, numpy.memmap)) self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta)) tstvec = [] self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
def ldaTrain(f_inputs, model_output, corpus_name, **args): """ Input Args f_input training corpus, one document a line with ' ' seperated model_input model name corpus_name convert training corpus to gensim format corpus **args contain 'num_topics', 'alpha', 'eta' """ lda_corpus, id2word = genCorpus(corpus_name=corpus_name, f_inputs=f_inputs) if CPU_NUM > 1: lda_model = ldamulticore.LdaMulticore(lda_corpus, workers=CPU_NUM, id2word=id2word, **args) else: lda_model = ldamodel.LdaModel(lda_corpus, id2word=id2word, **args) lda_model.save(model_output, ignore=['state', 'dispatcher'])
def saveLDAModel(self, model_path): print(' ...start to build lda model...') dictionary = corpora.Dictionary(self.corpus) corpus = [dictionary.doc2bow(text) for text in self.corpus] # lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # id2word=dictionary, # num_topics=self.num_topics, # iterations=100, # alpha='auto', # random_state=100, # update_every=1, # chunksize=10, # passes=20, # per_word_topics=True) lda_model = ldamulticore.LdaMulticore(corpus=corpus, id2word=dictionary, passes=20, num_topics=self.num_topics, workers=4, iterations=100, alpha='symmetric', gamma_threshold=0.001) all_topics = lda_model.get_document_topics(corpus, minimum_probability=0.5, per_word_topics=False) documents = self.documents with open(model_path + '/lda.results', 'w', -1, 'utf-8') as f: for doc_idx, topic in enumerate(all_topics): if len(topic) == 1: topic_id, prob = topic[0] f.writelines(documents[doc_idx].strip() + "\u241E" + ' '.join(self.corpus[doc_idx]) + "\u241E" + str(topic_id) + "\u241E" + str(prob) + '\n') lda_model.save(model_path + '/lda.model') with open(model_path + 'model.dictionary', 'wb') as f: pickle.dump(dictionary, f) return lda_model
def saveLDAModel(self): print(' ...start to build lda model...') # dictionary = corpora.Dictionary(self.corpus) # corpus = [dictionary.doc2bow(text) for text in self.corpus] # tfidf = TfidfModel(corpus) # corpus_tfidf = tfidf[corpus] lda_model = ldamulticore.LdaMulticore(corpus=self.corpus_tfidf, id2word=self.dictionary, passes=20, num_topics=self.num_topics, workers=4, iterations=100, alpha='symmetric', gamma_threshold=0.001) with open(self.model_path + self.data_name + '_lda_model.pickle', 'wb') as f: pickle.dump(lda_model, f) all_topics = lda_model.get_document_topics(self.corpus_tfidf, minimum_probability=0.5, per_word_topics=False) documents = self.documents with open(self.model_path + self.data_name + '_lda.results', 'w', -1, 'utf-8') as f: for doc_idx, topic in enumerate(all_topics): print(doc_idx, ' || ', topic) if len(topic) == 1: topic_id, prob = topic[0][0], topic[0][1] f.writelines( str(doc_idx) + "\u241E" + documents[doc_idx].strip() + "\u241E" + ' '.join(self.corpus[doc_idx]) + "\u241E" + str(topic_id) + "\u241E" + str(prob) + '\n') lda_model.save(self.model_path + self.data_name + '_lda.model') with open(self.model_path + self.data_name + '_model.dictionary', 'wb') as f: pickle.dump(self.dictionary, f) return lda_model
def getOptimalTopicNum(self): # dictionary = corpora.Dictionary(self.corpus) # corpus = [dictionary.doc2bow(text) for text in self.corpus] # tfidf = TfidfModel(corpus) # self.corpus_tfidf = tfidf[corpus] # self.dictionary = corpora.Dictionary(self.corpus_tfidf) com_nums = [] for i in range(10, 60, 10): if i == 0: p = 1 else: p = i com_nums.append(p) coherence_list = [] for i in com_nums: # lda = gensim.models.ldamodel.LdaModel(corpus=corpus, # id2word=dictionary, # num_topics=i, # iterations=100, # alpha='auto', # random_state=100, # update_every=1, # chunksize=10, # passes=20, # per_word_topics=True) lda = ldamulticore.LdaMulticore(corpus=self.corpus_tfidf, id2word=self.dictionary, passes=20, num_topics=i, workers=4, iterations=100, alpha='symmetric', gamma_threshold=0.001) coh_model_lda = CoherenceModel(model=lda, corpus=self.corpus_tfidf, dictionary=self.dictionary, coherence='u_mass') coherence_value = coh_model_lda.get_coherence() # coh = lda.log_perplexity(corpus) coherence_list.append(coherence_value) print('k = {} coherence value = {}'.format( str(i), str(coherence_value))) # for co_value in coherence_list: df = pd.DataFrame({'num': com_nums, 'co_value': coherence_list}) delta = df['co_value'].diff() / df['co_value'][1:] df['delta'] = df['co_value'].diff() find = df['delta'] == df['delta'].max() df_find = df[find] optimal_value = 0 if coherence_list[0] >= df_find['delta'].tolist()[0]: optimal_value = coherence_list[0] optimal_num = com_nums[0] else: optimal_value = df_find['delta'].tolist()[0] optimal_num = df_find['num'].tolist()[0] print('==== coherence values =====') print(df, end='\n') print('==== final values =====') print(df_find) df.to_csv(self.model_path + self.data_name + '_coherence_delta.csv', mode='w', encoding='utf-8') coh_dict = dict(zip(com_nums, coherence_list)) sorted_coh_dict = sorted(coh_dict.items(), key=operator.itemgetter(1), reverse=True) plt.plot(com_nums, coherence_list, marker='o') plt.xlabel('topic') plt.ylabel('coherence value') plt.draw() fig = plt.gcf() fig.savefig(self.model_path + self.data_name + '_coherence.png') t_ind = np.argmax(coherence_list) # self.num_topics = sorted_coh_dict[0][0] print('optimal topic number = ', optimal_num) return optimal_num
# neg_corpus = [neg_dict.doc2bow(i) for i in ne['2']] # neg_lda = ldamulticore.LdaMulticore(neg_corpus,num_topics = 3,id2word = neg_dict, workers=48) # 正面主题分析 pos_dict = corpora.Dictionary(po) pos_corpus = [pos_dict.doc2bow(i) for i in po] joblib.dump(pos_dict, args.model_file + ".dic") joblib.dump(pos_corpus, args.model_file + ".cps") # pos_lda = ldamulticore.LdaMulticore(pos_corpus,num_topics= 3,id2word =pos_dict, workers=1) score_dic = {} lda_modes = [] for n in range(1, 5): if platform == "linux" or platform == "linux2": pos_lda = ldamulticore.LdaMulticore(pos_corpus, num_topics=n * 5, id2word=pos_dict, workers=4) goodcm = CoherenceModel(model=pos_lda, texts=po, dictionary=pos_dict, coherence='c_v', processes=4) elif platform == "win32": pos_lda = ldamodel.LdaModel(pos_corpus, num_topics=n * 5, id2word=pos_dict) goodcm = CoherenceModel(model=pos_lda, texts=po, dictionary=pos_dict, coherence='c_v', processes=1)