def ldamodel(doc_clean,n_topics,n_words,description,tfidfmodel=False,unseen_docs=None): doc_clean = [min_char(doc).split() for doc in doc_clean] dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. corpus = [dictionary.doc2bow(doc) for doc in doc_clean] compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=doc_clean, start=2, limit=40, step=6) if tfidfmodel: tfidf = TfidfModel(corpus,id2word=dictionary,smartirs='ntc') corpus = tfidf[corpus] ldamodel = LdaModel(corpus, num_topics=16, id2word=dictionary,random_state=1,passes=50,per_word_topics=True) print("#Tópicos LDA") for i in range(0, n_topics): temp = ldamodel.show_topic(i, n_words) terms = [] for term in temp: terms.append(term) print("Topic #" + str(i) + ": ", ", ".join([t + '*' + str(i) for t, i in terms])) print('Bound: ',ldamodel.bound(corpus)) # Compute Perplexity print('Perplexity: ',ldamodel.log_perplexity(corpus)) # Compute Coherence Score coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) if unseen_docs: corpus_new = [dictionary.doc2bow(doc) for doc in unseen_docs] for i, unseen_doc in enumerate(corpus_new): topic = None score = 0 inference_doc = ldamodel[unseen_doc] print(unseen_docs[i]) for index,tmpScore in inference_doc[0]: if tmpScore > score: score = tmpScore topic = ldamodel.print_topic(index, 5) print ("Score: {}\t Topic: {}".format(score, topic)) print("Log perplexity for new corpus is", ldamodel.log_perplexity(corpus_new)) print_result(ldamodel, doc_clean, corpus, n_topics, description) pickle.dump(corpus, open(description+'.pkl', 'wb')) dictionary.save(description+'dictionary.gensim') ldamodel.save(description+'_ldamodel.gensim')
def getPageID2TopicDist(docs, n_topics): """ :docs=[(docID,[word1,word2,..]),...] :param n_topics: Number of topics to generate from the data :return: [docid]=[0.9,0.1] # sequential topic probability """ # prepare the corpus in Gensim format texts = [doc[1] for doc in docs] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] #print corpus; # apply LDA on the corpus lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics, passes=4) # print top terms from each topic for i in range(lda.num_topics): topic = lda.print_topic(i, topn=8) print[tup[1] for tup in lda.show_topic(topicid=i, topn=8)] # get the topic distribution in each document pageID2TopicDist = dict() doc_topic = [] for index, doc in enumerate(docs): doc_bow = dictionary.doc2bow(doc[1]) topic_distri = lda[doc_bow] dlist = [0.0] * n_topics for tup in topic_distri: dlist[tup[0]] = tup[1] #print topic_distri; #top_topic = sorted(topic_distri, key=lambda x: x[1], reverse=True)[0][0] #doc_topic.append(top_topic) pageid = doc[0] pageID2TopicDist[pageid] = dlist # topic of each document #print doc_topic; return pageID2TopicDist
dictionary = Dictionary(texts) dictionary.filter_extremes(no_above = 0.2) corpus = [dictionary.doc2bow(text) for text in texts] print('done with corpus.') lda = LdaModel(corpus, id2word=dictionary, iterations=100, num_topics=10) #print(lda.print_topics()) for i in range(0,num_topics): print("-----------------------------------") print(lda.print_topic(i)) #lda.save('lda') topic = [] for i in range (0,num_topics): topic.append(open("./data/topic_" + str(i)+".txt", "w", encoding='utf-8')) topic[i].write(lda.print_topic(i) + '\n') topic.append(open("./data/topic_"+str(num_topics)+".txt", "w", encoding='utf-8')) topic[num_topics].write("empty topic" + '\n') #print(dictionary[corpus[1][0][0]] + '-------------') flag = False origin = open("./data/output.txt").readlines()
# gensim ################## from gensim import corpora, similarities from gensim.models.ldamodel import LdaModel from gensim.models.tfidfmodel import TfidfModel # gensim Dictionary format is "list of list" (i.e [[1, 2, 3], [4,5,6]...] dictionary = corpora.Dictionary(contents_clean) # words to vectors corpus = [dictionary.doc2bow(text) for text in contents_clean] lda = LdaModel(corpus = corpus, num_topics=10, id2word=dictionary) print lda.print_topic(1, 10) print lda.print_topic(2, 10) print lda.print_topic(3, 10) print '--------------------------' for topic in lda.print_topics(num_topics=10, num_words=5): print topic[1] print '------------tfidf---------' tfidf = TfidfModel(corpus=corpus) print tfidf[dictionary.doc2bow([u'中国', u'报道'])]
# construct LDA model model_lda = LdaModel(corpus=corpus, id2word=id_to_word, num_topics=25, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # print keywords in n topics pprint(model_lda.print_topics()) # print top 10 keywords that comprise topic with index of 0 pprint(model_lda.print_topic(24)) # the most import keywords, and the respective weight, that form topic 0 are # print top 10 keywords that comprise topic with index of 1 pprint(model_lda.print_topic(1)) # TODO (Lee) - infer topic from keywords? # #### Alternate workflow to Evaluate - model #1 # In[ ]: #uncomment to use
neg_segments = [] for each_review in neg_review_list: neg_segments.append(' '.join(jieba.cut(each_review))) #delete stop words for i in range(0,len(neg_segments)): for should_delete_word in stop_words: if(should_delete_word in neg_segments[i]): neg_segments[i] = neg_segments[i].replace(should_delete_word,"") else: pass # fed data into work2vec with open("neg_reviews_segmented.txt", 'w') as file: for seg_review in neg_segments: file.write(seg_review) #fed into model pos_dict = corpora.Dictionary([word.split() for word in pos_segments]) pos_corpus = [pos_dict.doc2bow(i) for i in [word.split() for word in pos_segments]] pos_lda_model = LdaModel(pos_corpus,num_topics=10,id2word=pos_dict) print("positive topics:") for i in range(10): print(pos_lda_model.print_topic(i)) print("__________________________") ##negative print("negative topics:") neg_dict = corpora.Dictionary([word.split() for word in neg_segments]) neg_corpus = [neg_dict.doc2bow(i) for i in [word.split() for word in neg_segments]] neg_lda_model = LdaModel(neg_corpus,num_topics=10,id2word=neg_dict) for i in range(10): print(neg_lda_model.print_topic(i)) print("__________________________")
df = pd.read_csv("./total_info.txt", sep=',', header=0, names=['A', 'B', 'C', 'D', 'E', 'F']) data = df['B'] data = data.apply(lambda s: clean_text(s)) datalist = data.values print(datalist) # 分词 texts = [[word for word in doc.lower().split()] for doc in datalist] print(texts[0]) common_dictionary = Dictionary(texts) common_corpus = [common_dictionary.doc2bow(text) for text in texts] lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=20) print(lda.print_topic(10, topn=5)) lda.save('lda.model') lda = LdaModel.load('lda.model') tryTxt = "while i be suffer i be able to press and go in subscribe but when i press the video it keep show no connection ." trylist = [word for word in tryTxt.lower().split()] bow = common_dictionary.doc2bow(trylist) print(lda.get_document_topics(bow)) import pyLDAvis.gensim # 浏览器打开http://127.0.0.1:8888/ vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary) pyLDAvis.show(vis)
# Build LDA model print('Training LDA model...') model = LdaModel( corpus=corpus, id2word=id2word, num_topics=options.num_topics, random_state=100, update_every=1, chunksize=100, passes=options.iterations, alpha='auto', per_word_topics=True ) print('...done') print('Saving model...') model.save(model_path) print('...done') print('Topics found:') for i in range(options.num_topics): print(i, ' -> ', model.print_topic(i)) doc_lda = model[corpus] # Compute Perplexity print('Perplexity: ', model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('Coherence Score: ', coherence_lda)
# words_list = [] # for i in tweets[list(tweets.keys())[2]].split("|||"): # words =[word for word in nltk.word_tokenize(i) if word not in STOPWORDS and word.isalnum() and len(word)>=2] # words_list.append(words) num_topics = 3 dictionary = corpora.Dictionary(words_list) corpus = [dictionary.doc2bow(words) for words in words_list] lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics) ###output1: topics and corresponding words pp = pprint.PrettyPrinter(indent=4) pp.pprint(lda.print_topics(num_words=10)) ###output2: 2 ways of showing one topic and corresponding words lda.print_topic(topicno=0) lda.show_topic(1) ### ouput3: show topic of one user (even new user) sorted(lda.get_document_topics(corpus[100], minimum_probability=0, per_word_topics=False), key=lambda x: x[1], reverse=True) ### output4: visualize LDA lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, R=15, sort_topics=False)
from gensim import corpora dictionary = corpora.Dictionary([a[1] for a in articles]) corpus = [dictionary.doc2bow(a[1]) for a in articles] print(corpus[0]) from gensim.models.ldamodel import LdaModel nr_topics = 5 ldamodel = LdaModel(corpus, num_topics=nr_topics, id2word=dictionary, passes=20) print(ldamodel.print_topics()) # Show topics by top-3 terms for t in range(nr_topics): print(ldamodel.print_topic(t, topn=3)) # Show some random articles from random import shuffle idx = list(range(len(articles))) shuffle(idx) for a in idx[:3]: article = articles[a] print('==========================') print(article[0]) prediction = ldamodel[corpus[a]][0] print(ldamodel.print_topic(prediction[0], topn=3)) print('Probability:', prediction[1])