def main(): lda0 = models.LdaModel.load('./timewindow_in3/_1999-2000lda_model') lda1 = models.LdaModel.load('./timewindow_in3/_2000-2001-2002lda_model') lda2 = models.LdaModel.load('./timewindow_in3/_2002-2003-2004lda_model') lda3 = models.LdaModel.load('./timewindow_in3/_2004-2005-2006lda_model') lda4 = models.LdaModel.load('./timewindow_in3/_2006-2007-2008lda_model') lda5 = models.LdaModel.load('./timewindow_in3/_2008-2009-2010lda_model') lda6 = models.LdaModel.load('./timewindow_in3/_2010-2011-2012lda_model') lda7 = models.LdaModel.load('./timewindow_in3/_2012-2013-2014lda_model') lda8 = models.LdaModel.load('./timewindow_in3/_2014-2015-2016lda_model') lda9 = models.LdaModel.load('./timewindow_in3/_2016-2017lda_model') LDA_list = [lda0, lda1, lda2, lda3, lda4, lda5, lda6, lda7, lda8, lda9] corpus_exam0 = corpora.BleiCorpus("./timewindow_in3/corpus_1999-2000.blei") corpus_exam1 = corpora.BleiCorpus( "./timewindow_in3/corpus_2000-2001-2002.blei") corpus_exam2 = corpora.BleiCorpus( "./timewindow_in3/corpus_2002-2003-2004.blei") corpus_exam3 = corpora.BleiCorpus( "./timewindow_in3/corpus_2004-2005-2006.blei") corpus_exam4 = corpora.BleiCorpus( "./timewindow_in3/corpus_2006-2007-2008.blei") corpus_exam5 = corpora.BleiCorpus( "./timewindow_in3/corpus_2008-2009-2010.blei") corpus_exam6 = corpora.BleiCorpus( "./timewindow_in3/corpus_2010-2011-2012.blei") corpus_exam7 = corpora.BleiCorpus( "./timewindow_in3/corpus_2012-2013-2014.blei") corpus_exam8 = corpora.BleiCorpus( "./timewindow_in3/corpus_2014-2015-2016.blei") corpus_exam9 = corpora.BleiCorpus("./timewindow_in3/corpus_2016-2017.blei") corpus_list = [ corpus_exam0, corpus_exam1, corpus_exam2, corpus_exam3, corpus_exam4, corpus_exam5, corpus_exam6, corpus_exam7, corpus_exam8, corpus_exam9 ] # for (lda, corpus) in zip(LDA_list, corpus_list): # # print num_doc_per_topic(corpus, lda) # print lda # print doc_topic_mat(corpus, lda) list_all = [] for i in range(10): # print i list_t = doc_topic_mat(corpus_list[i], LDA_list[i]) # print list_t list_all.append(list_t) list_np = np.array(list_all) # print type(list_all), type(list_all[0]),type(list_all[0][0]) print "******************************************************"
def train(reviewDict, k): '''Feed reviews to LDA model using k topics''' id2word = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict) # id2word.filter_extremes(keep_n=10000) # id2word.compactify() corpora_dict = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict) corpora_dict.save('lda/dictionary.dict') corpus = [ corpora_dict.doc2bow(reviewDict[review]["review_nouns"]) for review in reviewDict ] corpora.BleiCorpus.serialize('lda/corpus.lda-c', corpus) corpus = corpora.BleiCorpus('lda/corpus.lda-c') if k == 50: # save lda model for 50 topics lda = gensim.models.LdaModel(corpus, num_topics=50, id2word=id2word) lda.save('lda/lda_50_topics.lda') elif k == 25: # save lda model for 25 topics lda = gensim.models.LdaModel(corpus, num_topics=25, id2word=id2word) lda.save('lda/lda_25_topics.lda') return lda
def main(): global REVIEW_DICT REVIEW_DICT = reviewData # Check if the folder for the lda model exists. If it doesnt create the folder if not os.path.exists('lda'): os.makedirs('lda') train(lemmatize(REVIEW_DICT), 25) # Get all 25 topics using K=25 WEIGHT_TOPIC = [] dictionary_path = "lda/dictionary.dict" corpus_path = "lda/corpus.lda-c" lda_model_path = "lda/lda_25_topics.lda" dictionary = corpora.Dictionary.load(dictionary_path) corpus = corpora.BleiCorpus(corpus_path) lda = LdaModel.load(lda_model_path) # Print topics TOPIC_DICT = dict(lda.show_topics(num_topics=25)) for topicN, topicWeights in TOPIC_DICT.items(): print('Topic ' + str(topicN) + ' : \n' + str(topicWeights) + '\n')
def load_corpus(self, override_corpus=True): if self.corpus_format == 'svmlight': # Joachim's SVMlight format try: c = corpora.SvmLightCorpus(fname=(DOCUMENT_PATH + self.filename + '_corpus.' + self.corpus_format)) if override_corpus is True: self.save_corpus(c) except: c = corpora.SvmLightCorpus self.save_corpus(c) elif self.corpus_format == 'lda-c': # Blei's LDA-C format try: c = corpora.BleiCorpus(fname=(DOCUMENT_PATH + self.filename + '_corpus.' + self.corpus_format)) if override_corpus is True: self.save_corpus(c) except: c = corpora.BleiCorpus self.save_corpus(c) elif self.corpus_format == 'low': # GibbsLDA++ format try: c = corpora.LowCorpus(fname=(DOCUMENT_PATH + self.filename + '_corpus.' + self.corpus_format)) if override_corpus is True: self.save_corpus(c) except: c = corpora.LowCorpus self.save_corpus(c) else: # Default Market Matrix format try: c = corpora.MmCorpus(fname=(DOCUMENT_PATH + self.filename + '_corpus.' + self.corpus_format)) if override_corpus is True: self.save_corpus(c) except: c = corpora.MmCorpus self.save_corpus(c) return c
def main(): lda1 = models.LdaModel.load('./period_static/_buddinglda_model') lda2 = models.LdaModel.load('./period_static/_growinglda_model') lda3 = models.LdaModel.load('./period_static/_maturelda_model') lda_list = [lda1, lda2, lda3] corpus0 = corpora.BleiCorpus("./period_static/corpus_budding.blei") corpus1 = corpora.BleiCorpus("./period_static/corpus_growing.blei") corpus2 = corpora.BleiCorpus("./period_static/corpus_mature.blei") corpus_list = [corpus0, corpus1, corpus2] cos_sim(lda_list) semantic_sim(lda_list) num_per_topic(lda_list, corpus_list) density(lda_list, corpus_list) print_topic(lda_list)
def displayTopics(): dictionary = corpora.Dictionary.load(dictionary_path) corpus = corpora.BleiCorpus(corpus_path) lda = LdaMulticore.load(lda_model_path) i = 0 for topic in lda.show_topics(lda_num_topics): print 'Topic #' + str(i) + ': ' + str(topic) i += 1
def run(lda_model_path, corpus_path, num_topics, id2word): corpus = corpora.BleiCorpus(corpus_path) lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=id2word) lda.save(lda_model_path) return lda
def load(): dictionary = corpora.Dictionary.load('./tmp/all_doucment.dict') corpus = corpora.BleiCorpus('./tmp/corpus.blei') lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10, per_word_topics=True) print lda.id2word print '----------------------------------------------------' list = lda.get_document_topics(corpus, per_word_topics=True) pprint(list[2])
def run(lda_model_path, corpus_path, num_topics, id2word): u'''Training to create LDA model''' corpus = corpora.BleiCorpus(corpus_path) lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=id2word, iterations=200) lda.save(lda_model_path) return lda
def run(lda_model_path, corpus_path, num_topics, id2word): corpus = corpora.BleiCorpus(corpus_path) lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=id2word, passes=10, eval_every=10, iterations=500) lda.save(lda_model_path) return lda
def vis_lda(load_path, output_html=None): print "Visualizing the LDA" if load_path is None: load_path = "LDA_data" if output_html is None: output_html = "LDA_vis.html" corpus = corpora.BleiCorpus(load_path + "_corp") id2word = corpora.Dictionary.load_from_text(load_path + "_dic") lda = models.ldamodel.LdaModel.load(load_path + "_lda") vis = pyLDAvis.gensim.prepare(lda, corpus, id2word) pyLDAvis.save_html(vis, output_html)
def build_lda_mode(): # corpus is bag of words, which is the original feature corpus = corpora.BleiCorpus( './zhihu_dat/item.dat') # the bag of words feature of question data # build up lda model: using lda model, given a bag of words feature, return the topic feature, so the topic model is to reduce the dimension of the features of a document lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=10) # save the model to disk for future use(Given a document such as question, return the topic feature of the document) lda_model.save('./zhihu_dat/zhihu_10.lda') print 'Building complete'
def main(): lda0 = models.LdaModel.load('./timewindow_in3/_1999-2000lda_model') lda1 = models.LdaModel.load('./timewindow_in3/_2000-2001-2002lda_model') lda2 = models.LdaModel.load('./timewindow_in3/_2002-2003-2004lda_model') lda3 = models.LdaModel.load('./timewindow_in3/_2004-2005-2006lda_model') lda4 = models.LdaModel.load('./timewindow_in3/_2006-2007-2008lda_model') lda5 = models.LdaModel.load('./timewindow_in3/_2008-2009-2010lda_model') lda6 = models.LdaModel.load('./timewindow_in3/_2010-2011-2012lda_model') lda7 = models.LdaModel.load('./timewindow_in3/_2012-2013-2014lda_model') lda8 = models.LdaModel.load('./timewindow_in3/_2014-2015-2016lda_model') lda9 = models.LdaModel.load('./timewindow_in3/_2016-2017lda_model') LDA_list = [lda0, lda1, lda2, lda3, lda4, lda5, lda6, lda7, lda8, lda9] corpus_exam0 = corpora.BleiCorpus("./timewindow_in3/corpus_1999-2000.blei") corpus_exam1 = corpora.BleiCorpus( "./timewindow_in3/corpus_2000-2001-2002.blei") corpus_exam2 = corpora.BleiCorpus( "./timewindow_in3/corpus_2002-2003-2004.blei") corpus_exam3 = corpora.BleiCorpus( "./timewindow_in3/corpus_2004-2005-2006.blei") corpus_exam4 = corpora.BleiCorpus( "./timewindow_in3/corpus_2006-2007-2008.blei") corpus_exam5 = corpora.BleiCorpus( "./timewindow_in3/corpus_2008-2009-2010.blei") corpus_exam6 = corpora.BleiCorpus( "./timewindow_in3/corpus_2010-2011-2012.blei") corpus_exam7 = corpora.BleiCorpus( "./timewindow_in3/corpus_2012-2013-2014.blei") corpus_exam8 = corpora.BleiCorpus( "./timewindow_in3/corpus_2014-2015-2016.blei") corpus_exam9 = corpora.BleiCorpus("./timewindow_in3/corpus_2016-2017.blei") corpus_list = [ corpus_exam0, corpus_exam1, corpus_exam2, corpus_exam3, corpus_exam4, corpus_exam5, corpus_exam6, corpus_exam7, corpus_exam8, corpus_exam9 ] for (lda, corpus) in zip(LDA_list, corpus_list): print cal_density(corpus, lda)
def load_lda_corpus(): corpus_Lda = corpora.BleiCorpus( "tmp/corpus_stories.lda-c") #carico il corpus precedentemente salvato # stampa una serie di array aventi 100 coppie corrispondenti in questo caso ai 100 topic (se un topic non è per # niente presente in un documento la coppia non verrà mostrata affatto) # (se ci fossero tot topic stamperebbe tot coppie, ma solo se effettivamente nei documenti sono presenti) # in ciascuna delle coppie: (id_topic, distribuzione del documento su quel topic quindi più alto è il valore # e più il documento parlerà di quel tipic) # for doc in corpus_Lda[:10]: #prende i primi 10 in base a come sono stati inseriti nel corpus # print(doc) return corpus_Lda
def main(): #Loads dataset and vocabulary #Generating topics and distributions print('Starting up!') num_topics = 10 num_question = 50 num_answer = 50985 corpus_name = '../data/ldac/deriv/sq50.txt' voc_name = '../data/voc/deriv/all.txt' corpus = corpora.BleiCorpus(corpus_name, voc_name) print('Corpus processed!') #id2word = corpora.Dictionary.load('../data/voc_2.txt') lda = models.ldamodel.LdaModel(corpus, num_topics=num_topics, chunksize=2000, decay=0.5, offset=1.0, passes=1, update_every=0, eval_every=10, iterations=20000, gamma_threshold=0.001) print('LDA applied to corpus!') #print('=== Topic-Word Distributions ===') #topic_word_list = lda.show_topics() #for i in xrange(num_topics): # print('Topic {} : {}'.format(i,' '.join(topic_word_list[i][0:]))) print('=== Document-Topic Distributions ===') #Writing doc_topic distributions to a file to parse later #doc_topic = lda[corpus] dist_list = list(lda[corpus]) answer_list, metric_list = compare_sq(dist_list, num_question, num_answer, num_topics) print('Answers compared!') answer_list f = open('{}{}'.format(corpus_name, '.answers'), 'w') print('Answers for each question.') for i, answers in enumerate(answer_list): answers f.write('Question {}: '.format(i)) for j in xrange(50): f.write('{} '.format(answers[j])) f.write('\n') f.close() print('Results written !')
def nbc(): choose_1 = random.randint(0, 25) # choose_2 = random.randint(0, 25) corpus1 = corpora.BleiCorpus('../corpus/corpus_{}.blei'.format(choose_1)) corpus1 = corpora.BleiCorpus('../corpus/corpus_0.blei') # corpus_2 = corpora.BleiCorpus('../corpus/corpus_{}.blei'.format(choose_1)) test_X = matutils.corpus2csc(corpus1).transpose() # 测试集 # print test_X.get_shape() label_list = read_label() test_y = label_list[(choose_1*20000):(choose_1+1)*20000] # 测试集标签 test_y = label_list[(0 * 20000):(0 + 1) * 20000] clf = MultinomialNB(alpha=0.01) for index in range(0, 25): corpus = corpora.BleiCorpus('../corpus/corpus_{}.blei'.format(index)) csi_matrix = matutils.corpus2csc(corpus).transpose() if csi_matrix.get_shape() ==(20000, 271884): print(csi_matrix.get_shape()) clf.partial_fit(csi_matrix, label_list[(index*20000):(index+1)*20000], classes=np.array([0, 1])) print("第{}次".format(index)) pre = clf.predict(test_X) totalScore(pre, test_y)
def run_lsi(dictionary_file, ldac_file, lsi_file, topics_file, num_topics, paper_ids): dictionary = corpora.Dictionary().load(dictionary_file) corpus_ldac = corpora.BleiCorpus(fname=ldac_file, fname_vocab=(ldac_file + '.vocab')) num_docs = len(corpus_ldac) ''' Writes the corpus-documents TFIDF values into a file ''' tfidf_mdl = models.TfidfModel(corpus_ldac) corpus_tfidf = tfidf_mdl[corpus_ldac] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics) corpus_lsi = lsi[corpus_tfidf] lsi_matrix = np.zeros((num_docs, num_topics)) row_count = 0 for doc in corpus_lsi: for each_tuple in doc: lsi_matrix[row_count, int(each_tuple[0])] = float(each_tuple[1]) row_count += 1 with codecs.open(lsi_file, mode='w', encoding='utf-8') as fw: for i in range(0, num_docs): fw.write(str(paper_ids[i]) + u'|{') s = '' for j in range(0, num_topics): s += str(lsi_matrix[i, j]) + u',' fw.write(s.rstrip(u',') + u'}\n') # np.savetxt(lsi_file, lsi_matrix) print 'Number of documents: ', row_count topics = lsi.show_topics(num_topics=-1, num_words=50, log=False, formatted=False) with codecs.open(topics_file, mode='w', encoding='utf-8') as fw: fw.write(u'topic_id|topic_words\n') for i in range(0, num_topics): topic_words = u",".join(w[1] for w in topics[i]) fw.write(str(i + 1) + u"|{" + topic_words + u"}\n")
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) dictionary_path = "models/dictionary.dict" corpus_path = "models/corpus.lda-c" lda_num_topics = 50 lda_model_path = "models/lda_model_50_topics.lda" dictionary = corpora.Dictionary.load(dictionary_path) corpus = corpora.BleiCorpus(corpus_path) lda = LdaModel.load(lda_model_path) i = 0 for topic in lda.show_topics(num_topics=lda_num_topics): print '#' + str(i) + ': ' + str(topic) i += 1
def predTopics(review_text): separated_text = review_text.lower().split() # apply LDA model dictionary_path = "lda/dictionary.dict" corpus_path = "lda/corpus.lda-c" lda_model_path = "lda/lda_25_topics.lda" dictionary = corpora.Dictionary.load(dictionary_path) corpus = corpora.BleiCorpus(corpus_path) lda = LdaModel.load(lda_model_path) review_bow = dictionary.doc2bow(separated_text) return lda[review_bow]
def run_lda(dictionary_file, ldac_file, theta_file, topics_file, num_topics, num_passes, paper_ids): dictionary = corpora.Dictionary().load(dictionary_file) corpus_ldac = corpora.BleiCorpus(fname=ldac_file, fname_vocab=(ldac_file + '.vocab')) num_docs = len(corpus_ldac) model = models.ldamodel.LdaModel(corpus_ldac, id2word=dictionary, num_topics=num_topics, passes=num_passes, update_every=0, alpha=1.0, eta=1.0, decay=0.0) # Creates the \theta matrix theta = model[corpus_ldac] theta_matrix = np.zeros((num_docs, num_topics)) dcount = 0 for theta_d in theta: for theta_dt in theta_d: theta_matrix[dcount, int(theta_dt[0])] = float(theta_dt[1]) dcount += 1 with codecs.open(theta_file, mode='w', encoding='utf-8') as fw: for i in range(0, num_docs): fw.write(str(paper_ids[i]) + u'|{') s = '' for j in range(0, num_topics): s += str(theta_matrix[i, j]) + u',' fw.write(s.rstrip(u',') + u'}\n') # np.savetxt(theta_file, theta_matrix) print 'Number of documents: ', dcount topics = model.show_topics(topics=-1, topn=50, log=False, formatted=False) with codecs.open(topics_file, mode='w', encoding='utf-8') as fw: fw.write(u'topic_id|topic_words\n') for i in range(0, num_topics): topic_words = u",".join(w[1] for w in topics[i]) fw.write(str(i + 1) + u"|{" + topic_words + u"}\n")
def build_model(dat, vocab, num_topics=100, alpha=None): """ loading items & training lda model """ if not path.exists(dat) or not path.exists(vocab): print('Error: Expected items to be present at ./datasets/ap/') # Corpus is just the preloaded list of words # dat:term_num term_id:term_freq ... for each line # vocab:term for each line(line no is term_id implicit) corpus = corpora.BleiCorpus( dat, vocab) # class 'gensim.corpora.bleicorpus.BleiCorpus # #doc_line=2246 #vocab=10473 # print(corpus.id2word) #{0: 'line', 1: 'new', 2: 'percent'...} same as vocab:word_id:word model = models.LdaModel(corpus, num_topics=num_topics, alpha=alpha, id2word=corpus.id2word) # corpus_list = [c for c in corpus] # print 'len(corpus_list[0]) = ', len(corpus_list[0]), '\n', corpus_list[0] #<=>ap.dat line 1 return corpus, model
def main(): names = [ '2006-2007-2008', '2008-2009-2010', '2010-2011-2012', '2012-2013-2014', '2014-2015-2016' ] path = "Output/" lda_list = [] corpus_list = [] for name in names: lda = models.LdaModel.load('Corpus3/lda_model_' + name) corpus = corpora.BleiCorpus('Corpus3/corpus_' + name + '.blei') lda_list.append(lda) corpus_list.append(corpus) num_per_topic(lda_list, corpus_list, path) density(lda_list, corpus_list, path) print_topic(lda_list, path) cos_sim(lda_list, path) semantic_sim(lda_list, path)
def load_corpus(self, directory=None): print("[%s] Load corpus ..." % self._name) name = self._name ext = self._model_type dir = "%s/" % (directory if directory else ".") self._wdict = corpora.Dictionary.load_from_text(dir + name + ".dict") if self._model_type == "lda": self._topic_model = models.LdaModel.load(dir + name + "." + ext) elif self._model_type == "lsi": self._topic_model = models.LsiModel.load(dir + name + "." + ext) elif self._model_type == "hdp": self._topic_model = models.HdpModel.load(dir + name + "." + ext) else: self._topic_model = NullModel.load(dir + name + "." + ext) self._corpus = corpora.BleiCorpus(self._corpus, dir + name + ".blei") self._index = similarities.MatrixSimilarity.load(dir + name + ".index") self._file_names = load_list(dir + name + ".file_names") self._topic_names = load_list(dir + name + ".topic_names")
def runLDA(corpusfile, dcyfile, num_topics, ind=-1): ''' Do classical LDA on word matrix M using alpha, beta Plot the results ''' print("Running Vanilla LDA on current M") dcy = corpora.Dictionary.load(dcyfile) print(dcy) if ind > 0: tmp = dcy.token2id for key in tmp: if tmp[key] == int(ind): print('Word to insert: ' + key) break corpus = corpora.BleiCorpus(corpusfile) #tfidf = models.TfidfModel(corpus, normalize=True) #tfidf_corpus = tfidf[corpus] tfidf_corpus = corpus #Remove this line to allow tfidf values lda = models.LdaModel(tfidf_corpus, id2word=dcy, num_topics=num_topics) print(lda.print_topics(num_topics, num_words=20)) return 0
def get_lda_models(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) input_file = "nostem_processed_normal_format_Homework2_data.csv" documents_list = [] for line in open(input_file, "r"): documents_list.append(line) texts = [[word for word in document.lower().split(" ")] for document in documents_list] dictionary = corpora.Dictionary(texts) dictionary.save('temp_twitter.dict') corpus = [dictionary.doc2bow(text) for text in texts] corpora.BleiCorpus.serialize('temp_twitter_lda.mm', corpus) print "Generate the dictionary and lda corpus" if (os.path.exists("temp_twitter.dict")): dictionary = corpora.Dictionary.load('temp_twitter.dict') corpus = corpora.BleiCorpus('temp_twitter_lda.mm') print("Used files generated from first tutorial") #tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model #corpus_tfidf = tfidf[corpus] #get the lda model lda_3_topics = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3) lda_4_topics = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=4) lda_5_topics = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5) #lda.print_topics(num_topics=5, num_words=10) lda_3_topics.save("lda_3_topics_model") lda_4_topics.save("lda_4_topics_model") lda_5_topics.save("lda_5_topics_model")
import numpy as np # **Download data** # # <del>http://www.cs.princeton.edu/~blei/lda-c/ap.tgz</del> # # http://www.cs.columbia.edu/~blei/lda-c/ # # Unzip the data and put them into your folder, e.g., /Users/datalab/bigdata/ap/ # In[23]: # Load the data corpus = corpora.BleiCorpus('/Users/datalab/bigdata/ap/ap.dat', '/Users/datalab/bigdata/ap/vocab.txt') # **使用help命令理解corpora.BleiCorpus函数** # # > help(corpora.BleiCorpus) class BleiCorpus(gensim.corpora.indexedcorpus.IndexedCorpus) | Corpus in Blei's LDA-C format. | | The corpus is represented as two files: | one describing the documents, | and another describing the mapping between words and their ids. # In[24]: # 使用dir看一下有corpus有哪些子函数?
def getdata(): #Getting the sample corpus corpus = corpora.BleiCorpus('D:/Projects/ap/ap.dat', 'D:/Projects/ap/vocab.txt') return corpus
from gensim import corpora from collections import defaultdict from pprint import pprint from gensim.models import ldamodel import numpy as np import pandas as pd import os import json from scipy.stats import entropy from scipy.spatial.distance import pdist, squareform from nltk.corpus import stopwords model = ldamodel.LdaModel.load("lda_output/reclamos_lda") dictionary = corpora.Dictionary.load('corpus_output/corpus_dict.dict') corpus = corpora.BleiCorpus('corpus_output/corpus.lda-c') n_topic = 10 index = 0 predict = model[corpus] def get_doc_topic_dists(predict): doc_topic_dists = np.empty([len(corpus), n_topic]) index = 0 for topics in predict: for topic in topics: doc_topic_dists[index][topic[0]] = topic[1] index += 1 np.savetxt('doc_topic.csv', doc_topic_dists, delimiter=',') return doc_topic_dists
print("Please install it") raise import matplotlib.pyplot as plt import numpy as np from os import path NUM_TOPICS = 100 # Check that data exists if not path.exists('./data/ap/ap.dat'): print('Error: Expected data to be present at data/ap/') print('Please cd into ./data & run ./download_ap.sh') # Load the data corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt') # Build the topic model model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None) # Iterate over all the topics in the model for ti in range(model.num_topics): words = model.show_topic(ti, 64) tf = sum(f for f, w in words) with open('topics.txt', 'w') as output: output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words)) output.write("\n\n\n")
__author__ = 'askofen' from gensim import corpora, models, similarities import numpy as np from scipy.spatial import distance import matplotlib.pyplot as plt corpus = corpora.BleiCorpus('../../Data/04/data/ap.dat', '../../Data/04/data/vocab.txt') # alpha = 1 -> more topics per document model = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word) # length of vocabulary dictionary print(len(model.id2word)) topics = [model[topic] for topic in corpus] topicsProDocCount = [len(t) for t in topics] plt.hist(topicsProDocCount, bins=15) plt.title("Topics pro document histogram") dense = np.zeros((len(topics), 100), float) # make matrix with weight of each topic for each document for ti, t in enumerate(topics): for tj, v in t: dense[ti, tj] = v #distance between all the rows in the matrix