def train_lda(): """ Usage: python Wechat_LDA.py wechat.csv """ with open(sys.argv[1], 'r') as wx: for f in wx: seg = jieba.cut(f) seg = [word for word in seg if word not in stopwords] with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg: wx_seg.write(' '.join(seg)) documents = open('wechat_seg.txt', 'r') dictionary = corpora.Dictionary(LineSentence(documents)) corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)] tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True) tfidf_model.save('wechat_seg.txt.tfidf_model') # corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus]) lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1) lda_model.save('wechat_lda_model.pkl') topics = [] for doc in corpus: topics.append(lda_model[doc]) counts = np.zeros(100) for top_doc in topics: for ti, _ in top_doc: counts[ti] += 1 words = lda_model.show_topic(counts.argmax(), 64) with open('top_words.txt', 'w') as tw: writer = UnicodeWriter(tw) for w in words: writer.writerow((w[0], int(float(w[1])*1000)))
def createLDA(self, fileName = '', modelName= '', ldaPasses='', topicNum=''): ''' fileName -> file for the dictionary (.dict) and corpus (.mm) files modelName -> model name for LDA to save to disk ldaPasses -~ number of passes, 10 default topicNum -> number of topics to generate, 100 by default ''' if fileName == '': fileName = self.__fileName if ldaPasses == '': ldaPasses = self.__ldaPasses if topicNum == '': topicNum = self.__topicNum if modelName == '': modelName = fileName + '_' + str(ldaPasses) + 'P_' + str(topicNum) + 'T' dict = corpora.Dictionary.load(self.__destination+fileName+'.dict') mm = corpora.MmCorpus(self.__destination+fileName+'.mm') #lda = models.ldamodel.LdaModel(corpus=mm, id2word=dict, num_topics=6, update_every=1, chunksize=10000, passes=10) lda = LdaMulticore(corpus=mm, num_topics=topicNum, id2word=dict, chunksize=30000, passes=ldaPasses, workers=3) lda.save(self.__destination+modelName+'.lda') #======================================================================= # print lda #======================================================================= print 'Created LDA model %s'%self.__fileName
def _build_lda(self, name, corpus, num_topics=30, words_to_save=200, multicore=True): from gensim.models import LdaMulticore, LdaModel gdict = self.gensim_dictionary if multicore: lda = LdaMulticore(corpus=corpus, num_topics=num_topics, workers=3, id2word=gdict) else: lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=gdict) model = TopicModel(name=name, dictionary=self) model.save() topics = [] for i in range(num_topics): topic = lda.show_topic(i, topn=words_to_save) alpha = lda.alpha[i] topicm = Topic(model=model, name="?", alpha=alpha, index=i) topicm.save() topics.append(topicm) words = [] for prob, word_text in topic: word_index = gdict.token2id[word_text] word_id = self.get_word_id(word_index) tw = TopicWord(topic=topicm, word_id=word_id, word_index=word_index, probability=prob) words.append(tw) TopicWord.objects.bulk_create(words) most_likely_word_scores = topicm.word_scores\ .order_by('-probability')\ .prefetch_related('word') topicm.name = ', '.join([score.word.text for score in most_likely_word_scores[:3]]) topicm.save() if settings.DEBUG: # prevent memory leaks from django.db import connection connection.queries = [] model.save_to_file(lda) return (model, lda)
def _build_lda(self, name, corpus, num_topics=30, words_to_save=200): from gensim.models import LdaMulticore gdict = self.gensim_dictionary lda = LdaMulticore(corpus=corpus, num_topics=num_topics, workers=3, id2word=gdict) model = TopicModel(name=name, dictionary=self) model.save() topics = [] for i in range(num_topics): topic = lda.show_topic(i, topn=words_to_save) alpha = lda.alpha[i] topicm = Topic(model=model, name="?", alpha=alpha, index=i) topicm.save() topics.append(topicm) words = [] for prob, word_text in topic: word_index = gdict.token2id[word_text] word_id = self.get_word_id(word_index) tw = TopicWord(topic=topicm, word_id=word_id, word_index=word_index, probability=prob) words.append(tw) TopicWord.objects.bulk_create(words) if settings.DEBUG: # prevent memory leaks from django.db import connection connection.queries = [] model.save_to_file(lda) return (model, lda)
def load_from_file(self): from gensim.models import LdaMulticore return LdaMulticore.load("lda_out_%d.model" % self.id)
### Create BOW corpus ### corpus = [dictionary.doc2bow(text) for text in text_list] print("--- Corpus made: %s minutes ---" % round(((time.time() - start_time)/60),2)) start_lda_time = time.time() ################################# ######### Train LDA ############ ################################# lda_model = LdaMulticore(corpus, num_topics=4, id2word=dictionary, passes=150, workers = 3) final_topics = lda_model.show_topics() print("--- LDA trained : %s minutes ---" % round(((time.time() - start_lda_time)/60),2)) ################################# ##### Display WordCloud ######### ################################# curr_topic = 0 wc = WordCloud(background_color="black", max_words=2000,max_font_size=40, width=120, height=120, random_state=42) for line in final_topics: line = line[1] scores = [float(x.split("*")[0]) for x in line.split(" + ")] words = [x.split("*")[1] for x in line.split(" + ")] freqs = []
from gensim import corpora, models, similarities import time import numpy as np from gensim.models import LdaMulticore as LdaModel import argparse if __name__ == '__main__': lda_filename = 'lda/samsung.lda' corpus = corpora.MmCorpus('lda/samsung1.mm') dictionary = corpora.Dictionary.load('lda/samsung1.dict') start = time.time() lda = LdaModel(corpus, num_topics=10, alpha=1. / 100, eta=.2, chunksize=10000, workers=11, passes=10, decay=0.75, id2word=dictionary) print('Saving model') end = time.time() elapsed = end - start print "Time taken for LDA training: ", elapsed, "seconds." lda.print_topics() lda.save(lda_filename) print("lda saved in %s " % lda_filename)
bow_corpus = [dict_doc.doc2bow(x) for x in process_doc] # print(bow_corpus) # lda_model = LdaMulticore(bow_corpus,num_topics=8,id2word=dict_doc,workers=2,passes=8) # lda_model = LdaMulticore(bow_corpus) # print(lda_model) #for unseen document unseen_doc = "Javascript add row and calculate multiple rows from html" bow_vec = dict_doc.doc2bow(preprocess(unseen_doc)) # print(bow_vec) if __name__ == '__main__': multiprocessing.freeze_support() #for windows system lda_model = LdaMulticore(corpus=bow_corpus, id2word=dict_doc, num_topics=4) # print(lda) for idx, topic in lda_model.print_topics(-1): print("Topic: {} \nWords: {}".format(idx, topic)) print("\n") # print("*******************************************") # for index, score in sorted(lda_model[bow_vec], key=lambda tup: -1 * tup[1]): # print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5))) print((lda_model[bow_vec])) output = lda_model[bow_vec] b = sorted(output,key = lambda x:x[1],reverse = True) print(b[0][0])
regex = re.compile("[0-9]") no_numbers = filter(lambda i: not regex.search(i), clean) no_numbers = [i for i in clean if not regex.search(i)] # remove punctuation no_numbers = [''.join(char for char in s if char not in string.punctuation) for s in no_numbers] # tokenization of the pre-cleaned text tokenized_sents = [word_tokenize(i) for i in no_numbers] print(tokenized_sents) # TOPIC MODELLING # create dictionary dictionary = corpora.Dictionary(tokenized_sents) print(dictionary) # create corpus corpus = [dictionary.doc2bow(word) for word in tokenized_sents] print(corpus) # train the LDA Model lda_model = LdaMulticore(corpus = corpus, id2word = dictionary, num_topics = 8) lda_model.save('lda_model.model') lda_model.print_topics(-1) print(lda_model.print_topics(-1)) # create a LSI topic model lsi_model = LsiModel(corpus = corpus, id2word = dictionary, num_topics = 8, decay = 0.5) pprint(lsi_model.print_topics(-1))
class GensimLDAVectorizer(BaseEstimator, TransformerMixin): def __init__(self, num_topics, return_dense=True, max_df=0.5, min_df=5, **lda_params): """ :param num_topics: number of topics for the LDA model :param return_dense: transform function returns dense or not :param max_df: maximum word documentfrequency. Should be given as :param min_df: minimum word documentfrequency. Similar to max_df. :param lda_params: parameters for the constructor of gensim.model.Ldamulticore """ super().__init__() self.lda: LdaMulticore = None self.corpus = None self.lda_params = lda_params self.lda_params["num_topics"] = num_topics self.is_dense = return_dense self.max_df = max_df self.min_df = min_df def fit(self, docs): """ :param docs: List of split strings. :return: GensimLDAVectorizer """ id2word = Dictionary(docs) id2word.filter_extremes(self.min_df, self.max_df) self.corpus = [id2word.doc2bow(d) for d in docs] self.lda = LdaMulticore(corpus=self.corpus, id2word=id2word, **self.lda_params) return self def transform(self, docs): """ :param docs: List of split strings. :return: numpy.ndarray """ cur_bow = [self.lda.id2word.doc2bow(d) for d in docs] lda_bag_of_topics = [self.lda[c] for c in cur_bow] num_terms = self.lda.num_topics return corpus2dense(lda_bag_of_topics, num_terms).T if self.is_dense else corpus2csc( lda_bag_of_topics, num_terms).T def fit_transform(self, docs, y=None, **fit_params): return self.fit(docs).transform(docs) def evaluate_coherence(self, docs, coherence="c_v"): """ :param docs: List[List[str]] :param coherence: one of the coherence methods stated in gensim.models.CoherenceModel :return: gensim.models.CoherenceModel """ return CoherenceModel(model=self.lda, texts=docs, corpus=self.corpus, coherence=coherence, processes=self.lda_params["workers"]) def save(self, fname, *args, **kwargs): self.lda.save(fname=fname, *args, **kwargs) @classmethod def load(self, fname, return_dense=True, max_df=0.5, min_df=5, *args, **kwargs): lda = LdaMulticore.load(fname, *args, **kwargs) lda = LdaMulticore() alpha = lda.alpha eta = lda.eta iterations = lda.iterations random_seed = lda.random_state workers = lda.workers num_topics = lda.num_topics return GensimLDAVectorizer(num_topics, alpha, eta, workers, iterations, return_dense, max_df, min_df, random_seed)
news_train = fetch_20newsgroups(subset='train') # Tokenization and lemmatization wnl = WordNetLemmatizer() news_train_lemma = [ tokenize_lemmatize(article, wnl.lemmatize) for article in news_train.data ] # Build a genism corpara structure dict_train = Dictionary(news_train_lemma) mmCorpus_train = [dict_train.doc2bow(article) for article in news_train_lemma] # Latent Semantic Analysis lsi_train = LsiModel(corpus=mmCorpus_train, num_topics=40, id2word=dict_train) for i in range(40): print('topic' + i.__str__() + ' :') print(lsi_train.print_topic(i)) # Latent Dirichlet Allocation lda_train = LdaMulticore(corpus=mmCorpus_train, num_topics=40, id2word=dict_train, workers=5) for i in range(40): print('topic' + i.__str__() + ' :') print(lda_train.print_topic(i))
def label_documents(documents: List, LVL, SET): OPTIMAL_TOPICS = 10 PREPROCESSINGs = ["lemmed"] for PREPROCESSING in PREPROCESSINGs: # texts = clean_documents([str(doc) for doc in documents], PREPROCESSING) # pickle.dump(texts, open("{}_{}_{}_text".format(SET, LVL, PREPROCESSING), "wb")) texts = pickle.load( open("{}_{}_{}_text".format(SET, LVL, PREPROCESSING), "rb")) dictionary = Dictionary(texts) print('Number of unique tokens: %d' % len(dictionary)) dictionary.filter_extremes(no_below=20, no_above=0.5) print('Number filtered of unique tokens: %d' % len(dictionary)) logger.info("Clean complete") logger.info("Dictionary complete") corpus = [dictionary.doc2bow(text) for text in texts] logger.info("Corpus complete") # model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=1000, num_topics=OPTIMAL_TOPICS, workers=3) # model.save("{}_{}_{}_model_cv_coherence_{}".format(LVL, OPTIMAL_TOPICS, PREPROCESSING)) model = LdaMulticore.load("{}_{}_1000_model_cv_coherence_{}".format( LVL, OPTIMAL_TOPICS, PREPROCESSING)) coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') print("Coherence {}".format(coherence_model.get_coherence())) topics = [] corpus = [dictionary.doc2bow(text) for text in texts] for document in corpus: topic = model.get_document_topics(document) topics.append(topic) with open("topics_results_{}_{}_{}.csv".format(SET, LVL, PREPROCESSING), "wt", encoding="utf8", newline="") as outf: writer = csv.writer(outf) for document_topics in topics: sorted_topics = sorted(document_topics, key=lambda x: -x[1]) if sorted_topics: best = [sorted_topics[0][0]] else: best = [-1] print("NONE ERROR") writer.writerow(best) x = model.show_topics(num_topics=OPTIMAL_TOPICS, num_words=10, formatted=True) # Below Code Prints Topics and Words with open("topics_keywords_{}_{}_{}".format(SET, LVL, PREPROCESSING), "wt", encoding="utf8", newline="") as outf: writer = csv.writer(outf) for t in x: topic = t[0] words = [(word_score.split("*")[1].strip()[1:-1], float(word_score.split("*")[0].strip())) for word_score in t[1].split("+")] sort_words = sorted(words, key=lambda z: z[1]) print(str(topic) + " " + str(sort_words)) words = [w for w, s in sort_words] score = [s for w, s in sort_words] topics_word_bar(words, score, t[0]) writer.writerow([topic] + [sort_words]) return
# + # Fit a single version of the LDA model. num_topics = 10 chunksize = 5000 passes = 4 iterations = 200 eval_every = 1 # Evaluate convergence at the end id2word = dictionary.id2token lda_model = LdaMulticore( corpus=tfidf_corpus, id2word=id2word, chunksize=chunksize, alpha='symmetric', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every, workers=4 # Use all four cores ) top_topics = lda_model.top_topics(tfidf_corpus) pprint(top_topics) # - # Gensim calculates the [intrinsic coherence score](http://qpleple.com/topic-coherence-to-evaluate-topic-models/) for # each topic. By averaging across all of the topics in the model you can get an average coherence score. Coherence # is a measure of the strength of the association between words in a topic cluster. It is supposed to be an objective # way to evaluate the quailty of the topic clusters. Higher scores are better.