def train_model(texts, **kwargs): # parse args filter_stopwords = kwargs.get('filter_stopwords', True) normalizer = kwargs.get('normalizer', 'porter') tfidf = kwargs.get('tfidf', True) num_topics = kwargs.get('num_topics', 20) min_freq = kwargs.get('min_freq', 2) use_pickle = kwargs.get('use_pickle', True) update_pickle = kwargs.get('update_pickle', True) report = kwargs.get('report', True) distributed = kwargs.get('distributed', False) # build corpus or read it in from pickle if use_pickle: print "INFO: loading pickled corpus and word hash" corpus = pickle.load( open( "pickles/corpus.p", "rb" ) ) id2word = pickle.load( open( "pickles/id2word.p", "rb" ) ) else: print "INFO: processing text and building corpus..." corpus, id2word = process_texts( texts = texts, filter_stopwords = filter_stopwords, normalizer = normalizer, min_freq = min_freq ) if update_pickle: # pickle files print "INFO: updating pickled coprus and word hash" pickle.dump(corpus, open( "pickles/corpus.p", "wb" ) ) pickle.dump(id2word, open( "pickles/id2word.p", "wb" ) ) # optional tfidf transformation if tfidf: print "INFO: applying tfidf transformation..." tfidf = TfidfModel(corpus) corpus = tfidf[corpus] # fit model print "INFO: fitting model..." lda = LdaModel( corpus = corpus, id2word = id2word, num_topics = num_topics, distributed = distributed ) # report if report: perplexity = lda.bound(corpus) print "RESULTS:" print "\nperplexity: ", perplexity, "\n" topics = lda.show_topics(num_topics) for i, t in enumerate(topics): print "topic %d:" % i print t return lda, corpus, id2word
def train_model(texts, **kwargs): # parse args filter_stopwords = kwargs.get('filter_stopwords', True) normalizer = kwargs.get('normalizer', 'porter') tfidf = kwargs.get('tfidf', True) num_topics = kwargs.get('num_topics', 20) min_freq = kwargs.get('min_freq', 2) use_pickle = kwargs.get('use_pickle', True) update_pickle = kwargs.get('update_pickle', True) report = kwargs.get('report', True) distributed = kwargs.get('distributed', False) # build corpus or read it in from pickle if use_pickle: print "INFO: loading pickled corpus and word hash" corpus = pickle.load(open("pickles/corpus.p", "rb")) id2word = pickle.load(open("pickles/id2word.p", "rb")) else: print "INFO: processing text and building corpus..." corpus, id2word = process_texts(texts=texts, filter_stopwords=filter_stopwords, normalizer=normalizer, min_freq=min_freq) if update_pickle: # pickle files print "INFO: updating pickled coprus and word hash" pickle.dump(corpus, open("pickles/corpus.p", "wb")) pickle.dump(id2word, open("pickles/id2word.p", "wb")) # optional tfidf transformation if tfidf: print "INFO: applying tfidf transformation..." tfidf = TfidfModel(corpus) corpus = tfidf[corpus] # fit model print "INFO: fitting model..." lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, distributed=distributed) # report if report: perplexity = lda.bound(corpus) print "RESULTS:" print "\nperplexity: ", perplexity, "\n" topics = lda.show_topics(num_topics) for i, t in enumerate(topics): print "topic %d:" % i print t return lda, corpus, id2word
def ldamodel(doc_clean,n_topics,n_words,description,tfidfmodel=False,unseen_docs=None): doc_clean = [min_char(doc).split() for doc in doc_clean] dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. corpus = [dictionary.doc2bow(doc) for doc in doc_clean] compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=doc_clean, start=2, limit=40, step=6) if tfidfmodel: tfidf = TfidfModel(corpus,id2word=dictionary,smartirs='ntc') corpus = tfidf[corpus] ldamodel = LdaModel(corpus, num_topics=16, id2word=dictionary,random_state=1,passes=50,per_word_topics=True) print("#Tópicos LDA") for i in range(0, n_topics): temp = ldamodel.show_topic(i, n_words) terms = [] for term in temp: terms.append(term) print("Topic #" + str(i) + ": ", ", ".join([t + '*' + str(i) for t, i in terms])) print('Bound: ',ldamodel.bound(corpus)) # Compute Perplexity print('Perplexity: ',ldamodel.log_perplexity(corpus)) # Compute Coherence Score coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) if unseen_docs: corpus_new = [dictionary.doc2bow(doc) for doc in unseen_docs] for i, unseen_doc in enumerate(corpus_new): topic = None score = 0 inference_doc = ldamodel[unseen_doc] print(unseen_docs[i]) for index,tmpScore in inference_doc[0]: if tmpScore > score: score = tmpScore topic = ldamodel.print_topic(index, 5) print ("Score: {}\t Topic: {}".format(score, topic)) print("Log perplexity for new corpus is", ldamodel.log_perplexity(corpus_new)) print_result(ldamodel, doc_clean, corpus, n_topics, description) pickle.dump(corpus, open(description+'.pkl', 'wb')) dictionary.save(description+'dictionary.gensim') ldamodel.save(description+'_ldamodel.gensim')
sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2] dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences] print('topic_num,avg,bound,perplexity,coherence') for i in range(1, max_topic_num + 1): lda = LdaModel(corpus=corpus, id2word=dic, num_topics=i, alpha=alpha, random_state=1) avg_topics = mean([len(t) for t in [lda[c] for c in corpus]]) bound = lda.bound(corpus) perwordbound = lda.log_perplexity(corpus) perplexity = np.exp2(-perwordbound) cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass', processes=1) coherence = cm.get_coherence() print(f"{i},{avg_topics},{bound},{perplexity},{coherence}")