def train_dtm(articles, n_topics, timeslices, model_filename): # train DTM print("\nCreating common_corpus and common_dictionary") common_dictionary = corpora.Dictionary(articles) common_corpus = [common_dictionary.doc2bow(doc) for doc in articles] chain_var = 0.1 print("Size of vocabulary: ", str(len(common_dictionary))) print("\nLDASeq Params:") print("time_slice = ", str(timeslices)) print("n_topics = ", str(n_topics)) print("chain_var = ", str(chain_var)) print("\nStart training Ldaseq model") ldaseq = LdaSeqModel(corpus=common_corpus, time_slice=timeslices, num_topics=n_topics, id2word=common_dictionary, chain_variance=chain_var) model_file = model_filename ldaseq.save(model_file) print("***** Done training! Saved trained model as", model_file, "*****") # save common_dictionary f = open(model_file + "_dict.pkl", "wb") pickle.dump(common_dictionary, f) f.close() print("Saved common_dictionary!") # save common_corpus f = open(model_file + "_corpus.pkl", "wb") pickle.dump(common_corpus, f) f.close() print("Saved common_corpus!")
def model_dtm(category_id): dictionary,corpus = load_dict_corpus(category_id) time_range = time_ranges[category_id] ldaseq = LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_range, num_topics=3, chunksize=10, passes=1, random_state=3) ldaseq.save(datapath('dynamic_topic_models/dtm_'+str(category_id)+'_model')) pickle.dump(ldaseq, open('dynamic_topic_models/dtm_'+str(category_id)+'_model.p', 'wb')) return ldaseq
def _train_model(self): # Create Dictionary id2word = corpora.Dictionary(self.data_words) # Create Corpus texts = self.data_words # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # Train Test Split corpus_train = train_test_split(corpus, range(len(corpus)), test_size=self.test_size, random_state=self.random_seed)[0] if self.model_name == 'lda': # Model self.model = gensim.models.wrappers.LdaMallet( self.path_to_mallet, alpha=100, corpus=corpus_train, num_topics=self.k, id2word=id2word, random_seed=self.random_seed, iterations=self.iterations) # Save Coherence Model co = 'c_v' # the way to calculate coherence score coherencemodel = CoherenceModel(model=self.model, texts=texts, dictionary=id2word, coherence=co) pickle.dump(coherencemodel, open(self.coh_path, "wb")) # Save Doc Lda doc_lda = self.model[corpus] pickle.dump(doc_lda, open(self.doc_lda_path, 'wb')) elif self.model_name == 'dtm': self.model = LdaSeqModel(corpus=corpus_train, time_slice=year_slice, id2word=id2word, lda_model=lda_model, num_topics=self.k) # Save Corpus pickle.dump(corpus, open(self.corpus_path, 'wb'))
def train_dtm(articles, timeslices, filename, k=10): chain_var = 0.1 common_dictionary = corpora.Dictionary(articles) common_corpus = [common_dictionary.doc2bow(a) for a in articles] print("Training DTM for all articles") print("Articles:", len(articles)) print("Topics:", k) print("Time slices: ", timeslices) print("Vocab len: ", len(common_dictionary)) ldaseq = LdaSeqModel(corpus=common_corpus, time_slice=timeslices, num_topics=k, id2word=common_dictionary, chain_variance=chain_var) model_file = filename ldaseq.save(model_file) dict_filename = model_file + "_dict.pkl" pickle.dump(common_dictionary, open(dict_filename, "wb")) dict_filename = model_file + "_corpus.pkl" pickle.dump(common_corpus, open(dict_filename, "wb")) print("Saved DTM model as", model_file, "!")
def train_dtm(articles, n_topics, outfile="dtm", dates=None, time_slices=None): if time_slices is None and dates is not None: counts = Counter(dates) time_slices = list(counts.values()) print("Dates: ", counts) print("Time slices:", time_slices) chain_var = 0.1 common_dictionary = corpora.Dictionary(articles) common_corpus = [common_dictionary.doc2bow(a) for a in articles] ldaseq = LdaSeqModel(corpus=common_corpus, time_slice=time_slices, num_topics=n_topics, id2word=common_dictionary, chain_variance=chain_var) model_file = "trained_models/" + outfile ldaseq.save(model_file) dict_filename = model_file + "_dict.pkl" pickle.dump(common_dictionary, open(dict_filename, "wb")) dict_filename = model_file + "_corpus.pkl" pickle.dump(common_corpus, open(dict_filename, "wb")) print("Saved DTM model as", model_file, "!")
def run_ldaseq(data, sections, **kw): """Run LDA sequential model.""" nips = NipsData() nips.load_data() yrs, cnts = papers_per_year(data) data = nips.combined_sections(sections, data) name = get_save_name(sections) ncomps = kw.get('num_topics', lda_seq_defaults['num_topics']) modname = f"ldaseq_{ncomps}_" + name mod = lda_load_model(modname) if mod: return mod d, bow = lda_get_dictionary(data, name=name, save=True) print(f"Running LDAseq on {sections}") lda_args = {**lda_seq_defaults, **kw} mod = LdaSeqModel(corpus=bow, time_slice=cnts, id2word=d, **lda_args) path, _ = get_save_path(modname) mod.save(path) return mod
class model(): def __init__(self, data_words_path, model_name, k, path_to_mallet, model_path, corpus_path, coh_path, doc_lda_path, a_sq=None, pub_year=None, test_size=0.2, random_seed=100, iterations=1000): self.data_words_path = data_words_path self.model_name = model_name self.k = k self.path_to_mallet = path_to_mallet self.model_path = model_path self.corpus_path = corpus_path self.coh_path = coh_path self.doc_lda_path = doc_lda_path self.test_size = test_size self.random_seed = random_seed self.iterations = iterations self.a_sq = a_sq self.pub_year = pub_year self._load_data_words() self._train_model() self._save_model() def _load_data_words(self): with open(self.data_words_path, "rb") as fp: self.data_words = pickle.load(fp) def _train_model(self): # Create Dictionary id2word = corpora.Dictionary(self.data_words) # Create Corpus texts = self.data_words # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # Train Test Split corpus_train = train_test_split(corpus, range(len(corpus)), test_size=self.test_size, random_state=self.random_seed)[0] if self.model_name == 'lda': # Model self.model = gensim.models.wrappers.LdaMallet( self.path_to_mallet, alpha=100, corpus=corpus_train, num_topics=self.k, id2word=id2word, random_seed=self.random_seed, iterations=self.iterations) # Save Coherence Model co = 'c_v' # the way to calculate coherence score coherencemodel = CoherenceModel(model=self.model, texts=texts, dictionary=id2word, coherence=co) pickle.dump(coherencemodel, open(self.coh_path, "wb")) # Save Doc Lda doc_lda = self.model[corpus] pickle.dump(doc_lda, open(self.doc_lda_path, 'wb')) elif self.model_name == 'dtm': self.model = LdaSeqModel(corpus=corpus_train, time_slice=year_slice, id2word=id2word, lda_model=lda_model, num_topics=self.k) # Save Corpus pickle.dump(corpus, open(self.corpus_path, 'wb')) def _save_model(self): self.model.save(self.model_path)
multFile.write(str(wordID) + ':' + str(weigth) + ' ') multFile.write('\n') multFile.close() print(multFile) time_slice = total_yearly_list #LdaSeqModel(corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100) #use LdaSeqModel to generate DTM results #ldaseq = LdaSeqModel(corpus=corpus_memory_friendly, id2word=dictionary, time_slice=time_slice, num_topics=5) ldaseq = LdaSeqModel(corpus=corpus_memory_friendly, id2word=dictionary, time_slice=time_slice, num_topics=4) # for given time, the distriibution of each topic ldaseq.print_topics(time=1) # for given topic the word distribution over time DTM_topic_0 = ldaseq.print_topic_times(topic=0, top_terms=10) DTM_topic_1 = ldaseq.print_topic_times(topic=1, top_terms=10) DTM_topic_2 = ldaseq.print_topic_times(topic=2, top_terms=10) DTM_topic_3 = ldaseq.print_topic_times(topic=3, top_terms=10) DTM_topic_4 = ldaseq.print_topic_times(topic=4, top_terms=10) def topic_time(DTM_topic, time_stamps): for i in range(len(time_slice) - 1): if i == 0: temp_a1 = pd.DataFrame(DTM_topic[i])