def main(args): f = sys.argv[1] if args.model_type == "lda": loaded_model = LdaModel.load(args.model) for topic_num, topic in enumerate( loaded_model.show_topics(num_topics=-1)): topic_num, topic_str = topic print(str(topic_num) + ':', end=' ') for term in topic_str.split(' + '): weight, word = term.split('*') if args.model_type == "dtm": word = "\"" + word + "\"" print(word, end=' ') print() elif args.model_type == "dtm": loaded_model = DtmModel.load(args.model) for topic_id in range(loaded_model.num_topics): for time in range(len(loaded_model.time_slices)): top_words = loaded_model.show_topic(topic_id, time, topn=10) print("Topic", str(topic_id) + ", time slice", str(time) + ':', end=' ') for weight, word in top_words: print(word, end=', ') print() print() elif args.model_type == "ldaseq": loaded_model = LdaSeqModel.load(args.model) # maybe use dtm_coherence? print(loaded_model.num_topics) print(loaded_model.time_slice) for topic_id in range(loaded_model.num_topics): for time in range(len(loaded_model.time_slice)): top_words = loaded_model.print_topic(topic=topic_id, time=time, top_terms=20) print("Topic", str(topic_id) + ", time slice", str(time) + ':', end=' ') for word, weight in top_words: print(word, end=' ') print() print() else: print("Unknown model type provided: " + args.model_type) sys.exit(1)
def dtm(dtm_path, corpus, dictionary, time_slices, num_topics=40, load=False): # dtm_path should have your local binary of Blei-DTM print("Running DTM") if load is False: model = DtmModel(dtm_path, corpus, time_slices, num_topics=num_topics, id2word=dictionary, initialize_lda=True) model.save("DTM") return model elif load is True: model = DtmModel.load('DTM') return model
tagged_words = pos_tag(words) lemmatized = [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) for word, tag in tagged_words] lemmatized = list(filter(lambda w: not any(p in w for p in punctuation) and w not in stopword_list and w not in punctuation and len(w) >= 3, lemmatized)) return lemmatized def timestamp(): return datetime.now().strftime('%x %X') print('({}) DTM training data preprocessing started'.format(timestamp())) start = datetime.now() orig_df = pd.read_pickle(r'dfs\2020-03-22-to-2020-11-18-1000-daily') orig_texts = [preprocess(text) for text in orig_df['full_text']] orig_dictionary = corpora.Dictionary(orig_texts) orig_corpus = [orig_dictionary.doc2bow(text) for text in orig_texts] dtm_model = DtmModel.load(r'dtm\2020-03-22-to-2020-11-18-1000-daily') print('Time to preprocess training texts:', str(datetime.now() - start)) ###################################### ##### DAY-BY-DAY TOPIC LABELLING ##### ###################################### conn = sqlite3.connect('database/tweets.db') # df = pd.read_pickle(r'dfs\2020-03-22-to-2020-08-19-4000-daily') # df = pd.read_sql_query('select * from tweets where "user.screen_name" in (select screen_name from labels) and created_at between \'2020-03-22\' and \'2020-11-18\'', conn) # comment out n_tweets_per_day lines and cumulative_tweets lines when using full dataset # n_tweets_per_day = df['created_at'].apply(lambda x: x[:10]).value_counts().sort_index().values.tolist() # cumulative_tweets[i] is the index of the first Tweet from i days after START_DATE
#Corpus class for DTM data load class DTMcorpus(corpora.textcorpus.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input) corpus = DTMcorpus(documents) #path where dtm file is installed dtm_path = "/home/ankit/NLP_Project/dtm/dtm/dtm" model = DtmModel.load("DTMMOdel.txt") #model.save("DTModel.txt") #Gives top 25 topics tp = model.show_topics(num_topics=-1, times=1, num_words=100, log=False, formatted=False) print tp print type(tp) for i in tp: for j in i: print type(j), j[1].decode("utf-8") #print i.decode("utf-8")