def choose_lda(self, name='lda'): if name == 'lda': return vw_helpers.LDAResults( self.topics_file_1, self.predictions_file_1, self.sff, self.num_topics_1) elif name == 'lda_2': return vw_helpers.LDAResults( self.topics_file_2, self.predictions_file_1, self.sff, self.num_topics_1, alpha=1e-5)
def read_vw(vw_dir='../vw', topics=TOPICS): topics_file = os.path.join(vw_dir, 'topics.dat') prediction_file = os.path.join(vw_dir, 'predictions.dat') data_file = os.path.join(vw_dir, 'sff_file.pkl') return vw_helpers.LDAResults(topics_file, prediction_file, data_file, num_topics=topics)
#cd data/processed #rm -f *cache #vw --lda 20 --cache_file doc_tokens.cache --passes 5 -p prediction.dat --readable_model topics.dat --bit_precision 16 --lda_D 975 --lda_rho 0.1 --lda_alpha 1 ../sparse/doc_tokens.vw #load the sparse file formatter = text_processors.VWFormatter() sff = text_processors.SFileFilter(formatter) sff.load_sfile(sfile_path) #remove "gaps" in the sequence of numbers (ids) sff.compactify() sff.save(PROCESSED + '/sff_basic.pkl') sff.to_frame().sort_values(by='doc_fraction', ascending=False).head(10) #use the LDAResults class from rosetta to convert back to readable, python friendly formats lda = vw_helpers.LDAResults(PROCESSED + '/topics.dat', PROCESSED + '/prediction.dat', PROCESSED + '/sff_basic.pkl') #look at some of the words topic_words = lda.pr_token_g_topic.loc[:,'topic_12'].sort_values(ascending=False).index[:10] lda.sfile_frame.loc[topic_words] #look at the the first topic a_topic = lda.pr_token_g_topic.T.loc['topic_00'].copy() a_topic.sort_values(ascending=False) a_topic[:10] ##look at first document's topic weights lda.pr_topic_g_doc.T.iloc[[0]].plot(kind='bar', figsize=(12,7), title = 'First Document Topic Weights') #or at the average topic probabilties