Beispiel #1
0
 def choose_lda(self, name='lda'):
     if name == 'lda':
         return vw_helpers.LDAResults(
             self.topics_file_1, self.predictions_file_1,
             self.sff, self.num_topics_1)
     elif name == 'lda_2':
         return vw_helpers.LDAResults(
             self.topics_file_2, self.predictions_file_1, self.sff,
             self.num_topics_1, alpha=1e-5)
Beispiel #2
0
def read_vw(vw_dir='../vw', topics=TOPICS):
    topics_file = os.path.join(vw_dir, 'topics.dat')
    prediction_file = os.path.join(vw_dir, 'predictions.dat')
    data_file = os.path.join(vw_dir, 'sff_file.pkl')
    return vw_helpers.LDAResults(topics_file,
                                 prediction_file,
                                 data_file,
                                 num_topics=topics)
Beispiel #3
0
#cd data/processed
#rm -f *cache
#vw --lda 20 --cache_file doc_tokens.cache --passes 5 -p prediction.dat --readable_model topics.dat --bit_precision 16 --lda_D 975 --lda_rho 0.1 --lda_alpha 1 ../sparse/doc_tokens.vw

#load the sparse file 
formatter = text_processors.VWFormatter()
sff = text_processors.SFileFilter(formatter)
sff.load_sfile(sfile_path)

#remove "gaps" in the sequence of numbers (ids)
sff.compactify()
sff.save(PROCESSED + '/sff_basic.pkl')
sff.to_frame().sort_values(by='doc_fraction', ascending=False).head(10)

#use the LDAResults class from rosetta to convert back to readable, python friendly formats
lda = vw_helpers.LDAResults(PROCESSED + '/topics.dat', 
                            PROCESSED + '/prediction.dat', PROCESSED + '/sff_basic.pkl')

#look at some of the words
topic_words = lda.pr_token_g_topic.loc[:,'topic_12'].sort_values(ascending=False).index[:10]
lda.sfile_frame.loc[topic_words]

#look at the the first topic
a_topic = lda.pr_token_g_topic.T.loc['topic_00'].copy()
a_topic.sort_values(ascending=False)
a_topic[:10]

##look at first document's topic weights
lda.pr_topic_g_doc.T.iloc[[0]].plot(kind='bar', figsize=(12,7),
                                   title = 'First Document Topic Weights')

#or at the average topic probabilties