コード例 #1
0
ファイル: test_text.py プロジェクト: vickingur/rosetta
 def setUp(self):
     self.outfile = StringIO()
     formatter = text_processors.VWFormatter()
     self.sff = text_processors.SFileFilter(formatter,
                                            bit_precision=20,
                                            verbose=False)
     self.hash_fun = self.sff._get_hash_fun()
コード例 #2
0
ファイル: test_text.py プロジェクト: vickingur/rosetta
    def setUp(self):
        self.outfile = StringIO()

        formatter = text_processors.VWFormatter()
        self.sff = text_processors.SFileFilter(formatter,
                                               bit_precision=8,
                                               verbose=False)
        self.sff.id2token = {0: 'w0', 1: 'w1'}
        sfile = StringIO(" 1 doc1| w0:1 w1:2\n 1 doc2| w0:3 w1:4")
        self.sff.load_sfile(sfile)

        self.topics_file_1 = StringIO("Version 7.3\nlabel: 11\n"
                                      "0 1 2\n"
                                      "1 3 4")
        self.topics_file_2 = StringIO("Version 7.3\nlabel: 11\n"
                                      "0 1 0\n"
                                      "1 0 1")
        self.num_topics_1 = 2
        self.predictions_file_1 = StringIO("0.0 0.0 doc1\n"
                                           "0.0 0.0 doc2\n"
                                           "1 2 doc1\n"
                                           "39 58 doc2")
コード例 #3
0
from rosetta.text import text_processors, filefilter, streamers, vw_helpers

#create the VW format file 
my_tokenizer = text_processors.TokenizerBasic()
stream = streamers.TextFileStreamer(text_base_path=RAW, tokenizer=my_tokenizer)
stream.to_vw(sfile_path, n_jobs=-1, raise_on_bad_id=False)

### somewhere here run on your command line (stick with 5 passes or so...)
#cd data/processed
#rm -f *cache
#vw --lda 20 --cache_file doc_tokens.cache --passes 5 -p prediction.dat --readable_model topics.dat --bit_precision 16 --lda_D 975 --lda_rho 0.1 --lda_alpha 1 ../sparse/doc_tokens.vw

#load the sparse file 
formatter = text_processors.VWFormatter()
sff = text_processors.SFileFilter(formatter)
sff.load_sfile(sfile_path)

#remove "gaps" in the sequence of numbers (ids)
sff.compactify()
sff.save(PROCESSED + '/sff_basic.pkl')
sff.to_frame().sort_values(by='doc_fraction', ascending=False).head(10)

#use the LDAResults class from rosetta to convert back to readable, python friendly formats
lda = vw_helpers.LDAResults(PROCESSED + '/topics.dat', 
                            PROCESSED + '/prediction.dat', PROCESSED + '/sff_basic.pkl')

#look at some of the words
topic_words = lda.pr_token_g_topic.loc[:,'topic_12'].sort_values(ascending=False).index[:10]
lda.sfile_frame.loc[topic_words]