Esempio n. 1
0
def tokenize(outfile, paths, base_path, no_shuffle, tokenizer_type,
             tokenizer_pickle, doc_id_level, n_jobs, chunksize):
    """
    Write later if module interface is needed. See _cli for the documentation.
    """
    assert (paths == []) or (base_path is None)

    if base_path:
        paths = filefilter.get_paths(base_path, file_type='*', get_iter=True)
        if no_shuffle is False:
            paths = list(paths)
            shuffle(paths)

    if tokenizer_pickle is not None:
        tokenizer = SaveLoad.load(tokenizer_pickle)
    else:
        tokenizer_dict = {'basic': text_processors.TokenizerBasic}
        tokenizer = tokenizer_dict[tokenizer_type]()

    formatter = text_processors.VWFormatter()

    func = partial(_tokenize_one, tokenizer, formatter, doc_id_level)

    results_iterator = imap_easy(func, paths, n_jobs, chunksize)

    for result in results_iterator:
        outfile.write(result + '\n')
Esempio n. 2
0
 def setUp(self):
     self.outfile = StringIO()
     formatter = text_processors.VWFormatter()
     self.sff = text_processors.SFileFilter(formatter,
                                            bit_precision=20,
                                            verbose=False)
     self.hash_fun = self.sff._get_hash_fun()
Esempio n. 3
0
    def setUp(self):
        self.outfile = StringIO()

        formatter = text_processors.VWFormatter()
        self.sff = text_processors.SFileFilter(formatter,
                                               bit_precision=8,
                                               verbose=False)
        self.sff.id2token = {0: 'w0', 1: 'w1'}
        sfile = StringIO(" 1 doc1| w0:1 w1:2\n 1 doc2| w0:3 w1:4")
        self.sff.load_sfile(sfile)

        self.topics_file_1 = StringIO("Version 7.3\nlabel: 11\n"
                                      "0 1 2\n"
                                      "1 3 4")
        self.topics_file_2 = StringIO("Version 7.3\nlabel: 11\n"
                                      "0 1 0\n"
                                      "1 0 1")
        self.num_topics_1 = 2
        self.predictions_file_1 = StringIO("0.0 0.0 doc1\n"
                                           "0.0 0.0 doc2\n"
                                           "1 2 doc1\n"
                                           "39 58 doc2")
Esempio n. 4
0
 def setUp(self):
     self.formatter = text_processors.VWFormatter()
Esempio n. 5
0

from rosetta.text import text_processors, filefilter, streamers, vw_helpers

#create the VW format file 
my_tokenizer = text_processors.TokenizerBasic()
stream = streamers.TextFileStreamer(text_base_path=RAW, tokenizer=my_tokenizer)
stream.to_vw(sfile_path, n_jobs=-1, raise_on_bad_id=False)

### somewhere here run on your command line (stick with 5 passes or so...)
#cd data/processed
#rm -f *cache
#vw --lda 20 --cache_file doc_tokens.cache --passes 5 -p prediction.dat --readable_model topics.dat --bit_precision 16 --lda_D 975 --lda_rho 0.1 --lda_alpha 1 ../sparse/doc_tokens.vw

#load the sparse file 
formatter = text_processors.VWFormatter()
sff = text_processors.SFileFilter(formatter)
sff.load_sfile(sfile_path)

#remove "gaps" in the sequence of numbers (ids)
sff.compactify()
sff.save(PROCESSED + '/sff_basic.pkl')
sff.to_frame().sort_values(by='doc_fraction', ascending=False).head(10)

#use the LDAResults class from rosetta to convert back to readable, python friendly formats
lda = vw_helpers.LDAResults(PROCESSED + '/topics.dat', 
                            PROCESSED + '/prediction.dat', PROCESSED + '/sff_basic.pkl')

#look at some of the words
topic_words = lda.pr_token_g_topic.loc[:,'topic_12'].sort_values(ascending=False).index[:10]
lda.sfile_frame.loc[topic_words]