def tokenize(outfile, paths, base_path, no_shuffle, tokenizer_type, tokenizer_pickle, doc_id_level, n_jobs, chunksize): """ Write later if module interface is needed. See _cli for the documentation. """ assert (paths == []) or (base_path is None) if base_path: paths = filefilter.get_paths(base_path, file_type='*', get_iter=True) if no_shuffle is False: paths = list(paths) shuffle(paths) if tokenizer_pickle is not None: tokenizer = SaveLoad.load(tokenizer_pickle) else: tokenizer_dict = {'basic': text_processors.TokenizerBasic} tokenizer = tokenizer_dict[tokenizer_type]() formatter = text_processors.VWFormatter() func = partial(_tokenize_one, tokenizer, formatter, doc_id_level) results_iterator = imap_easy(func, paths, n_jobs, chunksize) for result in results_iterator: outfile.write(result + '\n')
def setUp(self): self.outfile = StringIO() formatter = text_processors.VWFormatter() self.sff = text_processors.SFileFilter(formatter, bit_precision=20, verbose=False) self.hash_fun = self.sff._get_hash_fun()
def setUp(self): self.outfile = StringIO() formatter = text_processors.VWFormatter() self.sff = text_processors.SFileFilter(formatter, bit_precision=8, verbose=False) self.sff.id2token = {0: 'w0', 1: 'w1'} sfile = StringIO(" 1 doc1| w0:1 w1:2\n 1 doc2| w0:3 w1:4") self.sff.load_sfile(sfile) self.topics_file_1 = StringIO("Version 7.3\nlabel: 11\n" "0 1 2\n" "1 3 4") self.topics_file_2 = StringIO("Version 7.3\nlabel: 11\n" "0 1 0\n" "1 0 1") self.num_topics_1 = 2 self.predictions_file_1 = StringIO("0.0 0.0 doc1\n" "0.0 0.0 doc2\n" "1 2 doc1\n" "39 58 doc2")
def setUp(self): self.formatter = text_processors.VWFormatter()
from rosetta.text import text_processors, filefilter, streamers, vw_helpers #create the VW format file my_tokenizer = text_processors.TokenizerBasic() stream = streamers.TextFileStreamer(text_base_path=RAW, tokenizer=my_tokenizer) stream.to_vw(sfile_path, n_jobs=-1, raise_on_bad_id=False) ### somewhere here run on your command line (stick with 5 passes or so...) #cd data/processed #rm -f *cache #vw --lda 20 --cache_file doc_tokens.cache --passes 5 -p prediction.dat --readable_model topics.dat --bit_precision 16 --lda_D 975 --lda_rho 0.1 --lda_alpha 1 ../sparse/doc_tokens.vw #load the sparse file formatter = text_processors.VWFormatter() sff = text_processors.SFileFilter(formatter) sff.load_sfile(sfile_path) #remove "gaps" in the sequence of numbers (ids) sff.compactify() sff.save(PROCESSED + '/sff_basic.pkl') sff.to_frame().sort_values(by='doc_fraction', ascending=False).head(10) #use the LDAResults class from rosetta to convert back to readable, python friendly formats lda = vw_helpers.LDAResults(PROCESSED + '/topics.dat', PROCESSED + '/prediction.dat', PROCESSED + '/sff_basic.pkl') #look at some of the words topic_words = lda.pr_token_g_topic.loc[:,'topic_12'].sort_values(ascending=False).index[:10] lda.sfile_frame.loc[topic_words]