def test_to_vw(self): stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer) stream.to_vw(open(self.temp_vw_path, 'w')) result = open(self.temp_vw_path).read() benchmark = " 1 a| failure:1 doomed:1\n 1 1| set:1 success:1\n" self.assertEqual(benchmark, result)
def test_to_vw(self): stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer) result = StringIO() stream.to_vw(result) benchmark = " 1 a| failure:1 doomed:1\n 1 1| set:1 success:1\n" self.assertEqual(benchmark, result.getvalue())
def test_to_scipyspare(self): stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer) result = stream.to_scipysparse() benchmark = sparse.csr_matrix([[1, 1, 0, 0], [0, 0, 1, 1]]) compare = result.toarray() == benchmark.toarray() self.assertTrue(compare.all())
def test_token_stream(self): stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer) token_benchmark = [["doomed", "failure"], ["set", "success"]] id_benchmark = ["a", "1"] token_result = [] for each in stream.token_stream(cache_list=["doc_id"]): token_result.append(each) self.assertEqual(token_benchmark, token_result) self.assertEqual(id_benchmark, stream.__dict__["doc_id_cache"])
def test_token_stream(self): stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer) token_benchmark = [['doomed', 'failure'], ['set', 'success']] id_benchmark = ['a', '1'] token_result = [] for each in stream.token_stream(cache_list=['doc_id']): token_result.append(each) self.assertEqual(token_benchmark, token_result) self.assertEqual(id_benchmark, stream.__dict__['doc_id_cache'])
def test_info_stream(self): stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer) token_benchmark = [["doomed", "failure"], ["set", "success"]] text_benchmark = ["doomed to failure", "set for success"] token_result = [] text_result = [] for each in stream.info_stream(): token_result.append(each["tokens"]) text_result.append(each["text"]) self.assertEqual(token_benchmark, token_result) self.assertEqual(text_benchmark, text_result)
def test_info_stream(self): stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer) token_benchmark = [['doomed', 'failure'], ['set', 'success']] text_benchmark = ['doomed to failure', 'set for success'] token_result = [] text_result = [] for each in stream.info_stream(): token_result.append(each['tokens']) text_result.append(each['text']) self.assertEqual(token_benchmark, token_result) self.assertEqual(text_benchmark, text_result)
def convert(self, force=False): logger.info("Creating reviews file...") if force or not self.reviews_file.exists(): with open(self.reviews_file, 'w') as fp: for line in self.input_file: row = line.strip().split(' ', 5) if len(row) > 5: user_id, item_id, rating, timestamp, word_count, review = row print >>fp, review logger.info("Creating filtered dat file...") def file_gen(): with open(self.reviews_file, 'r') as fp: for index, line in enumerate(fp): yield { 'text': line.strip(), 'doc_id': str(index), } if force or not self.filtered_file.exists(): with open(self.filtered_file, 'w') as fp: tokenizer = TokenizerBasic() stream = TextIterStreamer(file_gen(), tokenizer=tokenizer) stream.to_vw(fp, n_jobs=1) if force or not self.out_file.exists(): sff = SFileFilter(VWFormatter()) sff.load_sfile(self.filtered_file) df = sff.to_frame() df.head() df.describe() print "Filtering dat file..." sff.filter_extremes(doc_freq_min=50, doc_fraction_max=0.8) print "Sparsifying..." sff.compactify() sff.save(self.output_dir / 'sff_file.pkl') print "Outputting final file..." sff.filter_sfile(self.filtered_file, self.out_file)