Example #1
0
 def test_to_vw(self):
     stream = TextIterStreamer(text_iter=self.text_iter,
                               tokenizer=self.tokenizer)
     stream.to_vw(open(self.temp_vw_path, 'w'))
     result  = open(self.temp_vw_path).read()
     benchmark = " 1 a| failure:1 doomed:1\n 1 1| set:1 success:1\n"
     self.assertEqual(benchmark, result)
Example #2
0
 def test_to_vw(self):
     stream = TextIterStreamer(text_iter=self.text_iter,
                               tokenizer=self.tokenizer)
     stream.to_vw(open(self.temp_vw_path, 'w'))
     result = open(self.temp_vw_path).read()
     benchmark = " 1 a| failure:1 doomed:1\n 1 1| set:1 success:1\n"
     self.assertEqual(benchmark, result)
Example #3
0
    def test_to_vw(self):
        stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer)
        result = StringIO()
        stream.to_vw(result)

        benchmark = " 1 a| failure:1 doomed:1\n 1 1| set:1 success:1\n"
        self.assertEqual(benchmark, result.getvalue())
Example #4
0
    def test_to_scipyspare(self):
        stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer)

        result = stream.to_scipysparse()
        benchmark = sparse.csr_matrix([[1, 1, 0, 0], [0, 0, 1, 1]])

        compare = result.toarray() == benchmark.toarray()
        self.assertTrue(compare.all())
Example #5
0
    def test_to_scipyspare(self):
        stream = TextIterStreamer(text_iter=self.text_iter,
                                  tokenizer=self.tokenizer)

        result = stream.to_scipysparse()
        benchmark = sparse.csr_matrix([[1, 1, 0, 0], [0, 0, 1, 1]])

        compare = result.toarray() == benchmark.toarray()
        self.assertTrue(compare.all())
Example #6
0
    def test_token_stream(self):
        stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer)
        token_benchmark = [["doomed", "failure"], ["set", "success"]]
        id_benchmark = ["a", "1"]
        token_result = []
        for each in stream.token_stream(cache_list=["doc_id"]):
            token_result.append(each)

        self.assertEqual(token_benchmark, token_result)
        self.assertEqual(id_benchmark, stream.__dict__["doc_id_cache"])
Example #7
0
    def test_token_stream(self):
        stream = TextIterStreamer(text_iter=self.text_iter,
                                  tokenizer=self.tokenizer)
        token_benchmark = [['doomed', 'failure'], ['set', 'success']]
        id_benchmark = ['a', '1']
        token_result = []
        for each in stream.token_stream(cache_list=['doc_id']):
            token_result.append(each)

        self.assertEqual(token_benchmark, token_result)
        self.assertEqual(id_benchmark, stream.__dict__['doc_id_cache'])
Example #8
0
    def test_token_stream(self):
        stream = TextIterStreamer(text_iter=self.text_iter,
                                  tokenizer=self.tokenizer)
        token_benchmark = [['doomed', 'failure'],
                           ['set', 'success']]
        id_benchmark = ['a', '1']
        token_result = []
        for each in stream.token_stream(cache_list=['doc_id']):
            token_result.append(each)

        self.assertEqual(token_benchmark, token_result)
        self.assertEqual(id_benchmark, stream.__dict__['doc_id_cache'])
Example #9
0
    def test_info_stream(self):
        stream = TextIterStreamer(text_iter=self.text_iter, tokenizer=self.tokenizer)
        token_benchmark = [["doomed", "failure"], ["set", "success"]]
        text_benchmark = ["doomed to failure", "set for success"]
        token_result = []
        text_result = []
        for each in stream.info_stream():
            token_result.append(each["tokens"])
            text_result.append(each["text"])

        self.assertEqual(token_benchmark, token_result)
        self.assertEqual(text_benchmark, text_result)
Example #10
0
    def test_info_stream(self):
        stream = TextIterStreamer(text_iter=self.text_iter,
                                  tokenizer=self.tokenizer)
        token_benchmark = [['doomed', 'failure'], ['set', 'success']]
        text_benchmark = ['doomed to failure', 'set for success']
        token_result = []
        text_result = []
        for each in stream.info_stream():
            token_result.append(each['tokens'])
            text_result.append(each['text'])

        self.assertEqual(token_benchmark, token_result)
        self.assertEqual(text_benchmark, text_result)
Example #11
0
    def test_info_stream(self):
        stream = TextIterStreamer(text_iter=self.text_iter,
                                  tokenizer=self.tokenizer)
        token_benchmark = [['doomed', 'failure'],
                           ['set', 'success']]
        text_benchmark = ['doomed to failure', 'set for success']
        token_result = []
        text_result = []
        for each in stream.info_stream():
            token_result.append(each['tokens'])
            text_result.append(each['text'])

        self.assertEqual(token_benchmark, token_result)
        self.assertEqual(text_benchmark, text_result)
Example #12
0
    def convert(self, force=False):
        logger.info("Creating reviews file...")
        if force or not self.reviews_file.exists():
            with open(self.reviews_file, 'w') as fp:
                for line in self.input_file:
                    row = line.strip().split(' ', 5)
                    if len(row) > 5:
                        user_id, item_id, rating, timestamp, word_count, review = row
                        print >>fp, review
        logger.info("Creating filtered dat file...")
        def file_gen():
            with open(self.reviews_file, 'r') as fp:
                for index, line in enumerate(fp):
                    yield {
                        'text': line.strip(),
                        'doc_id': str(index),
                    }

        if force or not self.filtered_file.exists():
            with open(self.filtered_file, 'w') as fp:
                tokenizer = TokenizerBasic()
                stream = TextIterStreamer(file_gen(), tokenizer=tokenizer)
                stream.to_vw(fp, n_jobs=1)

        if force or not self.out_file.exists():
            sff = SFileFilter(VWFormatter())
            sff.load_sfile(self.filtered_file)

            df = sff.to_frame()
            df.head()
            df.describe()

            print "Filtering dat file..."
            sff.filter_extremes(doc_freq_min=50, doc_fraction_max=0.8)
            print "Sparsifying..."
            sff.compactify()
            sff.save(self.output_dir / 'sff_file.pkl')
            print "Outputting final file..."
            sff.filter_sfile(self.filtered_file, self.out_file)