Example #1
0
 def test_regex_extract_not_enough_input2(self):
     w = featurize.WebCorpusExtractor(regex_filter=[
         r'<CAS<([^<>]+)>',
     ])
     f = featurize.Featurizer(6, 11, label_extractor=w)
     f.featurize_stream(io.StringIO(input_with_cases))
     self.assertFalse(f.dataset.full)
Example #2
0
 def test_regex_extract(self):
     w = featurize.WebCorpusExtractor(regex_filter=[
         r'<CAS<([^<>]+)>',
     ])
     f = featurize.Featurizer(2, 20, label_extractor=w)
     f.featurize_stream(io.StringIO(input_with_cases))
     self.assertEqual(len(f.dataset), 4)
Example #3
0
 def test_keep_duplicates(self):
     s = len(input_with_duplicates.split('\n'))
     f = featurize.Featurizer(30, 300, skip_duplicates=False)
     f.featurize_stream(io.StringIO(input_with_duplicates))
     self.assertEqual(len(f.dataset), s)
Example #4
0
 def test_empty_extractor2(self):
     f = featurize.Featurizer(3)
     f.featurize_stream(io.StringIO(input_with_cases))
     self.assertIn('részletez/VERB<INF>', f.dataset.labels)
Example #5
0
 def test_empty_extractor(self):
     s = len(input_with_cases.strip().split('\n'))
     f = featurize.Featurizer(3)
     f.featurize_stream(io.StringIO(input_with_cases))
     self.assertEqual(len(f.dataset), s)
Example #6
0
 def test_pos_extract_not_enough_input(self):
     w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"])
     f = featurize.Featurizer(200, 20, label_extractor=w)
     f.featurize_stream(io.StringIO(input_simple))
     self.assertFalse(f.dataset.full)
Example #7
0
 def test_pos_extract(self):
     w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"])
     f = featurize.Featurizer(2, 20, label_extractor=w)
     f.featurize_stream(io.StringIO(input_simple))
     self.assertEqual(len(f.dataset), 4)
     self.assertTrue(f.dataset.full)