Ejemplo n.º 1
0
 def test_regex_extract_not_enough_input2(self):
     w = featurize.WebCorpusExtractor(regex_filter=[
         r'<CAS<([^<>]+)>',
     ])
     f = featurize.Featurizer(6, 11, label_extractor=w)
     f.featurize_stream(io.StringIO(input_with_cases))
     self.assertFalse(f.dataset.full)
Ejemplo n.º 2
0
 def test_grep_and_regex_filter(self):
     w = featurize.WebCorpusExtractor(
         grep_filter=["NOUN", "VERB"],
         regex_filter=[r'<([^<>]+)>']
     )
     self.assertEqual(w.extract_label("NOUN<CAS<ACC>"), "NOUNACC")
     self.assertEqual(w.extract_label("<CAS<ACC>"), None)
Ejemplo n.º 3
0
 def test_regex_extract(self):
     w = featurize.WebCorpusExtractor(regex_filter=[
         r'<CAS<([^<>]+)>',
     ])
     f = featurize.Featurizer(2, 20, label_extractor=w)
     f.featurize_stream(io.StringIO(input_with_cases))
     self.assertEqual(len(f.dataset), 4)
Ejemplo n.º 4
0
 def test_regex_filter(self):
     w = featurize.WebCorpusExtractor(regex_filter=[
         r'([abc])', r'(\w\d)\d', r'^(defg)$',
     ])
     self.assertEqual(w.extract_label("abc"), 'a')
     self.assertEqual(w.extract_label("d92"), 'd9')
     self.assertEqual(w.extract_label("defg"), 'defg')
     self.assertEqual(w.extract_label("defgh"), None)
Ejemplo n.º 5
0
 def test_pos_extract_not_enough_input(self):
     w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"])
     f = featurize.Featurizer(200, 20, label_extractor=w)
     f.featurize_stream(io.StringIO(input_simple))
     self.assertFalse(f.dataset.full)
Ejemplo n.º 6
0
 def test_pos_extract(self):
     w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"])
     f = featurize.Featurizer(2, 20, label_extractor=w)
     f.featurize_stream(io.StringIO(input_simple))
     self.assertEqual(len(f.dataset), 4)
     self.assertTrue(f.dataset.full)
Ejemplo n.º 7
0
 def test_grep_filter(self):
     w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"])
     self.assertEqual(w.extract_label("abc"), None)
     self.assertEqual(w.extract_label("NOUNabc"), "NOUN")
     self.assertEqual(w.extract_label("NOUNabcVERB"), "NOUN")
Ejemplo n.º 8
0
 def test_echo_filter(self):
     w = featurize.WebCorpusExtractor()
     self.assertEqual(w.extract_label("abc"), "abc")