def test_regex_extract_not_enough_input2(self): w = featurize.WebCorpusExtractor(regex_filter=[ r'<CAS<([^<>]+)>', ]) f = featurize.Featurizer(6, 11, label_extractor=w) f.featurize_stream(io.StringIO(input_with_cases)) self.assertFalse(f.dataset.full)
def test_grep_and_regex_filter(self): w = featurize.WebCorpusExtractor( grep_filter=["NOUN", "VERB"], regex_filter=[r'<([^<>]+)>'] ) self.assertEqual(w.extract_label("NOUN<CAS<ACC>"), "NOUNACC") self.assertEqual(w.extract_label("<CAS<ACC>"), None)
def test_regex_extract(self): w = featurize.WebCorpusExtractor(regex_filter=[ r'<CAS<([^<>]+)>', ]) f = featurize.Featurizer(2, 20, label_extractor=w) f.featurize_stream(io.StringIO(input_with_cases)) self.assertEqual(len(f.dataset), 4)
def test_regex_filter(self): w = featurize.WebCorpusExtractor(regex_filter=[ r'([abc])', r'(\w\d)\d', r'^(defg)$', ]) self.assertEqual(w.extract_label("abc"), 'a') self.assertEqual(w.extract_label("d92"), 'd9') self.assertEqual(w.extract_label("defg"), 'defg') self.assertEqual(w.extract_label("defgh"), None)
def test_pos_extract_not_enough_input(self): w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"]) f = featurize.Featurizer(200, 20, label_extractor=w) f.featurize_stream(io.StringIO(input_simple)) self.assertFalse(f.dataset.full)
def test_pos_extract(self): w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"]) f = featurize.Featurizer(2, 20, label_extractor=w) f.featurize_stream(io.StringIO(input_simple)) self.assertEqual(len(f.dataset), 4) self.assertTrue(f.dataset.full)
def test_grep_filter(self): w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"]) self.assertEqual(w.extract_label("abc"), None) self.assertEqual(w.extract_label("NOUNabc"), "NOUN") self.assertEqual(w.extract_label("NOUNabcVERB"), "NOUN")
def test_echo_filter(self): w = featurize.WebCorpusExtractor() self.assertEqual(w.extract_label("abc"), "abc")