def test_regex_extract_not_enough_input2(self): w = featurize.WebCorpusExtractor(regex_filter=[ r'<CAS<([^<>]+)>', ]) f = featurize.Featurizer(6, 11, label_extractor=w) f.featurize_stream(io.StringIO(input_with_cases)) self.assertFalse(f.dataset.full)
def test_regex_extract(self): w = featurize.WebCorpusExtractor(regex_filter=[ r'<CAS<([^<>]+)>', ]) f = featurize.Featurizer(2, 20, label_extractor=w) f.featurize_stream(io.StringIO(input_with_cases)) self.assertEqual(len(f.dataset), 4)
def test_keep_duplicates(self): s = len(input_with_duplicates.split('\n')) f = featurize.Featurizer(30, 300, skip_duplicates=False) f.featurize_stream(io.StringIO(input_with_duplicates)) self.assertEqual(len(f.dataset), s)
def test_empty_extractor2(self): f = featurize.Featurizer(3) f.featurize_stream(io.StringIO(input_with_cases)) self.assertIn('részletez/VERB<INF>', f.dataset.labels)
def test_empty_extractor(self): s = len(input_with_cases.strip().split('\n')) f = featurize.Featurizer(3) f.featurize_stream(io.StringIO(input_with_cases)) self.assertEqual(len(f.dataset), s)
def test_pos_extract_not_enough_input(self): w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"]) f = featurize.Featurizer(200, 20, label_extractor=w) f.featurize_stream(io.StringIO(input_simple)) self.assertFalse(f.dataset.full)
def test_pos_extract(self): w = featurize.WebCorpusExtractor(grep_filter=["NOUN", "VERB"]) f = featurize.Featurizer(2, 20, label_extractor=w) f.featurize_stream(io.StringIO(input_simple)) self.assertEqual(len(f.dataset), 4) self.assertTrue(f.dataset.full)