def __init__(self, path, line_parser_re, lang): assert path and line_parser_re self.path = path with codecs.open(path, encoding='iso-8859-1') as f: AnnotatedText.__init__(self, f.read(), line_parser_re=line_parser_re, lang=lang)
def test_annotated_text(self): def test_pos(tokens, text): for t in tokens: self.assertEqual(text[t[1]:t[1] + t[2]], t[0]) def f(): AnnotatedText('abcd', line_parser_re = None) self.assertRaises(AssertionError, f) txt = ''' WORD1 TAG11 TAG21 WORD2 TAG12 TAG21 WORD3 TAG11 O ''' a = AnnotatedText(txt, line_parser_re = re.compile(r'([^ ]+) (?P<TAG1>[^ ]+) (?P<TAG2>[^ ]+)')) self.assertEqual(len(a.tags_names), 2) self.assertEqual(a.tags_names, ['TAG1', 'TAG2']) self.assertEqual(len(a.tags_groups), 2) self.assertEqual(a.tags_groups, [2, 3]) tokens = [t for t in a.tokens()] self.assertEqual(len(tokens), 3) self.assertEqual(len(a._tokens), 6) i, tt = 0, (('TAG11', 'TAG21'), ('TAG12', 'TAG21'), ('TAG11', 'O')) for t in ('WORD1', 'WORD2', 'WORD3'): self.assertEqual(t, tokens[i][0]) self.assertEqual(tt[i], tokens[i][3]) i+=1 test_pos(tokens, str(a)) tokens = [t for t in a.tokens('TAG1')] self.assertEqual(len(tokens), 3) tokens = [t for t in a.tokens('TAG2')] self.assertEqual(len(tokens), 2) def f(): tokens = [t for t in a.tokens('TAG3')] self.assertRaises(ValueError, f) txt = ''' TAG11 WORD1 I-TAG21 WARNING TAG12 WORD2 B-TAG21 TAG11 WORD3 O O WORD4 I-TAG21 O WORD5 B-TAG21 O WORD6 I-TAG21 O WORD7 O ''' a = AnnotatedText(txt, line_parser_re = re.compile(r'(?P<TAG1>[^ ]+) ([^ ]+) (?P<TAG2>[^ ]+)'), ignore_line_re = re.compile(r'WARNING')) self.assertEqual(len(a.tags_names), 2) self.assertEqual(a.tags_names, ['TAG1', 'TAG2']) self.assertEqual(len(a.tags_groups), 2) self.assertEqual(a.tags_groups, [1, 3]) tokens = [t for t in a.tokens()] self.assertEqual(len(tokens), 7) self.assertEqual(len(a._tokens), 13) tokens = [t for t in a.iob_tokens('TAG2')] tt, i = ('WORD1', 'WORD2', 'WORD4', 'WORD5 WORD6'), 0 self.assertEqual(len(tokens), 4) for t in tokens: self.assertEqual(t[0], tt[i]) self.assertEqual(t[3], 'TAG21') i+=1 test_pos(tokens, str(a))
def f(): AnnotatedText('abcd', line_parser_re=None)
def test_annotated_text(self): def test_pos(tokens, text): for t in tokens: self.assertEqual(text[t[1]:t[1] + t[2]], t[0]) def f(): AnnotatedText('abcd', line_parser_re=None) self.assertRaises(AssertionError, f) txt = ''' WORD1 TAG11 TAG21 WORD2 TAG12 TAG21 WORD3 TAG11 O ''' a = AnnotatedText(txt, line_parser_re=re.compile( r'([^ ]+) (?P<TAG1>[^ ]+) (?P<TAG2>[^ ]+)')) self.assertEqual(len(a.tags_names), 2) self.assertEqual(a.tags_names, ['TAG1', 'TAG2']) self.assertEqual(len(a.tags_groups), 2) self.assertEqual(a.tags_groups, [2, 3]) tokens = [t for t in a.tokens()] self.assertEqual(len(tokens), 3) self.assertEqual(len(a._tokens), 6) i, tt = 0, (('TAG11', 'TAG21'), ('TAG12', 'TAG21'), ('TAG11', 'O')) for t in ('WORD1', 'WORD2', 'WORD3'): self.assertEqual(t, tokens[i][0]) self.assertEqual(tt[i], tokens[i][3]) i += 1 test_pos(tokens, str(a)) tokens = [t for t in a.tokens('TAG1')] self.assertEqual(len(tokens), 3) tokens = [t for t in a.tokens('TAG2')] self.assertEqual(len(tokens), 2) def f(): tokens = [t for t in a.tokens('TAG3')] self.assertRaises(ValueError, f) txt = ''' TAG11 WORD1 I-TAG21 WARNING TAG12 WORD2 B-TAG21 TAG11 WORD3 O O WORD4 I-TAG21 O WORD5 B-TAG21 O WORD6 I-TAG21 O WORD7 O ''' a = AnnotatedText(txt, line_parser_re=re.compile( r'(?P<TAG1>[^ ]+) ([^ ]+) (?P<TAG2>[^ ]+)'), ignore_line_re=re.compile(r'WARNING')) self.assertEqual(len(a.tags_names), 2) self.assertEqual(a.tags_names, ['TAG1', 'TAG2']) self.assertEqual(len(a.tags_groups), 2) self.assertEqual(a.tags_groups, [1, 3]) tokens = [t for t in a.tokens()] self.assertEqual(len(tokens), 7) self.assertEqual(len(a._tokens), 13) tokens = [t for t in a.iob_tokens('TAG2')] tt, i = ('WORD1', 'WORD2', 'WORD4', 'WORD5 WORD6'), 0 self.assertEqual(len(tokens), 4) for t in tokens: self.assertEqual(t[0], tt[i]) self.assertEqual(t[3], 'TAG21') i += 1 test_pos(tokens, str(a))
def __init__(self, path, line_parser_re, lang): assert path and line_parser_re self.path = path with codecs.open(path, encoding='iso-8859-1') as f: AnnotatedText.__init__(self, f.read(), line_parser_re = line_parser_re, lang = lang)