Exemple #1
0
 def __init__(self, path, line_parser_re, lang):
     assert path and line_parser_re
     self.path = path
     with codecs.open(path, encoding='iso-8859-1') as f:
         AnnotatedText.__init__(self,
                                f.read(),
                                line_parser_re=line_parser_re,
                                lang=lang)
Exemple #2
0
    def test_annotated_text(self):
        def test_pos(tokens, text):
            for t in tokens:
                self.assertEqual(text[t[1]:t[1] + t[2]], t[0])
        
        
        def f(): AnnotatedText('abcd', line_parser_re = None)
        self.assertRaises(AssertionError, f)
        
        txt = '''
        WORD1 TAG11 TAG21
        
        WORD2 TAG12 TAG21
        WORD3 TAG11 O
        '''

        a = AnnotatedText(txt, line_parser_re = re.compile(r'([^ ]+) (?P<TAG1>[^ ]+) (?P<TAG2>[^ ]+)'))
        self.assertEqual(len(a.tags_names), 2)
        self.assertEqual(a.tags_names, ['TAG1', 'TAG2'])
        self.assertEqual(len(a.tags_groups), 2)
        self.assertEqual(a.tags_groups, [2, 3])

        tokens = [t for t in a.tokens()]
        self.assertEqual(len(tokens), 3)
        self.assertEqual(len(a._tokens), 6)
        i, tt = 0, (('TAG11', 'TAG21'), ('TAG12', 'TAG21'), ('TAG11', 'O'))
        for t in ('WORD1', 'WORD2', 'WORD3'): 
            self.assertEqual(t, tokens[i][0])
            self.assertEqual(tt[i], tokens[i][3])
            i+=1

        test_pos(tokens, str(a))

        tokens = [t for t in a.tokens('TAG1')]
        self.assertEqual(len(tokens), 3)

        tokens = [t for t in a.tokens('TAG2')]
        self.assertEqual(len(tokens), 2)

        def f(): tokens = [t for t in a.tokens('TAG3')]
        self.assertRaises(ValueError, f)

        txt = '''
        TAG11 WORD1 I-TAG21
        
        WARNING
        
        TAG12 WORD2 B-TAG21
        TAG11 WORD3 O
        
        O WORD4 I-TAG21
        O WORD5 B-TAG21
        O WORD6 I-TAG21
        O WORD7 O
        '''
        a = AnnotatedText(txt, line_parser_re = re.compile(r'(?P<TAG1>[^ ]+) ([^ ]+) (?P<TAG2>[^ ]+)'), ignore_line_re = re.compile(r'WARNING'))
        self.assertEqual(len(a.tags_names), 2)
        self.assertEqual(a.tags_names, ['TAG1', 'TAG2'])
        self.assertEqual(len(a.tags_groups), 2)
        self.assertEqual(a.tags_groups, [1, 3])
   
        tokens = [t for t in a.tokens()]
        self.assertEqual(len(tokens), 7)
        self.assertEqual(len(a._tokens), 13)
        
        tokens = [t for t in a.iob_tokens('TAG2')]
        tt, i   = ('WORD1', 'WORD2', 'WORD4', 'WORD5 WORD6'), 0
        self.assertEqual(len(tokens), 4)
        for t in tokens: 
            self.assertEqual(t[0], tt[i])
            self.assertEqual(t[3], 'TAG21')
            i+=1
        test_pos(tokens, str(a))
Exemple #3
0
 def f():
     AnnotatedText('abcd', line_parser_re=None)
Exemple #4
0
    def test_annotated_text(self):
        def test_pos(tokens, text):
            for t in tokens:
                self.assertEqual(text[t[1]:t[1] + t[2]], t[0])

        def f():
            AnnotatedText('abcd', line_parser_re=None)

        self.assertRaises(AssertionError, f)

        txt = '''
        WORD1 TAG11 TAG21
        
        WORD2 TAG12 TAG21
        WORD3 TAG11 O
        '''

        a = AnnotatedText(txt,
                          line_parser_re=re.compile(
                              r'([^ ]+) (?P<TAG1>[^ ]+) (?P<TAG2>[^ ]+)'))
        self.assertEqual(len(a.tags_names), 2)
        self.assertEqual(a.tags_names, ['TAG1', 'TAG2'])
        self.assertEqual(len(a.tags_groups), 2)
        self.assertEqual(a.tags_groups, [2, 3])

        tokens = [t for t in a.tokens()]
        self.assertEqual(len(tokens), 3)
        self.assertEqual(len(a._tokens), 6)
        i, tt = 0, (('TAG11', 'TAG21'), ('TAG12', 'TAG21'), ('TAG11', 'O'))
        for t in ('WORD1', 'WORD2', 'WORD3'):
            self.assertEqual(t, tokens[i][0])
            self.assertEqual(tt[i], tokens[i][3])
            i += 1

        test_pos(tokens, str(a))

        tokens = [t for t in a.tokens('TAG1')]
        self.assertEqual(len(tokens), 3)

        tokens = [t for t in a.tokens('TAG2')]
        self.assertEqual(len(tokens), 2)

        def f():
            tokens = [t for t in a.tokens('TAG3')]

        self.assertRaises(ValueError, f)

        txt = '''
        TAG11 WORD1 I-TAG21
        
        WARNING
        
        TAG12 WORD2 B-TAG21
        TAG11 WORD3 O
        
        O WORD4 I-TAG21
        O WORD5 B-TAG21
        O WORD6 I-TAG21
        O WORD7 O
        '''
        a = AnnotatedText(txt,
                          line_parser_re=re.compile(
                              r'(?P<TAG1>[^ ]+) ([^ ]+) (?P<TAG2>[^ ]+)'),
                          ignore_line_re=re.compile(r'WARNING'))
        self.assertEqual(len(a.tags_names), 2)
        self.assertEqual(a.tags_names, ['TAG1', 'TAG2'])
        self.assertEqual(len(a.tags_groups), 2)
        self.assertEqual(a.tags_groups, [1, 3])

        tokens = [t for t in a.tokens()]
        self.assertEqual(len(tokens), 7)
        self.assertEqual(len(a._tokens), 13)

        tokens = [t for t in a.iob_tokens('TAG2')]
        tt, i = ('WORD1', 'WORD2', 'WORD4', 'WORD5 WORD6'), 0
        self.assertEqual(len(tokens), 4)
        for t in tokens:
            self.assertEqual(t[0], tt[i])
            self.assertEqual(t[3], 'TAG21')
            i += 1
        test_pos(tokens, str(a))
Exemple #5
0
 def __init__(self, path, line_parser_re, lang):
     assert path and line_parser_re
     self.path = path
     with codecs.open(path, encoding='iso-8859-1') as f:
         AnnotatedText.__init__(self, f.read(), line_parser_re = line_parser_re, lang = lang)