Esempio n. 1
0
def test_sgml_reconstruction():
    reference_dataset = '/home/timv/projects/data/citations/tagged_references.txt'

    with file(reference_dataset, 'r') as f:
        for sgml in line_groups(f.read(), '<NEW.*?>'):
    
            (labels, tokens) = zip(*sgml2bio(sgml))
            
            # convert spans to sgml
            spans = bio2span(labels)
            reconstructed = ' '.join('<%s>%s</%s>' % (l, ' '.join(tokens[b:e]), l) for (l,b,e) in spans)
    
            assert equals_mod_whitespace(reconstructed, sgml), \
                'reconstructed example should only differ in whitespace.'

    print 'passed sgml reconstruction test.'
Esempio n. 2
0
def test_bio2span():

    tests = [
        (['I-NUM','I-TEMP'], [Span(label='NUM', begins=0, ends=1), Span(label='TEMP', begins=1, ends=2)]),
        (['I-NUM','B-TEMP'], [Span(label='NUM', begins=0, ends=1), Span(label='TEMP', begins=1, ends=2)]),
        (['B-NUM','B-TEMP'], [Span(label='NUM', begins=0, ends=1), Span(label='TEMP', begins=1, ends=2)]),
        (['B-NUM','B-TEMP'], [Span(label='NUM', begins=0, ends=1), Span(label='TEMP', begins=1, ends=2)]),
        (['B-NUM','O'], [Span(label='NUM', begins=0, ends=1)]),
        (['O','B-NUM'], [Span(label='NUM', begins=1, ends=2)]),
        (['O','B-NUM','O'], [Span(label='NUM', begins=1, ends=2)]),
        (['O','B-NUM','I-NUM'], [Span(label='NUM', begins=1, ends=3)]),
        (
            ['O', 'O', 'O',
             'I-NUM', 'I-NUM', 'I-NUM',
             'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP',
             'O',
             'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM',
             'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            [Span(label='NUM', begins=3, ends=6),
             Span(label='TEMP', begins=6, ends=14),
             Span(label='NUM', begins=15, ends=23)
             ]
        ),
        (
            ['O', 'O', 'O', 'O', 'O', 'O',
             'I-TEMP',
             'I-NUM', 'I-NUM', 'I-NUM',
             'O',
             'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM',
             'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
             'I-TEMP', 'I-TEMP', 'I-TEMP',
             'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            [Span(label='TEMP', begins=6, ends=7),
             Span(label='NUM', begins=7, ends=10),
             Span(label='NUM', begins=11, ends=15),
             Span(label='TEMP', begins=23, ends=26),
             ]
        ),
        (
            ['O', 'O', 'O',
             'B-NUM', 'I-NUM', 'I-NUM', 'I-NUM' ,
             'B-NUM', 'I-NUM', 'I-NUM', 'I-NUM',
             'O',
             'B-NUM', 'I-NUM', 'I-NUM',
             'O', 'O', 'O',
             'B-TEMP', 'I-TEMP',
             'O'],
            [Span(label='NUM', begins=3, ends=7),
             Span(label='NUM', begins=7, ends=11),
             Span(label='NUM', begins=12, ends=15),
             Span(label='TEMP', begins=18, ends=20),
             ]
        ),
    ]

    for x, expected in tests:
        print x
        got = bio2span(x, include_O=False)
        if expected == got:
            print '\033[32mpassed\033[0m'
        else:
            print '\033[31mfailed:\033[0m'
            print '  expected:', expected
            print '       got:', got
        print


    print '*************************************************************************'
    print '** Including O Spans.'

    include_O = [
        (['O','B-NUM','I-DATE'], 
         [Span(label='O', begins=0, ends=1), 
          Span(label='NUM', begins=1, ends=2),
          Span(label='DATE', begins=2, ends=3)]
        ),
        (['O','B-NUM','O','I-DATE'],
         [Span(label='O', begins=0, ends=1), Span(label='NUM', begins=1, ends=2), Span(label='O', begins=2, ends=3),
          Span(label='DATE', begins=3, ends=4)]
        ),
        (['O','B-NUM','O','I-DATE', 'O', 'O'],
         [Span(label='O', begins=0, ends=1),
          Span(label='NUM', begins=1, ends=2),
          Span(label='O', begins=2, ends=3),
          Span(label='DATE', begins=3, ends=4),
          Span(label='O', begins=4, ends=5),
          Span(label='O', begins=5, ends=6)]
         ),
        (['O'],
         [Span(label='O', begins=0, ends=1)]
        ),
    ]

    for x, expected in include_O:
        print x
        got = bio2span(x, include_O=True)
        if expected == got:
            print '\033[32mpassed\033[0m'
        else:
            print '\033[31mfailed:\033[0m'
            print '  expected:', expected
            print '       got:', got
            raise AssertionError
        print

    print 'passed test_bio2span'