Ejemplo n.º 1
0
def normalize_temporal_expressions(content, reference_date):
    """
    Constructs a corpus from documents.

    Params:
        content (str): Tokenized string
        reference_date (date): Reference date.

    """

    recogniser = ternip.recogniser() 
    normaliser = ternip.normaliser()

    content = f'<TimeML>\n{content}\n</TimeML>'
    doc = TimeMlDocument(content,"TimeML")
    sents = recogniser.tag(doc.get_sents())

    normaliser.annotate(sents, reference_date.strftime('%Y%m%d'))
    doc.reconcile(sents)

    xml_str = str(doc)

    unsupported_annotations = ["T24","T24","TMO", "TAF", "TEV", "TNI"]
    
    for ua in unsupported_annotations:
        xml_str = xml_str.replace(ua,"")

    return xml_str
Ejemplo n.º 2
0
 def test_create_from_sents(self):
     s = TimeMlDocument.create([[('This', 'POS', set()),
                                 ('is', 'POS', set()),
                                 ('some', 'POS', set()),
                                 ('annotated', 'POS', set()),
                                 ('text.', 'POS', set())],
                                [('This', 'POS', set()),
                                 ('is', 'POS', set()), ('a', 'POS', set()),
                                 ('second', 'POS', set()),
                                 ('sentence.', 'POS', set())]])
     self.assertEqual(
         str(s),
         xml.dom.minidom.parseString(
             '<TimeML>This is some annotated text. This is a second sentence.</TimeML>'
         ).toxml())
Ejemplo n.º 3
0
 def test_create_from_sents_with_offsets(self):
     s = TimeMlDocument.create([[('This', 'POS', set()),
                                 ('is', 'POS', set()),
                                 ('some', 'POS', set()),
                                 ('annotated', 'POS', set()),
                                 ('text.', 'POS', set())],
                                [('This', 'POS', set()),
                                 ('is', 'POS', set()), ('a', 'POS', set()),
                                 ('second', 'POS', set()),
                                 ('sentence.', 'POS', set())]],
                               tok_offsets=[[2, 7, 11, 16, 28],
                                            [36, 41, 45, 46, 53]])
     self.assertEqual(
         str(s),
         xml.dom.minidom.parseString(
             '<TimeML>  This is  some annotated   text.   This is  asecond sentence.</TimeML>'
         ).toxml())
Ejemplo n.º 4
0
 def test_create_from_sents_with_offsets_tags(self):
     sents = [[('This', 'POS', set()), ('is', 'POS', set()),
               ('some', 'POS', set()), ('annotated', 'POS', set()),
               ('text.', 'POS', set())],
              [('This', 'POS', set()), ('is', 'POS', set()),
               ('a', 'POS', set()), ('second', 'POS', set()),
               ('sentence.', 'POS', set())]]
     s = TimeMlDocument.create(sents,
                               tok_offsets=[[2, 7, 11, 16, 28],
                                            [36, 41, 45, 46, 53]],
                               add_S='s',
                               add_LEX='lex',
                               pos_attr='pos')
     self.assertEqual(
         str(s),
         xml.dom.minidom.parseString(
             '<TimeML>  <s><lex pos="POS">This</lex> <lex pos="POS">is</lex>  <lex pos="POS">some</lex> <lex pos="POS">annotated</lex>   <lex pos="POS">text.</lex></s>   <s><lex pos="POS">This</lex> <lex pos="POS">is</lex>  <lex pos="POS">a</lex><lex pos="POS">second</lex> <lex pos="POS">sentence.</lex></s></TimeML>'
         ).toxml())
     self.assertEqual(sents, s.get_sents())
Ejemplo n.º 5
0
 def test_create_from_sents(self):
     s = TimeMlDocument.create([[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())],
                                       [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]])
     self.assertEquals(str(s), xml.dom.minidom.parseString('<TimeML>This is some annotated text. This is a second sentence.</TimeML>').toxml())
Ejemplo n.º 6
0
 def test_create_from_sents_with_offsets_tags(self):
     sents = [[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())],
              [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]]
     s = TimeMlDocument.create(sents, tok_offsets=[[2, 7, 11, 16, 28], [36, 41, 45, 46, 53]], add_S='s', add_LEX='lex', pos_attr='pos')
     self.assertEquals(str(s), xml.dom.minidom.parseString('<TimeML>  <s><lex pos="POS">This</lex> <lex pos="POS">is</lex>  <lex pos="POS">some</lex> <lex pos="POS">annotated</lex>   <lex pos="POS">text.</lex></s>   <s><lex pos="POS">This</lex> <lex pos="POS">is</lex>  <lex pos="POS">a</lex><lex pos="POS">second</lex> <lex pos="POS">sentence.</lex></s></TimeML>').toxml())
     self.assertEquals(sents, s.get_sents())
Ejemplo n.º 7
0
 def test_create_from_sents_with_offsets(self):
     s = TimeMlDocument.create([[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())],
                                       [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]],
             tok_offsets=[[2, 7, 11, 16, 28], [36, 41, 45, 46, 53]])
     self.assertEquals(str(s), xml.dom.minidom.parseString('<TimeML>  This is  some annotated   text.   This is  asecond sentence.</TimeML>').toxml())