Exemple #1
0
def dothenlp():

    if request.method == "POST":
        try:
            myData = request.json['myData']
            recogniser = ternip.recogniser()
            normaliser = ternip.normaliser()
            doc = TernDocument(myData)
            strdoc = str(doc)
            ref_date = find_between(strdoc, "<DATE_TIME>", "</DATE_TIME>").replace("-","")

            sents = recogniser.tag(doc.get_sents())
            normaliser.annotate(sents, ref_date)
            doc.reconcile(sents)
            # Single Sentences
            s = find_between(str(doc), "<TEXT>", "</TEXT>").replace("\n"," <br>")
            s = s.replace("_QUOTE_","&quot;")
            s = s.replace("_APOSTROPHE_","&#39;")
            s = s.replace("_AND_","&")

            sent_tokens = nltk.sent_tokenize(s)

            t = str(doc).split("<TEXT>")[0]
            t = t.replace("_QUOTE_","&quot;")
            t = t.replace("_APOSTROPHE_","&#39;")
            t = t.replace("_AND_","&")

            output = t + "\n\n<SENTENCES>\n\n" + str(sent_tokens) + "\n\n</SENTENCES>"
            return jsonify(result=output)
        except:
            return jsonify(result="something wrong")
Exemple #2
0
 def test_reconcile_DCT_sents_DATE(self):
     d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><DATE>20100801</DATE><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>')
     s = d.get_dct_sents()
     t = Timex()
     t.value = 'ABCDEF'
     s[0][0][2].add(t)
     d.reconcile_dct(s)
     self.assertEquals(str(d), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><DATE><TIMEX2 VAL="ABCDEF">20100801</TIMEX2></DATE><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>').toxml())
Exemple #3
0
 def test_create_from_sents(self):
     s = TernDocument.create([[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())],
                                     [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]], 'ABC123')
     self.assertEquals(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>').toxml())
Exemple #4
0
 def test_get_DCT_sents_None(self):
     d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>')
     self.assertEquals([[]], d.get_dct_sents())
Exemple #5
0
 def test_get_DCT_sents_DATE(self):
     d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><DATE>20100801</DATE><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>')
     self.assertEquals([[('20100801', 'CD', set())]], d.get_dct_sents())
Exemple #6
0
 def test_create_from_sents_with_offsets_tags(self):
     sents = [[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())],
              [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]]
     s = TernDocument.create(sents, 'ABC123', tok_offsets=[[2, 7, 11, 16, 28], [36, 41, 45, 46, 53]], add_S='s', add_LEX='lex', pos_attr='pos')
     self.assertEquals(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT>  <s><lex pos="POS">This</lex> <lex pos="POS">is</lex>  <lex pos="POS">some</lex> <lex pos="POS">annotated</lex>   <lex pos="POS">text.</lex></s>   <s><lex pos="POS">This</lex> <lex pos="POS">is</lex>  <lex pos="POS">a</lex><lex pos="POS">second</lex> <lex pos="POS">sentence.</lex></s></TEXT></BODY></DOC>').toxml())
     self.assertEquals(sents, s.get_sents())
Exemple #7
0
 def test_create_from_sents_with_dct(self):
     s = TernDocument.create([[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())],
                                     [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]], 'ABC123',
                                     dct='20100802')
     self.assertEquals(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><DATE_TIME>08/02/2010</DATE_TIME><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>').toxml())
Exemple #8
0
ternip_extent_acts = 0
ternip_extent_cors = 0
ternip_norm_poss = 0
ternip_norm_acts = 0
ternip_norm_cors = 0

start = time.clock()

for i in range(len(unannotated)):

    id = os.path.basename(unannotated[i])

    # Open the document
    try:
        with open(unannotated[i]) as fd:
            doc = TernDocument(fd.read())
    except:
        doc = None
        print("Unable to load document", id)
        continue

    # Get DCT
    dct_sents = doc.get_dct_sents()
    dct_sents = recogniser.tag(dct_sents)
    normaliser.annotate(dct_sents, 'XXXXXXXX')
    doc.reconcile_dct(dct_sents)
    if len(dct_sents) > 0 and len(dct_sents[0]) > 0 and len(
            dct_sents[0][0][2]) > 0:
        dct = dct_sents[0][0][2].pop().value
    else:
        dct = ''
import sys
import os
import os.path
import logging

sys.path.append('..')
from ternip.formats.tern import TernDocument

console = logging.StreamHandler()
console.setFormatter(logging.Formatter('[%(asctime)s] %(name)-12s %(levelname)-8s %(message)s'))
logging.getLogger().addHandler(console)
logging.getLogger('ternip').setLevel(logging.INFO)

LOGGER = logging.getLogger()

if not os.path.isdir('preprocessed'):
    os.mkdir('preprocessed')

for fpath in glob(os.path.normpath('../sample_data/tern/data/english/ace_2004/*/*.sgm')):
    with open(fpath) as fd:
        try:
            doc = TernDocument(fd.read())
            LOGGER.info("Pre-processing " + os.path.basename(fpath))
            doc.reconcile_dct(doc.get_dct_sents(), add_S='s', add_LEX='lex', pos_attr='pos')
            doc.reconcile(doc.get_sents(), add_S='s', add_LEX='lex', pos_attr='pos')
            with open(os.path.join('preprocessed', os.path.basename(fpath)), 'w') as ppfd:
                ppfd.write(str(doc)[22:])
        except Exception:
            LOGGER.exception('Can not load document ' + os.path.basename(fpath))

Exemple #10
0
ternip_extent_acts = 0
ternip_extent_cors = 0
ternip_norm_poss = 0
ternip_norm_acts = 0
ternip_norm_cors = 0

start = time.clock()

for i in range(len(unannotated)):
    
    id = os.path.basename(unannotated[i])
    
    # Open the document
    try:
        with open(unannotated[i]) as fd:
            doc = TernDocument(fd.read())
    except:
        doc = None
        print "Unable to load document", id
        continue
    
    # Get DCT
    dct_sents = doc.get_dct_sents()
    dct_sents = recogniser.tag(dct_sents)
    normaliser.annotate(dct_sents, 'XXXXXXXX')
    doc.reconcile_dct(dct_sents)
    if len(dct_sents) > 0 and len(dct_sents[0]) > 0 and len(dct_sents[0][0][2]) > 0:
        dct = dct_sents[0][0][2].pop().value
    else:
        dct = ''
    sents = recogniser.tag(doc.get_sents())
Exemple #11
0
 def test_get_DCT_sents_None(self):
     d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>')
     self.assertEqual([[]], d.get_dct_sents())
Exemple #12
0
 def test_get_DCT_sents_DATE(self):
     d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><DATE>20100801</DATE><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>')
     self.assertEqual([[('20100801', 'CD', set())]], d.get_dct_sents())
Exemple #13
0
 def test_create_from_sents_with_offsets_tags(self):
     sents = [[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())],
              [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]]
     s = TernDocument.create(sents, 'ABC123', tok_offsets=[[2, 7, 11, 16, 28], [36, 41, 45, 46, 53]], add_S='s', add_LEX='lex', pos_attr='pos')
     self.assertEqual(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT>  <s><lex pos="POS">This</lex> <lex pos="POS">is</lex>  <lex pos="POS">some</lex> <lex pos="POS">annotated</lex>   <lex pos="POS">text.</lex></s>   <s><lex pos="POS">This</lex> <lex pos="POS">is</lex>  <lex pos="POS">a</lex><lex pos="POS">second</lex> <lex pos="POS">sentence.</lex></s></TEXT></BODY></DOC>').toxml())
     self.assertEqual(sents, s.get_sents())
Exemple #14
0
 def test_create_from_sents_with_offsets(self):
     s = TernDocument.create([[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())],
                                     [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]],
             'ABC123',
             tok_offsets=[[2, 7, 11, 16, 28], [36, 41, 45, 46, 53]])
     self.assertEqual(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT>  This is  some annotated   text.   This is  asecond sentence.</TEXT></BODY></DOC>').toxml())