def dothenlp(): if request.method == "POST": try: myData = request.json['myData'] recogniser = ternip.recogniser() normaliser = ternip.normaliser() doc = TernDocument(myData) strdoc = str(doc) ref_date = find_between(strdoc, "<DATE_TIME>", "</DATE_TIME>").replace("-","") sents = recogniser.tag(doc.get_sents()) normaliser.annotate(sents, ref_date) doc.reconcile(sents) # Single Sentences s = find_between(str(doc), "<TEXT>", "</TEXT>").replace("\n"," <br>") s = s.replace("_QUOTE_",""") s = s.replace("_APOSTROPHE_","'") s = s.replace("_AND_","&") sent_tokens = nltk.sent_tokenize(s) t = str(doc).split("<TEXT>")[0] t = t.replace("_QUOTE_",""") t = t.replace("_APOSTROPHE_","'") t = t.replace("_AND_","&") output = t + "\n\n<SENTENCES>\n\n" + str(sent_tokens) + "\n\n</SENTENCES>" return jsonify(result=output) except: return jsonify(result="something wrong")
def test_reconcile_DCT_sents_DATE(self): d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><DATE>20100801</DATE><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>') s = d.get_dct_sents() t = Timex() t.value = 'ABCDEF' s[0][0][2].add(t) d.reconcile_dct(s) self.assertEquals(str(d), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><DATE><TIMEX2 VAL="ABCDEF">20100801</TIMEX2></DATE><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>').toxml())
def test_create_from_sents(self): s = TernDocument.create([[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())], [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]], 'ABC123') self.assertEquals(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>').toxml())
def test_get_DCT_sents_None(self): d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>') self.assertEquals([[]], d.get_dct_sents())
def test_get_DCT_sents_DATE(self): d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><DATE>20100801</DATE><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>') self.assertEquals([[('20100801', 'CD', set())]], d.get_dct_sents())
def test_create_from_sents_with_offsets_tags(self): sents = [[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())], [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]] s = TernDocument.create(sents, 'ABC123', tok_offsets=[[2, 7, 11, 16, 28], [36, 41, 45, 46, 53]], add_S='s', add_LEX='lex', pos_attr='pos') self.assertEquals(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT> <s><lex pos="POS">This</lex> <lex pos="POS">is</lex> <lex pos="POS">some</lex> <lex pos="POS">annotated</lex> <lex pos="POS">text.</lex></s> <s><lex pos="POS">This</lex> <lex pos="POS">is</lex> <lex pos="POS">a</lex><lex pos="POS">second</lex> <lex pos="POS">sentence.</lex></s></TEXT></BODY></DOC>').toxml()) self.assertEquals(sents, s.get_sents())
def test_create_from_sents_with_dct(self): s = TernDocument.create([[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())], [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]], 'ABC123', dct='20100802') self.assertEquals(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><DATE_TIME>08/02/2010</DATE_TIME><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>').toxml())
ternip_extent_acts = 0 ternip_extent_cors = 0 ternip_norm_poss = 0 ternip_norm_acts = 0 ternip_norm_cors = 0 start = time.clock() for i in range(len(unannotated)): id = os.path.basename(unannotated[i]) # Open the document try: with open(unannotated[i]) as fd: doc = TernDocument(fd.read()) except: doc = None print("Unable to load document", id) continue # Get DCT dct_sents = doc.get_dct_sents() dct_sents = recogniser.tag(dct_sents) normaliser.annotate(dct_sents, 'XXXXXXXX') doc.reconcile_dct(dct_sents) if len(dct_sents) > 0 and len(dct_sents[0]) > 0 and len( dct_sents[0][0][2]) > 0: dct = dct_sents[0][0][2].pop().value else: dct = ''
import sys import os import os.path import logging sys.path.append('..') from ternip.formats.tern import TernDocument console = logging.StreamHandler() console.setFormatter(logging.Formatter('[%(asctime)s] %(name)-12s %(levelname)-8s %(message)s')) logging.getLogger().addHandler(console) logging.getLogger('ternip').setLevel(logging.INFO) LOGGER = logging.getLogger() if not os.path.isdir('preprocessed'): os.mkdir('preprocessed') for fpath in glob(os.path.normpath('../sample_data/tern/data/english/ace_2004/*/*.sgm')): with open(fpath) as fd: try: doc = TernDocument(fd.read()) LOGGER.info("Pre-processing " + os.path.basename(fpath)) doc.reconcile_dct(doc.get_dct_sents(), add_S='s', add_LEX='lex', pos_attr='pos') doc.reconcile(doc.get_sents(), add_S='s', add_LEX='lex', pos_attr='pos') with open(os.path.join('preprocessed', os.path.basename(fpath)), 'w') as ppfd: ppfd.write(str(doc)[22:]) except Exception: LOGGER.exception('Can not load document ' + os.path.basename(fpath))
ternip_extent_acts = 0 ternip_extent_cors = 0 ternip_norm_poss = 0 ternip_norm_acts = 0 ternip_norm_cors = 0 start = time.clock() for i in range(len(unannotated)): id = os.path.basename(unannotated[i]) # Open the document try: with open(unannotated[i]) as fd: doc = TernDocument(fd.read()) except: doc = None print "Unable to load document", id continue # Get DCT dct_sents = doc.get_dct_sents() dct_sents = recogniser.tag(dct_sents) normaliser.annotate(dct_sents, 'XXXXXXXX') doc.reconcile_dct(dct_sents) if len(dct_sents) > 0 and len(dct_sents[0]) > 0 and len(dct_sents[0][0][2]) > 0: dct = dct_sents[0][0][2].pop().value else: dct = '' sents = recogniser.tag(doc.get_sents())
def test_get_DCT_sents_None(self): d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>') self.assertEqual([[]], d.get_dct_sents())
def test_get_DCT_sents_DATE(self): d = TernDocument('<DOC><DOCNO>ABC123</DOCNO><DATE>20100801</DATE><BODY><TEXT>This is some annotated text. This is a second sentence.</TEXT></BODY></DOC>') self.assertEqual([[('20100801', 'CD', set())]], d.get_dct_sents())
def test_create_from_sents_with_offsets_tags(self): sents = [[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())], [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]] s = TernDocument.create(sents, 'ABC123', tok_offsets=[[2, 7, 11, 16, 28], [36, 41, 45, 46, 53]], add_S='s', add_LEX='lex', pos_attr='pos') self.assertEqual(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT> <s><lex pos="POS">This</lex> <lex pos="POS">is</lex> <lex pos="POS">some</lex> <lex pos="POS">annotated</lex> <lex pos="POS">text.</lex></s> <s><lex pos="POS">This</lex> <lex pos="POS">is</lex> <lex pos="POS">a</lex><lex pos="POS">second</lex> <lex pos="POS">sentence.</lex></s></TEXT></BODY></DOC>').toxml()) self.assertEqual(sents, s.get_sents())
def test_create_from_sents_with_offsets(self): s = TernDocument.create([[('This', 'POS', set()), ('is', 'POS', set()), ('some', 'POS', set()), ('annotated', 'POS', set()), ('text.', 'POS', set())], [('This', 'POS', set()), ('is', 'POS', set()), ('a', 'POS', set()), ('second', 'POS', set()), ('sentence.', 'POS', set())]], 'ABC123', tok_offsets=[[2, 7, 11, 16, 28], [36, 41, 45, 46, 53]]) self.assertEqual(str(s), xml.dom.minidom.parseString('<DOC><DOCNO>ABC123</DOCNO><BODY><TEXT> This is some annotated text. This is asecond sentence.</TEXT></BODY></DOC>').toxml())