def test_create_terms(): """ Can we create_terms via the create_{term,token} functions? """ naf = KafNafParser(type="NAF") sent=1; offset=0 input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'), (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')] offset = 0 for (word, lemma, pos, morph) in input: token = naf.create_wf(word, 1, offset) offset += len(word) term = naf.create_term(lemma, pos, morph, [token]) tokens = {t.get_id(): t for t in naf.get_tokens()} assert_equal(len(tokens), 4) result = {} for term in naf.get_terms(): for token_id in term.get_span().get_span_ids(): token = tokens[token_id] result[term.get_id()] = (token.get_text(), term.get_lemma(), term.get_pos(), term.get_morphofeat()) result = [result[tid] for tid in sorted(result.keys())] assert_equal(input, result)
def test_create_terms(): """ Can we create_terms via the create_{term,token} functions? """ naf = KafNafParser(type="NAF") sent = 1 offset = 0 input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'), (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')] offset = 0 for (word, lemma, pos, morph) in input: token = naf.create_wf(word, 1, offset) offset += len(word) term = naf.create_term(lemma, pos, morph, [token]) tokens = {t.get_id(): t for t in naf.get_tokens()} assert_equal(len(tokens), 4) result = {} for term in naf.get_terms(): for token_id in term.get_span().get_span_ids(): token = tokens[token_id] result[term.get_id()] = (token.get_text(), term.get_lemma(), term.get_pos(), term.get_morphofeat()) result = [result[tid] for tid in sorted(result.keys())] assert_equal(input, result)
def corenlp2naf(xml_bytes, annotators): """ Call from on the text and return a Naf object """ naf = KafNafParser(type="NAF") try: doc = Document(xml_bytes) except: log.exception("Error on parsing xml") raise terms = {} # (xml_sentid, xml_tokenid) : term for sent in doc.sentences: for t in sent.tokens: wf = naf.create_wf(t.word, sent.id, t.character_offset_begin) term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf]) terms[sent.id, t.id] = term if t.ner not in (None, 'O'): naf.create_entity(t.ner, [term.get_id()]) if sent.collapsed_ccprocessed_dependencies: dependencies = True for dep in sent.collapsed_ccprocessed_dependencies.links: if dep.type != 'root': child = terms[sent.id, dep.dependent.idx] parent = terms[sent.id, dep.governor.idx] comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma()) naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment) if doc.coreferences: for coref in doc.coreferences: cterms = set() for m in coref.mentions: cterms |= { terms[m.sentence.id, t.id].get_id() for t in m.tokens } naf.create_coreference("term", cterms) for annotator in annotators: if annotator in LAYERMAP: naf.create_linguistic_processor( LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()), get_corenlp_version()) s = BytesIO() naf.dump(s) return s.getvalue()
def corenlp2naf(xml_bytes, annotators): """ Call from on the text and return a Naf object """ naf = KafNafParser(type="NAF") try: doc = Document(xml_bytes) except: log.exception("Error on parsing xml") raise terms = {} # (xml_sentid, xml_tokenid) : term for sent in doc.sentences: for t in sent.tokens: wf = naf.create_wf(t.word, sent.id, t.character_offset_begin) term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf]) terms[sent.id, t.id] = term if t.ner not in (None, 'O'): naf.create_entity(t.ner, [term.get_id()]) if sent.collapsed_ccprocessed_dependencies: dependencies = True for dep in sent.collapsed_ccprocessed_dependencies.links: if dep.type != 'root': child = terms[sent.id, dep.dependent.idx] parent = terms[sent.id, dep.governor.idx] comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma()) naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment) if doc.coreferences: for coref in doc.coreferences: cterms = set() for m in coref.mentions: cterms |= {terms[m.sentence.id, t.id].get_id() for t in m.tokens} naf.create_coreference("term", cterms) for annotator in annotators: if annotator in LAYERMAP: naf.create_linguistic_processor(LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()), get_corenlp_version()) s = BytesIO() naf.dump(s) return s.getvalue()