Example #1
0
def test_create_terms():
    """
    Can we create_terms via the create_{term,token} functions?
    """
    
    naf = KafNafParser(type="NAF")
    sent=1; offset=0
    input = [(u'dit', u'dit', u'O', u'VNW'),
             (u'is', u'zijn', u'V', u'WW'),
             (u'een', u'een', u'D', u'LID'),
             (u'test', u'test', u'N', u'N')]

    offset = 0
    for (word, lemma, pos, morph) in input:
        token = naf.create_wf(word, 1, offset)
        offset += len(word)
        term = naf.create_term(lemma, pos, morph, [token])

    tokens = {t.get_id(): t for t in naf.get_tokens()}
    assert_equal(len(tokens), 4)
    
    result = {}
    for term in naf.get_terms():
        for token_id in term.get_span().get_span_ids():
            token = tokens[token_id]
            result[term.get_id()] = (token.get_text(), term.get_lemma(),
                                     term.get_pos(), term.get_morphofeat())
    result = [result[tid] for tid in sorted(result.keys())]
    assert_equal(input, result)
def test_create_terms():
    """
    Can we create_terms via the create_{term,token} functions?
    """

    naf = KafNafParser(type="NAF")
    sent = 1
    offset = 0
    input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'),
             (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')]

    offset = 0
    for (word, lemma, pos, morph) in input:
        token = naf.create_wf(word, 1, offset)
        offset += len(word)
        term = naf.create_term(lemma, pos, morph, [token])

    tokens = {t.get_id(): t for t in naf.get_tokens()}
    assert_equal(len(tokens), 4)

    result = {}
    for term in naf.get_terms():
        for token_id in term.get_span().get_span_ids():
            token = tokens[token_id]
            result[term.get_id()] = (token.get_text(), term.get_lemma(),
                                     term.get_pos(), term.get_morphofeat())
    result = [result[tid] for tid in sorted(result.keys())]
    assert_equal(input, result)
Example #3
0
def corenlp2naf(xml_bytes, annotators):
    """
    Call from on the text and return a Naf object
    """
    naf = KafNafParser(type="NAF")

    try:
        doc = Document(xml_bytes)
    except:
        log.exception("Error on parsing xml")
        raise

    terms = {}  # (xml_sentid, xml_tokenid) : term
    for sent in doc.sentences:
        for t in sent.tokens:
            wf = naf.create_wf(t.word, sent.id, t.character_offset_begin)
            term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf])
            terms[sent.id, t.id] = term
            if t.ner not in (None, 'O'):
                naf.create_entity(t.ner, [term.get_id()])
        if sent.collapsed_ccprocessed_dependencies:
            dependencies = True
            for dep in sent.collapsed_ccprocessed_dependencies.links:
                if dep.type != 'root':
                    child = terms[sent.id, dep.dependent.idx]
                    parent = terms[sent.id, dep.governor.idx]
                    comment = "{t}({o}, {s})".format(s=child.get_lemma(),
                                                     t=dep.type,
                                                     o=parent.get_lemma())
                    naf.create_dependency(child.get_id(),
                                          parent.get_id(),
                                          dep.type,
                                          comment=comment)

    if doc.coreferences:
        for coref in doc.coreferences:
            cterms = set()
            for m in coref.mentions:
                cterms |= {
                    terms[m.sentence.id, t.id].get_id()
                    for t in m.tokens
                }
            naf.create_coreference("term", cterms)

    for annotator in annotators:
        if annotator in LAYERMAP:
            naf.create_linguistic_processor(
                LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()),
                get_corenlp_version())
    s = BytesIO()
    naf.dump(s)
    return s.getvalue()
Example #4
0
def corenlp2naf(xml_bytes, annotators):
    """
    Call from on the text and return a Naf object
    """
    naf = KafNafParser(type="NAF")

    try:
        doc = Document(xml_bytes)
    except:
        log.exception("Error on parsing xml")
        raise

    terms = {} # (xml_sentid, xml_tokenid) : term
    for sent in doc.sentences:
        for t in sent.tokens:
            wf = naf.create_wf(t.word, sent.id, t.character_offset_begin)
            term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf])
            terms[sent.id, t.id] = term
            if t.ner not in (None, 'O'):
                naf.create_entity(t.ner, [term.get_id()])
        if sent.collapsed_ccprocessed_dependencies:
            dependencies = True
            for dep in sent.collapsed_ccprocessed_dependencies.links:
                if dep.type != 'root':
                    child = terms[sent.id, dep.dependent.idx]
                    parent = terms[sent.id, dep.governor.idx]
                    comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma())
                    naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment)

    if doc.coreferences:
        for coref in doc.coreferences:
            cterms = set()
            for m in coref.mentions:
                cterms |= {terms[m.sentence.id, t.id].get_id() for t in m.tokens}
            naf.create_coreference("term", cterms)
        
    for annotator in annotators:
        if annotator in LAYERMAP:
            naf.create_linguistic_processor(LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()),
                                            get_corenlp_version())
    s = BytesIO()
    naf.dump(s)
    return s.getvalue()