Ejemplo n.º 1
0
def test_issue5458():
    # Test that the noun chuncker does not generate overlapping spans
    # fmt: off
    words = [
        "In", "an", "era", "where", "markets", "have", "brought", "prosperity",
        "and", "empowerment", "."
    ]
    vocab = Vocab(strings=words)
    deps = [
        "ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc",
        "conj", "punct"
    ]
    pos = [
        "ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ",
        "NOUN", "PUNCT"
    ]
    heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
    # fmt: on
    en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
    en_doc.noun_chunks_iterator = noun_chunks

    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
    nlp = English()
    merge_nps = nlp.create_pipe("merge_noun_chunks")
    merge_nps(en_doc)
Ejemplo n.º 2
0
 def __call__(self, doc: Doc) -> Doc:
     for sent in doc.sents:
         blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES))
         mlist = blist.mrph_list()
         tlist = blist.tag_list()
         for l, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]):
             sent._.set(getattr(KNP_USER_KEYS, comp).list_, l)
         if len(mlist) != len(sent):
             t, m = None, None
             for t, m in zip(sent, mlist):
                 if t.text != m.midasi:
                     break
             raise ValueError(
                 f"""Internal error occured
         Sentence: {sent.text}
         mlist : {[m.midasi for m in mlist]}
         tokens: {[t.text for t in sent]}
         diff  : {m.midasi}, {t.text}
         """
             )
         for m, token in zip(mlist, sent):
             token._.set(KNP_USER_KEYS.morph.element, m)
     doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc)))  # type: ignore
     doc.noun_chunks_iterator = knp_noun_chunker  # type: ignore
     # TODO: https://github.com/python/mypy/issues/3004
     return doc
Ejemplo n.º 3
0
 def __call__(self, doc: Doc) -> Doc:
     for sent in doc.sents:
         blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES))
         mlist = blist.mrph_list()
         tlist = blist.tag_list()
         if len(mlist) != len(sent):
             mlist = _separate_mrph(mlist, sent)
         for label, comp in zip([blist, mlist, tlist],
                                ["bunsetsu", "morph", "tag"]):
             sent._.set(getattr(KNP_USER_KEYS, comp).list_, label)
         for m, token in zip(mlist, sent):
             token._.set(KNP_USER_KEYS.morph.element, m)
     doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc)))
     doc.noun_chunks_iterator = knp_noun_chunker
     return doc
Ejemplo n.º 4
0
def correct_dep(doc, rewrite_ne_as_proper_noun):
    complex_tokens = []
    last_head = -1
    for token in doc[0:-1]:
        label = token.dep_
        p = label.find('_as_')
        if p >= 0:
            tag = label[p + 4:]
            if len(tag) > 0:
                lemma = token.lemma_
                token.tag_ = tag  # work around: lemma_ must be set after tag_ (spaCy's bug)
                token.lemma_ = lemma
            token.dep_ = label[0:p]

    for token in doc[0:-1]:
        label = token.dep_
        if label.startswith('as_'):
            head = token.head
            if last_head == head.i:
                complex_tokens[-1].append(token)
            else:
                complex_tokens.append([token])
                last_head = token.i
            tag = label[3:]
            if len(tag) > 0:
                lemma = token.lemma_
                token.tag_ = tag  # work around: lemma_ must be set after tag_ (spaCy's bug)
                token.lemma_ = lemma
            token.dep_ = 'dep'
        else:
            complex_tokens.append([token])
            last_head = token.i
    complex_tokens.append([doc[-1]])  # for root detection error

    index = 0
    count = 0
    index_map = [0] * (len(doc) + 1)  # last element is for ner
    for comp in complex_tokens:
        for _ in comp:
            index_map[count] = index
            count += 1
        index += 1
    index_map[-1] = count

    if len(complex_tokens) > 1:
        words, lemmas, tags, pos_details, infs, spaces, sent_starts, deps, heads = zip(*[
            (
                ''.join([t.orth_ + ' ' if t.whitespace_ else t.orth_ for t in comp[0:-1]] + [comp[-1].orth_]),
                ''.join([t.lemma_ + ' ' if t.whitespace_ else t.lemma_ for t in comp[0:-1]] + [comp[-1].lemma_]),
                comp[0].pos_,
                comp[0]._.pos_detail,
                comp[-1]._.inf,
                comp[-1].whitespace_,
                True if comp[0].sent_start else False,
                comp[0].dep_,
                index_map[comp[0].head.i],
            ) if len(comp) > 1 else (
                comp[0].orth_,
                comp[0].lemma_,
                comp[0].pos_,
                comp[0]._.pos_detail,
                comp[0]._.inf,
                comp[0].whitespace_,
                True if comp[0].sent_start else False,
                comp[0].dep_,
                index_map[comp[0].head.i],
            ) for comp in complex_tokens[0:-1]
        ])
    else:
        words = lemmas = tags = pos_details = infs = spaces = sent_starts = deps = heads = []
    new_doc = Doc(doc.vocab, words=words, spaces=spaces)
    for token, lemma, tag, pos_detail, inf, dep in zip(new_doc, lemmas, tags, pos_details, infs, deps):
        token.tag_ = tag
        token.lemma_ = lemma  # work around: lemma_ must be set after tag_ (spaCy's bug)
        token._.pos_detail = pos_detail
        token._.inf = inf
        token.dep_ = dep
    for token, sent_start in zip(new_doc, sent_starts):
        if sent_start:
            token.sent_start = True
    root_i = len(new_doc)
    for token, head in zip(new_doc, heads):
        if head == root_i:
            token.head = token
        else:
            token.head = new_doc[head]

    ents = []
    prev_start = prev_end = -1
    for ent in doc.ents:
        start = index_map[ent.start]
        end = max(index_map[ent.end], start + 1)
        if prev_end > start and prev_start < end:
            ents = ents[:-1]
        ents.append((ent.label, start, end))
        prev_start = start
        prev_end = end
    new_doc.ents = ents

    new_doc.is_tagged = doc.is_tagged
    new_doc.is_parsed = doc.is_parsed

    if rewrite_ne_as_proper_noun:
        for _, start, end in ents:
            for token in new_doc[start:end]:
                lemma = token.lemma_
                token.tag_ = 'PROPN'  # work around: lemma_ must be set after tag_ (spaCy's bug)
                token.lemma_ = lemma

    new_doc.noun_chunks_iterator = noun_chunks  # TODO work around for spaCy 2.0.12

    if len(doc.text) - len(EOS) != len(new_doc.text):
        print(
            'doc.text length is different from source={} to corrected={}'.format(
                len(doc.text) - len(EOS),
                len(new_doc.text)),
            file=sys.stderr
        )
        for t in doc:
            print('<', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr)
        for t in new_doc:
            print('>', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr)

    return new_doc