def test_issue5458(): # Test that the noun chuncker does not generate overlapping spans # fmt: off words = [ "In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "." ] vocab = Vocab(strings=words) deps = [ "ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct" ] pos = [ "ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT" ] heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] # fmt: on en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps) en_doc.noun_chunks_iterator = noun_chunks # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" nlp = English() merge_nps = nlp.create_pipe("merge_noun_chunks") merge_nps(en_doc)
def __call__(self, doc: Doc) -> Doc: for sent in doc.sents: blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES)) mlist = blist.mrph_list() tlist = blist.tag_list() for l, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]): sent._.set(getattr(KNP_USER_KEYS, comp).list_, l) if len(mlist) != len(sent): t, m = None, None for t, m in zip(sent, mlist): if t.text != m.midasi: break raise ValueError( f"""Internal error occured Sentence: {sent.text} mlist : {[m.midasi for m in mlist]} tokens: {[t.text for t in sent]} diff : {m.midasi}, {t.text} """ ) for m, token in zip(mlist, sent): token._.set(KNP_USER_KEYS.morph.element, m) doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc))) # type: ignore doc.noun_chunks_iterator = knp_noun_chunker # type: ignore # TODO: https://github.com/python/mypy/issues/3004 return doc
def __call__(self, doc: Doc) -> Doc: for sent in doc.sents: blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES)) mlist = blist.mrph_list() tlist = blist.tag_list() if len(mlist) != len(sent): mlist = _separate_mrph(mlist, sent) for label, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]): sent._.set(getattr(KNP_USER_KEYS, comp).list_, label) for m, token in zip(mlist, sent): token._.set(KNP_USER_KEYS.morph.element, m) doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc))) doc.noun_chunks_iterator = knp_noun_chunker return doc
def correct_dep(doc, rewrite_ne_as_proper_noun): complex_tokens = [] last_head = -1 for token in doc[0:-1]: label = token.dep_ p = label.find('_as_') if p >= 0: tag = label[p + 4:] if len(tag) > 0: lemma = token.lemma_ token.tag_ = tag # work around: lemma_ must be set after tag_ (spaCy's bug) token.lemma_ = lemma token.dep_ = label[0:p] for token in doc[0:-1]: label = token.dep_ if label.startswith('as_'): head = token.head if last_head == head.i: complex_tokens[-1].append(token) else: complex_tokens.append([token]) last_head = token.i tag = label[3:] if len(tag) > 0: lemma = token.lemma_ token.tag_ = tag # work around: lemma_ must be set after tag_ (spaCy's bug) token.lemma_ = lemma token.dep_ = 'dep' else: complex_tokens.append([token]) last_head = token.i complex_tokens.append([doc[-1]]) # for root detection error index = 0 count = 0 index_map = [0] * (len(doc) + 1) # last element is for ner for comp in complex_tokens: for _ in comp: index_map[count] = index count += 1 index += 1 index_map[-1] = count if len(complex_tokens) > 1: words, lemmas, tags, pos_details, infs, spaces, sent_starts, deps, heads = zip(*[ ( ''.join([t.orth_ + ' ' if t.whitespace_ else t.orth_ for t in comp[0:-1]] + [comp[-1].orth_]), ''.join([t.lemma_ + ' ' if t.whitespace_ else t.lemma_ for t in comp[0:-1]] + [comp[-1].lemma_]), comp[0].pos_, comp[0]._.pos_detail, comp[-1]._.inf, comp[-1].whitespace_, True if comp[0].sent_start else False, comp[0].dep_, index_map[comp[0].head.i], ) if len(comp) > 1 else ( comp[0].orth_, comp[0].lemma_, comp[0].pos_, comp[0]._.pos_detail, comp[0]._.inf, comp[0].whitespace_, True if comp[0].sent_start else False, comp[0].dep_, index_map[comp[0].head.i], ) for comp in complex_tokens[0:-1] ]) else: words = lemmas = tags = pos_details = infs = spaces = sent_starts = deps = heads = [] new_doc = Doc(doc.vocab, words=words, spaces=spaces) for token, lemma, tag, pos_detail, inf, dep in zip(new_doc, lemmas, tags, pos_details, infs, deps): token.tag_ = tag token.lemma_ = lemma # work around: lemma_ must be set after tag_ (spaCy's bug) token._.pos_detail = pos_detail token._.inf = inf token.dep_ = dep for token, sent_start in zip(new_doc, sent_starts): if sent_start: token.sent_start = True root_i = len(new_doc) for token, head in zip(new_doc, heads): if head == root_i: token.head = token else: token.head = new_doc[head] ents = [] prev_start = prev_end = -1 for ent in doc.ents: start = index_map[ent.start] end = max(index_map[ent.end], start + 1) if prev_end > start and prev_start < end: ents = ents[:-1] ents.append((ent.label, start, end)) prev_start = start prev_end = end new_doc.ents = ents new_doc.is_tagged = doc.is_tagged new_doc.is_parsed = doc.is_parsed if rewrite_ne_as_proper_noun: for _, start, end in ents: for token in new_doc[start:end]: lemma = token.lemma_ token.tag_ = 'PROPN' # work around: lemma_ must be set after tag_ (spaCy's bug) token.lemma_ = lemma new_doc.noun_chunks_iterator = noun_chunks # TODO work around for spaCy 2.0.12 if len(doc.text) - len(EOS) != len(new_doc.text): print( 'doc.text length is different from source={} to corrected={}'.format( len(doc.text) - len(EOS), len(new_doc.text)), file=sys.stderr ) for t in doc: print('<', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr) for t in new_doc: print('>', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr) return new_doc