def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1.is_parsed = True doc2 = Doc(en_vocab, words=["Test"]) doc2.is_tagged = True doc3 = Doc(en_vocab, words=["Test"]) # DEP requires is_parsed matcher = Matcher(en_vocab) matcher.add("TEST", [[{"DEP": "a"}]]) matcher(doc1) with pytest.raises(ValueError): matcher(doc2) with pytest.raises(ValueError): matcher(doc3) # TAG, POS, LEMMA require is_tagged for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab) matcher.add("TEST", [[{attr: "a"}]]) matcher(doc2) with pytest.raises(ValueError): matcher(doc1) with pytest.raises(ValueError): matcher(doc3) # TEXT/ORTH only require tokens matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3) matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TEXT": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3)
def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1.is_parsed = True doc2 = Doc(en_vocab, words=["Test"]) doc2.is_tagged = True doc3 = Doc(en_vocab, words=["Test"]) # DEP requires is_parsed matcher = PhraseMatcher(en_vocab, attr="DEP") matcher.add("TEST1", [doc1]) with pytest.raises(ValueError): matcher.add("TEST2", [doc2]) with pytest.raises(ValueError): matcher.add("TEST3", [doc3]) # TAG, POS, LEMMA require is_tagged for attr in ("TAG", "POS", "LEMMA"): matcher = PhraseMatcher(en_vocab, attr=attr) matcher.add("TEST2", [doc2]) with pytest.raises(ValueError): matcher.add("TEST1", [doc1]) with pytest.raises(ValueError): matcher.add("TEST3", [doc3]) # TEXT/ORTH only require tokens matcher = PhraseMatcher(en_vocab, attr="ORTH") matcher.add("TEST3", [doc3]) matcher = PhraseMatcher(en_vocab, attr="TEXT") matcher.add("TEST3", [doc3])
def test_issue599(en_vocab): doc = Doc(en_vocab) doc.is_tagged = True doc.is_parsed = True doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) assert doc2.is_parsed
def to_doc(self, vocab, is_parsed=False): words = [morph.surface for morph in self.morphs] + [EOS] spaces = [morph.trailing_space for morph in self.morphs] + [False] doc = Doc(vocab, words=words, spaces=spaces) root_label = None for token, morph in zip(doc, self.morphs): token.tag_ = morph.pos token._.pos_detail = morph.pos_detail token._.inf = morph.inf token.lemma_ = morph.lemma # work around: lemma_ must be set after tag_ (spaCy's bug) if is_parsed and morph.dep_label: if morph.id == morph.dep_morph.id: root_label = morph.dep_label token.dep_ = root_label if root_label.find('as_') >= 0 else '{}_as_{}'.format(root_label, morph.pos) token.head = doc[-1] else: token.dep_ = morph.dep_label token.head = doc[morph.dep_morph.id] doc[-1].tag_ = 'X' # work around: lemma_ must be set after tag_ (spaCy's bug) doc[-1].lemma_ = EOS if root_label: doc[-1].head = doc[-1] doc[-1].dep_ = 'root' doc.is_parsed = True return doc
def test_issue3882(en_vocab): """Test that displaCy doesn't serialize the doc.user_data when making a copy of the Doc. """ doc = Doc(en_vocab, words=["Hello", "world"]) doc.is_parsed = True doc.user_data["test"] = set() parse_deps(doc)
def test_issue3199(): """Test that Span.noun_chunks works correctly if no noun chunks iterator is available. To make this test future-proof, we're constructing a Doc with a new Vocab here and setting is_parsed to make sure the noun chunks run. """ doc = Doc(Vocab(), words=["This", "is", "a", "sentence"]) doc.is_parsed = True assert list(doc[0:3].noun_chunks) == []
def __call__(self, text): """Convert input text to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ udpipe_sents = self.model(text) if text else [Sentence()] text = " ".join(s.getText() for s in udpipe_sents) tokens, heads = self.get_tokens_with_heads(udpipe_sents) if not tokens: return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not span: break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.form) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upostag or "")) # CoNNL xpostag-s, custom for each UD treebank #tags.append(self.vocab.strings.add(token.xpostag or "")) tags.append(self.vocab.strings.add(token.feats or "")) deps.append(self.vocab.strings.add(self._dep(token.deprel) or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.form) span = text[offset:] if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.form)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) # Overwrite lemmas separately to prevent overwritting by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def __call__(self, text): """Convert a StanfordNLP Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ snlp_doc = self.snlp(text) if text else Document("") text = snlp_doc.text tokens, heads = self.get_tokens_with_heads(snlp_doc) if not len(tokens): return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not len(span): break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.text) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upos or "")) tags.append(self.vocab.strings.add(token.xpos or "")) deps.append(self.vocab.strings.add(token.dependency_relation or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.text) span = text[offset:] if i == len(tokens) - 1: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.text)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def __call__(self,text): u=self.model(text,raw=True) if text else "" vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] feats=[] heads=[] deps=[] spaces=[] norms=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) feats.append(feat) if deprel=="root": heads.append(0) deps.append(r) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) i=misc.find("Translit=") norms.append(vs.add(form if i<0 else misc[i+9:])) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(feats): if j!="_" and j!="": doc[i].set_morph(j) t=Tree(u) t._makeChunks() bunsetu=["I"]*len(doc) for s in t._cabocha._sentences: for w in s: try: bunsetu[w[0]-1]="B" except: pass doc.user_data["bunsetu_bi_labels"]=bunsetu return doc
def knp_dependency_parser(doc: Doc) -> Doc: tag_spans: Iterable[Span] = doc._.get(KNP_USER_KEYS.tag.spans) for tag in tag_spans: parent: Optional[Span] = tag._.get(KNP_USER_KEYS.tag.parent) if parent is not None: tag[0].head = parent[0] else: tag[0].head = tag[0] for p, c in zip(tag, tag[1:]): c.head = p doc.is_parsed = True return doc
def __call__(self, text): """Convert a StanfordNLP Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ snlp_doc = self.snlp(text) text = snlp_doc.text tokens, heads = self.get_tokens_with_heads(snlp_doc) if not len(tokens): raise ValueError("No tokens available.") words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 for i, token in enumerate(tokens): span = text[offset:] if not len(span): break while not span.startswith(token.text): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.text) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upos or "")) tags.append(self.vocab.strings.add(token.xpos or "")) deps.append(self.vocab.strings.add(token.dependency_relation or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.text) span = text[offset:] if i == len(tokens) - 1: spaces.append(False) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.text)) attrs = [POS, TAG, DEP, HEAD, LEMMA] array = numpy.array(list(zip(pos, tags, deps, heads, lemmas)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def __call__(self,text): u=self.model(text,raw=True) if text else "" vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] feats=[] heads=[] deps=[] spaces=[] norms=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) feats.append(feat) if deprel=="root": heads.append(0) deps.append(r) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) i=misc.find("Gloss=") if i<0: norms.append(vs.add(form)) else: j=misc.find("|",i) norms.append(vs.add(misc[i+6:] if j<0 else misc[i+6:j])) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(feats): if j!="_" and j!="": doc[i].set_morph(j) return doc
def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" string = "This is a first sentence . And another one" doc = Doc(Vocab(), words=string.split()) doc[6].sent_start = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert not new_doc.is_parsed assert not new_doc.is_tagged doc.is_parsed = True doc.is_tagged = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc.is_parsed assert new_doc.is_tagged
def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1[0].dep_ = "ROOT" doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" if spacy_version >= 3: doc2[0].set_morph("Feat=Val") else: doc1.is_parsed = True doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) # DEP requires DEP matcher = Matcher(en_vocab) matcher.add("TEST", [[{"DEP": "a"}]]) matcher(doc1) with pytest.raises(ValueError): matcher(doc2) with pytest.raises(ValueError): matcher(doc3) # errors can be suppressed if desired matcher(doc2, allow_missing=True) matcher(doc3, allow_missing=True) # TAG, POS, LEMMA require those values for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab) matcher.add("TEST", [[{attr: "a"}]]) if spacy_version < 3: doc2.is_tagged = True matcher(doc2) with pytest.raises(ValueError): matcher(doc1) with pytest.raises(ValueError): matcher(doc3) # TEXT/ORTH only require tokens matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3) matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TEXT": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3)
def test_phrase_matcher_validation(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1.is_parsed = True doc2 = Doc(en_vocab, words=["Test"]) doc2.is_tagged = True doc3 = Doc(en_vocab, words=["Test"]) matcher = PhraseMatcher(en_vocab, validate=True) with pytest.warns(UserWarning): matcher.add("TEST1", [doc1]) with pytest.warns(UserWarning): matcher.add("TEST2", [doc2]) with pytest.warns(None) as record: matcher.add("TEST3", [doc3]) assert not record.list matcher = PhraseMatcher(en_vocab, attr="POS", validate=True) with pytest.warns(None) as record: matcher.add("TEST4", [doc2]) assert not record.list
def __call__(self,text): u=self.model(text) if text else "" if not self.convUD: return u vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] heads=[] deps=[] spaces=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,dummy_feats,head,deprel,dummy_deps,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) if deprel=="root" or deprel=="ROOT": heads.append(0) deps.append(r) elif head=="0": heads.append(0) deps.append(vs.add(deprel)) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD],a) try: doc.is_tagged=True doc.is_parsed=True except: pass return doc
def knp_dependency_parser(doc: Doc) -> Doc: tag_spans: Iterable[Span] = doc._.get(KNP_USER_KEYS.tag.spans) s = [] for tag in tag_spans: for c in tag[1:]: c.head = tag[0] c.dep_ = _get_child_dep(c) parent: Optional[Span] = tag._.get(KNP_USER_KEYS.tag.parent) if parent is not None: tag[0].head = parent[0] tag[0].dep_ = _get_dep(tag[0]) else: tag[0].head = tag[0] tag[0].dep_ = "ROOT" s.append(tag[0]) s = _modify_head_punct(s) s = _modify_head_flat(s) s = _modify_head_conj(s) doc.is_parsed = True return doc
def __call__(self,text): from suparkanbun.tradify import tradify t="" for c in text: if c in self.simplify: t+=self.simplify[c] else: t+=c if self.danku!=None: u=t.replace("\n","") t="" while len(u)>500: s=self.danku(u[0:500]) r="" for c,p in s: r+=c if p=="S" or p=="E": r+="\n" r="\n".join(r.split("\n")[0:-2])+"\n" t+=r u=u[len(r.replace("\n","")):] s=self.danku(u) for c,p in s: t+=c if p=="S" or p=="E": t+="\n" if len(t)<500: p=self.tagger(t.replace("\n","")) else: p=[] u="" for s in t.strip().split("\n"): u+=s if len(u)>400: p+=self.tagger(u) u="" if len(u)>0: p+=self.tagger(u) u=self.supar.predict([[c for c in s] for s in t.strip().split("\n")],lang=None) t=text.replace("\n","") i=0 w=[] for s in u.sentences: v=[] for h,d in zip(s.values[6],s.values[7]): j=t[i] k=tradify[j] if j in tradify else j v.append({"form":j,"lemma":k,"pos":p[i][1],"head":h,"deprel":d}) i+=1 for j in reversed(range(0,len(v)-1)): if v[j]["deprel"]=="compound" and v[j]["head"]==j+2 and v[j]["pos"]==v[j+1]["pos"]: k=v.pop(j) v[j]["form"]=k["form"]+v[j]["form"] v[j]["lemma"]=k["lemma"]+v[j]["lemma"] for k in range(0,len(v)): if v[k]["head"]>j+1: v[k]["head"]-=1 w.append(list(v)) vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] feats=[] heads=[] deps=[] spaces=[] norms=[] for s in w: for i,t in enumerate(s): form=t["form"] words.append(form) lemmas.append(vs.add(t["lemma"])) p=t["pos"].split(",") xpos=",".join(p[0:4]) pos.append(vs.add(p[4])) tags.append(vs.add(xpos)) feats.append(p[5]) if t["deprel"]=="root": heads.append(0) deps.append(r) else: heads.append(t["head"]-i-1) deps.append(vs.add(t["deprel"])) spaces.append(False) g=self.gloss(form,xpos) if g!=None: norms.append(vs.add(g)) else: norms.append(vs.add(form)) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(feats): if j!="_" and j!="": doc[i].set_morph(j) return doc
def doc_not_parsed(en_tokenizer): text = "This is a sentence. This is another sentence. And a third." tokens = en_tokenizer(text) doc = Doc(tokens.vocab, words=[t.text for t in tokens]) doc.is_parsed = False return doc
def __call__(self, text: Union[str, List[str], List[List[str]]]) -> Doc: """Convert input text to a spaCy Doc. text: The text to process. It can be presegmented or pretokenized: str : raw text, List[str] : presegmented text, List[List[str]] : pretokenized text. RETURNS: The spaCy Doc object. """ udpipe_sents = self.model(text=text) if text else [Sentence()] text = " ".join(s.getText() for s in udpipe_sents) tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents) if not tokens: return Doc(vocab=self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self._check_aligned(text=text, tokens=tokens) if not is_aligned: text = "" for token in tokens: text += token.form if NO_SPACE not in token.misc: text += " " for i, token in enumerate(tokens): span = text[offset:] if not span: break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.form) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upostag or "")) # CoNNL xpostag-s, custom for each UD treebank tags.append(self.vocab.strings.add(token.xpostag or "")) deps.append(self.vocab.strings.add(self._dep(token.deprel) or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.form) span = text[offset:] if i == len(tokens) - 1 or NO_SPACE in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.form)) try: attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) except ValueError as e: if '[E167]' in str(e): raise ValueError( "Could not properly assign morphology features. " f"Please update the tag map for '{self.model._lang}'" " language. See " "https://spacy.io/usage/adding-languages#tag-map " "for details. A quick workaround is to use the keyword " "argument ignore_tag_map=True when loading UDPipeLanguage." ) else: raise e # Overwrite lemmas separately to prevent overwritting by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array(attrs=[LEMMA], array=lemma_array) doc.is_tagged = bool(any(pos) and any(tags)) doc.is_parsed = bool(any(deps)) return doc
def __call__(self, text): u = self.model(text) if text else "" vs = self.vocab.strings r = vs.add("ROOT") p = { "ACAUS": "VERB", "ACOND": "SCONJ", "ADV": "ADV", "ALIM": "SCONJ", "APREC": "SCONJ", "ART": "DET", "CCIRC": "SCONJ", "CFOC": "PART", "CONJ": "CCONJ", "COP": "PRON", "CPRET": "AUX", "CREL": "SCONJ", "EXIST": "VERB", "FUT": "AUX", "IMOD": "ADV", "NEG": "ADV", "NPROP": "PROPN", "NUM": "NUM", "PDEM": "DET", "PPOS": "DET", "PREP": "ADP", "PTC": "PART", "PUNCT": "PUNCT" } words = [] lemmas = [] pos = [] tags = [] heads = [] deps = [] spaces = [] norms = [] for s in u.split("\n"): if s.startswith('<norm xml:id="u'): id = s[15:s.index('"', 16)] i = s.index(' orig="') form = s[i + 7:s.index('"', i + 8)] words.append(form) i = s.find(' lemma="') lemmas.append( vs.add(form if i < 0 else s[i + 8:s.index('"', i + 9)])) i = s.find(' norm="') norms.append( vs.add(form if i < 0 else s[i + 7:s.index('"', i + 8)])) i = s.index(' func="') dep = s[i + 7:s.index('"', i + 8)] if dep == "root": heads.append(0) deps.append(r) else: i = s.find(' head="#u') heads.append( 0 if i < 0 else int(s[i + 9:s.index('"', i + 10)]) - int(id)) deps.append(vs.add(dep)) i = s.index(' pos="') xpos = s[i + 6:s.index('"', i + 7)] tags.append(vs.add(xpos)) upos = "X" if xpos in p: upos = p[xpos] elif xpos.startswith("A"): upos = "AUX" elif xpos.startswith("N"): upos = "ADJ" if dep in {"amod", "acl"} else "NOUN" elif xpos.startswith("P"): upos = "PRON" elif xpos.startswith("V"): upos = "VERB" pos.append(vs.add(upos)) spaces.append(False) elif s.startswith("</norm_group>"): spaces[-1] = True doc = Doc(self.vocab, words=words, spaces=spaces) a = numpy.array(list(zip(lemmas, pos, tags, deps, heads, norms)), dtype="uint64") doc.from_array([LEMMA, POS, TAG, DEP, HEAD, NORM], a) try: doc.is_tagged = True doc.is_parsed = True except: pass return doc
def __call__(self, text): """Convert a Stanza Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ if not text: return Doc(self.vocab) elif text.isspace(): return Doc(self.vocab, words=[text], spaces=[False]) snlp_doc = self.snlp(text) text = snlp_doc.text snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc) words = [] spaces = [] pos = [] tags = [] deps = [] heads = [] lemmas = [] offset = 0 token_texts = [t.text for t in snlp_tokens] is_aligned = True try: words, spaces = self.get_words_and_spaces(token_texts, text) except ValueError: words = token_texts spaces = [True] * len(words) is_aligned = False warnings.warn( "Due to multiword token expansion or an alignment " "issue, the original text has been replaced by space-separated " "expanded tokens.", stacklevel=4, ) offset = 0 for i, word in enumerate(words): if word.isspace() and word != snlp_tokens[i + offset].text: # insert a space token pos.append(self.vocab.strings.add("SPACE")) tags.append(self.vocab.strings.add("_SP")) deps.append(self.vocab.strings.add("")) lemmas.append(self.vocab.strings.add(word)) # increment any heads left of this position that point beyond # this position to the right (already present in heads) for j in range(0, len(heads)): if j + heads[j] >= i: heads[j] += 1 # decrement any heads right of this position that point beyond # this position to the left (yet to be added from snlp_heads) for j in range(i + offset, len(snlp_heads)): if j + snlp_heads[j] < i + offset: snlp_heads[j] -= 1 # initial space tokens are attached to the following token, # otherwise attach to the preceding token if i == 0: heads.append(1) else: heads.append(-1) offset -= 1 else: token = snlp_tokens[i + offset] assert word == token.text pos.append(self.vocab.strings.add(token.upos or "")) tags.append( self.vocab.strings.add(token.xpos or token.feats or "")) deps.append(self.vocab.strings.add(token.deprel or "")) heads.append(snlp_heads[i + offset]) lemmas.append(self.vocab.strings.add(token.lemma or "")) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) ents = [] for ent in snlp_doc.entities: ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type) ents.append(ent_span) if not is_aligned or not all(ents): warnings.warn( f"Can't set named entities because of multi-word token " f"expansion or because the character offsets don't map to " f"valid tokens produced by the Stanza tokenizer:\n" f"Words: {words}\n" f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", stacklevel=4, ) else: doc.ents = ents # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) or any(tags): doc.is_tagged = True if any(deps) or any(heads): doc.is_parsed = True return doc
def correct_dep(doc, rewrite_ne_as_proper_noun): complex_tokens = [] last_head = -1 for token in doc[0:-1]: label = token.dep_ p = label.find('_as_') if p >= 0: tag = label[p + 4:] if len(tag) > 0: lemma = token.lemma_ token.tag_ = tag # work around: lemma_ must be set after tag_ (spaCy's bug) token.lemma_ = lemma token.dep_ = label[0:p] for token in doc[0:-1]: label = token.dep_ if label.startswith('as_'): head = token.head if last_head == head.i: complex_tokens[-1].append(token) else: complex_tokens.append([token]) last_head = token.i tag = label[3:] if len(tag) > 0: lemma = token.lemma_ token.tag_ = tag # work around: lemma_ must be set after tag_ (spaCy's bug) token.lemma_ = lemma token.dep_ = 'dep' else: complex_tokens.append([token]) last_head = token.i complex_tokens.append([doc[-1]]) # for root detection error index = 0 count = 0 index_map = [0] * (len(doc) + 1) # last element is for ner for comp in complex_tokens: for _ in comp: index_map[count] = index count += 1 index += 1 index_map[-1] = count if len(complex_tokens) > 1: words, lemmas, tags, pos_details, infs, spaces, sent_starts, deps, heads = zip(*[ ( ''.join([t.orth_ + ' ' if t.whitespace_ else t.orth_ for t in comp[0:-1]] + [comp[-1].orth_]), ''.join([t.lemma_ + ' ' if t.whitespace_ else t.lemma_ for t in comp[0:-1]] + [comp[-1].lemma_]), comp[0].pos_, comp[0]._.pos_detail, comp[-1]._.inf, comp[-1].whitespace_, True if comp[0].sent_start else False, comp[0].dep_, index_map[comp[0].head.i], ) if len(comp) > 1 else ( comp[0].orth_, comp[0].lemma_, comp[0].pos_, comp[0]._.pos_detail, comp[0]._.inf, comp[0].whitespace_, True if comp[0].sent_start else False, comp[0].dep_, index_map[comp[0].head.i], ) for comp in complex_tokens[0:-1] ]) else: words = lemmas = tags = pos_details = infs = spaces = sent_starts = deps = heads = [] new_doc = Doc(doc.vocab, words=words, spaces=spaces) for token, lemma, tag, pos_detail, inf, dep in zip(new_doc, lemmas, tags, pos_details, infs, deps): token.tag_ = tag token.lemma_ = lemma # work around: lemma_ must be set after tag_ (spaCy's bug) token._.pos_detail = pos_detail token._.inf = inf token.dep_ = dep for token, sent_start in zip(new_doc, sent_starts): if sent_start: token.sent_start = True root_i = len(new_doc) for token, head in zip(new_doc, heads): if head == root_i: token.head = token else: token.head = new_doc[head] ents = [] prev_start = prev_end = -1 for ent in doc.ents: start = index_map[ent.start] end = max(index_map[ent.end], start + 1) if prev_end > start and prev_start < end: ents = ents[:-1] ents.append((ent.label, start, end)) prev_start = start prev_end = end new_doc.ents = ents new_doc.is_tagged = doc.is_tagged new_doc.is_parsed = doc.is_parsed if rewrite_ne_as_proper_noun: for _, start, end in ents: for token in new_doc[start:end]: lemma = token.lemma_ token.tag_ = 'PROPN' # work around: lemma_ must be set after tag_ (spaCy's bug) token.lemma_ = lemma new_doc.noun_chunks_iterator = noun_chunks # TODO work around for spaCy 2.0.12 if len(doc.text) - len(EOS) != len(new_doc.text): print( 'doc.text length is different from source={} to corrected={}'.format( len(doc.text) - len(EOS), len(new_doc.text)), file=sys.stderr ) for t in doc: print('<', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr) for t in new_doc: print('>', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr) return new_doc
def __call__(self,text): t=text.replace("\r","").replace("(","(").replace(")",")").replace("[","[").replace("]","]").replace("{","{").replace("}","}") u=self.model(t) if t else "" vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] morphs=[] heads=[] deps=[] spaces=[] norms=[] ent_iobs=[] ent_types=[] bunsetu=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,feats,head,deprel,_,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) morphs.append(feats) if deprel=="root": heads.append(0) deps.append(r) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) i=misc.find("Translit=") norms.append(vs.add(form if i<0 else misc[i+9:])) i=misc.find("NE=") if i<0: ent_iobs.append(2) ent_types.append(0) else: j=misc.find("|",i) if j<0: j=len(misc) if misc[i+3:i+4]=="B": ent_iobs.append(3) else: ent_iobs.append(1) ent_types.append(vs.add(misc[i+5:j])) bunsetu.append("I") if misc.startswith("BunsetuBILabel="): bunsetu[-1]=misc[15:16] doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms,ent_iobs,ent_types)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM,ENT_IOB,ENT_TYPE],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(morphs): if j!="_" and j!="": doc[i].set_morph(j) doc.user_data["bunsetu_bi_labels"]=bunsetu return doc
def __call__(self, text): """Convert a Stanza Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ snlp_doc = self.snlp(text) if text else Document("") text = snlp_doc.text tokens, heads = self.get_tokens_with_heads(snlp_doc) if not len(tokens): return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not len(span): break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.text) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upos or "")) tags.append(self.vocab.strings.add(token.xpos or "")) deps.append(self.vocab.strings.add(token.deprel or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.text) span = text[offset:] if i == len(tokens) - 1: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.text)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) ents = [] for ent in snlp_doc.entities: ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type) ents.append(ent_span) if not all(ents): warnings.warn( f"Can't set named entities because the character offsets don't " f"map to valid tokens produced by the Stanza tokenizer:\n" f"Words: {words}\n" f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", stacklevel=4, ) else: doc.ents = ents # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def serialize_spacy_doc(orig_doc, converted_sentences): words = [] spaces = [] total_attrs = [] attrs_ = list(attrs.NAMES) attrs_.remove('SENT_START') # this clashes HEAD (see spacy documentation) attrs_.remove( 'SPACY') # we dont want to override the spaces we assign later on for orig_span, converted_sentence in zip(orig_doc.sents, converted_sentences): # remove redundant dummy-root-node converted = { iid: tok for iid, tok in converted_sentence.items() if iid != 0 } orig = orig_span.as_doc() # get attributes of original doc orig_attrs = orig.to_array(attrs_) # append copied attributes for new nodes new_nodes_attrs = [] for iid, tok in converted.items(): if int(iid) != iid: new_node_attrs = list(orig_attrs[int(iid) - 1]) # here we fix the relative head he is pointing to, # in case it is a negative number we need to cast it to its unsigned synonym relative = int(iid) - (len(orig_attrs) + len(new_nodes_attrs) + 1) new_node_attrs[attrs_.index('HEAD')] = relative + ( 2**NUM_OF_BITS if relative < 0 else 0) new_nodes_attrs.append(new_node_attrs) if new_nodes_attrs: new_attrs = np.append(orig_attrs, new_nodes_attrs, axis=0) else: new_attrs = orig_attrs total_attrs = np.append(total_attrs, new_attrs, axis=0) if len(total_attrs) > 0 else new_attrs # fix whitespaces in case of new nodes: take original spaces. change the last one if there are new nodes. # add spaces for each new nodes, except for last spaces += [t.whitespace_ if not ((i + 1 == len(orig)) and (len(new_nodes_attrs) > 0)) else ' ' for i, t in enumerate(orig)] + \ [' ' if i + 1 < len(converted.keys()) else '' for i, iid in enumerate(converted.keys()) if int(iid) != iid] spaces[-1] = ' ' words += [t.get_conllu_field("form") for iid, t in converted.items()] # form new doc including new nodes and set attributes spaces[-1] = '' new_doc = Doc(orig_doc.vocab, words=words, spaces=spaces) new_doc.from_array(attrs_, total_attrs) j = 0 for converted_sentence in converted_sentences: converted = { iid: tok for iid, tok in converted_sentence.items() if iid != 0 } # store spacy ids for head indices extraction later on spacy_ids = { iid: (spacy_i + j) for spacy_i, iid in enumerate(converted.keys()) } # set new info for all tokens per their head lists for i, bart_tok in enumerate(converted.values()): spacy_tok = new_doc[i + j] for head, rel in bart_tok.get_new_relations(): # extract spacy correspondent head id head_tok = new_doc[ spacy_ids[head.get_conllu_field("id")] if head. get_conllu_field("id") != 0 else spacy_tok.i] # parse stringish label is_state_head_node = ((head_tok.text == "STATE") and (head.get_conllu_field("id") != int(head.get_conllu_field("id")))) or \ (bart_tok.get_conllu_field("id") != int(bart_tok.get_conllu_field("id"))) new_rel, src, unc, alt = parse_bart_label( rel, is_state_head_node=is_state_head_node) # add info to token spacy_tok._.parent_list.append({ 'head': head_tok, 'rel': new_rel, 'src': src, 'alt': alt, 'unc': unc }) # fix sentence boundaries, need to turn off is_parsed bool as it prevents setting the boundaries new_doc.is_parsed = False spacy_tok.is_sent_start = False if i != 0 else True new_doc.is_parsed = True j += len(converted) return new_doc