def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1.is_parsed = True doc2 = Doc(en_vocab, words=["Test"]) doc2.is_tagged = True doc3 = Doc(en_vocab, words=["Test"]) # DEP requires is_parsed matcher = Matcher(en_vocab) matcher.add("TEST", [[{"DEP": "a"}]]) matcher(doc1) with pytest.raises(ValueError): matcher(doc2) with pytest.raises(ValueError): matcher(doc3) # TAG, POS, LEMMA require is_tagged for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab) matcher.add("TEST", [[{attr: "a"}]]) matcher(doc2) with pytest.raises(ValueError): matcher(doc1) with pytest.raises(ValueError): matcher(doc3) # TEXT/ORTH only require tokens matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3) matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TEXT": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3)
def read_conlldoc(self, inputdoc): words = list() sentbounds = list() # pos = list() tags = list() # lemmas = list() for sent in string2doc(inputdoc, hide_fields=HIDDEN_FIELDS): for i, tok in enumerate(sent): if i == 0: sentbounds.append(True) else: sentbounds.append(False) words.append(tok.word) tags.append(self.nlp.vocab.strings.add(tok.xpos)) # pos.append(self.nlp.vocab.strings.add(conv_table.get(tok.xpos, "_"))) # lemmas.append(self.nlp.vocab.strings.add(tok.lemma)) # attrs = [POS, TAG] attrs = [TAG] # arr = np.array(list(zip(pos, tags)), dtype="uint64") arr = np.array(tags, dtype="uint64") sdoc = Doc(self.nlp.vocab, words=words).from_array(attrs, arr) for i, sb in enumerate(sentbounds): if sb: sdoc[i].is_sent_start = True else: # these must be set to False, since, # if left as None, spaCy will add further sentbounds sdoc[i].is_sent_start = False # lemma_array = np.array([[lemma] for lemma in lemmas], dtype="uint64") # sdoc.from_array([LEMMA], lemma_array) if any(tags): sdoc.is_tagged = True return sdoc
def test_issue599(en_vocab): doc = Doc(en_vocab) doc.is_tagged = True doc.is_parsed = True doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) assert doc2.is_parsed
def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1.is_parsed = True doc2 = Doc(en_vocab, words=["Test"]) doc2.is_tagged = True doc3 = Doc(en_vocab, words=["Test"]) # DEP requires is_parsed matcher = PhraseMatcher(en_vocab, attr="DEP") matcher.add("TEST1", [doc1]) with pytest.raises(ValueError): matcher.add("TEST2", [doc2]) with pytest.raises(ValueError): matcher.add("TEST3", [doc3]) # TAG, POS, LEMMA require is_tagged for attr in ("TAG", "POS", "LEMMA"): matcher = PhraseMatcher(en_vocab, attr=attr) matcher.add("TEST2", [doc2]) with pytest.raises(ValueError): matcher.add("TEST1", [doc1]) with pytest.raises(ValueError): matcher.add("TEST3", [doc3]) # TEXT/ORTH only require tokens matcher = PhraseMatcher(en_vocab, attr="ORTH") matcher.add("TEST3", [doc3]) matcher = PhraseMatcher(en_vocab, attr="TEXT") matcher.add("TEST3", [doc3])
def test_matcher_no_zero_length(en_vocab): doc = Doc(en_vocab, words=["a", "b"]) doc[0].tag_ = "A" doc[1].tag_ = "B" doc.is_tagged = True matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) assert len(matcher(doc)) == 0
def __call__(self, text): """Convert input text to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ udpipe_sents = self.model(text) if text else [Sentence()] text = " ".join(s.getText() for s in udpipe_sents) tokens, heads = self.get_tokens_with_heads(udpipe_sents) if not tokens: return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not span: break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.form) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upostag or "")) # CoNNL xpostag-s, custom for each UD treebank #tags.append(self.vocab.strings.add(token.xpostag or "")) tags.append(self.vocab.strings.add(token.feats or "")) deps.append(self.vocab.strings.add(self._dep(token.deprel) or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.form) span = text[offset:] if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.form)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) # Overwrite lemmas separately to prevent overwritting by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def __call__(self, text): """Convert a StanfordNLP Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ snlp_doc = self.snlp(text) if text else Document("") text = snlp_doc.text tokens, heads = self.get_tokens_with_heads(snlp_doc) if not len(tokens): return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not len(span): break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.text) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upos or "")) tags.append(self.vocab.strings.add(token.xpos or "")) deps.append(self.vocab.strings.add(token.dependency_relation or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.text) span = text[offset:] if i == len(tokens) - 1: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.text)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc: words = [x.surface for x in dtokens] spaces = [x.space for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) for token, dtoken in zip(doc, dtokens): token.lemma_ = dtoken.lemma token.tag_ = dtoken.pos token._.set(self.key_fstring, dtoken.fstring) doc.is_tagged = True return doc
def __call__(self,text): u=self.model(text,raw=True) if text else "" vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] feats=[] heads=[] deps=[] spaces=[] norms=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) feats.append(feat) if deprel=="root": heads.append(0) deps.append(r) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) i=misc.find("Translit=") norms.append(vs.add(form if i<0 else misc[i+9:])) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(feats): if j!="_" and j!="": doc[i].set_morph(j) t=Tree(u) t._makeChunks() bunsetu=["I"]*len(doc) for s in t._cabocha._sentences: for w in s: try: bunsetu[w[0]-1]="B" except: pass doc.user_data["bunsetu_bi_labels"]=bunsetu return doc
def __call__(self, text): result = self.tokenizer.tokenize(text=text) morph_spaces = [] last_morph = None for m in result: if m.surface(): if m.part_of_speech()[0] == '空白': if last_morph: morph_spaces.append((last_morph, True)) last_morph = None else: morph_spaces.append((m, False)) elif last_morph: morph_spaces.append((last_morph, False)) last_morph = m else: last_morph = m if last_morph: morph_spaces.append((last_morph, False)) # the last space is removed by JapaneseReviser at the final stage of pipeline words = [m.surface() for m, spaces in morph_spaces] spaces = [space for m, space in morph_spaces] doc = Doc(self.nlp.vocab if self.nlp else Vocab(), words=words, spaces=spaces) next_tag = morph_tag( morph_spaces[0][0].part_of_speech()[0:4]) if len(doc) else '' for token, (morph, spaces) in zip(doc, morph_spaces): tag = next_tag next_tag = morph_tag(morph_spaces[token.i + 1][0].part_of_speech() [0:4]) if token.i < len(doc) - 1 else '' token.tag_ = tag token.pos = TAG_MAP[tag][POS] # TODO separate lexical rules to resource files if morph.normalized_form() == '為る' and tag == '動詞-非自立可能': token.pos_ = 'AUX' elif tag == '名詞-普通名詞-サ変可能': if next_tag == '動詞-非自立可能': token.pos_ = 'VERB' elif tag == '名詞-普通名詞-サ変形状詞可能': if next_tag == '動詞-非自立可能': token.pos_ = 'VERB' elif next_tag == '助動詞' or next_tag.find('形状詞') >= 0: token.pos_ = 'ADJ' token.lemma_ = morph.normalized_form() token._.inf = ','.join(morph.part_of_speech()[4:]) token._.reading = morph.reading_form() if self.enable_ex_sudachi: token._.sudachi = morph if self.use_sentence_separator: separate_sentences(doc) doc.is_tagged = True return doc
def __init__(self, qid: str, text: List[str], head: Tuple[int, int], tail: Tuple[int, int], head_type: str = None, tail_type: str = None, ner: List[str] = None, pos: List[str] = None, dep: List[str] = None, dep_heads: List[str] = None, vid: int = 0, annotator=None, metas: Dict[str, any] = None) -> None: Target.__init__(self, qid=qid, text=None, vid=vid, annotator=annotator, metas=metas) self.head = head self.tail = tail self.head_type = head_type self.tail_type = tail_type self.ner = ner self.pos = pos self.dep = dep self.dep_heads = dep_heads self.annotator = annotator vocab = annotator.model.vocab words = text spaces = [True] * len(words) tags = [vocab.strings.add(p) for p in pos] # deps = [vocab.strings.add(d) for d in dep] # heads = dep_heads ent_types = [vocab.strings.add(e) for e in ner] # attrs = [ENT_TYPE, POS, TAG, DEP, HEAD] attrs = [ENT_TYPE, TAG] # array = numpy.array(list(zip(ent_types, pos, tags, deps, heads)), dtype="uint64") array = numpy.array(list(zip(ent_types, tags)), dtype="uint64") doc = Doc(vocab, words=words, spaces=spaces).from_array(attrs, array) if any(pos) and any(tags): doc.is_tagged = True # if any(deps): # doc.is_parsed = True self.doc = doc
def __call__(self, text): """Convert a StanfordNLP Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ snlp_doc = self.snlp(text) text = snlp_doc.text tokens, heads = self.get_tokens_with_heads(snlp_doc) if not len(tokens): raise ValueError("No tokens available.") words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 for i, token in enumerate(tokens): span = text[offset:] if not len(span): break while not span.startswith(token.text): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.text) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upos or "")) tags.append(self.vocab.strings.add(token.xpos or "")) deps.append(self.vocab.strings.add(token.dependency_relation or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.text) span = text[offset:] if i == len(tokens) - 1: spaces.append(False) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.text)) attrs = [POS, TAG, DEP, HEAD, LEMMA] array = numpy.array(list(zip(pos, tags, deps, heads, lemmas)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def __call__(self,text): u=self.model(text,raw=True) if text else "" vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] feats=[] heads=[] deps=[] spaces=[] norms=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) feats.append(feat) if deprel=="root": heads.append(0) deps.append(r) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) i=misc.find("Gloss=") if i<0: norms.append(vs.add(form)) else: j=misc.find("|",i) norms.append(vs.add(misc[i+6:] if j<0 else misc[i+6:j])) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(feats): if j!="_" and j!="": doc[i].set_morph(j) return doc
def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" string = "This is a first sentence . And another one" doc = Doc(Vocab(), words=string.split()) doc[6].sent_start = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert not new_doc.is_parsed assert not new_doc.is_tagged doc.is_parsed = True doc.is_tagged = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc.is_parsed assert new_doc.is_tagged
def __call__(self, text: str) -> Doc: dtokens = self.detailed_tokens(text) words = [x.surface for x in dtokens] spaces = [x.space for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) for token, dtoken in zip(doc, dtokens): token.tag_ = dtoken.pos token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text token._.set(self.key_fstring, dtoken.fstring) with doc.retokenize() as retokenizer: for match in RE_URL.finditer(doc.text): span = doc.char_span(*match.span()) if span: retokenizer.merge(span) doc.is_tagged = True return doc
def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1[0].dep_ = "ROOT" doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" if spacy_version >= 3: doc2[0].set_morph("Feat=Val") else: doc1.is_parsed = True doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) # DEP requires DEP matcher = Matcher(en_vocab) matcher.add("TEST", [[{"DEP": "a"}]]) matcher(doc1) with pytest.raises(ValueError): matcher(doc2) with pytest.raises(ValueError): matcher(doc3) # errors can be suppressed if desired matcher(doc2, allow_missing=True) matcher(doc3, allow_missing=True) # TAG, POS, LEMMA require those values for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab) matcher.add("TEST", [[{attr: "a"}]]) if spacy_version < 3: doc2.is_tagged = True matcher(doc2) with pytest.raises(ValueError): matcher(doc1) with pytest.raises(ValueError): matcher(doc3) # TEXT/ORTH only require tokens matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3) matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TEXT": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3)
def test_phrase_matcher_validation(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1.is_parsed = True doc2 = Doc(en_vocab, words=["Test"]) doc2.is_tagged = True doc3 = Doc(en_vocab, words=["Test"]) matcher = PhraseMatcher(en_vocab, validate=True) with pytest.warns(UserWarning): matcher.add("TEST1", [doc1]) with pytest.warns(UserWarning): matcher.add("TEST2", [doc2]) with pytest.warns(None) as record: matcher.add("TEST3", [doc3]) assert not record.list matcher = PhraseMatcher(en_vocab, attr="POS", validate=True) with pytest.warns(None) as record: matcher.add("TEST4", [doc2]) assert not record.list
def __call__(self,text): u=self.model(text) if text else "" if not self.convUD: return u vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] heads=[] deps=[] spaces=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,dummy_feats,head,deprel,dummy_deps,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) if deprel=="root" or deprel=="ROOT": heads.append(0) deps.append(r) elif head=="0": heads.append(0) deps.append(vs.add(deprel)) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD],a) try: doc.is_tagged=True doc.is_parsed=True except: pass return doc
def __call__(self,doc): vs=self.vocab.strings words=[] pos=[] tags=[] spaces=[] for i,(form,xpos) in enumerate(self.pos_tag([t.orth_ for t in doc])): if form.strip()=="": if len(spaces)>0: spaces[-1]=True else: words.append(form) spaces.append(doc[i].whitespace_!="") tags.append(vs.add(xpos)) pos.append(self.tag_map[xpos][POS] if xpos in self.tag_map else X) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(pos,tags)),dtype="uint64") doc.from_array([POS,TAG],a) if not SPACY_V3: doc.is_tagged=True return doc
def test_issue4133(en_vocab): nlp = English() vocab_bytes = nlp.vocab.to_bytes() words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] doc = Doc(en_vocab, words=words) for i, token in enumerate(doc): token.pos_ = pos[i] # usually this is already True when starting from proper models instead of blank English doc.is_tagged = True doc_bytes = doc.to_bytes() vocab = Vocab() vocab = vocab.from_bytes(vocab_bytes) doc = Doc(vocab).from_bytes(doc_bytes) actual = [] for token in doc: actual.append(token.pos_) assert actual == pos
def __call__(self, text): """Convert a Stanza Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ snlp_doc = self.snlp(text) if text else Document("") text = snlp_doc.text tokens, heads = self.get_tokens_with_heads(snlp_doc) if not len(tokens): return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not len(span): break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.text) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upos or "")) tags.append(self.vocab.strings.add(token.xpos or "")) deps.append(self.vocab.strings.add(token.deprel or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.text) span = text[offset:] if i == len(tokens) - 1: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.text)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) ents = [] for ent in snlp_doc.entities: ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type) ents.append(ent_span) if not all(ents): warnings.warn( f"Can't set named entities because the character offsets don't " f"map to valid tokens produced by the Stanza tokenizer:\n" f"Words: {words}\n" f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", stacklevel=4, ) else: doc.ents = ents # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def __call__(self,text): from suparkanbun.tradify import tradify t="" for c in text: if c in self.simplify: t+=self.simplify[c] else: t+=c if self.danku!=None: u=t.replace("\n","") t="" while len(u)>500: s=self.danku(u[0:500]) r="" for c,p in s: r+=c if p=="S" or p=="E": r+="\n" r="\n".join(r.split("\n")[0:-2])+"\n" t+=r u=u[len(r.replace("\n","")):] s=self.danku(u) for c,p in s: t+=c if p=="S" or p=="E": t+="\n" if len(t)<500: p=self.tagger(t.replace("\n","")) else: p=[] u="" for s in t.strip().split("\n"): u+=s if len(u)>400: p+=self.tagger(u) u="" if len(u)>0: p+=self.tagger(u) u=self.supar.predict([[c for c in s] for s in t.strip().split("\n")],lang=None) t=text.replace("\n","") i=0 w=[] for s in u.sentences: v=[] for h,d in zip(s.values[6],s.values[7]): j=t[i] k=tradify[j] if j in tradify else j v.append({"form":j,"lemma":k,"pos":p[i][1],"head":h,"deprel":d}) i+=1 for j in reversed(range(0,len(v)-1)): if v[j]["deprel"]=="compound" and v[j]["head"]==j+2 and v[j]["pos"]==v[j+1]["pos"]: k=v.pop(j) v[j]["form"]=k["form"]+v[j]["form"] v[j]["lemma"]=k["lemma"]+v[j]["lemma"] for k in range(0,len(v)): if v[k]["head"]>j+1: v[k]["head"]-=1 w.append(list(v)) vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] feats=[] heads=[] deps=[] spaces=[] norms=[] for s in w: for i,t in enumerate(s): form=t["form"] words.append(form) lemmas.append(vs.add(t["lemma"])) p=t["pos"].split(",") xpos=",".join(p[0:4]) pos.append(vs.add(p[4])) tags.append(vs.add(xpos)) feats.append(p[5]) if t["deprel"]=="root": heads.append(0) deps.append(r) else: heads.append(t["head"]-i-1) deps.append(vs.add(t["deprel"])) spaces.append(False) g=self.gloss(form,xpos) if g!=None: norms.append(vs.add(g)) else: norms.append(vs.add(form)) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(feats): if j!="_" and j!="": doc[i].set_morph(j) return doc
def __call__(self, text: Union[str, List[str], List[List[str]]]) -> Doc: """Convert input text to a spaCy Doc. text: The text to process. It can be presegmented or pretokenized: str : raw text, List[str] : presegmented text, List[List[str]] : pretokenized text. RETURNS: The spaCy Doc object. """ udpipe_sents = self.model(text=text) if text else [Sentence()] text = " ".join(s.getText() for s in udpipe_sents) tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents) if not tokens: return Doc(vocab=self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self._check_aligned(text=text, tokens=tokens) if not is_aligned: text = "" for token in tokens: text += token.form if NO_SPACE not in token.misc: text += " " for i, token in enumerate(tokens): span = text[offset:] if not span: break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.form) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upostag or "")) # CoNNL xpostag-s, custom for each UD treebank tags.append(self.vocab.strings.add(token.xpostag or "")) deps.append(self.vocab.strings.add(self._dep(token.deprel) or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.form) span = text[offset:] if i == len(tokens) - 1 or NO_SPACE in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.form)) try: attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) except ValueError as e: if '[E167]' in str(e): raise ValueError( "Could not properly assign morphology features. " f"Please update the tag map for '{self.model._lang}'" " language. See " "https://spacy.io/usage/adding-languages#tag-map " "for details. A quick workaround is to use the keyword " "argument ignore_tag_map=True when loading UDPipeLanguage." ) else: raise e # Overwrite lemmas separately to prevent overwritting by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array(attrs=[LEMMA], array=lemma_array) doc.is_tagged = bool(any(pos) and any(tags)) doc.is_parsed = bool(any(deps)) return doc
def __call__(self, text): u = self.model(text) if text else "" vs = self.vocab.strings r = vs.add("ROOT") p = { "ACAUS": "VERB", "ACOND": "SCONJ", "ADV": "ADV", "ALIM": "SCONJ", "APREC": "SCONJ", "ART": "DET", "CCIRC": "SCONJ", "CFOC": "PART", "CONJ": "CCONJ", "COP": "PRON", "CPRET": "AUX", "CREL": "SCONJ", "EXIST": "VERB", "FUT": "AUX", "IMOD": "ADV", "NEG": "ADV", "NPROP": "PROPN", "NUM": "NUM", "PDEM": "DET", "PPOS": "DET", "PREP": "ADP", "PTC": "PART", "PUNCT": "PUNCT" } words = [] lemmas = [] pos = [] tags = [] heads = [] deps = [] spaces = [] norms = [] for s in u.split("\n"): if s.startswith('<norm xml:id="u'): id = s[15:s.index('"', 16)] i = s.index(' orig="') form = s[i + 7:s.index('"', i + 8)] words.append(form) i = s.find(' lemma="') lemmas.append( vs.add(form if i < 0 else s[i + 8:s.index('"', i + 9)])) i = s.find(' norm="') norms.append( vs.add(form if i < 0 else s[i + 7:s.index('"', i + 8)])) i = s.index(' func="') dep = s[i + 7:s.index('"', i + 8)] if dep == "root": heads.append(0) deps.append(r) else: i = s.find(' head="#u') heads.append( 0 if i < 0 else int(s[i + 9:s.index('"', i + 10)]) - int(id)) deps.append(vs.add(dep)) i = s.index(' pos="') xpos = s[i + 6:s.index('"', i + 7)] tags.append(vs.add(xpos)) upos = "X" if xpos in p: upos = p[xpos] elif xpos.startswith("A"): upos = "AUX" elif xpos.startswith("N"): upos = "ADJ" if dep in {"amod", "acl"} else "NOUN" elif xpos.startswith("P"): upos = "PRON" elif xpos.startswith("V"): upos = "VERB" pos.append(vs.add(upos)) spaces.append(False) elif s.startswith("</norm_group>"): spaces[-1] = True doc = Doc(self.vocab, words=words, spaces=spaces) a = numpy.array(list(zip(lemmas, pos, tags, deps, heads, norms)), dtype="uint64") doc.from_array([LEMMA, POS, TAG, DEP, HEAD, NORM], a) try: doc.is_tagged = True doc.is_parsed = True except: pass return doc
def correct_dep(doc, rewrite_ne_as_proper_noun): complex_tokens = [] last_head = -1 for token in doc[0:-1]: label = token.dep_ p = label.find('_as_') if p >= 0: tag = label[p + 4:] if len(tag) > 0: lemma = token.lemma_ token.tag_ = tag # work around: lemma_ must be set after tag_ (spaCy's bug) token.lemma_ = lemma token.dep_ = label[0:p] for token in doc[0:-1]: label = token.dep_ if label.startswith('as_'): head = token.head if last_head == head.i: complex_tokens[-1].append(token) else: complex_tokens.append([token]) last_head = token.i tag = label[3:] if len(tag) > 0: lemma = token.lemma_ token.tag_ = tag # work around: lemma_ must be set after tag_ (spaCy's bug) token.lemma_ = lemma token.dep_ = 'dep' else: complex_tokens.append([token]) last_head = token.i complex_tokens.append([doc[-1]]) # for root detection error index = 0 count = 0 index_map = [0] * (len(doc) + 1) # last element is for ner for comp in complex_tokens: for _ in comp: index_map[count] = index count += 1 index += 1 index_map[-1] = count if len(complex_tokens) > 1: words, lemmas, tags, pos_details, infs, spaces, sent_starts, deps, heads = zip(*[ ( ''.join([t.orth_ + ' ' if t.whitespace_ else t.orth_ for t in comp[0:-1]] + [comp[-1].orth_]), ''.join([t.lemma_ + ' ' if t.whitespace_ else t.lemma_ for t in comp[0:-1]] + [comp[-1].lemma_]), comp[0].pos_, comp[0]._.pos_detail, comp[-1]._.inf, comp[-1].whitespace_, True if comp[0].sent_start else False, comp[0].dep_, index_map[comp[0].head.i], ) if len(comp) > 1 else ( comp[0].orth_, comp[0].lemma_, comp[0].pos_, comp[0]._.pos_detail, comp[0]._.inf, comp[0].whitespace_, True if comp[0].sent_start else False, comp[0].dep_, index_map[comp[0].head.i], ) for comp in complex_tokens[0:-1] ]) else: words = lemmas = tags = pos_details = infs = spaces = sent_starts = deps = heads = [] new_doc = Doc(doc.vocab, words=words, spaces=spaces) for token, lemma, tag, pos_detail, inf, dep in zip(new_doc, lemmas, tags, pos_details, infs, deps): token.tag_ = tag token.lemma_ = lemma # work around: lemma_ must be set after tag_ (spaCy's bug) token._.pos_detail = pos_detail token._.inf = inf token.dep_ = dep for token, sent_start in zip(new_doc, sent_starts): if sent_start: token.sent_start = True root_i = len(new_doc) for token, head in zip(new_doc, heads): if head == root_i: token.head = token else: token.head = new_doc[head] ents = [] prev_start = prev_end = -1 for ent in doc.ents: start = index_map[ent.start] end = max(index_map[ent.end], start + 1) if prev_end > start and prev_start < end: ents = ents[:-1] ents.append((ent.label, start, end)) prev_start = start prev_end = end new_doc.ents = ents new_doc.is_tagged = doc.is_tagged new_doc.is_parsed = doc.is_parsed if rewrite_ne_as_proper_noun: for _, start, end in ents: for token in new_doc[start:end]: lemma = token.lemma_ token.tag_ = 'PROPN' # work around: lemma_ must be set after tag_ (spaCy's bug) token.lemma_ = lemma new_doc.noun_chunks_iterator = noun_chunks # TODO work around for spaCy 2.0.12 if len(doc.text) - len(EOS) != len(new_doc.text): print( 'doc.text length is different from source={} to corrected={}'.format( len(doc.text) - len(EOS), len(new_doc.text)), file=sys.stderr ) for t in doc: print('<', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr) for t in new_doc: print('>', t.i, t.orth_, t.pos_, t.dep_, t.head.i, file=sys.stderr) return new_doc
def __call__(self, text): """Convert a Stanza Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ if not text: return Doc(self.vocab) elif text.isspace(): return Doc(self.vocab, words=[text], spaces=[False]) snlp_doc = self.snlp(text) text = snlp_doc.text snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc) words = [] spaces = [] pos = [] tags = [] deps = [] heads = [] lemmas = [] offset = 0 token_texts = [t.text for t in snlp_tokens] is_aligned = True try: words, spaces = self.get_words_and_spaces(token_texts, text) except ValueError: words = token_texts spaces = [True] * len(words) is_aligned = False warnings.warn( "Due to multiword token expansion or an alignment " "issue, the original text has been replaced by space-separated " "expanded tokens.", stacklevel=4, ) offset = 0 for i, word in enumerate(words): if word.isspace() and word != snlp_tokens[i + offset].text: # insert a space token pos.append(self.vocab.strings.add("SPACE")) tags.append(self.vocab.strings.add("_SP")) deps.append(self.vocab.strings.add("")) lemmas.append(self.vocab.strings.add(word)) # increment any heads left of this position that point beyond # this position to the right (already present in heads) for j in range(0, len(heads)): if j + heads[j] >= i: heads[j] += 1 # decrement any heads right of this position that point beyond # this position to the left (yet to be added from snlp_heads) for j in range(i + offset, len(snlp_heads)): if j + snlp_heads[j] < i + offset: snlp_heads[j] -= 1 # initial space tokens are attached to the following token, # otherwise attach to the preceding token if i == 0: heads.append(1) else: heads.append(-1) offset -= 1 else: token = snlp_tokens[i + offset] assert word == token.text pos.append(self.vocab.strings.add(token.upos or "")) tags.append( self.vocab.strings.add(token.xpos or token.feats or "")) deps.append(self.vocab.strings.add(token.deprel or "")) heads.append(snlp_heads[i + offset]) lemmas.append(self.vocab.strings.add(token.lemma or "")) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) ents = [] for ent in snlp_doc.entities: ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type) ents.append(ent_span) if not is_aligned or not all(ents): warnings.warn( f"Can't set named entities because of multi-word token " f"expansion or because the character offsets don't map to " f"valid tokens produced by the Stanza tokenizer:\n" f"Words: {words}\n" f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", stacklevel=4, ) else: doc.ents = ents # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) or any(tags): doc.is_tagged = True if any(deps) or any(heads): doc.is_parsed = True return doc
def __call__(self,text): t=text.replace("\r","").replace("(","(").replace(")",")").replace("[","[").replace("]","]").replace("{","{").replace("}","}") u=self.model(t) if t else "" vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] morphs=[] heads=[] deps=[] spaces=[] norms=[] ent_iobs=[] ent_types=[] bunsetu=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,feats,head,deprel,_,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) morphs.append(feats) if deprel=="root": heads.append(0) deps.append(r) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) i=misc.find("Translit=") norms.append(vs.add(form if i<0 else misc[i+9:])) i=misc.find("NE=") if i<0: ent_iobs.append(2) ent_types.append(0) else: j=misc.find("|",i) if j<0: j=len(misc) if misc[i+3:i+4]=="B": ent_iobs.append(3) else: ent_iobs.append(1) ent_types.append(vs.add(misc[i+5:j])) bunsetu.append("I") if misc.startswith("BunsetuBILabel="): bunsetu[-1]=misc[15:16] doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms,ent_iobs,ent_types)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM,ENT_IOB,ENT_TYPE],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(morphs): if j!="_" and j!="": doc[i].set_morph(j) doc.user_data["bunsetu_bi_labels"]=bunsetu return doc