def test_issue5048(en_vocab): words = ["This", "is", "a", "sentence"] pos_s = ["DET", "VERB", "DET", "NOUN"] spaces = [" ", " ", " ", ""] deps_s = ["dep", "adj", "nn", "atm"] tags_s = ["DT", "VBZ", "DT", "NN"] strings = en_vocab.strings for w in words: strings.add(w) deps = [strings.add(d) for d in deps_s] pos = [strings.add(p) for p in pos_s] tags = [strings.add(t) for t in tags_s] attrs = [POS, DEP, TAG] array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64") doc = Doc(en_vocab, words=words, spaces=spaces) doc.from_array(attrs, array) v1 = [(token.text, token.pos_, token.tag_) for token in doc] doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s) v2 = [(token.text, token.pos_, token.tag_) for token in doc2] assert v1 == v2
def get_doc(self, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): """Create Doc object from given vocab, words and annotations.""" vocab = Vocab() pos = pos or [""] * len(words) tags = tags or [""] * len(words) heads = heads or [0] * len(words) deps = deps or [""] * len(words) for value in deps + tags + pos: vocab.strings.add(value) doc = Doc(vocab, words=words) attrs = doc.to_array([POS, HEAD, DEP]) for i, (p, head, dep) in enumerate(zip(pos, heads, deps)): attrs[i, 0] = doc.vocab.strings[p] attrs[i, 1] = head attrs[i, 2] = doc.vocab.strings[dep] doc.from_array([POS, HEAD, DEP], attrs) if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] if tags: for token in doc: token.tag_ = tags[token.i] return doc
def get_doc(vocab, words, pos, heads, deps): assert len(pos) == len(words) assert len(heads) == len(words) assert len(deps) == len(words) headings = [] values = [] annotations = [pos, heads, deps] possible_headings = [POS, HEAD, DEP] for a, annot in enumerate(annotations): headings.append(possible_headings[a]) if annot is not heads: values.extend(annot) for value in values: vocab.strings.add(value) doc = Doc(vocab, words=words) attrs = doc.to_array(headings) j = 0 for annot in annotations: if annot is heads: for i in range(len(words)): attrs[i, j] = heads[i] else: for i in range(len(words)): attrs[i, j] = doc.vocab.strings[annot[i]] j += 1 doc.from_array(headings, attrs) return doc
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): """Create Doc object from given vocab, words and annotations.""" pos = pos or [""] * len(words) tags = tags or [""] * len(words) heads = heads or [0] * len(words) deps = deps or [""] * len(words) for value in deps + tags + pos: vocab.strings.add(value) doc = Doc(vocab, words=words) attrs = doc.to_array([POS, HEAD, DEP]) for i, (p, head, dep) in enumerate(zip(pos, heads, deps)): attrs[i, 0] = doc.vocab.strings[p] attrs[i, 1] = head attrs[i, 2] = doc.vocab.strings[dep] doc.from_array([POS, HEAD, DEP], attrs) if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] if tags: for token in doc: token.tag_ = tags[token.i] return doc
def __call__(self, text): """Convert input text to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ udpipe_sents = self.model(text) if text else [Sentence()] text = " ".join(s.getText() for s in udpipe_sents) tokens, heads = self.get_tokens_with_heads(udpipe_sents) if not tokens: return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not span: break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.form) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upostag or "")) # CoNNL xpostag-s, custom for each UD treebank #tags.append(self.vocab.strings.add(token.xpostag or "")) tags.append(self.vocab.strings.add(token.feats or "")) deps.append(self.vocab.strings.add(self._dep(token.deprel) or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.form) span = text[offset:] if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.form)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) # Overwrite lemmas separately to prevent overwritting by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def __call__(self, text): """Convert a StanfordNLP Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ snlp_doc = self.snlp(text) if text else Document("") text = snlp_doc.text tokens, heads = self.get_tokens_with_heads(snlp_doc) if not len(tokens): return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not len(span): break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.text) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upos or "")) tags.append(self.vocab.strings.add(token.xpos or "")) deps.append(self.vocab.strings.add(token.dependency_relation or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.text) span = text[offset:] if i == len(tokens) - 1: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.text)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None): """Create Doc object from given vocab, words and annotations.""" if deps and not heads: heads = [0] * len(deps) headings = [] values = [] annotations = [pos, heads, deps, lemmas, tags] possible_headings = [POS, HEAD, DEP, LEMMA, TAG] for a, annot in enumerate(annotations): if annot is not None: if len(annot) != len(words): raise ValueError(Errors.E189) headings.append(possible_headings[a]) if annot is not heads: values.extend(annot) for value in values: vocab.strings.add(value) doc = Doc(vocab, words=words) # if there are any other annotations, set them if headings: attrs = doc.to_array(headings) j = 0 for annot in annotations: if annot: if annot is heads: for i in range(len(words)): if attrs.ndim == 1: attrs[i] = heads[i] else: attrs[i, j] = heads[i] else: for i in range(len(words)): if attrs.ndim == 1: attrs[i] = doc.vocab.strings[annot[i]] else: attrs[i, j] = doc.vocab.strings[annot[i]] j += 1 doc.from_array(headings, attrs) # finally, set the entities if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] return doc
def remove_tokens_on_match(doc): indexes = [] for index, token in enumerate(doc): if (token.pos_ in ('PUNCT', 'NUM', 'SYM')): indexes.append(index) np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) np_array = numpy.delete(np_array, indexes, axis = 0) doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes]) doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array) return doc2
def __call__(self,text): u=self.model(text,raw=True) if text else "" vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] feats=[] heads=[] deps=[] spaces=[] norms=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) feats.append(feat) if deprel=="root": heads.append(0) deps.append(r) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) i=misc.find("Translit=") norms.append(vs.add(form if i<0 else misc[i+9:])) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(feats): if j!="_" and j!="": doc[i].set_morph(j) t=Tree(u) t._makeChunks() bunsetu=["I"]*len(doc) for s in t._cabocha._sentences: for w in s: try: bunsetu[w[0]-1]="B" except: pass doc.user_data["bunsetu_bi_labels"]=bunsetu return doc
def test_doc_from_array_morph(en_vocab): # fmt: off words = ["I", "live", "in", "New", "York", "."] morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"] # fmt: on doc = Doc(en_vocab, words=words, morphs=morphs) attrs = [MORPH] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) assert [str(t.morph) for t in new_doc] == morphs assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
def test_doc_from_array_sent_starts(en_vocab): # fmt: off words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"] # fmt: on doc = Doc(en_vocab, words=words, heads=heads, deps=deps) # HEAD overrides SENT_START without warning attrs = [SENT_START, HEAD] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) # no warning using default attrs attrs = doc._get_array_attrs() arr = doc.to_array(attrs) with pytest.warns(None) as record: new_doc.from_array(attrs, arr) assert len(record) == 0 # only SENT_START uses SENT_START attrs = [SENT_START] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] assert not new_doc.has_annotation("DEP") # only HEAD uses HEAD attrs = [HEAD, DEP] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] assert new_doc.has_annotation("DEP")
def remove_tokens_on_match(self, doc): indexes = [] for inx, token in enumerate(doc): if not token.is_stop and token.tag_ == "NN": indexes.append(inx) np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) np_array = numpy.delete(np_array, indexes, axis=0) doc2 = Doc( doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes]) doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array) return doc2
def __call__(self,text): u=self.model(text,raw=True) if text else "" vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] feats=[] heads=[] deps=[] spaces=[] norms=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,feat,head,deprel,dummy_deps,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) feats.append(feat) if deprel=="root": heads.append(0) deps.append(r) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) i=misc.find("Gloss=") if i<0: norms.append(vs.add(form)) else: j=misc.find("|",i) norms.append(vs.add(misc[i+6:] if j<0 else misc[i+6:j])) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(feats): if j!="_" and j!="": doc[i].set_morph(j) return doc
def make_doc_from_text_chunks( text: str, lang: Union[str, Language], chunk_size: int = 100000, ) -> Doc: """ Make a single spaCy-processed document from 1 or more chunks of ``text``. This is a workaround for processing very long texts, for which spaCy is unable to allocate enough RAM. Although this function's performance is *pretty good*, it's inherently less performant that just processing the entire text in one shot. Only use it if necessary! Args: text: Text document to be chunked and processed by spaCy. lang: A 2-letter language code (e.g. "en"), the name of a spaCy model for the desired language, or an already-instantiated spaCy language pipeline. chunk_size: Number of characters comprising each text chunk (excluding the last chunk, which is probably smaller). For best performance, value should be somewhere between 1e3 and 1e7, depending on how much RAM you have available. .. note:: Since chunking is done by character, chunks edges' probably won't respect natural language segmentation, which means that every ``chunk_size`` characters, spaCy will probably get tripped up and make weird parsing errors. Returns: A single processed document, initialized from components accumulated chunk by chunk. """ if isinstance(lang, str): lang = core.load_spacy_lang(lang) elif not isinstance(lang, Language): raise TypeError( errors.type_invalid_msg("lang", type(lang), Union[str, Language])) words: List[str] = [] spaces: List[bool] = [] np_arrays = [] cols = [ attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB, attrs.ENT_TYPE ] text_len = len(text) i = 0 # iterate over text chunks and accumulate components needed to make a doc while i < text_len: chunk_doc = lang(text[i:i + chunk_size]) words.extend(tok.text for tok in chunk_doc) spaces.extend(bool(tok.whitespace_) for tok in chunk_doc) np_arrays.append(chunk_doc.to_array(cols)) i += chunk_size # now, initialize the doc from words and spaces # then load attribute values from the concatenated np array doc = Doc(lang.vocab, words=words, spaces=spaces) doc = doc.from_array(cols, np.concatenate(np_arrays, axis=0)) return doc
def doc_cleaning(doc: Doc): np_array = doc.to_array( [LEMMA, LOWER, POS, TAG, ENT_TYPE, IS_ALPHA, DEP, HEAD, SPACY]) words = [t.text for i, t in enumerate(doc)] cleaned_words = list() for w in words: if w != 'PairDrug1' and w != 'PairDrug2': w = number_substitution(w) if w == '%': w = ' ' cleaned_words.append(w) doc2 = Doc(doc.vocab, words=cleaned_words) doc2.from_array( [LEMMA, LOWER, POS, TAG, ENT_TYPE, IS_ALPHA, DEP, HEAD, SPACY], np_array) return doc2
def test_issue2203(en_vocab): """Test that lemmas are set correctly in doc.from_array.""" words = ["I", "'ll", "survive"] tags = ["PRP", "MD", "VB"] lemmas = ["-PRON-", "will", "survive"] tag_ids = [en_vocab.strings.add(tag) for tag in tags] lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] doc = Doc(en_vocab, words=words) # Work around lemma corrpution problem and set lemmas after tags doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) assert [t.tag_ for t in doc] == tags assert [t.lemma_ for t in doc] == lemmas # We need to serialize both tag and lemma, since this is what causes the bug doc_array = doc.to_array(["TAG", "LEMMA"]) new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array) assert [t.tag_ for t in new_doc] == tags assert [t.lemma_ for t in new_doc] == lemmas
def remove_tokens_on_match(doc): tokens_to_remove = [ "NNP", "NN", "NNS", "CD", "UH", "JJ" ] indexes = [] for index, token in enumerate(doc): # print(index, token.text, token.tag_, token.dep_) for tag in tokens_to_remove: if (token.tag_ == tag): indexes.append(index) # print("REMOVE: ",token.text, tag, dep) np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) np_array = numpy.delete(np_array, indexes, axis = 0) doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes]) doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array) doc2 = doc2.text return doc2
def test_example_constructor(en_vocab): words = ["I", "like", "stuff"] tags = ["NOUN", "VERB", "NOUN"] tag_ids = [en_vocab.strings.add(tag) for tag in tags] predicted = Doc(en_vocab, words=words) reference = Doc(en_vocab, words=words) reference = reference.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) example = Example(predicted, reference) tags = example.get_aligned("TAG", as_string=True) assert tags == ["NOUN", "VERB", "NOUN"]
def substitution(doc: Doc, index: int, value: int) -> Doc: np_array = doc.to_array( [LEMMA, LOWER, POS, TAG, ENT_TYPE, IS_ALPHA, DEP, HEAD, SPACY]) words = [t.text for i, t in enumerate(doc)] #print(words[index]) if value == -1: item = 'NoPair' if value == 0: item = "Drug" if value == 1: item = "PairDrug1" if value == 2: item = "PairDrug2" words.__setitem__(index, item) doc2 = Doc(doc.vocab, words=words) doc2.from_array( [LEMMA, LOWER, POS, TAG, ENT_TYPE, IS_ALPHA, DEP, HEAD, SPACY], np_array) return doc2
def __call__(self,text): u=self.model(text) if text else "" if not self.convUD: return u vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] heads=[] deps=[] spaces=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,dummy_feats,head,deprel,dummy_deps,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) if deprel=="root" or deprel=="ROOT": heads.append(0) deps.append(r) elif head=="0": heads.append(0) deps.append(vs.add(deprel)) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD],a) try: doc.is_tagged=True doc.is_parsed=True except: pass return doc
def __call__(self,doc): vs=self.vocab.strings words=[] pos=[] tags=[] spaces=[] for i,(form,xpos) in enumerate(self.pos_tag([t.orth_ for t in doc])): if form.strip()=="": if len(spaces)>0: spaces[-1]=True else: words.append(form) spaces.append(doc[i].whitespace_!="") tags.append(vs.add(xpos)) pos.append(self.tag_map[xpos][POS] if xpos in self.tag_map else X) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(pos,tags)),dtype="uint64") doc.from_array([POS,TAG],a) if not SPACY_V3: doc.is_tagged=True return doc
def test_doc_from_array_heads_in_bounds(en_vocab): """Test that Doc.from_array doesn't set heads that are out of bounds.""" words = ["This", "is", "a", "sentence", "."] doc = Doc(en_vocab, words=words) for token in doc: token.head = doc[0] # correct arr = doc.to_array(["HEAD"]) doc_from_array = Doc(en_vocab, words=words) doc_from_array.from_array(["HEAD"], arr) # head before start arr = doc.to_array(["HEAD"]) arr[0] = -1 doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) # head after end arr = doc.to_array(["HEAD"]) arr[0] = 5 doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr)
def test_doc_from_array_sent_starts(en_vocab): words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] # fmt: off deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] # fmt: on doc = Doc(en_vocab, words=words) for i, (dep, head) in enumerate(zip(deps, heads)): doc[i].dep_ = dep doc[i].head = doc[head] if head == i: doc[i].is_sent_start = True doc.is_parsed attrs = [SENT_START, HEAD] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) with pytest.raises(ValueError): new_doc.from_array(attrs, arr) attrs = [SENT_START, DEP] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] assert not new_doc.is_parsed attrs = [HEAD, DEP] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] assert new_doc.is_parsed
def test_issue3012(en_vocab): """Test that the is_tagged attribute doesn't get overwritten when we from_array without tag information.""" words = ["This", "is", "10", "%", "."] tags = ["DT", "VBZ", "CD", "NN", "."] pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"] doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) assert doc.has_annotation("TAG") expected = ("10", "NUM", "CD", "PERCENT") assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected header = [ENT_IOB, ENT_TYPE] ent_array = doc.to_array(header) doc.from_array(header, ent_array) assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected # Serializing then deserializing doc_bytes = doc.to_bytes() doc2 = Doc(en_vocab).from_bytes(doc_bytes) assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
def test_issue1799(): """Test sentence boundaries are deserialized correctly, even for non-projective sentences.""" heads_deps = numpy.asarray( [ [1, 397], [4, 436], [2, 426], [1, 402], [0, 8206900633647566924], [18446744073709551615, 440], [18446744073709551614, 442], ], dtype="uint64", ) doc = Doc(Vocab(), words="Just what I was looking for .".split()) doc.vocab.strings.add("ROOT") doc = doc.from_array([HEAD, DEP], heads_deps) assert len(list(doc.sents)) == 1
def read_spacy_docs( filepath: Union[str, pathlib.Path], *, format: str = "pickle", lang: Optional[Union[str, Language]] = None, ) -> Iterable[Doc]: """ Read the contents of a file at ``filepath``, written either in pickle or binary format. Args: filepath: Path to file on disk from which data will be read. format ({"pickle", "binary"}): Format of the data that was written to disk. If 'pickle', use ``pickle`` in python's stdlib; if 'binary', use the 3rd-party ``msgpack`` library. .. warning:: Docs written in pickle format were saved all together as a list, which means they're all loaded into memory at once before streaming one by one. Mind your RAM usage, especially when reading many docs! .. warning:: When writing docs in binary format, spaCy's built-in ``spacy.Doc.to_bytes()`` method is used, but when reading the data back in :func:`read_spacy_docs()`, experimental and *unofficial* work-arounds are used to allow for all the docs in ``data`` to be read from the same file. If spaCy changes, this code could break, so use this functionality at your own risk! lang: Already-instantiated ``spacy.Language`` object, or the string name by which it can be loaded, used to process the docs written to disk at ``filepath``. Note that this is only applicable when ``format="binary"``. Yields: Next deserialized document. Raises: ValueError: if format is not "pickle" or "binary", or if ``lang`` is not provided when ``format="binary"`` """ if format == "pickle": with io_utils.open_sesame(filepath, mode="rb") as f: for spacy_doc in pickle.load(f): yield spacy_doc elif format == "binary": if lang is None: raise ValueError( "When format='binary', a `spacy.Language` (and its associated " "`spacy.Vocab`) is required to deserialize the binary data; " "and these should be the same as were used when processing " "the original docs!") elif isinstance(lang, Language): vocab = lang.vocab elif isinstance(lang, str): vocab = spacier.core.load_spacy_lang(lang).vocab else: raise ValueError( "lang = '{}' is invalid; must be a str or `spacy.Language`") with io_utils.open_sesame(filepath, mode="rb") as f: unpacker = msgpack.Unpacker(f, raw=False, unicode_errors="strict") for msg in unpacker: # NOTE: The following code has been adapted from spaCy's # built-in ``spacy.Doc.from_bytes()``. If that functionality # changes, the following will probably break... # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope # users don't mind getting a list instead of a tuple. if "user_data_keys" in msg: user_data_keys = msgpack.loads(msg["user_data_keys"], use_list=False) user_data_values = msgpack.loads(msg["user_data_values"]) user_data = { key: value for key, value in zip(user_data_keys, user_data_values) } else: user_data = None text = msg["text"] attrs = msg["array_body"] words = [] spaces = [] start = 0 for i in range(attrs.shape[0]): end = start + int(attrs[i, 0]) has_space = int(attrs[i, 1]) words.append(text[start:end]) spaces.append(bool(has_space)) start = end + has_space spacy_doc = Doc(vocab, words=words, spaces=spaces, user_data=user_data) spacy_doc = spacy_doc.from_array(msg["array_head"][2:], attrs[:, 2:]) if "sentiment" in msg: spacy_doc.sentiment = msg["sentiment"] if "tensor" in msg: spacy_doc.tensor = msg["tensor"] yield spacy_doc else: raise ValueError( "format = '{}' is invalid; value must be one of {}".format( format, {"pickle", "binary"}))
def __call__(self,text): from suparkanbun.tradify import tradify t="" for c in text: if c in self.simplify: t+=self.simplify[c] else: t+=c if self.danku!=None: u=t.replace("\n","") t="" while len(u)>500: s=self.danku(u[0:500]) r="" for c,p in s: r+=c if p=="S" or p=="E": r+="\n" r="\n".join(r.split("\n")[0:-2])+"\n" t+=r u=u[len(r.replace("\n","")):] s=self.danku(u) for c,p in s: t+=c if p=="S" or p=="E": t+="\n" if len(t)<500: p=self.tagger(t.replace("\n","")) else: p=[] u="" for s in t.strip().split("\n"): u+=s if len(u)>400: p+=self.tagger(u) u="" if len(u)>0: p+=self.tagger(u) u=self.supar.predict([[c for c in s] for s in t.strip().split("\n")],lang=None) t=text.replace("\n","") i=0 w=[] for s in u.sentences: v=[] for h,d in zip(s.values[6],s.values[7]): j=t[i] k=tradify[j] if j in tradify else j v.append({"form":j,"lemma":k,"pos":p[i][1],"head":h,"deprel":d}) i+=1 for j in reversed(range(0,len(v)-1)): if v[j]["deprel"]=="compound" and v[j]["head"]==j+2 and v[j]["pos"]==v[j+1]["pos"]: k=v.pop(j) v[j]["form"]=k["form"]+v[j]["form"] v[j]["lemma"]=k["lemma"]+v[j]["lemma"] for k in range(0,len(v)): if v[k]["head"]>j+1: v[k]["head"]-=1 w.append(list(v)) vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] feats=[] heads=[] deps=[] spaces=[] norms=[] for s in w: for i,t in enumerate(s): form=t["form"] words.append(form) lemmas.append(vs.add(t["lemma"])) p=t["pos"].split(",") xpos=",".join(p[0:4]) pos.append(vs.add(p[4])) tags.append(vs.add(xpos)) feats.append(p[5]) if t["deprel"]=="root": heads.append(0) deps.append(r) else: heads.append(t["head"]-i-1) deps.append(vs.add(t["deprel"])) spaces.append(False) g=self.gloss(form,xpos) if g!=None: norms.append(vs.add(g)) else: norms.append(vs.add(form)) doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(feats): if j!="_" and j!="": doc[i].set_morph(j) return doc
def __call__(self, text): """Convert a Stanza Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ if not text: return Doc(self.vocab) elif text.isspace(): return Doc(self.vocab, words=[text], spaces=[False]) snlp_doc = self.snlp(text) text = snlp_doc.text snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc) words = [] spaces = [] pos = [] tags = [] deps = [] heads = [] lemmas = [] offset = 0 token_texts = [t.text for t in snlp_tokens] is_aligned = True try: words, spaces = self.get_words_and_spaces(token_texts, text) except ValueError: words = token_texts spaces = [True] * len(words) is_aligned = False warnings.warn( "Due to multiword token expansion or an alignment " "issue, the original text has been replaced by space-separated " "expanded tokens.", stacklevel=4, ) offset = 0 for i, word in enumerate(words): if word.isspace() and word != snlp_tokens[i + offset].text: # insert a space token pos.append(self.vocab.strings.add("SPACE")) tags.append(self.vocab.strings.add("_SP")) deps.append(self.vocab.strings.add("")) lemmas.append(self.vocab.strings.add(word)) # increment any heads left of this position that point beyond # this position to the right (already present in heads) for j in range(0, len(heads)): if j + heads[j] >= i: heads[j] += 1 # decrement any heads right of this position that point beyond # this position to the left (yet to be added from snlp_heads) for j in range(i + offset, len(snlp_heads)): if j + snlp_heads[j] < i + offset: snlp_heads[j] -= 1 # initial space tokens are attached to the following token, # otherwise attach to the preceding token if i == 0: heads.append(1) else: heads.append(-1) offset -= 1 else: token = snlp_tokens[i + offset] assert word == token.text pos.append(self.vocab.strings.add(token.upos or "")) tags.append( self.vocab.strings.add(token.xpos or token.feats or "")) deps.append(self.vocab.strings.add(token.deprel or "")) heads.append(snlp_heads[i + offset]) lemmas.append(self.vocab.strings.add(token.lemma or "")) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) ents = [] for ent in snlp_doc.entities: ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type) ents.append(ent_span) if not is_aligned or not all(ents): warnings.warn( f"Can't set named entities because of multi-word token " f"expansion or because the character offsets don't map to " f"valid tokens produced by the Stanza tokenizer:\n" f"Words: {words}\n" f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", stacklevel=4, ) else: doc.ents = ents # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) or any(tags): doc.is_tagged = True if any(deps) or any(heads): doc.is_parsed = True return doc
def __call__(self, text): u = self.model(text) if text else "" vs = self.vocab.strings r = vs.add("ROOT") p = { "ACAUS": "VERB", "ACOND": "SCONJ", "ADV": "ADV", "ALIM": "SCONJ", "APREC": "SCONJ", "ART": "DET", "CCIRC": "SCONJ", "CFOC": "PART", "CONJ": "CCONJ", "COP": "PRON", "CPRET": "AUX", "CREL": "SCONJ", "EXIST": "VERB", "FUT": "AUX", "IMOD": "ADV", "NEG": "ADV", "NPROP": "PROPN", "NUM": "NUM", "PDEM": "DET", "PPOS": "DET", "PREP": "ADP", "PTC": "PART", "PUNCT": "PUNCT" } words = [] lemmas = [] pos = [] tags = [] heads = [] deps = [] spaces = [] norms = [] for s in u.split("\n"): if s.startswith('<norm xml:id="u'): id = s[15:s.index('"', 16)] i = s.index(' orig="') form = s[i + 7:s.index('"', i + 8)] words.append(form) i = s.find(' lemma="') lemmas.append( vs.add(form if i < 0 else s[i + 8:s.index('"', i + 9)])) i = s.find(' norm="') norms.append( vs.add(form if i < 0 else s[i + 7:s.index('"', i + 8)])) i = s.index(' func="') dep = s[i + 7:s.index('"', i + 8)] if dep == "root": heads.append(0) deps.append(r) else: i = s.find(' head="#u') heads.append( 0 if i < 0 else int(s[i + 9:s.index('"', i + 10)]) - int(id)) deps.append(vs.add(dep)) i = s.index(' pos="') xpos = s[i + 6:s.index('"', i + 7)] tags.append(vs.add(xpos)) upos = "X" if xpos in p: upos = p[xpos] elif xpos.startswith("A"): upos = "AUX" elif xpos.startswith("N"): upos = "ADJ" if dep in {"amod", "acl"} else "NOUN" elif xpos.startswith("P"): upos = "PRON" elif xpos.startswith("V"): upos = "VERB" pos.append(vs.add(upos)) spaces.append(False) elif s.startswith("</norm_group>"): spaces[-1] = True doc = Doc(self.vocab, words=words, spaces=spaces) a = numpy.array(list(zip(lemmas, pos, tags, deps, heads, norms)), dtype="uint64") doc.from_array([LEMMA, POS, TAG, DEP, HEAD, NORM], a) try: doc.is_tagged = True doc.is_parsed = True except: pass return doc
def serialize_spacy_doc(orig_doc, converted_sentences): words = [] spaces = [] total_attrs = [] attrs_ = list(attrs.NAMES) attrs_.remove('SENT_START') # this clashes HEAD (see spacy documentation) attrs_.remove( 'SPACY') # we dont want to override the spaces we assign later on for orig_span, converted_sentence in zip(orig_doc.sents, converted_sentences): # remove redundant dummy-root-node converted = { iid: tok for iid, tok in converted_sentence.items() if iid != 0 } orig = orig_span.as_doc() # get attributes of original doc orig_attrs = orig.to_array(attrs_) # append copied attributes for new nodes new_nodes_attrs = [] for iid, tok in converted.items(): if int(iid) != iid: new_node_attrs = list(orig_attrs[int(iid) - 1]) # here we fix the relative head he is pointing to, # in case it is a negative number we need to cast it to its unsigned synonym relative = int(iid) - (len(orig_attrs) + len(new_nodes_attrs) + 1) new_node_attrs[attrs_.index('HEAD')] = relative + ( 2**NUM_OF_BITS if relative < 0 else 0) new_nodes_attrs.append(new_node_attrs) if new_nodes_attrs: new_attrs = np.append(orig_attrs, new_nodes_attrs, axis=0) else: new_attrs = orig_attrs total_attrs = np.append(total_attrs, new_attrs, axis=0) if len(total_attrs) > 0 else new_attrs # fix whitespaces in case of new nodes: take original spaces. change the last one if there are new nodes. # add spaces for each new nodes, except for last spaces += [t.whitespace_ if not ((i + 1 == len(orig)) and (len(new_nodes_attrs) > 0)) else ' ' for i, t in enumerate(orig)] + \ [' ' if i + 1 < len(converted.keys()) else '' for i, iid in enumerate(converted.keys()) if int(iid) != iid] spaces[-1] = ' ' words += [t.get_conllu_field("form") for iid, t in converted.items()] # form new doc including new nodes and set attributes spaces[-1] = '' new_doc = Doc(orig_doc.vocab, words=words, spaces=spaces) new_doc.from_array(attrs_, total_attrs) j = 0 for converted_sentence in converted_sentences: converted = { iid: tok for iid, tok in converted_sentence.items() if iid != 0 } # store spacy ids for head indices extraction later on spacy_ids = { iid: (spacy_i + j) for spacy_i, iid in enumerate(converted.keys()) } # set new info for all tokens per their head lists for i, bart_tok in enumerate(converted.values()): spacy_tok = new_doc[i + j] for head, rel in bart_tok.get_new_relations(): # extract spacy correspondent head id head_tok = new_doc[ spacy_ids[head.get_conllu_field("id")] if head. get_conllu_field("id") != 0 else spacy_tok.i] # parse stringish label is_state_head_node = ((head_tok.text == "STATE") and (head.get_conllu_field("id") != int(head.get_conllu_field("id")))) or \ (bart_tok.get_conllu_field("id") != int(bart_tok.get_conllu_field("id"))) new_rel, src, unc, alt = parse_bart_label( rel, is_state_head_node=is_state_head_node) # add info to token spacy_tok._.parent_list.append({ 'head': head_tok, 'rel': new_rel, 'src': src, 'alt': alt, 'unc': unc }) # fix sentence boundaries, need to turn off is_parsed bool as it prevents setting the boundaries new_doc.is_parsed = False spacy_tok.is_sent_start = False if i != 0 else True new_doc.is_parsed = True j += len(converted) return new_doc
def _spacy_decode(self, x): doc = Doc(self.nlp.vocab, words=x['words']) return doc.from_array([ DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER, TAG, SENT_END, SENT_START, ORTH, POS, ENT_IOB ], x['arr'].reshape(x['shape']))