def _task_set_state(self, **state): logger.debug('worker `%s`: setting state' % self.name) for key, default in state['_metadata_attrs'].items(): Token.set_extension('meta_' + key, default=default) # de-serialize SpaCy docs lang_cls = spacy.util.get_lang_class(self.language) vocab = Vocab().from_bytes(state.pop('vocab_bytes')) self.nlp = lang_cls(vocab).from_bytes(state.pop('nlp_bytes')) tagger_bytes = state.pop('tagger_bytes') if tagger_bytes is not None: self.tagger = spacy.pipeline.Tagger(self.nlp.vocab).from_bytes(tagger_bytes) self.nlp.pipeline = [('tagger', self.tagger)] else: self.tagger = None self._docs = [] for doc_bytes in state.pop('docs_bytes'): doc = Doc(self.nlp.vocab).from_bytes(doc_bytes) # document tensor array and user_data arrays may only be immutable "views" -> create mutable copies if not doc.tensor.flags.owndata: doc.tensor = doc.tensor.copy() for k, docdata in doc.user_data.items(): if isinstance(docdata, np.ndarray) and not docdata.flags.owndata: doc.user_data[k] = docdata.copy() self._docs.append(doc) for attr, val in state.items(): setattr(self, attr, val)
def test_issue3288(en_vocab): """Test that retokenization works correctly via displaCy when punctuation is merged onto the preceeding token and tensor is resized.""" words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] heads = [1, 1, 1, 4, 4, 6, 4, 4] deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] doc = Doc(en_vocab, words=words, heads=heads, deps=deps) doc.tensor = numpy.zeros((len(words), 96), dtype="float32") displacy.render(doc)
def test_issue3540(en_vocab): words = ["I", "live", "in", "NewYork", "right", "now"] tensor = np.asarray( [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], dtype="f", ) doc = Doc(en_vocab, words=words) doc.tensor = tensor gold_text = ["I", "live", "in", "NewYork", "right", "now"] assert [token.text for token in doc] == gold_text gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] assert [token.lemma_ for token in doc] == gold_lemma vectors_1 = [token.vector for token in doc] assert len(vectors_1) == len(doc) with doc.retokenize() as retokenizer: heads = [(doc[3], 1), doc[2]] attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) gold_text = ["I", "live", "in", "New", "York", "right", "now"] assert [token.text for token in doc] == gold_text gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] assert [token.lemma_ for token in doc] == gold_lemma vectors_2 = [token.vector for token in doc] assert len(vectors_2) == len(doc) assert vectors_1[0].tolist() == vectors_2[0].tolist() assert vectors_1[1].tolist() == vectors_2[1].tolist() assert vectors_1[2].tolist() == vectors_2[2].tolist() assert vectors_1[4].tolist() == vectors_2[5].tolist() assert vectors_1[5].tolist() == vectors_2[6].tolist()
def read_spacy_docs( filepath: Union[str, pathlib.Path], *, format: str = "pickle", lang: Optional[Union[str, Language]] = None, ) -> Iterable[Doc]: """ Read the contents of a file at ``filepath``, written either in pickle or binary format. Args: filepath: Path to file on disk from which data will be read. format ({"pickle", "binary"}): Format of the data that was written to disk. If 'pickle', use ``pickle`` in python's stdlib; if 'binary', use the 3rd-party ``msgpack`` library. .. warning:: Docs written in pickle format were saved all together as a list, which means they're all loaded into memory at once before streaming one by one. Mind your RAM usage, especially when reading many docs! .. warning:: When writing docs in binary format, spaCy's built-in ``spacy.Doc.to_bytes()`` method is used, but when reading the data back in :func:`read_spacy_docs()`, experimental and *unofficial* work-arounds are used to allow for all the docs in ``data`` to be read from the same file. If spaCy changes, this code could break, so use this functionality at your own risk! lang: Already-instantiated ``spacy.Language`` object, or the string name by which it can be loaded, used to process the docs written to disk at ``filepath``. Note that this is only applicable when ``format="binary"``. Yields: Next deserialized document. Raises: ValueError: if format is not "pickle" or "binary", or if ``lang`` is not provided when ``format="binary"`` """ if format == "pickle": with io_utils.open_sesame(filepath, mode="rb") as f: for spacy_doc in pickle.load(f): yield spacy_doc elif format == "binary": if lang is None: raise ValueError( "When format='binary', a `spacy.Language` (and its associated " "`spacy.Vocab`) is required to deserialize the binary data; " "and these should be the same as were used when processing " "the original docs!") elif isinstance(lang, Language): vocab = lang.vocab elif isinstance(lang, str): vocab = spacier.core.load_spacy_lang(lang).vocab else: raise ValueError( "lang = '{}' is invalid; must be a str or `spacy.Language`") with io_utils.open_sesame(filepath, mode="rb") as f: unpacker = msgpack.Unpacker(f, raw=False, unicode_errors="strict") for msg in unpacker: # NOTE: The following code has been adapted from spaCy's # built-in ``spacy.Doc.from_bytes()``. If that functionality # changes, the following will probably break... # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope # users don't mind getting a list instead of a tuple. if "user_data_keys" in msg: user_data_keys = msgpack.loads(msg["user_data_keys"], use_list=False) user_data_values = msgpack.loads(msg["user_data_values"]) user_data = { key: value for key, value in zip(user_data_keys, user_data_values) } else: user_data = None text = msg["text"] attrs = msg["array_body"] words = [] spaces = [] start = 0 for i in range(attrs.shape[0]): end = start + int(attrs[i, 0]) has_space = int(attrs[i, 1]) words.append(text[start:end]) spaces.append(bool(has_space)) start = end + has_space spacy_doc = Doc(vocab, words=words, spaces=spaces, user_data=user_data) spacy_doc = spacy_doc.from_array(msg["array_head"][2:], attrs[:, 2:]) if "sentiment" in msg: spacy_doc.sentiment = msg["sentiment"] if "tensor" in msg: spacy_doc.tensor = msg["tensor"] yield spacy_doc else: raise ValueError( "format = '{}' is invalid; value must be one of {}".format( format, {"pickle", "binary"}))