Ejemplo n.º 1
0
    def _task_set_state(self, **state):
        logger.debug('worker `%s`: setting state' % self.name)

        for key, default in state['_metadata_attrs'].items():
            Token.set_extension('meta_' + key, default=default)

        # de-serialize SpaCy docs
        lang_cls = spacy.util.get_lang_class(self.language)
        vocab = Vocab().from_bytes(state.pop('vocab_bytes'))
        self.nlp = lang_cls(vocab).from_bytes(state.pop('nlp_bytes'))
        tagger_bytes = state.pop('tagger_bytes')
        if tagger_bytes is not None:
            self.tagger = spacy.pipeline.Tagger(self.nlp.vocab).from_bytes(tagger_bytes)
            self.nlp.pipeline = [('tagger', self.tagger)]
        else:
            self.tagger = None

        self._docs = []
        for doc_bytes in state.pop('docs_bytes'):
            doc = Doc(self.nlp.vocab).from_bytes(doc_bytes)

            # document tensor array and user_data arrays may only be immutable "views" -> create mutable copies
            if not doc.tensor.flags.owndata:
                doc.tensor = doc.tensor.copy()

            for k, docdata in doc.user_data.items():
                if isinstance(docdata, np.ndarray) and not docdata.flags.owndata:
                    doc.user_data[k] = docdata.copy()

            self._docs.append(doc)

        for attr, val in state.items():
            setattr(self, attr, val)
Ejemplo n.º 2
0
def test_issue3288(en_vocab):
    """Test that retokenization works correctly via displaCy when punctuation
    is merged onto the preceeding token and tensor is resized."""
    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
    heads = [1, 1, 1, 4, 4, 6, 4, 4]
    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
    displacy.render(doc)
Ejemplo n.º 3
0
def test_issue3540(en_vocab):

    words = ["I", "live", "in", "NewYork", "right", "now"]
    tensor = np.asarray(
        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1],
         [6.0, 6.1]],
        dtype="f",
    )
    doc = Doc(en_vocab, words=words)
    doc.tensor = tensor

    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.text for token in doc] == gold_text

    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma

    vectors_1 = [token.vector for token in doc]
    assert len(vectors_1) == len(doc)

    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)

    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.text for token in doc] == gold_text

    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma

    vectors_2 = [token.vector for token in doc]
    assert len(vectors_2) == len(doc)

    assert vectors_1[0].tolist() == vectors_2[0].tolist()
    assert vectors_1[1].tolist() == vectors_2[1].tolist()
    assert vectors_1[2].tolist() == vectors_2[2].tolist()

    assert vectors_1[4].tolist() == vectors_2[5].tolist()
    assert vectors_1[5].tolist() == vectors_2[6].tolist()
Ejemplo n.º 4
0
def read_spacy_docs(
    filepath: Union[str, pathlib.Path],
    *,
    format: str = "pickle",
    lang: Optional[Union[str, Language]] = None,
) -> Iterable[Doc]:
    """
    Read the contents of a file at ``filepath``, written either in pickle or binary
    format.

    Args:
        filepath: Path to file on disk from which data will be read.
        format ({"pickle", "binary"}): Format of the data that was written to disk.
            If 'pickle', use ``pickle`` in python's stdlib; if 'binary', use
            the 3rd-party ``msgpack`` library.

            .. warning:: Docs written in pickle format were saved all together
               as a list, which means they're all loaded into memory at once
               before streaming one by one. Mind your RAM usage, especially when
               reading many docs!

            .. warning:: When writing docs in binary format, spaCy's built-in
               ``spacy.Doc.to_bytes()`` method is used, but when reading the data
               back in :func:`read_spacy_docs()`, experimental and *unofficial*
               work-arounds are used to allow for all the docs in ``data`` to be
               read from the same file. If spaCy changes, this code could break,
               so use this functionality at your own risk!

        lang: Already-instantiated ``spacy.Language`` object, or the string name
            by which it can be loaded, used to process the docs written to disk
            at ``filepath``. Note that this is only applicable when ``format="binary"``.

    Yields:
        Next deserialized document.

    Raises:
        ValueError: if format is not "pickle" or "binary", or if ``lang`` is not
            provided when ``format="binary"``
    """
    if format == "pickle":
        with io_utils.open_sesame(filepath, mode="rb") as f:
            for spacy_doc in pickle.load(f):
                yield spacy_doc
    elif format == "binary":
        if lang is None:
            raise ValueError(
                "When format='binary', a `spacy.Language` (and its associated "
                "`spacy.Vocab`) is required to deserialize the binary data; "
                "and these should be the same as were used when processing "
                "the original docs!")
        elif isinstance(lang, Language):
            vocab = lang.vocab
        elif isinstance(lang, str):
            vocab = spacier.core.load_spacy_lang(lang).vocab
        else:
            raise ValueError(
                "lang = '{}' is invalid; must be a str or `spacy.Language`")
        with io_utils.open_sesame(filepath, mode="rb") as f:
            unpacker = msgpack.Unpacker(f, raw=False, unicode_errors="strict")
            for msg in unpacker:

                # NOTE: The following code has been adapted from spaCy's
                # built-in ``spacy.Doc.from_bytes()``. If that functionality
                # changes, the following will probably break...

                # Msgpack doesn't distinguish between lists and tuples, which is
                # vexing for user data. As a best guess, we *know* that within
                # keys, we must have tuples. In values we just have to hope
                # users don't mind getting a list instead of a tuple.
                if "user_data_keys" in msg:
                    user_data_keys = msgpack.loads(msg["user_data_keys"],
                                                   use_list=False)
                    user_data_values = msgpack.loads(msg["user_data_values"])
                    user_data = {
                        key: value
                        for key, value in zip(user_data_keys, user_data_values)
                    }
                else:
                    user_data = None

                text = msg["text"]
                attrs = msg["array_body"]
                words = []
                spaces = []
                start = 0
                for i in range(attrs.shape[0]):
                    end = start + int(attrs[i, 0])
                    has_space = int(attrs[i, 1])
                    words.append(text[start:end])
                    spaces.append(bool(has_space))
                    start = end + has_space

                spacy_doc = Doc(vocab,
                                words=words,
                                spaces=spaces,
                                user_data=user_data)
                spacy_doc = spacy_doc.from_array(msg["array_head"][2:],
                                                 attrs[:, 2:])
                if "sentiment" in msg:
                    spacy_doc.sentiment = msg["sentiment"]
                if "tensor" in msg:
                    spacy_doc.tensor = msg["tensor"]
                yield spacy_doc
    else:
        raise ValueError(
            "format = '{}' is invalid; value must be one of {}".format(
                format, {"pickle", "binary"}))