Exemple #1
0
def test_exceeding_unpacker_read_size():
    dumpf = io.BytesIO()

    packer = msgpack.Packer()

    NUMBER_OF_STRINGS = 6
    read_size = 16
    # 5 ok for read_size=16, while 6 glibc detected *** python: double free or corruption (fasttop):
    # 20 ok for read_size=256, while 25 segfaults / glibc detected *** python: double free or corruption (!prev)
    # 40 ok for read_size=1024, while 50 introduces errors
    # 7000 ok for read_size=1024*1024, while 8000 leads to  glibc detected *** python: double free or corruption (!prev):

    for idx in range(NUMBER_OF_STRINGS):
        data = gen_binary_data(idx)
        dumpf.write(packer.pack(data))

    f = io.BytesIO(dumpf.getvalue())
    dumpf.close()

    unpacker = msgpack.Unpacker(f, read_size=read_size, use_list=1)

    read_count = 0
    for idx, o in enumerate(unpacker):
        assert type(o) == bytes
        assert o == gen_binary_data(idx)
        read_count += 1

    assert read_count == NUMBER_OF_STRINGS
Exemple #2
0
def read_spacy_docs(
    filepath: Union[str, pathlib.Path],
    *,
    format: str = "pickle",
    lang: Optional[Union[str, Language]] = None,
) -> Iterable[Doc]:
    """
    Read the contents of a file at ``filepath``, written either in pickle or binary
    format.

    Args:
        filepath: Path to file on disk from which data will be read.
        format ({"pickle", "binary"}): Format of the data that was written to disk.
            If 'pickle', use ``pickle`` in python's stdlib; if 'binary', use
            the 3rd-party ``msgpack`` library.

            .. warning:: Docs written in pickle format were saved all together
               as a list, which means they're all loaded into memory at once
               before streaming one by one. Mind your RAM usage, especially when
               reading many docs!

            .. warning:: When writing docs in binary format, spaCy's built-in
               ``spacy.Doc.to_bytes()`` method is used, but when reading the data
               back in :func:`read_spacy_docs()`, experimental and *unofficial*
               work-arounds are used to allow for all the docs in ``data`` to be
               read from the same file. If spaCy changes, this code could break,
               so use this functionality at your own risk!

        lang: Already-instantiated ``spacy.Language`` object, or the string name
            by which it can be loaded, used to process the docs written to disk
            at ``filepath``. Note that this is only applicable when ``format="binary"``.

    Yields:
        Next deserialized document.

    Raises:
        ValueError: if format is not "pickle" or "binary", or if ``lang`` is not
            provided when ``format="binary"``
    """
    if format == "pickle":
        with io_utils.open_sesame(filepath, mode="rb") as f:
            for spacy_doc in pickle.load(f):
                yield spacy_doc
    elif format == "binary":
        if lang is None:
            raise ValueError(
                "When format='binary', a `spacy.Language` (and its associated "
                "`spacy.Vocab`) is required to deserialize the binary data; "
                "and these should be the same as were used when processing "
                "the original docs!")
        elif isinstance(lang, Language):
            vocab = lang.vocab
        elif isinstance(lang, str):
            vocab = spacier.core.load_spacy_lang(lang).vocab
        else:
            raise ValueError(
                "lang = '{}' is invalid; must be a str or `spacy.Language`")
        with io_utils.open_sesame(filepath, mode="rb") as f:
            unpacker = msgpack.Unpacker(f, raw=False, unicode_errors="strict")
            for msg in unpacker:

                # NOTE: The following code has been adapted from spaCy's
                # built-in ``spacy.Doc.from_bytes()``. If that functionality
                # changes, the following will probably break...

                # Msgpack doesn't distinguish between lists and tuples, which is
                # vexing for user data. As a best guess, we *know* that within
                # keys, we must have tuples. In values we just have to hope
                # users don't mind getting a list instead of a tuple.
                if "user_data_keys" in msg:
                    user_data_keys = msgpack.loads(msg["user_data_keys"],
                                                   use_list=False)
                    user_data_values = msgpack.loads(msg["user_data_values"])
                    user_data = {
                        key: value
                        for key, value in zip(user_data_keys, user_data_values)
                    }
                else:
                    user_data = None

                text = msg["text"]
                attrs = msg["array_body"]
                words = []
                spaces = []
                start = 0
                for i in range(attrs.shape[0]):
                    end = start + int(attrs[i, 0])
                    has_space = int(attrs[i, 1])
                    words.append(text[start:end])
                    spaces.append(bool(has_space))
                    start = end + has_space

                spacy_doc = Doc(vocab,
                                words=words,
                                spaces=spaces,
                                user_data=user_data)
                spacy_doc = spacy_doc.from_array(msg["array_head"][2:],
                                                 attrs[:, 2:])
                if "sentiment" in msg:
                    spacy_doc.sentiment = msg["sentiment"]
                if "tensor" in msg:
                    spacy_doc.tensor = msg["tensor"]
                yield spacy_doc
    else:
        raise ValueError(
            "format = '{}' is invalid; value must be one of {}".format(
                format, {"pickle", "binary"}))