def test_exceeding_unpacker_read_size(): dumpf = io.BytesIO() packer = msgpack.Packer() NUMBER_OF_STRINGS = 6 read_size = 16 # 5 ok for read_size=16, while 6 glibc detected *** python: double free or corruption (fasttop): # 20 ok for read_size=256, while 25 segfaults / glibc detected *** python: double free or corruption (!prev) # 40 ok for read_size=1024, while 50 introduces errors # 7000 ok for read_size=1024*1024, while 8000 leads to glibc detected *** python: double free or corruption (!prev): for idx in range(NUMBER_OF_STRINGS): data = gen_binary_data(idx) dumpf.write(packer.pack(data)) f = io.BytesIO(dumpf.getvalue()) dumpf.close() unpacker = msgpack.Unpacker(f, read_size=read_size, use_list=1) read_count = 0 for idx, o in enumerate(unpacker): assert type(o) == bytes assert o == gen_binary_data(idx) read_count += 1 assert read_count == NUMBER_OF_STRINGS
def read_spacy_docs( filepath: Union[str, pathlib.Path], *, format: str = "pickle", lang: Optional[Union[str, Language]] = None, ) -> Iterable[Doc]: """ Read the contents of a file at ``filepath``, written either in pickle or binary format. Args: filepath: Path to file on disk from which data will be read. format ({"pickle", "binary"}): Format of the data that was written to disk. If 'pickle', use ``pickle`` in python's stdlib; if 'binary', use the 3rd-party ``msgpack`` library. .. warning:: Docs written in pickle format were saved all together as a list, which means they're all loaded into memory at once before streaming one by one. Mind your RAM usage, especially when reading many docs! .. warning:: When writing docs in binary format, spaCy's built-in ``spacy.Doc.to_bytes()`` method is used, but when reading the data back in :func:`read_spacy_docs()`, experimental and *unofficial* work-arounds are used to allow for all the docs in ``data`` to be read from the same file. If spaCy changes, this code could break, so use this functionality at your own risk! lang: Already-instantiated ``spacy.Language`` object, or the string name by which it can be loaded, used to process the docs written to disk at ``filepath``. Note that this is only applicable when ``format="binary"``. Yields: Next deserialized document. Raises: ValueError: if format is not "pickle" or "binary", or if ``lang`` is not provided when ``format="binary"`` """ if format == "pickle": with io_utils.open_sesame(filepath, mode="rb") as f: for spacy_doc in pickle.load(f): yield spacy_doc elif format == "binary": if lang is None: raise ValueError( "When format='binary', a `spacy.Language` (and its associated " "`spacy.Vocab`) is required to deserialize the binary data; " "and these should be the same as were used when processing " "the original docs!") elif isinstance(lang, Language): vocab = lang.vocab elif isinstance(lang, str): vocab = spacier.core.load_spacy_lang(lang).vocab else: raise ValueError( "lang = '{}' is invalid; must be a str or `spacy.Language`") with io_utils.open_sesame(filepath, mode="rb") as f: unpacker = msgpack.Unpacker(f, raw=False, unicode_errors="strict") for msg in unpacker: # NOTE: The following code has been adapted from spaCy's # built-in ``spacy.Doc.from_bytes()``. If that functionality # changes, the following will probably break... # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope # users don't mind getting a list instead of a tuple. if "user_data_keys" in msg: user_data_keys = msgpack.loads(msg["user_data_keys"], use_list=False) user_data_values = msgpack.loads(msg["user_data_values"]) user_data = { key: value for key, value in zip(user_data_keys, user_data_values) } else: user_data = None text = msg["text"] attrs = msg["array_body"] words = [] spaces = [] start = 0 for i in range(attrs.shape[0]): end = start + int(attrs[i, 0]) has_space = int(attrs[i, 1]) words.append(text[start:end]) spaces.append(bool(has_space)) start = end + has_space spacy_doc = Doc(vocab, words=words, spaces=spaces, user_data=user_data) spacy_doc = spacy_doc.from_array(msg["array_head"][2:], attrs[:, 2:]) if "sentiment" in msg: spacy_doc.sentiment = msg["sentiment"] if "tensor" in msg: spacy_doc.tensor = msg["tensor"] yield spacy_doc else: raise ValueError( "format = '{}' is invalid; value must be one of {}".format( format, {"pickle", "binary"}))