def test_issue599(en_vocab): doc = Doc(en_vocab) doc.is_tagged = True doc.is_parsed = True doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) assert doc2.is_parsed
def test_serialize_transformer_data(): data = {"x": TransformerData.empty()} bytes_data = srsly.msgpack_dumps(data) new_data = srsly.msgpack_loads(bytes_data) assert isinstance(new_data["x"], TransformerData) nlp = Language() trf = nlp.add_pipe( "transformer", config={ "model": { "name": "distilbert-base-uncased", "transformer_config": { "output_attentions": True }, } }, ) nlp.initialize() doc = nlp("This is a test.") b = doc.to_bytes() reloaded_doc = Doc(nlp.vocab) reloaded_doc.from_bytes(b) assert_docs_equal(doc, reloaded_doc) for key in doc._.trf_data.model_output: assert_array_equal(doc._.trf_data.model_output[key], reloaded_doc._.trf_data.model_output[key])
def test_serialize_empty_doc(en_vocab): doc = Doc(en_vocab) data = doc.to_bytes() doc2 = Doc(en_vocab) doc2.from_bytes(data) assert len(doc) == len(doc2) for token1, token2 in zip(doc, doc2): assert token1.text == token2.text
def test_issue599(en_vocab): doc = Doc(en_vocab) doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) assert doc2.has_annotation("DEP")
nlp = spacy.load("en_core_web_sm") doc_base = nlp(text) print("") print_doc(doc_base) # Serialize document to disk and bytes doc_base.to_disk("doc.spacy") doc_base_bytes = doc_base.to_bytes() # Serialize using DocBin docbin_base = DocBin(attrs=["ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE"], store_user_data=True) docbin_base.add(doc_base) docbin_base_bytes = docbin_base.to_bytes() # Restore document from disk doc = Doc(Vocab()) doc.from_disk("doc.spacy") print("") print_doc(doc) # Restore document from bytes doc = Doc(Vocab()) doc.from_bytes(doc_base_bytes) print("") print_doc(doc) # Restore using DocBin docbin = DocBin().from_bytes(docbin_base_bytes) docs = list(docbin.get_docs(nlp.vocab)) print("") print_doc(docs[0])