Ejemplo n.º 1
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed
Ejemplo n.º 2
0
def test_serialize_transformer_data():
    data = {"x": TransformerData.empty()}
    bytes_data = srsly.msgpack_dumps(data)
    new_data = srsly.msgpack_loads(bytes_data)
    assert isinstance(new_data["x"], TransformerData)

    nlp = Language()
    trf = nlp.add_pipe(
        "transformer",
        config={
            "model": {
                "name": "distilbert-base-uncased",
                "transformer_config": {
                    "output_attentions": True
                },
            }
        },
    )
    nlp.initialize()
    doc = nlp("This is a test.")
    b = doc.to_bytes()
    reloaded_doc = Doc(nlp.vocab)
    reloaded_doc.from_bytes(b)
    assert_docs_equal(doc, reloaded_doc)
    for key in doc._.trf_data.model_output:
        assert_array_equal(doc._.trf_data.model_output[key],
                           reloaded_doc._.trf_data.model_output[key])
Ejemplo n.º 3
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed
Ejemplo n.º 4
0
def test_serialize_empty_doc(en_vocab):
    doc = Doc(en_vocab)
    data = doc.to_bytes()
    doc2 = Doc(en_vocab)
    doc2.from_bytes(data)
    assert len(doc) == len(doc2)
    for token1, token2 in zip(doc, doc2):
        assert token1.text == token2.text
Ejemplo n.º 5
0
def test_serialize_empty_doc(en_vocab):
    doc = Doc(en_vocab)
    data = doc.to_bytes()
    doc2 = Doc(en_vocab)
    doc2.from_bytes(data)
    assert len(doc) == len(doc2)
    for token1, token2 in zip(doc, doc2):
        assert token1.text == token2.text
Ejemplo n.º 6
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.has_annotation("DEP")
Ejemplo n.º 7
0
nlp = spacy.load("en_core_web_sm")
doc_base = nlp(text)
print("")
print_doc(doc_base)

# Serialize document to disk and bytes
doc_base.to_disk("doc.spacy")
doc_base_bytes = doc_base.to_bytes()

# Serialize using DocBin
docbin_base = DocBin(attrs=["ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE"], store_user_data=True)
docbin_base.add(doc_base)
docbin_base_bytes = docbin_base.to_bytes()

# Restore document from disk
doc = Doc(Vocab())
doc.from_disk("doc.spacy")
print("")
print_doc(doc)

# Restore document from bytes
doc = Doc(Vocab())
doc.from_bytes(doc_base_bytes)
print("")
print_doc(doc)

# Restore using DocBin
docbin = DocBin().from_bytes(docbin_base_bytes)
docs = list(docbin.get_docs(nlp.vocab))
print("")
print_doc(docs[0])