def deserialize_doc(serialized_bytes): vocab = Vocab() doc_bytes = serialized_bytes[0] vocab_bytes = serialized_bytes[1] vocab.from_bytes(vocab_bytes) doc = Doc(vocab).from_bytes(doc_bytes) return doc
def test_lookups_to_from_bytes_via_vocab(): table_name = "test" vocab = Vocab() vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"}) assert table_name in vocab.lookups vocab_bytes = vocab.to_bytes() new_vocab = Vocab() new_vocab.from_bytes(vocab_bytes) assert len(new_vocab.lookups) == len(vocab.lookups) assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 assert table["hello"] == "world" assert new_vocab.to_bytes() == vocab_bytes
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() vocab1[strings[0]].norm_ = lex_attr assert vocab1[strings[0]].norm_ == lex_attr assert vocab2[strings[0]].norm_ != lex_attr vocab2 = vocab2.from_bytes(vocab1.to_bytes()) assert vocab2[strings[0]].norm_ == lex_attr
def test_serialize_vocab_roundtrip_bytes(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) vocab1_b = vocab1.to_bytes() vocab2_b = vocab2.to_bytes() if strings1 == strings2: assert vocab1_b == vocab2_b else: assert vocab1_b != vocab2_b vocab1 = vocab1.from_bytes(vocab1_b) assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
def test_serialize_vocab_roundtrip_bytes(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) vocab1_b = vocab1.to_bytes() vocab2_b = vocab2.to_bytes() if strings1 == strings2: assert vocab1_b == vocab2_b else: assert vocab1_b != vocab2_b vocab1 = vocab1.from_bytes(vocab1_b) assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b assert len(new_vocab1) == len(strings1) assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
def test_issue4133(en_vocab): nlp = English() vocab_bytes = nlp.vocab.to_bytes() words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] doc = Doc(en_vocab, words=words) for i, token in enumerate(doc): token.pos_ = pos[i] # usually this is already True when starting from proper models instead of blank English doc_bytes = doc.to_bytes() vocab = Vocab() vocab = vocab.from_bytes(vocab_bytes) doc = Doc(vocab).from_bytes(doc_bytes) actual = [] for token in doc: actual.append(token.pos_) assert actual == pos
def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) vocab.from_bytes(vocab.to_bytes()) assert len(vocab.strings) == len(strings) + 1 # adds _SP
def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) length = len(vocab) vocab.from_bytes(vocab.to_bytes()) assert len(vocab) == length