def test_serialize_doc_roundtrip_disk(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) with make_tempdir() as d: file_path = d / "doc" doc.to_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path) assert doc.to_bytes() == doc_d.to_bytes()
def test_serialize_doc_roundtrip_disk(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) with make_tempdir() as d: file_path = d / "doc" doc.to_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path) assert doc.to_bytes() == doc_d.to_bytes()
def test_serialize_doc_exclude(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) doc.user_data["foo"] = "bar" new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.user_data["foo"] == "bar" new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"]) assert not new_doc.user_data new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"])) assert not new_doc.user_data
def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" string = "This is a first sentence . And another one" doc = Doc(Vocab(), words=string.split()) doc[6].sent_start = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert not new_doc.is_parsed assert not new_doc.is_tagged doc.is_parsed = True doc.is_tagged = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc.is_parsed assert new_doc.is_tagged
def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" string = "This is a first sentence . And another one" doc = Doc(Vocab(), words=string.split()) doc[6].sent_start = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert not new_doc.is_parsed assert not new_doc.is_tagged doc.is_parsed = True doc.is_tagged = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc.is_parsed assert new_doc.is_tagged
def test_issue599(en_vocab): doc = Doc(en_vocab) doc.is_tagged = True doc.is_parsed = True doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) assert doc2.is_parsed
def test_issue599(en_vocab): doc = Doc(en_vocab) doc.is_tagged = True doc.is_parsed = True doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) assert doc2.is_parsed
def test_serialize_empty_doc(en_vocab): doc = Doc(en_vocab) data = doc.to_bytes() doc2 = Doc(en_vocab) doc2.from_bytes(data) assert len(doc) == len(doc2) for token1, token2 in zip(doc, doc2): assert token1.text == token2.text
def test_serialize_empty_doc(en_vocab): doc = Doc(en_vocab) data = doc.to_bytes() doc2 = Doc(en_vocab) doc2.from_bytes(data) assert len(doc) == len(doc2) for token1, token2 in zip(doc, doc2): assert token1.text == token2.text
def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"] doc = Doc(Vocab(), words=words) doc[6].is_sent_start = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert not new_doc.has_annotation("DEP") assert not new_doc.has_annotation("TAG") doc = Doc( Vocab(), words=words, tags=["TAG"] * len(words), heads=[0, 0, 0, 0, 0, 0, 6, 6, 6], deps=["dep"] * len(words), ) new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert new_doc.has_annotation("DEP") assert new_doc.has_annotation("TAG")
def test_doc_is_nered(en_vocab): words = ["I", "live", "in", "New", "York"] doc = Doc(en_vocab, words=words) assert not doc.has_annotation("ENT_IOB") doc.ents = [Span(doc, 3, 5, label="GPE")] assert doc.has_annotation("ENT_IOB") # Test creating doc from array with unknown values arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) assert doc.has_annotation("ENT_IOB") # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.has_annotation("ENT_IOB")
def test_sbd_serialization_projective(EN): """ test that before and after serialization, the sentence boundaries are the same. """ example = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' ')) EN.tagger(example) apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct']) example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes()) assert example.to_bytes() == example_serialized.to_bytes() assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]
def test_doc_is_nered(en_vocab): words = ["I", "live", "in", "New", "York"] doc = Doc(en_vocab, words=words) assert not doc.is_nered doc.ents = [Span(doc, 3, 5, label="GPE")] assert doc.is_nered # Test creating doc from array with unknown values arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) assert doc.is_nered # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.is_nered
def test_serialize_after_adding_entity(): # Re issue #514 vocab = spacy.en.English.Defaults.create_vocab() entity_recognizer = spacy.en.English.Defaults.create_entity() doc = Doc(vocab, words=u'This is a sentence about pasta .'.split()) entity_recognizer.add_label('Food') entity_recognizer(doc) label_id = vocab.strings[u'Food'] doc.ents = [(label_id, 5,6)] assert [(ent.label_, ent.text) for ent in doc.ents] == [(u'Food', u'pasta')] byte_string = doc.to_bytes()
def test_issue4133(en_vocab): nlp = English() vocab_bytes = nlp.vocab.to_bytes() words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] doc = Doc(en_vocab, words=words) for i, token in enumerate(doc): token.pos_ = pos[i] # usually this is already True when starting from proper models instead of blank English doc_bytes = doc.to_bytes() vocab = Vocab() vocab = vocab.from_bytes(vocab_bytes) doc = Doc(vocab).from_bytes(doc_bytes) actual = [] for token in doc: actual.append(token.pos_) assert actual == pos
def test_serialize_doc_exclude(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) doc.user_data["foo"] = "bar" new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.user_data["foo"] == "bar" new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"]) assert not new_doc.user_data new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"])) assert not new_doc.user_data with pytest.raises(ValueError): doc.to_bytes(user_data=False) with pytest.raises(ValueError): Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
def test_serialize_doc_exclude(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) doc.user_data["foo"] = "bar" new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.user_data["foo"] == "bar" new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"]) assert not new_doc.user_data new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"])) assert not new_doc.user_data with pytest.raises(ValueError): doc.to_bytes(user_data=False) with pytest.raises(ValueError): Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
def test_sbd_serialization_projective(EN): """ test that before and after serialization, the sentence boundaries are the same. """ example = EN.tokenizer.tokens_from_list( u"I bought a couch from IKEA. It was n't very comfortable .".split( ' ')) EN.tagger(example) apply_transition_sequence(EN, example, [ 'L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj', 'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod', 'R-acomp', 'D', 'R-punct' ]) example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes()) assert example.to_bytes() == example_serialized.to_bytes() assert [s.text for s in example.sents ] == [s.text for s in example_serialized.sents]
def test_en_parser_sbd_serialization_projective(nlp): """Test that before and after serialization, the sentence boundaries are the same.""" # NB: This was marked as causing segfault previously. # fmt: off text = "I bought a couch from IKEA It wasn't very comfortable." transition = ["L-nsubj", "S", "L-det", "R-dobj", "D", "R-prep", "R-pobj", "B-ROOT", "L-nsubj", "R-neg", "D", "S", "L-advmod", "R-acomp", "D", "R-punct"] # fmt: on doc = nlp.tokenizer(text) # This utility function needs to be updated I think? apply_transition_sequence(nlp.get_pipe("parser"), doc, transition) doc_serialized = Doc(nlp.vocab).from_bytes(doc.to_bytes()) assert doc.is_parsed assert doc_serialized.is_parsed assert doc.to_bytes() == doc_serialized.to_bytes() assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents]
def test_en_sbd_serialization_projective(combined_all_model_fixture): """Test that before and after serialization, the sentence boundaries are the same.""" text = "I bought a couch from IKEA It wasn't very comfortable." transition = [ 'L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj', 'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod', 'R-acomp', 'D', 'R-punct' ] doc = combined_all_model_fixture.tokenizer(text) apply_transition_sequence(en_with_combined_rule_tokenizer_fixture.parser, doc, transition) doc_serialized = Doc( en_with_combined_rule_tokenizer_fixture.vocab).from_bytes( doc.to_bytes()) assert doc.is_parsed == True assert doc_serialized.is_parsed == True assert doc.to_bytes() == doc_serialized.to_bytes() assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents]
def test_issue3012(en_vocab): """Test that the is_tagged attribute doesn't get overwritten when we from_array without tag information.""" words = ["This", "is", "10", "%", "."] tags = ["DT", "VBZ", "CD", "NN", "."] pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"] doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) assert doc.has_annotation("TAG") expected = ("10", "NUM", "CD", "PERCENT") assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected header = [ENT_IOB, ENT_TYPE] ent_array = doc.to_array(header) doc.from_array(header, ent_array) assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected # Serializing then deserializing doc_bytes = doc.to_bytes() doc2 = Doc(en_vocab).from_bytes(doc_bytes) assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
def test_serialize_doc_roundtrip_bytes(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) doc_b = doc.to_bytes() new_doc = Doc(en_vocab).from_bytes(doc_b) assert new_doc.to_bytes() == doc_b
def test_serialize_doc_roundtrip_bytes(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) doc_b = doc.to_bytes() new_doc = Doc(en_vocab).from_bytes(doc_b) assert new_doc.to_bytes() == doc_b
def test_serialize_doc_span_groups(en_vocab): doc = Doc(en_vocab, words=["hello", "world", "!"]) doc.spans["content"] = [doc[0:2]] new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert len(new_doc.spans["content"]) == 1
def test_issue599(en_vocab): doc = Doc(en_vocab) doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) assert doc2.has_annotation("DEP")