def test_get_oracle_actions(): ids, words, tags, heads, deps, ents = [], [], [], [], [], [] for id_, word, tag, head, dep, ent in annot_tuples: ids.append(id_) words.append(word) tags.append(tag) heads.append(head) deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = DependencyParser(doc.vocab, model) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") parser.moves.add_action(4, "ROOT") heads, deps = projectivize(heads, deps) for i, (head, dep) in enumerate(zip(heads, deps)): if head > i: parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) example = Example.from_dict(doc, { "words": words, "tags": tags, "heads": heads, "deps": deps }) parser.moves.get_oracle_sequence(example)
def test_issue3345(): """Test case where preset entity crosses sentence boundary.""" nlp = English() doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{ "label": "GPE", "pattern": "New York" }]) config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(doc.vocab, model, **config) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") doc = ruler(doc) # Get into the state just before "New" state = ner.moves.init_batch([doc])[0] ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") # Check that B-GPE is valid. assert ner.moves.is_valid(state, "B-GPE")
def test_serialize_sentencerecognizer(en_vocab): cfg = {"model": DEFAULT_SENTER_MODEL} model = registry.resolve(cfg, validate=True)["model"] sr = SentenceRecognizer(en_vocab, model) sr_b = sr.to_bytes() sr_d = SentenceRecognizer(en_vocab, model).from_bytes(sr_b) assert sr.to_bytes() == sr_d.to_bytes()
def parser(vocab): vocab.strings.add("ROOT") config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = DependencyParser(vocab, model, **config) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") parser.initialize(lambda: [_parser_example(parser)]) sgd = Adam(0.001) for i in range(10): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) example = Example.from_dict(doc, { "heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"] }) parser.update([example], sgd=sgd, losses=losses) return parser
def test_ner_constructor(en_vocab): config = { "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] EntityRecognizer(en_vocab, model, **config) EntityRecognizer(en_vocab, model)
def parser(vocab, arc_eager): config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] return Parser(vocab, model, moves=arc_eager, **config)
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = Parser(en_vocab, model) new_parser = Parser(en_vocab, model) new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"])) bytes_2 = new_parser.to_bytes(exclude=["vocab"]) bytes_3 = parser.to_bytes(exclude=["vocab"]) assert len(bytes_2) == len(bytes_3) assert bytes_2 == bytes_3
def test_parser_constructor(en_vocab): config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] DependencyParser(en_vocab, model, **config) DependencyParser(en_vocab, model)
def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" config = { "learn_tokens": True, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.initialize(lambda: [_parser_example(parser)]) assert "subtok" in parser.labels
def test_build_model(parser, vocab): config = { "learn_tokens": False, "min_action_freq": 0, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser.model = Parser(vocab, model=model, moves=parser.moves, **config).model assert parser.model is not None
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): tagger1 = taggers[0] tagger1_b = tagger1.to_bytes() tagger1 = tagger1.from_bytes(tagger1_b) assert tagger1.to_bytes() == tagger1_b cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.resolve(cfg, validate=True)["model"] new_tagger1 = Tagger(en_vocab, model).from_bytes(tagger1_b) new_tagger1_b = new_tagger1.to_bytes() assert len(new_tagger1_b) == len(tagger1_b) assert new_tagger1_b == tagger1_b
def test_ents_reset(en_vocab): """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) assert [t.ent_iob_ for t in doc] == orig_iobs
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): tagger1, tagger2 = taggers with make_tempdir() as d: file_path1 = d / "tagger1" file_path2 = d / "tagger2" tagger1.to_disk(file_path1) tagger2.to_disk(file_path2) cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.resolve(cfg, validate=True)["model"] tagger1_d = Tagger(en_vocab, model).from_disk(file_path1) tagger2_d = Tagger(en_vocab, model).from_disk(file_path2) assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
def blank_parser(en_vocab): config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, "beam_width": 1, "beam_update_prob": 1.0, "beam_density": 0.0, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = DependencyParser(en_vocab, model, **config) return parser
def test_add_label_get_label(pipe_cls, n_moves, model_config): """Test that added labels are returned correctly. This test was added to test for a bug in DependencyParser.labels that'd cause it to fail when splitting the move names. """ labels = ["A", "B", "C"] model = registry.resolve({"model": model_config}, validate=True)["model"] pipe = pipe_cls(Vocab(), model) for label in labels: pipe.add_label(label) assert len(pipe.move_names) == len(labels) * n_moves pipe_labels = sorted(list(pipe.labels)) assert pipe_labels == labels
def test_serialize_parser_roundtrip_disk(en_vocab, Parser): cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = Parser(en_vocab, model) with make_tempdir() as d: file_path = d / "parser" parser.to_disk(file_path) parser_d = Parser(en_vocab, model) parser_d = parser_d.from_disk(file_path) parser_bytes = parser.to_bytes(exclude=["model", "vocab"]) parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"]) assert len(parser_bytes) == len(parser_d_bytes) assert parser_bytes == parser_d_bytes
def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) doc.ents = [("ANIMAL", 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] doc.ents = [("WORD", 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.initialize(lambda: [_parser_example(parser)]) assert "subtok" not in parser.labels
def test_serialize_parser_strings(Parser): vocab1 = Vocab() label = "FunnyLabel" assert label not in vocab1.strings cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser1 = Parser(vocab1, model) parser1.add_label(label) assert label in parser1.vocab.strings vocab2 = Vocab() assert label not in vocab2.strings parser2 = Parser(vocab2, model) parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"])) assert label in parser2.vocab.strings
def test_add_label_deserializes_correctly(): cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner1 = EntityRecognizer(Vocab(), model) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.initialize(lambda: [_ner_example(ner1)]) ner2 = EntityRecognizer(Vocab(), model) # the second model needs to be resized before we can call from_bytes ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves) ner2.from_bytes(ner1.to_bytes()) assert ner1.moves.n_moves == ner2.moves.n_moves for i in range(ner1.moves.n_moves): assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): config = { "update_with_oracle_cut_size": 100, "beam_width": 1, "beam_update_prob": 1.0, "beam_density": 0.0, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = Parser(en_vocab, model) new_parser = Parser(en_vocab, model) new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"])) bytes_2 = new_parser.to_bytes(exclude=["vocab"]) bytes_3 = parser.to_bytes(exclude=["vocab"]) assert len(bytes_2) == len(bytes_3) assert bytes_2 == bytes_3
def test_serialize_tagger_strings(en_vocab, de_vocab, taggers): label = "SomeWeirdLabel" assert label not in en_vocab.strings assert label not in de_vocab.strings tagger = taggers[0] assert label not in tagger.vocab.strings with make_tempdir() as d: # check that custom labels are serialized as part of the component's strings.jsonl tagger.add_label(label) assert label in tagger.vocab.strings file_path = d / "tagger1" tagger.to_disk(file_path) # ensure that the custom strings are loaded back in when using the tagger in another pipeline cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.resolve(cfg, validate=True)["model"] tagger2 = Tagger(de_vocab, model).from_disk(file_path) assert label in tagger2.vocab.strings
def test_ents_reset(en_vocab): """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) assert [t.ent_iob_ for t in doc] == orig_iobs
def test_add_label_get_label(pipe_cls, n_moves, model_config): """Test that added labels are returned correctly. This test was added to test for a bug in DependencyParser.labels that'd cause it to fail when splitting the move names. """ labels = ["A", "B", "C"] model = registry.resolve({"model": model_config}, validate=True)["model"] config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } pipe = pipe_cls(Vocab(), model, **config) for label in labels: pipe.add_label(label) assert len(pipe.move_names) == len(labels) * n_moves pipe_labels = sorted(list(pipe.labels)) assert pipe_labels == labels
def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) doc.ents = [("ANIMAL", 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] doc.ents = [("WORD", 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
def test_serialize_pipe_exclude(en_vocab, Parser): cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] def get_new_parser(): new_parser = Parser(en_vocab, model) return new_parser parser = Parser(en_vocab, model) parser.cfg["foo"] = "bar" new_parser = get_new_parser().from_bytes( parser.to_bytes(exclude=["vocab"])) assert "foo" in new_parser.cfg new_parser = get_new_parser().from_bytes( parser.to_bytes(exclude=["vocab"]), exclude=["cfg"]) assert "foo" not in new_parser.cfg new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["cfg"]), exclude=["vocab"]) assert "foo" not in new_parser.cfg
def test_issue3345(entity_ruler_factory): """Test case where preset entity crosses sentence boundary.""" nlp = English() doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler.add_patterns([{"label": "GPE", "pattern": "New York"}]) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(doc.vocab, model) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") doc = ruler(doc) # Get into the state just before "New" state = ner.moves.init_batch([doc])[0] ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") # Check that B-GPE is valid. assert ner.moves.is_valid(state, "B-GPE")
def test_add_label_deserializes_correctly(): config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner1 = EntityRecognizer(Vocab(), model, **config) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.initialize(lambda: [_ner_example(ner1)]) ner2 = EntityRecognizer(Vocab(), model, **config) # the second model needs to be resized before we can call from_bytes ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves) ner2.from_bytes(ner1.to_bytes()) assert ner1.moves.n_moves == ner2.moves.n_moves for i in range(ner1.moves.n_moves): assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
def test_serialize_parser_roundtrip_disk(en_vocab, Parser): config = { "learn_tokens": False, "min_action_freq": 0, "update_with_oracle_cut_size": 100, "beam_width": 1, "beam_update_prob": 1.0, "beam_density": 0.0, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = Parser(en_vocab, model, **config) with make_tempdir() as d: file_path = d / "parser" parser.to_disk(file_path) parser_d = Parser(en_vocab, model, **config) parser_d = parser_d.from_disk(file_path) parser_bytes = parser.to_bytes(exclude=["model", "vocab"]) parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"]) assert len(parser_bytes) == len(parser_d_bytes) assert parser_bytes == parser_d_bytes
def test_serialize_parser_strings(Parser): vocab1 = Vocab() label = "FunnyLabel" assert label not in vocab1.strings config = { "learn_tokens": False, "min_action_freq": 0, "update_with_oracle_cut_size": 100, "beam_width": 1, "beam_update_prob": 1.0, "beam_density": 0.0, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser1 = Parser(vocab1, model, **config) parser1.add_label(label) assert label in parser1.vocab.strings vocab2 = Vocab() assert label not in vocab2.strings parser2 = Parser(vocab2, model, **config) parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"])) assert label in parser2.vocab.strings