def test_transformer_pipeline_todisk_settings(): nlp = English() trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG) nlp.initialize() # initially no attentions assert trf.model.tokenizer.model_max_length == 512 assert trf.model.transformer.config.output_attentions is False assert "attentions" not in nlp("test")._.trf_data.model_output # modify model_max_length (note that modifications to # tokenizer.model_max_length are not serialized by save_pretrained # see: https://github.com/explosion/spaCy/discussions/7393) trf.model.tokenizer.init_kwargs["model_max_length"] = 499 # add attentions on-the-fly trf.model.transformer.config.output_attentions = True assert nlp("test")._.trf_data.model_output.attentions is not None with make_tempdir() as d: nlp.to_disk(d) nlp2 = spacy.load(d) assert nlp2.pipe_names == ["transformer"] trf2 = nlp2.get_pipe("transformer") # model_max_length is preserved assert trf2.model.tokenizer.model_max_length == 499 # output_attentions setting is preserved assert trf2.model.transformer.config.output_attentions is True assert nlp2("test")._.trf_data.model_output.attentions is not None # the init configs are empty SimpleFrozenDicts assert trf2.model._init_tokenizer_config == {} with pytest.raises(NotImplementedError): trf2.model._init_tokenizer_config["use_fast"] = False
def test_transformer_pipeline_textcat(): """Test that a pipeline with just a transformer+textcat runs and trains properly. This used to throw an error because of shape inference issues - cf https://github.com/explosion/spaCy/issues/6401""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["transformer", "textcat"] train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) doc = nlp("We're interested at underwater basket weaving.") cats1 = doc.cats # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = spacy.load(file_path) doc2 = nlp2("We're interested at underwater basket weaving.") cats2 = doc2.cats assert cats1 == cats2
def test_transformer_pipeline_todisk(): nlp = English() nlp.add_pipe("transformer", config=DEFAULT_CONFIG) nlp.initialize() with make_tempdir() as d: nlp.to_disk(d) nlp2 = spacy.load(d) assert nlp2.pipe_names == ["transformer"]
def test_initialized_transformer_todisk(): nlp = Language() trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG) nlp.initialize() with make_tempdir() as d: trf.to_disk(d) nlp2 = Language() trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG) trf2.from_disk(d)
def test_transformer_pipeline_todisk_before_initialize(): nlp = English() trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG) with make_tempdir() as d: # serialize before initialization nlp.to_disk(d) nlp2 = spacy.load(d) nlp2.initialize() assert "last_hidden_state" in nlp2("test")._.trf_data.model_output
def test_transformer_pipeline_tagger_senter_listener(): """Test that a pipeline with just a transformer+tagger+senter runs and trains properly""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["transformer", "tagger", "senter"] tagger = nlp.get_pipe("tagger") transformer = nlp.get_pipe("transformer") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] assert isinstance(transformer, Transformer) assert isinstance(tagger_trf, TransformerListener) assert tagger_trf.upstream_name == "custom_upstream" train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) # Check that the Transformer component finds it listeners assert transformer.listeners == [] optimizer = nlp.initialize(lambda: train_examples) assert tagger_trf in transformer.listeners for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) text = "We're interested at underwater basket weaving." doc = nlp(text) doc_tensor = tagger_trf.predict([doc]) _assert_equal_tensors(doc._.trf_data.tensors, doc_tensor[0].tensors) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc2 = nlp2(text) tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors) # make sure that this can be saved to directory once more file_path_2 = d / "trained_nlp_2" nlp2.to_disk(file_path_2) # ensure to_bytes / from_bytes works nlp_bytes = nlp.to_bytes() nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp3.from_bytes(nlp_bytes) doc3 = nlp3(text) tagger3 = nlp3.get_pipe("tagger") tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] doc_tensor3 = tagger_trf3.predict([doc3]) _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
def test_inline_transformer_pipeline_todisk(): orig_config = Config().from_str(inline_cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["tagger"] with make_tempdir() as d: nlp.to_disk(d) nlp2 = spacy.load(d) assert nlp2.pipe_names == ["tagger"]
def test_overfitting_IO(): nlp = English() lemmatizer = nlp.add_pipe("trainable_lemmatizer") lemmatizer.min_tree_freq = 1 train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["trainable_lemmatizer"] < 0.00001 test_text = "She likes blue eggs" doc = nlp(test_text) assert doc[0].lemma_ == "she" assert doc[1].lemma_ == "like" assert doc[2].lemma_ == "blue" assert doc[3].lemma_ == "egg" # Check model after a {to,from}_disk roundtrip with util.make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert doc2[0].lemma_ == "she" assert doc2[1].lemma_ == "like" assert doc2[2].lemma_ == "blue" assert doc2[3].lemma_ == "egg" # Check model after a {to,from}_bytes roundtrip nlp_bytes = nlp.to_bytes() nlp3 = English() nlp3.add_pipe("trainable_lemmatizer") nlp3.from_bytes(nlp_bytes) doc3 = nlp3(test_text) assert doc3[0].lemma_ == "she" assert doc3[1].lemma_ == "like" assert doc3[2].lemma_ == "blue" assert doc3[3].lemma_ == "egg" # Check model after a pickle roundtrip. nlp_bytes = pickle.dumps(nlp) nlp4 = pickle.loads(nlp_bytes) doc4 = nlp4(test_text) assert doc4[0].lemma_ == "she" assert doc4[1].lemma_ == "like" assert doc4[2].lemma_ == "blue" assert doc4[3].lemma_ == "egg"
def test_transformer_pipeline_tagger_internal(): """Test that a tagger with internal transformer runs and trains properly""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["tagger"] tagger = nlp.get_pipe("tagger") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] assert isinstance(tagger_trf, Model) train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) optimizer = nlp.initialize(lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) doc = nlp("We're interested at underwater basket weaving.") doc_tensor = tagger_trf.predict([doc]) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp2.initialize(lambda: train_examples) # results are not the same if we don't call from_disk doc2 = nlp2("We're interested at underwater basket weaving.") tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) with pytest.raises(AssertionError): assert_equal(doc_tensor2.doc_data[0].tensors, doc_tensor.doc_data[0].tensors) # results ARE the same if we call from_disk nlp2.from_disk(file_path) doc2 = nlp2("We're interested at underwater basket weaving.") tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) assert_equal(doc_tensor2.doc_data[0].tensors, doc_tensor.doc_data[0].tensors)
def test_initialized_inline_transformer_pipeline_todisk(): orig_config = Config().from_str(inline_cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["tagger"] tagger = nlp.get_pipe("tagger") tagger.add_label("V") nlp.initialize() with make_tempdir() as d: nlp.to_disk(d) nlp2 = spacy.load(d) assert nlp2.pipe_names == ["tagger"] tagger2 = nlp2.get_pipe("tagger") assert list(tagger2.labels) == ["V"]
def test_overfitting_IO(): # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) train_examples = make_examples(nlp) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert spancat.model.get_dim("nO") == 2 assert set(spancat.labels) == {"LOC", "PERSON"} for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["spancat"] < 0.01 # test the trained model test_text = "I like London and Berlin" doc = nlp(test_text) assert doc.spans[spancat.key] == doc.spans[SPAN_KEY] spans = doc.spans[SPAN_KEY] assert len(spans) == 2 assert len(spans.attrs["scores"]) == 2 assert min(spans.attrs["scores"]) > 0.9 assert set([span.text for span in spans]) == {"London", "Berlin"} assert set([span.label_ for span in spans]) == {"LOC"} # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) spans2 = doc2.spans[SPAN_KEY] assert len(spans2) == 2 assert len(spans2.attrs["scores"]) == 2 assert min(spans2.attrs["scores"]) > 0.9 assert set([span.text for span in spans2]) == {"London", "Berlin"} assert set([span.label_ for span in spans2]) == {"LOC"} # Test scoring scores = nlp.evaluate(train_examples) assert f"spans_{SPAN_KEY}_f" in scores assert scores[f"spans_{SPAN_KEY}_p"] == 1.0 assert scores[f"spans_{SPAN_KEY}_r"] == 1.0 assert scores[f"spans_{SPAN_KEY}_f"] == 1.0 # also test that the spancat works for just a single entity in a sentence doc = nlp("London") assert len(doc.spans[spancat.key]) == 1
def test_transformer_sentencepiece_IO(): """Test that a transformer using sentencepiece trains + IO goes OK""" orig_config = Config().from_str(cfg_string) orig_config["components"]["transformer"]["model"]["name"] = "camembert-base" nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) tagger = nlp.get_pipe("tagger") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) optimizer = nlp.initialize(lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) text = "We're interested at underwater basket weaving." doc = nlp(text) doc_tensor = tagger_trf.predict([doc]) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc2 = nlp2(text) tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors) # make sure that this can be saved to directory once more file_path_2 = d / "trained_nlp_2" nlp2.to_disk(file_path_2) # ensure to_bytes / from_bytes works nlp_bytes = nlp.to_bytes() nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp3.from_bytes(nlp_bytes) doc3 = nlp3(text) tagger3 = nlp3.get_pipe("tagger") tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] doc_tensor3 = tagger_trf3.predict([doc3]) _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
def test_overfitting_IO_overlapping(): # Test for overfitting on overlapping entities fix_random_seed(0) nlp = English() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) train_examples = make_examples(nlp, data=TRAIN_DATA_OVERLAPPING) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert spancat.model.get_dim("nO") == 3 assert set(spancat.labels) == {"PERSON", "LOC", "DOUBLE_LOC"} for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["spancat"] < 0.01 # test the trained model test_text = "I like London and Berlin" doc = nlp(test_text) spans = doc.spans[SPAN_KEY] assert len(spans) == 3 assert len(spans.attrs["scores"]) == 3 assert min(spans.attrs["scores"]) > 0.9 assert set([span.text for span in spans]) == { "London", "Berlin", "London and Berlin", } assert set([span.label_ for span in spans]) == {"LOC", "DOUBLE_LOC"} # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) spans2 = doc2.spans[SPAN_KEY] assert len(spans2) == 3 assert len(spans2.attrs["scores"]) == 3 assert min(spans2.attrs["scores"]) > 0.9 assert set([span.text for span in spans2]) == { "London", "Berlin", "London and Berlin", } assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
def test_from_to_disk(): strings = StringStore() trees = EditTrees(strings) trees.add("deelt", "delen") trees.add("gedeeld", "delen") trees2 = EditTrees(strings) with make_tempdir() as temp_dir: trees_file = temp_dir / "edit_trees.bin" trees.to_disk(trees_file) trees2 = trees2.from_disk(trees_file) # Verify that the nodes did not change. assert len(trees) == len(trees2) for i in range(len(trees)): assert trees.tree_to_str(i) == trees2.tree_to_str(i) # Reinserting the same trees should not add new nodes. trees2.add("deelt", "delen") trees2.add("gedeeld", "delen") assert len(trees) == len(trees2)
def test_load_disable_enable() -> None: """ Tests spacy.load() with dis-/enabling components. """ base_nlp = English() for pipe in ("sentencizer", "tagger", "parser"): base_nlp.add_pipe(pipe) with make_tempdir() as tmp_dir: base_nlp.to_disk(tmp_dir) to_disable = ["parser", "tagger"] to_enable = ["tagger", "parser"] # Setting only `disable`. nlp = spacy.load(tmp_dir, disable=to_disable) assert all([comp_name in nlp.disabled for comp_name in to_disable]) # Setting only `enable`. nlp = spacy.load(tmp_dir, enable=to_enable) assert all([(comp_name in nlp.disabled) is (comp_name not in to_enable) for comp_name in nlp.component_names]) # Testing consistent enable/disable combination. nlp = spacy.load( tmp_dir, enable=to_enable, disable=[ comp_name for comp_name in nlp.component_names if comp_name not in to_enable ], ) assert all([(comp_name in nlp.disabled) is (comp_name not in to_enable) for comp_name in nlp.component_names]) # Inconsistent enable/disable combination. with pytest.raises(ValueError): spacy.load(tmp_dir, enable=to_enable, disable=["parser"])
def test_replace_listeners(): orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) text = "This is awesome" examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})] optimizer = nlp.initialize(lambda: examples) # verify correct configuration with transformer listener transformer = nlp.get_pipe("transformer") tagger = nlp.get_pipe("tagger") tagger_tok2vec = tagger.model.get_ref("tok2vec") tagger_listener = tagger_tok2vec.get_ref("listener") assert isinstance(tagger_listener, TransformerListener) assert transformer.listener_map["tagger"][0] == tagger_listener assert isinstance(transformer.model, TransformerModel) assert ( nlp.config["components"]["transformer"]["model"]["@architectures"] == "spacy-transformers.TransformerModel.v3" ) assert ( nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] == "spacy-transformers.TransformerListener.v1" ) # train pipe before replacing listeners for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) doc = nlp(text) preds = [t.tag_ for t in doc] doc_tensor = tagger_tok2vec.predict([doc]) # replace listener and verify predictions are still the same nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"]) tagger = nlp.get_pipe("tagger") tagger_tok2vec = tagger.model.get_ref("tok2vec") assert isinstance(tagger_tok2vec, Model) assert tagger_tok2vec.layers[0].layers[0].name == "transformer" assert ( nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] == "spacy-transformers.Tok2VecTransformer.v3" ) doc2 = nlp(text) assert preds == [t.tag_ for t in doc2] pred_tensor = tagger_tok2vec.predict([doc2]) _assert_equal_tensors(doc_tensor, pred_tensor) # attempt training with the new pipeline optimizer = nlp.resume_training() for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) assert losses["tagger"] > 0.0 # check for presence of additional fields in model_output assert doc2._.trf_data.model_output.pooler_output is not None assert doc2._.trf_data.model_output.attentions is not None # ensure IO goes OK doc_tensor_trained = tagger_tok2vec.predict([doc]) with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc3 = nlp2(text) tagger2 = nlp2.get_pipe("tagger") tagger_tok2vec2 = tagger2.model.get_ref("tok2vec") pred_tensor = tagger_tok2vec2.predict([doc3]) _assert_equal_tensors(doc_tensor_trained, pred_tensor)