def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() nlp.add_pipe(nlp.create_pipe("ner")) nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg)
def test_beam_parse(): nlp = Language() nlp.add_pipe(DependencyParser(nlp.vocab), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) doc = nlp.make_doc("Australia is a country") nlp.parser(doc, beam_width=2)
def test_simple_train(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update([text], [{"cats": {"answer": answer}}]) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def test_issue1654(): nlp = Language(Vocab()) assert not nlp.pipeline nlp.add_pipe(lambda doc: doc, name="1") nlp.add_pipe(lambda doc: doc, name="2", after="1") nlp.add_pipe(lambda doc: doc, name="3", after="2") assert nlp.pipe_names == ["1", "2", "3"] nlp2 = Language(Vocab()) assert not nlp2.pipeline nlp2.add_pipe(lambda doc: doc, name="3") nlp2.add_pipe(lambda doc: doc, name="2", before="3") nlp2.add_pipe(lambda doc: doc, name="1", before="2") assert nlp2.pipe_names == ["1", "2", "3"]
def init(self, model: Language): """Initialize the component and add it to the NLP pipe line. This base class implementation loads the :obj:`module`, then calls :meth:`.Language.add_pipe`. :param model: the model to add the spaCy model (``nlp`` in their parlance) """ for mod in self.modules: __import__(mod) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'creating pipe {self.pipe_name} with args: ' + f'{self.pipe_add_kwargs}') if self.pipe_config is None: model.add_pipe(self.pipe_name, **self.pipe_add_kwargs) else: model.add_pipe(self.pipe_name, config=self.pipe_config, **self.pipe_add_kwargs)
def test_no_resize(): nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("N") tagger.add_label("V") assert tagger.labels == ("N", "V") nlp.initialize() assert tagger.model.get_dim("nO") == 2 # this throws an error because the tagger can't be resized after initialization with pytest.raises(ValueError): tagger.add_label("J")
def test_no_resize(): nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) spancat.add_label("Thing") spancat.add_label("Phrase") assert spancat.labels == ("Thing", "Phrase") nlp.initialize() assert spancat.model.get_dim("nO") == 2 # this throws an error because the spancat can't be resized after initialization with pytest.raises(ValueError): spancat.add_label("Stuff")
def test_issue2564(): """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") nlp.initialize() doc = nlp("hello world") assert doc.has_annotation("TAG") docs = nlp.pipe(["hello", "world"]) piped_doc = next(docs) assert piped_doc.has_annotation("TAG")
def test_pipe_function_component(): name = "test_component" @Language.component(name) def component(doc: Doc) -> Doc: return doc assert name in registry.factories nlp = Language() with pytest.raises(ValueError): nlp.add_pipe(component) nlp.add_pipe(name) assert name in nlp.pipe_names assert nlp.pipe_factories[name] == name assert Language.get_factory_meta(name) assert nlp.get_pipe_meta(name) pipe = nlp.get_pipe(name) assert pipe == component pipe = nlp.create_pipe(name) assert pipe == component
def test_ner_labels_added_implicitly_on_update(): nlp = Language() ner = nlp.add_pipe("ner") for label in ["A", "B", "C"]: ner.add_label(label) nlp.initialize() doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) example = Example(nlp.make_doc(doc.text), doc) assert "D" not in ner.labels nlp.update([example]) assert "D" in ner.labels
def test_component_factories_from_nlp(): """Test that class components can implement a from_nlp classmethod that gives them access to the nlp object and config via the factory.""" class TestComponent5(object): def __call__(self, doc): return doc mock = Mock() mock.return_value = TestComponent5() TestComponent5.from_nlp = classmethod(mock) TestComponent5 = component("c5")(TestComponent5) assert "c5" in Language.factories nlp = Language() pipe = nlp.create_pipe("c5", config={"foo": "bar"}) nlp.add_pipe(pipe) assert nlp("hello world") # The first argument here is the class itself, so we're accepting any here mock.assert_called_once_with(ANY, nlp, foo="bar")
def test_add_pipe (nlp: Language): """It works as a pipeline component and can be disabled.""" # given base_text_rank = BaseTextRank() nlp.add_pipe("textrank", last=True) # works as a pipeline component # when text = "linear constraints over the" doc = nlp(text) phrases = [ p.text for p in doc._.phrases ] # then assert len(doc._.phrases) > 0 assert any(map(lambda x: "constraints" in x, phrases)) # identifies phrases not in noun chunks # when text = "everything you need to know about student loan interest rates variable and fixed rates capitalization amortization student loan refinancing and more." doc = nlp(text) phrases = [ p.text for p in doc._.phrases ] # then assert len(doc._.phrases) >= 2 # resolves Py 3.5 dict KeyError # when text = "linear constraints over the set of natural numbers" doc = nlp(text) phrases = [ p.text for p in doc._.phrases ] # then assert any(map(lambda x: "constraints" in x, phrases)) # pipeline can be disabled # when with nlp.select_pipes(disable=["textrank"]): doc = nlp(text) # then assert len(doc._.phrases) == 0
def test_resize(name, textcat_config): """The new textcat architectures are resizable""" nlp = Language() pipe_config = {"model": textcat_config} textcat = nlp.add_pipe(name, config=pipe_config) textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") assert textcat.model.maybe_get_dim("nO") in [2, None] nlp.initialize() assert textcat.model.maybe_get_dim("nO") in [2, None] textcat.add_label("NEUTRAL") assert textcat.model.maybe_get_dim("nO") in [3, None]
def test_label_types(name): nlp = Language() textcat = nlp.add_pipe(name) textcat.add_label("answer") with pytest.raises(ValueError): textcat.add_label(9) # textcat requires at least two labels if name == "textcat": with pytest.raises(ValueError): nlp.initialize() else: nlp.initialize()
def test_issue9904(): nlp = Language() textcat = nlp.add_pipe("textcat") get_examples = make_get_examples_single_label(nlp) nlp.initialize(get_examples) examples = get_examples() scores = textcat.predict([eg.predicted for eg in examples]) loss = textcat.get_loss(examples, scores)[0] loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0] assert loss == pytest.approx(loss_double_bs)
def test_no_resize(name, textcat_config): """The old textcat architectures weren't resizable""" nlp = Language() pipe_config = {"model": textcat_config} textcat = nlp.add_pipe(name, config=pipe_config) textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") nlp.initialize() assert textcat.model.maybe_get_dim("nO") in [2, None] # this throws an error because the textcat can't be resized after initialization with pytest.raises(ValueError): textcat.add_label("NEUTRAL")
def test_pipe_factories_decorator_idempotent(): """Check that decorator can be run multiple times if the function is the same. This is especially relevant for live reloading because we don't want spaCy to raise an error if a module registering components is reloaded. """ name = "test_pipe_factories_decorator_idempotent" func = lambda nlp, name: lambda doc: doc for i in range(5): Language.factory(name, func=func) nlp = Language() nlp.add_pipe(name) Language.factory(name, func=func) # Make sure it also works for component decorator, which creates the # factory function name2 = f"{name}2" func2 = lambda doc: doc for i in range(5): Language.component(name2, func=func2) nlp = Language() nlp.add_pipe(name) Language.component(name2, func=func2)
def test_initialize_examples(name, get_examples, train_data): nlp = Language() textcat = nlp.add_pipe(name) for text, annotations in train_data: for label, value in annotations.get("cats").items(): textcat.add_label(label) # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=get_examples(nlp)) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): nlp.initialize(get_examples=get_examples())
def test_language_factories_scores(): name = "test_language_factories_scores" func = lambda nlp, name: lambda doc: doc weights1 = {"a1": 0.5, "a2": 0.5} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} Language.factory(f"{name}1", default_score_weights=weights1, func=func) Language.factory(f"{name}2", default_score_weights=weights2, func=func) meta1 = Language.get_factory_meta(f"{name}1") assert meta1.default_score_weights == weights1 meta2 = Language.get_factory_meta(f"{name}2") assert meta2.default_score_weights == weights2 nlp = Language() nlp._config["training"]["score_weights"] = {} nlp.add_pipe(f"{name}1") nlp.add_pipe(f"{name}2") cfg = nlp.config["training"] expected_weights = { "a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05 } assert cfg["score_weights"] == expected_weights # Test with custom defaults config = nlp.config.copy() config["training"]["score_weights"]["a1"] = 0.0 config["training"]["score_weights"]["b3"] = 1.0 nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34} assert score_weights == expected # Test with null values config = nlp.config.copy() config["training"]["score_weights"]["a1"] = None nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35} assert score_weights == expected
def test_pipe_class_component_init(): name1 = "test_class_component1" name2 = "test_class_component2" @Language.factory(name1) class Component1: def __init__(self, nlp: Language, name: str): self.nlp = nlp def __call__(self, doc: Doc) -> Doc: return doc class Component2: def __init__(self, nlp: Language, name: str): self.nlp = nlp def __call__(self, doc: Doc) -> Doc: return doc @Language.factory(name2) def factory(nlp: Language, name=name2): return Component2(nlp, name) nlp = Language() for name, Component in [(name1, Component1), (name2, Component2)]: assert name in registry.factories with pytest.raises(ValueError): nlp.add_pipe(Component(nlp, name)) nlp.add_pipe(name) assert name in nlp.pipe_names assert nlp.pipe_factories[name] == name assert Language.get_factory_meta(name) assert nlp.get_pipe_meta(name) pipe = nlp.get_pipe(name) assert isinstance(pipe, Component) assert isinstance(pipe.nlp, Language) pipe = nlp.create_pipe(name) assert isinstance(pipe, Component) assert isinstance(pipe.nlp, Language)
def test_initialize_examples(): nlp = Language() morphologizer = nlp.add_pipe("morphologizer") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): nlp.initialize(get_examples=train_examples)
def test_make_spangroup(max_positive, nr_results): fix_random_seed(0) nlp = Language() spancat = nlp.add_pipe( "spancat", config={ "spans_key": SPAN_KEY, "threshold": 0.5, "max_positive": max_positive }, ) doc = nlp.make_doc("Greater London") ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")( sizes=[1, 2]) indices = ngram_suggester([doc])[0].dataXd assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) labels = ["Thing", "City", "Person", "GreatCity"] scores = numpy.asarray( [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f") spangroup = spancat._make_span_group(doc, indices, scores, labels) assert len(spangroup) == nr_results # first span is always the second token "London" assert spangroup[0].text == "London" assert spangroup[0].label_ == "City" assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5) # second span depends on the number of positives that were allowed assert spangroup[1].text == "Greater London" if max_positive == 1: assert spangroup[1].label_ == "GreatCity" assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5) else: assert spangroup[1].label_ == "Thing" assert_almost_equal(0.8, spangroup.attrs["scores"][1], 5) if nr_results > 2: assert spangroup[2].text == "Greater London" if max_positive == 2: assert spangroup[2].label_ == "GreatCity" assert_almost_equal(0.9, spangroup.attrs["scores"][2], 5) else: assert spangroup[2].label_ == "City" assert_almost_equal(0.7, spangroup.attrs["scores"][2], 5) assert spangroup[-1].text == "Greater London" assert spangroup[-1].label_ == "GreatCity" assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
def test_doc_gc(): # If the Doc object is garbage collected, the spans won't be functional afterwards nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) spancat.add_label("PERSON") nlp.initialize() texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."] all_spans = [doc.spans for doc in nlp.pipe(texts)] for text, spangroups in zip(texts, all_spans): assert isinstance(spangroups, SpanGroups) for key, spangroup in spangroups.items(): assert isinstance(spangroup, SpanGroup) assert len(spangroup) > 0 with pytest.raises(RuntimeError): span = spangroup[0]
def test_issue_3526_4(en_vocab): nlp = Language(vocab=en_vocab) patterns = [{"label": "ORG", "pattern": "Apple"}] config = {"overwrite_ents": True} ruler = nlp.add_pipe("entity_ruler", config=config) ruler.add_patterns(patterns) with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) ruler = nlp.get_pipe("entity_ruler") assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert ruler.overwrite is True nlp2 = load(tmpdir) new_ruler = nlp2.get_pipe("entity_ruler") assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert new_ruler.overwrite is True
def entity_linker(): nlp = Language() def create_kb(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) return kb entity_linker = nlp.add_pipe("entity_linker") entity_linker.set_kb(create_kb) # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization nlp.initialize() return entity_linker
def test_pipe_class_component_config(): name = "test_class_component_config" @Language.factory(name) class Component: def __init__(self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr): self.nlp = nlp self.value1 = value1 self.value2 = value2 self.is_base = True self.name = name def __call__(self, doc: Doc) -> Doc: return doc @English.factory(name) class ComponentEN: def __init__(self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr): self.nlp = nlp self.value1 = value1 self.value2 = value2 self.is_base = False def __call__(self, doc: Doc) -> Doc: return doc nlp = Language() with pytest.raises(ConfigValidationError): # no config provided nlp.add_pipe(name) with pytest.raises(ConfigValidationError): # invalid config nlp.add_pipe(name, config={"value1": "10", "value2": "hello"}) with pytest.warns(UserWarning): nlp.add_pipe(name, config={ "value1": 10, "value2": "hello", "name": "wrong_name" }) pipe = nlp.get_pipe(name) assert isinstance(pipe.nlp, Language) assert pipe.value1 == 10 assert pipe.value2 == "hello" assert pipe.is_base is True assert pipe.name == name nlp_en = English() with pytest.raises(ConfigValidationError): # invalid config nlp_en.add_pipe(name, config={"value1": "10", "value2": "hello"}) nlp_en.add_pipe(name, config={"value1": 10, "value2": "hello"}) pipe = nlp_en.get_pipe(name) assert isinstance(pipe.nlp, English) assert pipe.value1 == 10 assert pipe.value2 == "hello" assert pipe.is_base is False
def test_initialize_examples(): nlp = Language() lemmatizer = nlp.add_pipe("trainable_lemmatizer") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: train_examples[0]) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: []) with pytest.raises(TypeError): nlp.initialize(get_examples=train_examples)
def test_issue1654(): nlp = Language(Vocab()) assert not nlp.pipeline @Language.component("component") def component(doc): return doc nlp.add_pipe("component", name="1") nlp.add_pipe("component", name="2", after="1") nlp.add_pipe("component", name="3", after="2") assert nlp.pipe_names == ["1", "2", "3"] nlp2 = Language(Vocab()) assert not nlp2.pipeline nlp2.add_pipe("component", name="3") nlp2.add_pipe("component", name="2", before="3") nlp2.add_pipe("component", name="1", before="2") assert nlp2.pipe_names == ["1", "2", "3"]
def test_component_decorator_assigns(): spacy.language.ENABLE_PIPELINE_ANALYSIS = True @component("c1", assigns=["token.tag", "doc.tensor"]) def test_component1(doc): return doc @component("c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]) def test_component2(doc): return doc @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"]) def test_component3(doc): return doc assert "c1" in Language.factories assert "c2" in Language.factories assert "c3" in Language.factories nlp = Language() nlp.add_pipe(test_component1) with pytest.warns(UserWarning): nlp.add_pipe(test_component2) nlp.add_pipe(test_component3) assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor") assert [name for name, _ in assigns_tensor] == ["c1", "c2"] test_component4 = nlp.create_pipe("c1") assert test_component4.name == "c1" assert test_component4.factory == "c1" nlp.add_pipe(test_component4, name="c4") assert nlp.pipe_names == ["c1", "c2", "c3", "c4"] assert "c4" not in Language.factories assert nlp.pipe_factories["c1"] == "c1" assert nlp.pipe_factories["c4"] == "c1" assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor") assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"] requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos") assert [name for name, _ in requires_pos] == ["c2"] assert print_summary(nlp, no_print=True) assert nlp("hello world")
def test_simple_train(): nlp = Language() textcat = nlp.add_pipe("textcat") textcat.add_label("answer") nlp.initialize() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update((text, {"cats": {"answer": answer}})) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_language_source_and_vectors(nlp2): nlp = Language(Vocab()) textcat = nlp.add_pipe("textcat") for label in ("POSITIVE", "NEGATIVE"): textcat.add_label(label) nlp.initialize() long_string = "thisisalongstring" assert long_string not in nlp.vocab.strings assert long_string not in nlp2.vocab.strings nlp.vocab.strings.add(long_string) assert nlp.vocab.vectors.to_bytes() != nlp2.vocab.vectors.to_bytes() vectors_bytes = nlp.vocab.vectors.to_bytes() with pytest.warns(UserWarning): nlp2.add_pipe("textcat", name="textcat2", source=nlp) # strings should be added assert long_string in nlp2.vocab.strings # vectors should remain unmodified assert nlp.vocab.vectors.to_bytes() == vectors_bytes
def test_simple_train(): fix_random_seed(0) nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) get_examples = make_get_examples(nlp) nlp.initialize(get_examples) sgd = nlp.create_optimizer() assert len(spancat.labels) != 0 for i in range(40): losses = {} nlp.update(list(get_examples()), losses=losses, drop=0.1, sgd=sgd) doc = nlp("I like London and Berlin.") assert doc.spans[spancat.key] == doc.spans[SPAN_KEY] assert len(doc.spans[spancat.key]) == 2 assert doc.spans[spancat.key][0].text == "London" scores = nlp.evaluate(get_examples()) assert f"spans_{SPAN_KEY}_f" in scores assert scores[f"spans_{SPAN_KEY}_f"] == 1.0
def test_token_splitter(): nlp = Language() config = {"min_length": 20, "split_length": 5} token_splitter = nlp.add_pipe("token_splitter", config=config) doc = nlp("aaaaabbbbbcccccdddd e f g") assert [t.text for t in doc] == ["aaaaabbbbbcccccdddd", "e", "f", "g"] doc = nlp("aaaaabbbbbcccccdddddeeeeeff g h i") assert [t.text for t in doc] == [ "aaaaa", "bbbbb", "ccccc", "ddddd", "eeeee", "ff", "g", "h", "i", ] assert all(len(t.text) <= token_splitter.split_length for t in doc)