def test_pipe_factories_language_specific(): """Test that language sub-classes can have their own factories, with fallbacks to the base factories.""" name1 = "specific_component1" name2 = "specific_component2" Language.component(name1, func=lambda: "base") English.component(name1, func=lambda: "en") German.component(name2, func=lambda: "de") assert Language.has_factory(name1) assert not Language.has_factory(name2) assert English.has_factory(name1) assert not English.has_factory(name2) assert German.has_factory(name1) assert German.has_factory(name2) nlp = Language() assert nlp.create_pipe(name1)() == "base" with pytest.raises(ValueError): nlp.create_pipe(name2) nlp_en = English() assert nlp_en.create_pipe(name1)() == "en" with pytest.raises(ValueError): nlp_en.create_pipe(name2) nlp_de = German() assert nlp_de.create_pipe(name1)() == "base" assert nlp_de.create_pipe(name2)() == "de"
def test_pipe_factories_empty_dict_default(): """Test that default config values can be empty dicts and that no config validation error is raised.""" # TODO: fix this name = "test_pipe_factories_empty_dict_default" @Language.factory(name, default_config={"foo": {}}) def factory(nlp: Language, name: str, foo: dict): ... nlp = Language() nlp.create_pipe(name)
def test_issue1727(): """Test that models with no pretrained vectors can be deserialized correctly after vectors are added.""" nlp = Language(Vocab()) data = numpy.ones((3, 300), dtype="f") vectors = Vectors(data=data, keys=["I", "am", "Matt"]) tagger = nlp.create_pipe("tagger") tagger.add_label("PRP") assert tagger.cfg.get("pretrained_dims", 0) == 0 tagger.vocab.vectors = vectors with make_tempdir() as path: tagger.to_disk(path) tagger = nlp.create_pipe("tagger").from_disk(path) assert tagger.cfg.get("pretrained_dims", 0) == 0
def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() nlp.add_pipe(nlp.create_pipe("ner")) nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg)
def test_factories_merge_noun_chunks(doc): assert len(doc) == 7 nlp = Language() merge_noun_chunks = nlp.create_pipe("merge_noun_chunks") merge_noun_chunks(doc) assert len(doc) == 6 assert doc[2].text == "New York"
def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() nlp.add_pipe(nlp.create_pipe("ner")) nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg)
def test_factories_merge_noun_chunks(doc2): assert len(doc2) == 7 nlp = Language() merge_noun_chunks = nlp.create_pipe("merge_noun_chunks") merge_noun_chunks(doc2) assert len(doc2) == 6 assert doc2[2].text == "New York"
def nlp(): nlp = Language(Vocab()) textcat = nlp.create_pipe("textcat") for label in ("POSITIVE", "NEGATIVE"): textcat.add_label(label) nlp.add_pipe(textcat) nlp.begin_training() return nlp
def create_pipeline(nlp: Language, cfg: omegaconf.DictConfig) -> List[Pipe]: if not isinstance(cfg, omegaconf.DictConfig): cfg = OmegaConf.create(cfg) pipes = [] for name, pipe_config in cfg.items(): pipe_config = OmegaConf.to_container(pipe_config or OmegaConf.create({})) pipes.append(nlp.create_pipe(name, config=pipe_config or dict())) return pipes
def test_factories_merge_ents(doc): assert len(doc) == 7 assert len(list(doc.ents)) == 1 nlp = Language() merge_entities = nlp.create_pipe("merge_entities") merge_entities(doc) assert len(doc) == 6 assert len(list(doc.ents)) == 1 assert doc[2].text == "New York"
def test_factories_merge_ents(doc2): assert len(doc2) == 7 assert len(list(doc2.ents)) == 1 nlp = Language() merge_entities = nlp.create_pipe("merge_entities") merge_entities(doc2) assert len(doc2) == 6 assert len(list(doc2.ents)) == 1 assert doc2[2].text == "New York"
def tagger(): nlp = Language() nlp.add_pipe(nlp.create_pipe("tagger")) tagger = nlp.get_pipe("tagger") # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.begin_training(pipeline=nlp.pipeline) return tagger
def entity_linker(): nlp = Language() nlp.add_pipe(nlp.create_pipe("entity_linker")) entity_linker = nlp.get_pipe("entity_linker") # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) entity_linker.set_kb(kb) entity_linker.begin_training(pipeline=nlp.pipeline) return entity_linker
def test_tagger_begin_training_tag_map(): """Test that Tagger.begin_training() without gold tuples does not clobber the tag map.""" nlp = Language() tagger = nlp.create_pipe("tagger") orig_tag_count = len(tagger.labels) tagger.add_label("A", {"POS": "NOUN"}) nlp.add_pipe(tagger) nlp.begin_training() assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN} assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
def test_issue2564(): """Test the tagger sets is_tagged correctly when used via Language.pipe.""" nlp = Language() tagger = nlp.create_pipe("tagger") tagger.begin_training() # initialise weights nlp.add_pipe(tagger) doc = nlp("hello world") assert doc.is_tagged docs = nlp.pipe(["hello", "world"]) piped_doc = next(docs) assert piped_doc.is_tagged
def load_default_model(): # model = spacy.load(MODEL_PATH, disable=['tagger', 'parser', 'ner']) vocab = spacy.vocab.Vocab().from_disk(MODEL_PATH) model = Language(vocab) # ideally, we'd use their parser to do sentence segmentation, # but we don't have all day (week/month), so fall back to the basic version sbd = model.create_pipe('sentencizer') model.add_pipe(sbd) return model
def _initialize_textcat(nlp: Language) -> None: # Ensure that the text categorizer is added to the pipeline if 'textcat' not in nlp.pipe_names: LOGGER.info('Adding the text categorizer to the spacy nlp pipeline') # nlp.create_pipe works for built-ins that are registered with spaCy textcat = nlp.create_pipe('textcat', config={ 'exclusive_classes': True, 'architecture': 'simple_cnn' }) nlp.add_pipe(textcat, last=True)
def test_ner_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) ner = nlp.create_pipe("ner") nlp.add_pipe(ner) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lexeme_norm") nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list
def test_tagger_warns_no_lemma_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") with pytest.warns(UserWarning): tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") with pytest.warns(None) as record: nlp.begin_training() assert not record.list
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], [ "i'm looking for a place in the north of town", [[31, 36, "LOCATION"]] ], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def test_issue1967(label): nlp = Language() config = {} ner = nlp.create_pipe("ner", config=config) example = Example.from_dict( Doc(ner.vocab, words=["word"]), { "ids": [0], "words": ["word"], "tags": ["tag"], "heads": [0], "deps": ["dep"], "entities": [label], }, ) assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def test_simple_train(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update([text], [{"cats": {"answer": answer}}]) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_simple_train(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update([text], [{"cats": {"answer": answer}}]) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_component_decorator_assigns(): spacy.language.ENABLE_PIPELINE_ANALYSIS = True @component("c1", assigns=["token.tag", "doc.tensor"]) def test_component1(doc): return doc @component("c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]) def test_component2(doc): return doc @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"]) def test_component3(doc): return doc assert "c1" in Language.factories assert "c2" in Language.factories assert "c3" in Language.factories nlp = Language() nlp.add_pipe(test_component1) with pytest.warns(UserWarning): nlp.add_pipe(test_component2) nlp.add_pipe(test_component3) assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor") assert [name for name, _ in assigns_tensor] == ["c1", "c2"] test_component4 = nlp.create_pipe("c1") assert test_component4.name == "c1" assert test_component4.factory == "c1" nlp.add_pipe(test_component4, name="c4") assert nlp.pipe_names == ["c1", "c2", "c3", "c4"] assert "c4" not in Language.factories assert nlp.pipe_factories["c1"] == "c1" assert nlp.pipe_factories["c4"] == "c1" assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor") assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"] requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos") assert [name for name, _ in requires_pos] == ["c2"] assert print_summary(nlp, no_print=True) assert nlp("hello world")
def main(vectors_loc=None, lang=None): if lang is None: nlp = Language() else: # create empty language class – this is required if you're planning to # save the model to disk and load it back later (models always need a # "lang" setting). Use 'xx' for blank multi-language class. nlp = spacy.blank(lang) with open(VECTORS_PATH, "rb") as file_: print("loading vectors...") header = file_.readline() nr_row, nr_dim = header.split() nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode("utf8") pieces = line.rsplit(" ", int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f") nlp.vocab.set_vector(word, vector) # add the vectors to the vocab tagger = nlp.create_pipe("tagger") # Add the tags. This needs to be done before you start training. print("trainning tags...") for tag, values in TAG_MAP.items(): tagger.add_label(tag, values) nlp.add_pipe(tagger) optimizer = nlp.begin_training() for i in range(20): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # test the trained model test_text = "Eu desejo ouvir uma música muito boa" doc = nlp(test_text) print("Tags", [(t.text, t.tag_, t.pos_) for t in doc]) print("Saved mode to nl_model_tagger") nlp.to_disk("/app/model")
def test_component_factories_from_nlp(): """Test that class components can implement a from_nlp classmethod that gives them access to the nlp object and config via the factory.""" class TestComponent5(object): def __call__(self, doc): return doc mock = Mock() mock.return_value = TestComponent5() TestComponent5.from_nlp = classmethod(mock) TestComponent5 = component("c5")(TestComponent5) assert "c5" in Language.factories nlp = Language() pipe = nlp.create_pipe("c5", config={"foo": "bar"}) nlp.add_pipe(pipe) assert nlp("hello world") # The first argument here is the class itself, so we're accepting any here mock.assert_called_once_with(ANY, nlp, foo="bar")
def test_pipe_function_component(): name = "test_component" @Language.component(name) def component(doc: Doc) -> Doc: return doc assert name in registry.factories nlp = Language() with pytest.raises(ValueError): nlp.add_pipe(component) nlp.add_pipe(name) assert name in nlp.pipe_names assert nlp.pipe_factories[name] == name assert Language.get_factory_meta(name) assert nlp.get_pipe_meta(name) pipe = nlp.get_pipe(name) assert pipe == component pipe = nlp.create_pipe(name) assert pipe == component
def test_pipe_class_component_init(): name1 = "test_class_component1" name2 = "test_class_component2" @Language.factory(name1) class Component1: def __init__(self, nlp: Language, name: str): self.nlp = nlp def __call__(self, doc: Doc) -> Doc: return doc class Component2: def __init__(self, nlp: Language, name: str): self.nlp = nlp def __call__(self, doc: Doc) -> Doc: return doc @Language.factory(name2) def factory(nlp: Language, name=name2): return Component2(nlp, name) nlp = Language() for name, Component in [(name1, Component1), (name2, Component2)]: assert name in registry.factories with pytest.raises(ValueError): nlp.add_pipe(Component(nlp, name)) nlp.add_pipe(name) assert name in nlp.pipe_names assert nlp.pipe_factories[name] == name assert Language.get_factory_meta(name) assert nlp.get_pipe_meta(name) pipe = nlp.get_pipe(name) assert isinstance(pipe, Component) assert isinstance(pipe.nlp, Language) pipe = nlp.create_pipe(name) assert isinstance(pipe, Component) assert isinstance(pipe.nlp, Language)
def test_label_types(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") with pytest.raises(ValueError): nlp.get_pipe("textcat").add_label(9)