def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() nlp.add_pipe(nlp.create_pipe("ner")) nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg)
def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() nlp.add_pipe(nlp.create_pipe("ner")) nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg)
def nlp(): nlp = Language(Vocab()) textcat = nlp.create_pipe("textcat") for label in ("POSITIVE", "NEGATIVE"): textcat.add_label(label) nlp.add_pipe(textcat) nlp.begin_training() return nlp
def test_tagger_begin_training_tag_map(): """Test that Tagger.begin_training() without gold tuples does not clobber the tag map.""" nlp = Language() tagger = nlp.create_pipe("tagger") orig_tag_count = len(tagger.labels) tagger.add_label("A", {"POS": "NOUN"}) nlp.add_pipe(tagger) nlp.begin_training() assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN} assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
def test_ner_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) ner = nlp.create_pipe("ner") nlp.add_pipe(ner) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lexeme_norm") nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list
def test_tagger_warns_no_lemma_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") with pytest.warns(UserWarning): tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") with pytest.warns(None) as record: nlp.begin_training() assert not record.list
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], [ "i'm looking for a place in the north of town", [[31, 36, "LOCATION"]] ], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def test_simple_train(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update([text], [{"cats": {"answer": answer}}]) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_simple_train(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update([text], [{"cats": {"answer": answer}}]) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def main(vectors_loc=None, lang=None): if lang is None: nlp = Language() else: # create empty language class – this is required if you're planning to # save the model to disk and load it back later (models always need a # "lang" setting). Use 'xx' for blank multi-language class. nlp = spacy.blank(lang) with open(VECTORS_PATH, "rb") as file_: print("loading vectors...") header = file_.readline() nr_row, nr_dim = header.split() nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode("utf8") pieces = line.rsplit(" ", int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f") nlp.vocab.set_vector(word, vector) # add the vectors to the vocab tagger = nlp.create_pipe("tagger") # Add the tags. This needs to be done before you start training. print("trainning tags...") for tag, values in TAG_MAP.items(): tagger.add_label(tag, values) nlp.add_pipe(tagger) optimizer = nlp.begin_training() for i in range(20): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # test the trained model test_text = "Eu desejo ouvir uma música muito boa" doc = nlp(test_text) print("Tags", [(t.text, t.tag_, t.pos_) for t in doc]) print("Saved mode to nl_model_tagger") nlp.to_disk("/app/model")