def test_ner_labels_added_implicitly_on_update(): nlp = Language() ner = nlp.add_pipe("ner") for label in ["A", "B", "C"]: ner.add_label(label) nlp.initialize() doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) example = Example(nlp.make_doc(doc.text), doc) assert "D" not in ner.labels nlp.update([example]) assert "D" in ner.labels
def simple_nlp(): nlp = Language() nlp.add_pipe("transformer") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize() for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) return nlp
def test_transformer_pipeline_simple(): """Test that a simple pipeline with just a transformer at least runs""" nlp = Language() nlp.add_pipe("transformer") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize() for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) doc = nlp("We're interested at underwater basket weaving.") assert doc
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], [ "i'm looking for a place in the north of town", [[31, 36, "LOCATION"]] ], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def test_simple_train(): nlp = Language() textcat = nlp.add_pipe("textcat") textcat.add_label("answer") nlp.initialize() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update((text, {"cats": {"answer": answer}})) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_simple_train(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update([text], [{"cats": {"answer": answer}}]) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def test_simple_train(): fix_random_seed(0) nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) get_examples = make_get_examples(nlp) nlp.initialize(get_examples) sgd = nlp.create_optimizer() assert len(spancat.labels) != 0 for i in range(40): losses = {} nlp.update(list(get_examples()), losses=losses, drop=0.1, sgd=sgd) doc = nlp("I like London and Berlin.") assert doc.spans[spancat.key] == doc.spans[SPAN_KEY] assert len(doc.spans[spancat.key]) == 2 assert doc.spans[spancat.key][0].text == "London" scores = nlp.evaluate(get_examples()) assert f"spans_{SPAN_KEY}_f" in scores assert scores[f"spans_{SPAN_KEY}_f"] == 1.0
def main(vectors_loc=None, lang=None): if lang is None: nlp = Language() else: # create empty language class – this is required if you're planning to # save the model to disk and load it back later (models always need a # "lang" setting). Use 'xx' for blank multi-language class. nlp = spacy.blank(lang) with open(VECTORS_PATH, "rb") as file_: print("loading vectors...") header = file_.readline() nr_row, nr_dim = header.split() nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode("utf8") pieces = line.rsplit(" ", int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f") nlp.vocab.set_vector(word, vector) # add the vectors to the vocab tagger = nlp.create_pipe("tagger") # Add the tags. This needs to be done before you start training. print("trainning tags...") for tag, values in TAG_MAP.items(): tagger.add_label(tag, values) nlp.add_pipe(tagger) optimizer = nlp.begin_training() for i in range(20): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # test the trained model test_text = "Eu desejo ouvir uma música muito boa" doc = nlp(test_text) print("Tags", [(t.text, t.tag_, t.pos_) for t in doc]) print("Saved mode to nl_model_tagger") nlp.to_disk("/app/model")
def test_example_batch(nlp: Language, example_gold): texts, golds = zip(*example_gold) optim = nlp.resume_training() nlp.update(texts, golds, optim)
def test_update(nlp: Language, label_type): if label_type == "irex": pytest.skip("label type mismatch") optim = nlp.resume_training() nlp.update(*zip(*TESTCASE_ENE), optim)