def test_aligned_tags(): pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"] gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"] gold_tags = ["VERB", "DET", "NOUN", "NOUN", "SCONJ", "PRON", "VERB"] annots = {"words": gold_words, "tags": gold_tags} vocab = Vocab() predicted = Doc(vocab, words=pred_words) example1 = Example.from_dict(predicted, annots) aligned_tags1 = example1.get_aligned("TAG", as_string=True) assert aligned_tags1 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"] # ensure that to_dict works correctly example2 = Example.from_dict(predicted, example1.to_dict()) aligned_tags2 = example2.get_aligned("TAG", as_string=True) assert aligned_tags2 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
def test_transformer_pipeline_textcat(): """Test that a pipeline with just a transformer+textcat runs and trains properly. This used to throw an error because of shape inference issues - cf https://github.com/explosion/spaCy/issues/6401""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["transformer", "textcat"] train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) doc = nlp("We're interested at underwater basket weaving.") cats1 = doc.cats # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = spacy.load(file_path) doc2 = nlp2("We're interested at underwater basket weaving.") cats2 = doc2.cats assert cats1 == cats2
def train_spacy(data, epochs, batch_size=8): TRAINING_DATA = data nlp = spacy.blank('en') if 'ner' not in nlp.pipe_names: ner = nlp.add_pipe('ner', last=True) for _, annotations in TRAINING_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() for epoch in range(epochs): print('Starting epoch: '+ str(epoch)) random.shuffle(TRAINING_DATA) losses = {} for batch in minibatch(TRAINING_DATA, size=batch_size): for text, annotations in batch: doc = nlp.make_doc(text) example = Example.from_dict(doc, annotations) nlp.update( [example], drop=0.2, # Prevent overfitting sgd=optimizer, losses=losses, ) print(losses) return nlp
def train_spacy(data, iterations): nlp = spacy.blank('de') # create blank Language class # create the built-in pipeline components and add them to the pipeline if 'ner' not in nlp.pipe_names: nlp.add_pipe("ner", last=True) # get names of other pipes to disable them during training, actually not necessary since language class is blank other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for itn in range(iterations): print("Starting iteration " + str(itn)) random.shuffle(data) losses = {} for texts, annotations in data: examples = [ Example.from_dict(nlp.make_doc(texts), annotations) for texts, annotations in data ] nlp.update( examples, drop=0.5, # dropout - make it harder to memorise data sgd=optimizer, losses=losses) print("Losses", losses) return nlp
def test_Example_from_dict_with_parse(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) for i, token in enumerate(example.reference): assert token.dep_ == annots["deps"][i] assert token.head.i == annots["heads"][i]
def test_Example_from_dict_with_sent_start(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) assert len(list(example.reference.sents)) == 2 for i, token in enumerate(example.reference): assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
def test_Example_from_dict_sentences(): vocab = Vocab() predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"]) annots = {"sent_starts": [1, 0, 0, 1, 0]} ex = Example.from_dict(predicted, annots) assert len(list(ex.reference.sents)) == 2 # this currently throws an error - bug or feature? # predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"]) # annots = {"sent_starts": [1, 0, 0, 0, 0]} # ex = Example.from_dict(predicted, annots) # assert len(list(ex.reference.sents)) == 1 predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"]) annots = {"sent_starts": [1, -1, 0, 0, 0]} ex = Example.from_dict(predicted, annots) assert len(list(ex.reference.sents)) == 1
def test_Example_missing_deps(): vocab = Vocab() words = ["I", "like", "London", "and", "Berlin", "."] deps = ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"] heads = [1, 1, 1, 2, 2, 1] annots_head_only = {"words": words, "heads": heads} annots_head_dep = {"words": words, "heads": heads, "deps": deps} predicted = Doc(vocab, words=words) # when not providing deps, the head information is considered to be missing # in this case, the token's heads refer to themselves example_1 = Example.from_dict(predicted, annots_head_only) assert [t.head.i for t in example_1.reference] == [0, 1, 2, 3, 4, 5] # when providing deps, the head information is actually used example_2 = Example.from_dict(predicted, annots_head_dep) assert [t.head.i for t in example_2.reference] == heads
def test_Example_from_dict_with_tags(pred_words, annots): vocab = Vocab() predicted = Doc(vocab, words=pred_words) example = Example.from_dict(predicted, annots) for i, token in enumerate(example.reference): assert token.tag_ == annots["tags"][i] aligned_tags = example.get_aligned("TAG", as_string=True) assert aligned_tags == ["NN" for _ in predicted]
def test_Example_from_dict_with_cats(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) assert len(list(example.reference.cats)) == 3 assert example.reference.cats["cat1"] == 1.0 assert example.reference.cats["cat2"] == 0.0 assert example.reference.cats["cat3"] == 0.5
def test_transformer_pipeline_tagger_senter_listener(): """Test that a pipeline with just a transformer+tagger+senter runs and trains properly""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["transformer", "tagger", "senter"] tagger = nlp.get_pipe("tagger") transformer = nlp.get_pipe("transformer") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] assert isinstance(transformer, Transformer) assert isinstance(tagger_trf, TransformerListener) assert tagger_trf.upstream_name == "custom_upstream" train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) # Check that the Transformer component finds it listeners assert transformer.listeners == [] optimizer = nlp.initialize(lambda: train_examples) assert tagger_trf in transformer.listeners for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) text = "We're interested at underwater basket weaving." doc = nlp(text) doc_tensor = tagger_trf.predict([doc]) _assert_equal_tensors(doc._.trf_data.tensors, doc_tensor[0].tensors) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc2 = nlp2(text) tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors) # make sure that this can be saved to directory once more file_path_2 = d / "trained_nlp_2" nlp2.to_disk(file_path_2) # ensure to_bytes / from_bytes works nlp_bytes = nlp.to_bytes() nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp3.from_bytes(nlp_bytes) doc3 = nlp3(text) tagger3 = nlp3.get_pipe("tagger") tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] doc_tensor3 = tagger_trf3.predict([doc3]) _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
def test_Example_from_dict_with_empty_entities(): annots = { "words": ["I", "like", "New", "York", "and", "Berlin", "."], "entities": [], } vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) # entities as empty list sets everything to O assert example.reference.has_annotation("ENT_IOB") assert len(list(example.reference.ents)) == 0 assert all(token.ent_iob_ == "O" for token in example.reference) # various unset/missing entities leaves entities unset annots["entities"] = None example = Example.from_dict(predicted, annots) assert not example.reference.has_annotation("ENT_IOB") annots.pop("entities", None) example = Example.from_dict(predicted, annots) assert not example.reference.has_annotation("ENT_IOB")
def test_aligned_tags_multi(): pred_words = ["Applysome", "sunscreen", "unless", "you", "can", "not"] gold_words = ["Apply", "somesun", "screen", "unless", "you", "cannot"] gold_tags = ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB"] annots = {"words": gold_words, "tags": gold_tags} vocab = Vocab() predicted = Doc(vocab, words=pred_words) example = Example.from_dict(predicted, annots) aligned_tags = example.get_aligned("TAG", as_string=True) assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]
def test_Example_from_dict_with_spans_overlapping(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) assert len(list(example.reference.ents)) == 0 assert len(list(example.reference.spans["cities"])) == 3 assert len(list(example.reference.spans["people"])) == 1 for span in example.reference.spans["cities"]: assert span.label_ == "LOC" for span in example.reference.spans["people"]: assert span.label_ == "PERSON"
def test_Example_from_dict_with_links(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) assert example.reference[0].ent_kb_id_ == "" assert example.reference[1].ent_kb_id_ == "" assert example.reference[2].ent_kb_id_ == "Q60" assert example.reference[3].ent_kb_id_ == "Q60" assert example.reference[4].ent_kb_id_ == "" assert example.reference[5].ent_kb_id_ == "Q64" assert example.reference[6].ent_kb_id_ == ""
def train_func(model, train_data, optimizer, batch_size=8): losses = {} random.seed(1) random.shuffle(train_data) for batch in minibatch(train_data, size=batch_size): for text, labels in batch: doc = model.make_doc(text) example = Example.from_dict(doc, labels) # Update model with texts and labels model.update([example], sgd=optimizer, losses=losses) return losses
def test_Example_from_dict_with_sent_start(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) assert len(list(example.reference.sents)) == 2 for i, token in enumerate(example.reference): if to_ternary_int(annots["sent_starts"][i]) == 1: assert token.is_sent_start is True elif to_ternary_int(annots["sent_starts"][i]) == 0: assert token.is_sent_start is None else: assert token.is_sent_start is False
def test_Example_from_dict_with_entities(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) assert len(list(example.reference.ents)) == 2 # fmt: off assert [example.reference[i].ent_iob_ for i in range(7)] == ["O", "O", "B", "I", "O", "B", "O"] assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2] # fmt: on assert example.reference[2].ent_type_ == "LOC" assert example.reference[3].ent_type_ == "LOC" assert example.reference[5].ent_type_ == "LOC"
def test_transformer_pipeline_empty(): """Test that the pipeline doesn't fail with empty input""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) tagger = nlp.get_pipe("tagger") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) # train on empty doc optimizer = nlp.initialize() losses = {} empty_train_example = Example.from_dict(nlp.make_doc(""), {}) nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update([empty_train_example], sgd=optimizer, losses=losses) train_examples.append(empty_train_example) nlp.update(train_examples, sgd=optimizer, losses=losses) # predict empty doc doc = nlp("") _assert_empty(doc._.trf_data) docs = nlp.pipe(["", ""]) for doc in docs: _assert_empty(doc._.trf_data) nlp.pipe([]) # predict combination of empty and non-empty doc = nlp("This is a sentence") normal_tags = [t.tag_ for t in doc] docs = list(nlp.pipe(["", "This is a sentence", "", ""])) _assert_empty(docs[0]._.trf_data) assert [t.tag_ for t in docs[0]] == [] assert [t.tag_ for t in docs[1]] == normal_tags _assert_empty(docs[2]._.trf_data) _assert_empty(docs[3]._.trf_data)
def test_replace_listeners(): orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) text = "This is awesome" examples = [ Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]}) ] optimizer = nlp.initialize(lambda: examples) # verify correct configuration with transformer listener transformer = nlp.get_pipe("transformer") tagger = nlp.get_pipe("tagger") tagger_tok2vec = tagger.model.get_ref("tok2vec") tagger_listener = tagger_tok2vec.get_ref("listener") assert isinstance(tagger_listener, TransformerListener) assert transformer.listener_map["tagger"][0] == tagger_listener assert isinstance(transformer.model, TransformerModel) assert (nlp.config["components"]["transformer"]["model"]["@architectures"] == "spacy-transformers.TransformerModel.v2") assert (nlp.config["components"]["tagger"]["model"]["tok2vec"] ["@architectures"] == "spacy-transformers.TransformerListener.v1") # train pipe before replacing listeners for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) doc = nlp(text) preds = [t.tag_ for t in doc] doc_tensor = tagger_tok2vec.predict([doc]) # replace listener and verify predictions are still the same nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"]) tagger = nlp.get_pipe("tagger") tagger_tok2vec = tagger.model.get_ref("tok2vec") assert isinstance(tagger_tok2vec, Model) assert tagger_tok2vec.layers[0].layers[0].name == "transformer" assert (nlp.config["components"]["tagger"]["model"]["tok2vec"] ["@architectures"] == "spacy-transformers.Tok2VecTransformer.v2") doc2 = nlp(text) assert preds == [t.tag_ for t in doc2] assert_equal(doc_tensor, tagger_tok2vec.predict([doc2])) # attempt training with the new pipeline optimizer = nlp.resume_training() for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) assert losses["tagger"] > 0.0 # check for presence of additional fields in model_output assert doc2._.trf_data.model_output.pooler_output is not None assert doc2._.trf_data.model_output.attentions is not None
def simple_nlp(): nlp = Language() nlp.add_pipe("transformer") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize() for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) return nlp
def train(model, train_data, optimizer, batch_size=8): losses = {} random.seed(1) random.shuffle(train_data) # train_data is a list of tuples [(text0, label0), (text1, label1), ...] for batch in minibatch(train_data, size=batch_size): # split batch into text and labels for text, labels in batch: doc = nlp.make_doc(text) example = Example.from_dict(doc, labels) # Update model with texts and labels model.update([example], sgd=optimizer, losses=losses) return losses
def test_transformer_pipeline_simple(): """Test that a simple pipeline with just a transformer at least runs""" nlp = Language() nlp.add_pipe("transformer") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize() for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) doc = nlp("We're interested at underwater basket weaving.") assert doc
def test_transformer_pipeline_tagger_internal(): """Test that a tagger with internal transformer runs and trains properly""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["tagger"] tagger = nlp.get_pipe("tagger") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] assert isinstance(tagger_trf, Model) train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) optimizer = nlp.initialize(lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) doc = nlp("We're interested at underwater basket weaving.") doc_tensor = tagger_trf.predict([doc]) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp2.initialize(lambda: train_examples) # results are not the same if we don't call from_disk doc2 = nlp2("We're interested at underwater basket weaving.") tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) with pytest.raises(AssertionError): assert_equal(doc_tensor2.doc_data[0].tensors, doc_tensor.doc_data[0].tensors) # results ARE the same if we call from_disk nlp2.from_disk(file_path) doc2 = nlp2("We're interested at underwater basket weaving.") tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) assert_equal(doc_tensor2.doc_data[0].tensors, doc_tensor.doc_data[0].tensors)
def test_replace_listeners_invalid(): orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) text = "This is awesome" examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})] optimizer = nlp.initialize(lambda: examples) for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) with pytest.raises(ValueError): nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"]) with pytest.raises(ValueError): nlp.replace_listeners("transformer", "parser", ["model.tok2vec"]) with pytest.raises(ValueError): nlp.replace_listeners("transformer", "tagger", ["model.yolo"]) with pytest.raises(ValueError): nlp.replace_listeners("transformer", "tagger", ["model.tok2vec", "model.yolo"])
def test_transformer_sentencepiece_IO(): """Test that a transformer using sentencepiece trains + IO goes OK""" orig_config = Config().from_str(cfg_string) orig_config["components"]["transformer"]["model"]["name"] = "camembert-base" nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) tagger = nlp.get_pipe("tagger") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) optimizer = nlp.initialize(lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) text = "We're interested at underwater basket weaving." doc = nlp(text) doc_tensor = tagger_trf.predict([doc]) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc2 = nlp2(text) tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors) # make sure that this can be saved to directory once more file_path_2 = d / "trained_nlp_2" nlp2.to_disk(file_path_2) # ensure to_bytes / from_bytes works nlp_bytes = nlp.to_bytes() nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp3.from_bytes(nlp_bytes) doc3 = nlp3(text) tagger3 = nlp3.get_pipe("tagger") tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] doc_tensor3 = tagger_trf3.predict([doc3]) _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
def test_Example_missing_heads(): vocab = Vocab() words = ["I", "like", "London", "and", "Berlin", "."] deps = ["nsubj", "ROOT", "dobj", None, "conj", "punct"] heads = [1, 1, 1, None, 2, 1] annots = {"words": words, "heads": heads, "deps": deps} predicted = Doc(vocab, words=words) example = Example.from_dict(predicted, annots) parsed_heads = [t.head.i for t in example.reference] assert parsed_heads[0] == heads[0] assert parsed_heads[1] == heads[1] assert parsed_heads[2] == heads[2] assert parsed_heads[4] == heads[4] assert parsed_heads[5] == heads[5] expected = [True, True, True, False, True, True] assert [t.has_head() for t in example.reference] == expected # Ensure that the missing head doesn't create an artificial new sentence start expected = [True, False, False, False, False, False] assert example.get_aligned_sent_starts() == expected
def train_spacy(TRAINING_DATA, epochs): # nlp = spacy.blank('en') nlp = spacy.load('./holocaust_model') ner = nlp.add_pipe('ner', name='ner_conc_camp') ner.add_label('CAMP') other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner_conc_camp'] with nlp.disable_pipes( *other_pipes): # disable other pipes to not affect them optimizer = nlp.begin_training() for epoch in range(epochs): print(f'Starting epoch: {epoch}') random.shuffle(TRAINING_DATA) losses = {} for batch in minibatch(TRAINING_DATA, size=8): for text, annotations in TRAINING_DATA: doc = nlp.make_doc(text) example = Example.from_dict(doc, annotations) nlp.update([example], drop=0.2, sgd=optimizer, losses=losses) print(losses) return nlp
train_data.append((text, categories)) #%% nlp = spacy.blank('en') config = {"threshold": 0.8} nlp.add_pipe('textcat_multilabel', config=config) textcat = nlp.get_pipe("textcat_multilabel") labels = set(" ".join(df_sorted['genre']).replace(",", "").split()) for key in labeldict.keys(): textcat.add_label(key) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel'] # Only train the textcat pipe with nlp.disable_pipes(*other_pipes): optimizer = nlp.initialize() for i in range(1): losses = {} for batch in minibatch(train_data, size=compounding(2., 8., 1.5)): examples = [] for text, annotations in batch: doc = nlp.make_doc(str(text)) examples.append(Example.from_dict(doc, annotations)) nlp.update(examples, sgd=optimizer, losses=losses, drop=0.2) #print('{0:.3f}'.format(losses['textcat'])) docs = list(nlp.pipe(texts_stemmed[:500])) preds = textcat.predict(docs)
# get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] ### START start = datetime.datetime.now() # train NER Model with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.initialize() for itn in range(iterations): print("Iteration Number:" + str(itn)) random.shuffle(a) losses = {} for text, annotations in a: doc = nlp.make_doc(text) example = Example.from_dict(doc, annotations) nlp.update([example], sgd=optimizer, losses=losses, drop=0.2) # drop=next(dropout, -1) print("losses", losses) nlp.to_disk('nlp_ner_model') nlp_model = spacy.load('nlp_ner_model') # Pickling nlp model pickle.dump(nlp_model, open('ner_model.pkl', 'wb')) # Pickling test data pickle.dump(test_data, open('testFile.pkl', 'wb')) #### END print(