Example #1
0
def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR,
                     verbose=False,
                     textcat=None,
                     vectorError=False):
    """
    Loads a spacy model.
    
    OBS vectorError is a TEMP ugly work around error encounted by keeping two models an not been able to find referece name for vectros
    """
    from spacy.util import load_model_from_path

    if textcat == None or vectorError == True:
        modelname = 'spacy'

        model_weight_path = download_model(modelname,
                                           cache_dir,
                                           process_func=_unzip_process_func,
                                           verbose=verbose)
        nlp = load_model_from_path(model_weight_path)

    if textcat == 'sentiment':
        modelname = 'spacy.sentiment'

        model_weight_path = download_model(modelname,
                                           cache_dir,
                                           process_func=_unzip_process_func,
                                           verbose=verbose)
        # quick fix from not aligned models storage:
        import os
        model_weight_path = os.path.join(model_weight_path, 'spacy.sentiment')

        nlp = load_model_from_path(model_weight_path)

    return nlp
Example #2
0
def test_lemmatizer_serialize(nlp):
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
    nlp.initialize()

    def cope_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups

    nlp2 = English()
    lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
    lemmatizer2.initialize(lookups=cope_lookups())
    lemmatizer2.from_bytes(lemmatizer.to_bytes())
    assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
    assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
    doc2 = nlp2.make_doc("coping")
    doc2[0].pos_ = "VERB"
    assert doc2[0].lemma_ == ""
    doc2 = lemmatizer2(doc2)
    assert doc2[0].text == "coping"
    assert doc2[0].lemma_ == "cope"

    # Make sure that lemmatizer cache can be pickled
    b = pickle.dumps(lemmatizer2)
def test_transformer_pipeline_textcat():
    """Test that a pipeline with just a transformer+textcat runs and trains properly.
    This used to throw an error because of shape inference issues -
    cf https://github.com/explosion/spaCy/issues/6401"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    assert nlp.pipe_names == ["transformer", "textcat"]
    train_examples = []

    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)

    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    doc = nlp("We're interested at underwater basket weaving.")
    cats1 = doc.cats

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc2 = nlp2("We're interested at underwater basket weaving.")
        cats2 = doc2.cats
        assert cats1 == cats2
Example #4
0
def generate_meta(model_path, existing_meta):
    meta = existing_meta or {}
    settings = [('lang', 'Model language', meta.get('lang', 'en')),
                ('name', 'Model name', meta.get('name', 'model')),
                ('version', 'Model version', meta.get('version', '0.0.0')),
                ('spacy_version', 'Required spaCy version',
                 '>=%s,<3.0.0' % about.__version__),
                ('description', 'Model description',
                 meta.get('description', False)),
                ('author', 'Author', meta.get('author', False)),
                ('email', 'Author email', meta.get('email', False)),
                ('url', 'Author website', meta.get('url', False)),
                ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
    nlp = util.load_model_from_path(Path(model_path))
    meta['pipeline'] = nlp.pipe_names
    meta['vectors'] = {
        'width': nlp.vocab.vectors_length,
        'vectors': len(nlp.vocab.vectors),
        'keys': nlp.vocab.vectors.n_keys
    }
    prints(Messages.M047, title=Messages.M046)
    for setting, desc, default in settings:
        response = util.get_raw_input(desc, default)
        meta[setting] = default if response == '' and default else response
    if about.__title__ != 'spacy':
        meta['parent_package'] = about.__title__
    return meta
Example #5
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
    nlp = English()
    tagger = nlp.add_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert tagger.model.get_dim("nO") == len(TAGS)

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["tagger"] < 0.00001

    # test the trained model
    test_text = "I like blue eggs"
    doc = nlp(test_text)
    assert doc[0].tag_ == "N"
    assert doc[1].tag_ == "V"
    assert doc[2].tag_ == "J"
    assert doc[3].tag_ == "N"

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert doc2[0].tag_ == "N"
        assert doc2[1].tag_ == "V"
        assert doc2[2].tag_ == "J"
        assert doc2[3].tag_ == "N"

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.",
        "I like green eggs.",
        "Here is another one.",
        "I eat ham.",
    ]
    batch_deps_1 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([TAG]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)

    # Try to unlearn the first 'N' tag with negative annotation
    neg_ex = Example.from_dict(nlp.make_doc(test_text),
                               {"tags": ["!N", "V", "J", "N"]})

    for i in range(20):
        losses = {}
        nlp.update([neg_ex], sgd=optimizer, losses=losses)

    # test the "untrained" tag
    doc3 = nlp(test_text)
    assert doc3[0].tag_ != "N"
def test_overfitting_IO(use_upper):
    # Simple test to try and quickly overfit the NER component
    nlp = English()
    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    optimizer = nlp.initialize()

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["ner"] < 0.00001

    # test the trained model
    test_text = "I like London."
    doc = nlp(test_text)
    ents = doc.ents
    assert len(ents) == 1
    assert ents[0].text == "London"
    assert ents[0].label_ == "LOC"

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        ents2 = doc2.ents
        assert len(ents2) == 1
        assert ents2[0].text == "London"
        assert ents2[0].label_ == "LOC"
        # Ensure that the predictions are still the same, even after adding a new label
        ner2 = nlp2.get_pipe("ner")
        assert ner2.model.attrs["has_upper"] == use_upper
        ner2.add_label("RANDOM_NEW_LABEL")
        doc3 = nlp2(test_text)
        ents3 = doc3.ents
        assert len(ents3) == 1
        assert ents3[0].text == "London"
        assert ents3[0].label_ == "LOC"

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.",
        "Then one more sentence about London.",
        "Here is another one.",
        "I like London.",
    ]
    batch_deps_1 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([ENT_IOB]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
Example #7
0
def test_beam_overfitting_IO(neg_key):
    # Simple test to try and quickly overfit the Beam NER component
    nlp = English()
    beam_width = 16
    beam_density = 0.0001
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
        "incorrect_spans_key": neg_key,
    }
    ner = nlp.add_pipe("beam_ner", config=config)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    optimizer = nlp.initialize()

    # run overfitting
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["beam_ner"] < 0.0001

    # test the scores from the beam
    test_text = "I like London"
    docs = [nlp.make_doc(test_text)]
    beams = ner.predict(docs)
    entity_scores = ner.scored_ents(beams)[0]
    assert entity_scores[(2, 3, "LOC")] == 1.0
    assert entity_scores[(2, 3, "PERSON")] == 0.0
    assert len(nlp(test_text).ents) == 1

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        docs2 = [nlp2.make_doc(test_text)]
        ner2 = nlp2.get_pipe("beam_ner")
        beams2 = ner2.predict(docs2)
        entity_scores2 = ner2.scored_ents(beams2)[0]
        assert entity_scores2[(2, 3, "LOC")] == 1.0
        assert entity_scores2[(2, 3, "PERSON")] == 0.0

    # Try to unlearn the entity by using negative annotations
    neg_doc = nlp.make_doc(test_text)
    neg_ex = Example(neg_doc, neg_doc)
    neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")]
    neg_train_examples = [neg_ex]

    for i in range(20):
        losses = {}
        nlp.update(neg_train_examples, sgd=optimizer, losses=losses)

    # test the "untrained" model
    assert len(nlp(test_text).ents) == 0
def test_transformer_pipeline_tagger_senter_listener():
    """Test that a pipeline with just a transformer+tagger+senter runs and
    trains properly"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    assert nlp.pipe_names == ["transformer", "tagger", "senter"]
    tagger = nlp.get_pipe("tagger")
    transformer = nlp.get_pipe("transformer")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    assert isinstance(transformer, Transformer)
    assert isinstance(tagger_trf, TransformerListener)
    assert tagger_trf.upstream_name == "custom_upstream"
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    # Check that the Transformer component finds it listeners
    assert transformer.listeners == []
    optimizer = nlp.initialize(lambda: train_examples)
    assert tagger_trf in transformer.listeners

    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    text = "We're interested at underwater basket weaving."
    doc = nlp(text)
    doc_tensor = tagger_trf.predict([doc])
    _assert_equal_tensors(doc._.trf_data.tensors, doc_tensor[0].tensors)

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc2 = nlp2(text)
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)

        # make sure that this can be saved to directory once more
        file_path_2 = d / "trained_nlp_2"
        nlp2.to_disk(file_path_2)

    # ensure to_bytes / from_bytes works
    nlp_bytes = nlp.to_bytes()
    nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(text)
    tagger3 = nlp3.get_pipe("tagger")
    tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
    doc_tensor3 = tagger_trf3.predict([doc3])
    _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
Example #9
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
    nlp = English()
    textcat = nlp.add_pipe("textcat")

    train_examples = []
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert textcat.model.get_dim("nO") == 2

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["textcat"] < 0.01

    # test the trained model
    test_text = "I am happy."
    doc = nlp(test_text)
    cats = doc.cats
    assert cats["POSITIVE"] > 0.9
    assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        cats2 = doc2.cats
        assert cats2["POSITIVE"] > 0.9
        assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(
            1.0, 0.001)

    # Test scoring
    scores = nlp.evaluate(train_examples)
    assert scores["cats_micro_f"] == 1.0
    assert scores["cats_macro_f"] == 1.0
    assert scores["cats_macro_auc"] == 1.0
    assert scores["cats_score"] == 1.0
    assert "cats_score_desc" in scores

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."
    ]
    batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)]
    batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)]
    no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]]
    for cats_1, cats_2 in zip(batch_cats_1, batch_cats_2):
        for cat in cats_1:
            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
    for cats_1, cats_2 in zip(batch_cats_1, no_batch_cats):
        for cat in cats_1:
            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
Example #10
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly
    nlp = English()
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # add some cases where SENT_START == -1
    train_examples[0].reference[10].is_sent_start = False
    train_examples[1].reference[1].is_sent_start = False
    train_examples[1].reference[11].is_sent_start = False

    nlp.add_pipe("senter")
    optimizer = nlp.initialize()

    for i in range(200):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["senter"] < 0.001

    # test the trained model
    test_text = TRAIN_DATA[0][0]
    doc = nlp(test_text)
    gold_sent_starts = [0] * 14
    gold_sent_starts[0] = 1
    gold_sent_starts[5] = 1
    gold_sent_starts[9] = 1
    assert [int(t.is_sent_start) for t in doc] == gold_sent_starts

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.",
        "Then one more sentence about London.",
        "Here is another one.",
        "I like London.",
    ]
    batch_deps_1 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([SENT_START]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)

    # test internal pipe labels vs. Language.pipe_labels with hidden labels
    assert nlp.get_pipe("senter").labels == ("I", "S")
    assert "senter" not in nlp.pipe_labels
Example #11
0
def test_overfitting_IO():
    nlp = English()
    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
    lemmatizer.min_tree_freq = 1
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

    optimizer = nlp.initialize(get_examples=lambda: train_examples)

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["trainable_lemmatizer"] < 0.00001

    test_text = "She likes blue eggs"
    doc = nlp(test_text)
    assert doc[0].lemma_ == "she"
    assert doc[1].lemma_ == "like"
    assert doc[2].lemma_ == "blue"
    assert doc[3].lemma_ == "egg"

    # Check model after a {to,from}_disk roundtrip
    with util.make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert doc2[0].lemma_ == "she"
        assert doc2[1].lemma_ == "like"
        assert doc2[2].lemma_ == "blue"
        assert doc2[3].lemma_ == "egg"

    # Check model after a {to,from}_bytes roundtrip
    nlp_bytes = nlp.to_bytes()
    nlp3 = English()
    nlp3.add_pipe("trainable_lemmatizer")
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(test_text)
    assert doc3[0].lemma_ == "she"
    assert doc3[1].lemma_ == "like"
    assert doc3[2].lemma_ == "blue"
    assert doc3[3].lemma_ == "egg"

    # Check model after a pickle roundtrip.
    nlp_bytes = pickle.dumps(nlp)
    nlp4 = pickle.loads(nlp_bytes)
    doc4 = nlp4(test_text)
    assert doc4[0].lemma_ == "she"
    assert doc4[1].lemma_ == "like"
    assert doc4[2].lemma_ == "blue"
    assert doc4[3].lemma_ == "egg"
Example #12
0
def test_overfitting_IO(pipe_name):
    # Simple test to try and quickly overfit the dependency parser (normal or beam)
    nlp = English()
    parser = nlp.add_pipe(pipe_name)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
    optimizer = nlp.initialize()
    # run overfitting
    for i in range(200):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses[pipe_name] < 0.0001
    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    assert doc[0].dep_ == "nsubj"
    assert doc[2].dep_ == "dobj"
    assert doc[3].dep_ == "punct"
    assert doc[0].head.i == 1
    assert doc[2].head.i == 1
    assert doc[3].head.i == 1
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert doc2[0].dep_ == "nsubj"
        assert doc2[2].dep_ == "dobj"
        assert doc2[3].dep_ == "punct"
        assert doc2[0].head.i == 1
        assert doc2[2].head.i == 1
        assert doc2[3].head.i == 1

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.",
        "Then one more sentence about London.",
        "Here is another one.",
        "I like London.",
    ]
    batch_deps_1 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([DEP]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
Example #13
0
def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
    """
    Loads a spacy model.
    """
    from spacy.util import load_model_from_path

    model_weight_path = download_model('spacy',
                                       cache_dir,
                                       process_func=_unzip_process_func,
                                       verbose=verbose)

    nlp = load_model_from_path(model_weight_path)

    return nlp
Example #14
0
def test_issue4707():
    """Tests that disabled component names are also excluded from nlp.from_disk
    by default when loading a model.
    """
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
    exclude = ["tokenizer", "sentencizer"]
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir, exclude=exclude)
        new_nlp = load_model_from_path(tmpdir, disable=exclude)
    assert "sentencizer" not in new_nlp.pipe_names
    assert "entity_ruler" in new_nlp.pipe_names
Example #15
0
def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False, textcat=None, vectorError=False):
    """
    Loads a spaCy model.

    :param str cache_dir: the directory for storing cached models
    :param bool verbose: `True` to increase verbosity
    :param bool textcat: '`sentiment`' for loading the spaCy sentiment analyser
    :param bool vectorError:
    :return: a spaCy model
    
    .. warning:: vectorError is a temporary work around error encounted by keeping two models and not been able to find reference name for vectors
    """
    from spacy.util import load_model_from_path

    if textcat==None or vectorError==True:
        modelname='spacy'

        model_weight_path = download_model(modelname, cache_dir,
                                           process_func=_unzip_process_func,
                                           verbose=verbose)
        nlp = load_model_from_path(model_weight_path)
        
    
    if textcat=='sentiment':
        modelname='spacy.sentiment'
        
        model_weight_path = download_model(modelname, cache_dir,
                                           process_func=_unzip_process_func,
                                           verbose=verbose)
        # quick fix from not aligned models storage:
        import os
        model_weight_path =  os.path.join(model_weight_path, 'spacy.sentiment')
        
        nlp = load_model_from_path(model_weight_path)
    
    
    return nlp
Example #16
0
def test_overfitting_IO_multi():
    # Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
    nlp = English()
    textcat = nlp.add_pipe("textcat_multilabel")

    train_examples = []
    for text, annotations in TRAIN_DATA_MULTI_LABEL:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert textcat.model.get_dim("nO") == 3

    for i in range(100):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["textcat_multilabel"] < 0.01

    # test the trained model
    test_text = "I am confused but happy."
    doc = nlp(test_text)
    cats = doc.cats
    assert cats["HAPPY"] > 0.9
    assert cats["CONFUSED"] > 0.9

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        cats2 = doc2.cats
        assert cats2["HAPPY"] > 0.9
        assert cats2["CONFUSED"] > 0.9

    # Test scoring
    scores = nlp.evaluate(train_examples)
    assert scores["cats_micro_f"] == 1.0
    assert scores["cats_macro_f"] == 1.0
    assert "cats_score_desc" in scores

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."
    ]
    batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)]
    no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
Example #17
0
def test_issue999():
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to re-add labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
    TRAIN_DATA = [
        ["hey", []],
        ["howdy", []],
        ["hey there", []],
        ["hello", []],
        ["hi", []],
        ["i'm looking for a place to eat", []],
        [
            "i'm looking for a place in the north of town",
            [(31, 36, "LOCATION")]
        ],
        ["show me chinese restaurants", [(8, 15, "CUISINE")]],
        ["show me chines restaurants", [(8, 14, "CUISINE")]],
    ]
    nlp = English()
    ner = nlp.add_pipe("ner")
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
    nlp.initialize()
    for itn in range(20):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            example = Example.from_dict(nlp.make_doc(raw_text),
                                        {"entities": entity_offsets})
            nlp.update([example])

    with make_tempdir() as model_dir:
        nlp.to_disk(model_dir)
        nlp2 = util.load_model_from_path(model_dir)

    for raw_text, entity_offsets in TRAIN_DATA:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
            else:
                if entity_offsets:
                    raise Exception(ents)
Example #18
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
    fix_random_seed(0)
    nlp = English()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    train_examples = make_examples(nlp)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert spancat.model.get_dim("nO") == 2
    assert set(spancat.labels) == {"LOC", "PERSON"}

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["spancat"] < 0.01

    # test the trained model
    test_text = "I like London and Berlin"
    doc = nlp(test_text)
    assert doc.spans[spancat.key] == doc.spans[SPAN_KEY]
    spans = doc.spans[SPAN_KEY]
    assert len(spans) == 2
    assert len(spans.attrs["scores"]) == 2
    assert min(spans.attrs["scores"]) > 0.9
    assert set([span.text for span in spans]) == {"London", "Berlin"}
    assert set([span.label_ for span in spans]) == {"LOC"}

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        spans2 = doc2.spans[SPAN_KEY]
        assert len(spans2) == 2
        assert len(spans2.attrs["scores"]) == 2
        assert min(spans2.attrs["scores"]) > 0.9
        assert set([span.text for span in spans2]) == {"London", "Berlin"}
        assert set([span.label_ for span in spans2]) == {"LOC"}

    # Test scoring
    scores = nlp.evaluate(train_examples)
    assert f"spans_{SPAN_KEY}_f" in scores
    assert scores[f"spans_{SPAN_KEY}_p"] == 1.0
    assert scores[f"spans_{SPAN_KEY}_r"] == 1.0
    assert scores[f"spans_{SPAN_KEY}_f"] == 1.0

    # also test that the spancat works for just a single entity in a sentence
    doc = nlp("London")
    assert len(doc.spans[spancat.key]) == 1
def test_transformer_sentencepiece_IO():
    """Test that a transformer using sentencepiece trains + IO goes OK"""
    orig_config = Config().from_str(cfg_string)
    orig_config["components"]["transformer"]["model"]["name"] = "camembert-base"
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    tagger = nlp.get_pipe("tagger")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    optimizer = nlp.initialize(lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    text = "We're interested at underwater basket weaving."
    doc = nlp(text)
    doc_tensor = tagger_trf.predict([doc])

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc2 = nlp2(text)
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)

        # make sure that this can be saved to directory once more
        file_path_2 = d / "trained_nlp_2"
        nlp2.to_disk(file_path_2)

    # ensure to_bytes / from_bytes works
    nlp_bytes = nlp.to_bytes()
    nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(text)
    tagger3 = nlp3.get_pipe("tagger")
    tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
    doc_tensor3 = tagger_trf3.predict([doc3])
    _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
Example #20
0
def test_overfitting_IO_overlapping():
    # Test for overfitting on overlapping entities
    fix_random_seed(0)
    nlp = English()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})

    train_examples = make_examples(nlp, data=TRAIN_DATA_OVERLAPPING)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert spancat.model.get_dim("nO") == 3
    assert set(spancat.labels) == {"PERSON", "LOC", "DOUBLE_LOC"}

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["spancat"] < 0.01

    # test the trained model
    test_text = "I like London and Berlin"
    doc = nlp(test_text)
    spans = doc.spans[SPAN_KEY]
    assert len(spans) == 3
    assert len(spans.attrs["scores"]) == 3
    assert min(spans.attrs["scores"]) > 0.9
    assert set([span.text for span in spans]) == {
        "London",
        "Berlin",
        "London and Berlin",
    }
    assert set([span.label_ for span in spans]) == {"LOC", "DOUBLE_LOC"}

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        spans2 = doc2.spans[SPAN_KEY]
        assert len(spans2) == 3
        assert len(spans2.attrs["scores"]) == 3
        assert min(spans2.attrs["scores"]) > 0.9
        assert set([span.text for span in spans2]) == {
            "London",
            "Berlin",
            "London and Berlin",
        }
        assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
def test_attributeruler_serialize(nlp, pattern_dicts):
    a = nlp.add_pipe("attribute_ruler")
    a.add_patterns(pattern_dicts)
    text = "This is a test."
    attrs = ["ORTH", "LEMMA", "MORPH"]
    doc = nlp(text)
    # bytes roundtrip
    a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
    assert a.to_bytes() == a_reloaded.to_bytes()
    doc1 = a_reloaded(nlp.make_doc(text))
    numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
    assert a.patterns == a_reloaded.patterns
    # disk roundtrip
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(text)
        assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
        assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))
        assert a.patterns == nlp2.get_pipe("attribute_ruler").patterns
Example #22
0
    def __call__(self,
                 train_data,
                 dev_data,
                 test_data,
                 n_iter,
                 dropout,
                 patience=10):

        if "ner" in self.nlp.pipe_names:
            logging.warning("Pipeline already has NER, removing...")
            self.nlp.remove_pipe("ner")
        ner = self.nlp.create_pipe("ner")
        # ner.add_multitask_objective(get_position_label)
        self.nlp.add_pipe(ner, last=True)

        for sent, ann in train_data + dev_data + test_data:
            for _, _, label in ann:
                ner.add_label(label)

            # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != "ner"]
        logging.info("Starting the training with {} iterations".format(n_iter))
        fix_random_seed(42)

        best_dev_score = 0
        best_dev_itn = -1
        with self.nlp.disable_pipes(*other_pipes):  # only train NER
            # if not self.model:
            # FIXME: pre-train the model
            mlflow.log_param("base_model", Path(self.model).name)
            mlflow.log_param("n_iter", n_iter)
            mlflow.log_param("dropout", dropout)
            self.nlp.begin_training()
            for itn in tqdm(list(range(n_iter)), desc="iterations"):
                random.shuffle(train_data)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = list(
                    minibatch(train_data, size=compounding(4.0, 32.0, 1.001)))
                for batch in tqdm(batches, desc="batches"):
                    texts, annotations = zip(*batch)
                    self.nlp.update(
                        docs=texts,  # batch of texts
                        golds=[
                            self._sent_to_goldparse(t, a) for t, a in batch
                        ],  # batch of annotations
                        drop=
                        dropout,  # dropout - make it harder to memorise data
                        losses=losses,
                    )

                # logging.info("Losses: {}".format(losses))
                mlflow.log_metric("loss", losses["ner"], step=itn)
                train_scores = self.evaluate(train_data)
                mlflow.log_metrics(
                    {"train_" + k: v
                     for k, v in train_scores.items()},
                    step=itn)
                dev_scores = self.evaluate(dev_data)
                mlflow.log_metrics(
                    {"dev_" + k: v
                     for k, v in dev_scores.items()}, step=itn)
                self.store_model(
                    Path(self.output_dir) / "model_{}".format(itn))

                act_score = dev_scores["f1"]
                if best_dev_score < act_score:
                    best_dev_score = act_score
                    best_dev_itn = itn
                else:
                    if itn - best_dev_itn >= patience:
                        break

            load_model_from_path(
                Path(self.output_dir) / "model_{}".format(best_dev_itn))
            scores = self.evaluate(test_data)
            logging.info("Test scores {}".format(scores))
            mlflow.log_metrics(scores, step=itn)
            self.store_model(Path(self.output_dir) / "model-final")
def test_overfitting_IO():
    # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
    nlp = English()
    nlp.add_pipe("morphologizer")
    train_examples = []
    for inst in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]),
                                                inst[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["morphologizer"] < 0.00001

    # test the trained model
    test_text = "I like blue ham"
    doc = nlp(test_text)
    gold_morphs = ["Feat=N", "Feat=V", "", ""]
    gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert [str(t.morph) for t in doc2] == gold_morphs
        assert [t.pos_ for t in doc2] == gold_pos_tags

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.",
        "Then one more sentence about London.",
        "Here is another one.",
        "I like London.",
    ]
    batch_deps_1 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)

    # Test without POS
    nlp.remove_pipe("morphologizer")
    nlp.add_pipe("morphologizer")
    for example in train_examples:
        for token in example.reference:
            token.pos_ = ""
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["morphologizer"] < 0.00001

    # Test the trained model
    test_text = "I like blue ham"
    doc = nlp(test_text)
    gold_morphs = ["Feat=N", "Feat=V", "", ""]
    gold_pos_tags = ["", "", "", ""]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags

    # Test with unset morph and partial POS
    nlp.remove_pipe("morphologizer")
    nlp.add_pipe("morphologizer")
    for example in train_examples:
        for token in example.reference:
            if token.text == "ham":
                token.pos_ = "NOUN"
            else:
                token.pos_ = ""
            token.set_morph(None)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    print(nlp.get_pipe("morphologizer").labels)
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["morphologizer"] < 0.00001

    # Test the trained model
    test_text = "I like blue ham"
    doc = nlp(test_text)
    gold_morphs = ["", "", "", ""]
    gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
Example #24
0
def train(model, train_data, dev_data, test_data, output_dir, n_iter,
          meta_overrides):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)

    original_tokenizer = nlp.tokenizer

    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="parser")
    elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="tagger")
    elif 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.005))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 32),
                                   util.env_opt('batch_compound', 1.001))

    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
    best_epoch = 0
    best_f1 = 0
    for i in range(n_iter):

        random.shuffle(train_data)
        count = 0
        losses = {}
        total = len(train_data)

        with nlp.disable_pipes(*other_pipes):  # only train NER
            with tqdm.tqdm(total=total, leave=True) as pbar:
                for batch in minibatch(train_data, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs,
                               golds,
                               sgd=optimizer,
                               losses=losses,
                               drop=next(dropout_rates))
                    pbar.update(len(batch))
                    if count % 100 == 0 and count > 0:
                        print('sum loss: %s' % losses['ner'])
                    count += 1

        # save model to output directory
        output_dir_path = Path(output_dir + "/" + str(i))
        if not output_dir_path.exists():
            output_dir_path.mkdir()

        with nlp.use_params(optimizer.averages):
            nlp.tokenizer = original_tokenizer
            nlp.to_disk(output_dir_path)
            print("Saved model to", output_dir_path)

        # test the saved model
        print("Loading from", output_dir_path)
        nlp2 = util.load_model_from_path(output_dir_path)
        nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

        metrics = evaluate_ner(nlp2, dev_data)
        if metrics["f1-measure-overall"] > best_f1:
            best_f1 = metrics["f1-measure-overall"]
            best_epoch = i
    # save model to output directory
    best_model_path = Path(output_dir + "/" + "best")
    print(f"Best Epoch: {best_epoch} of {n_iter}")
    if os.path.exists(best_model_path):
        shutil.rmtree(best_model_path)
    shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path)

    # test the saved model
    print("Loading from", best_model_path)
    nlp2 = util.load_model_from_path(best_model_path)
    nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

    evaluate_ner(nlp2,
                 dev_data,
                 dump_path=os.path.join(output_dir, "dev_metrics.json"))
    evaluate_ner(nlp2,
                 test_data,
                 dump_path=os.path.join(output_dir, "test_metrics.json"))
def train_parser_and_tagger(train_json_path: str,
                            dev_json_path: str,
                            test_json_path: str,
                            model_output_dir: str,
                            model_path: str = None,
                            ontonotes_path: str = None,
                            ontonotes_train_percent: float = 0.0):
    """Function to train the spacy parser and tagger from a blank model, with the default, en_core_web_sm vocab.
       Training setup is mostly copied from the spacy cli train command.

       @param train_json_path: path to the conll formatted training data
       @param dev_json_path: path to the conll formatted dev data
       @param test_json_path: path to the conll formatted test data
       @param model_output_dir: path to the output directory for the trained models
       @param model_path: path to the model to load
       @param ontonotes_path: path to the directory containnig ontonotes in spacy format (optional)
       @param ontonotes_train_percent: percentage of the ontonotes training data to use (optional)
    """
    msg = Printer()

    train_json_path = cached_path(train_json_path)
    dev_json_path = cached_path(dev_json_path)
    test_json_path = cached_path(test_json_path)

    if model_path is not None:
        nlp = spacy.load(model_path)
    else:
        lang_class = util.get_lang_class('en')
        nlp = lang_class()

    if 'tagger' not in nlp.pipe_names:
        tagger = nlp.create_pipe('tagger')
        nlp.add_pipe(tagger, first=True)
    else:
        tagger = nlp.get_pipe('tagger')

    if 'parser' not in nlp.pipe_names:
        parser = nlp.create_pipe('parser')
        nlp.add_pipe(parser)
    else:
        parser = nlp.get_pipe('parser')

    train_corpus = GoldCorpus(train_json_path, dev_json_path)
    test_corpus = GoldCorpus(train_json_path, test_json_path)

    if ontonotes_path:
        onto_train_path = os.path.join(ontonotes_path, "train")
        onto_dev_path = os.path.join(ontonotes_path, "dev")
        onto_test_path = os.path.join(ontonotes_path, "test")
        onto_train_corpus = GoldCorpus(onto_train_path, onto_dev_path)
        onto_test_corpus = GoldCorpus(onto_train_path, onto_test_path)

    dropout_rates = util.decaying(0.2, 0.2, 0.0)
    batch_sizes = util.compounding(1., 16., 1.001)

    if model_path is not None:
        meta = nlp.meta
    else:
        meta = {}
        meta["lang"] = "en"
        meta["pipeline"] = ["tagger", "parser"]
        meta["name"] = "scispacy_core_web_sm"
        meta["license"] = "CC BY-SA 3.0"
        meta["author"] = "Allen Institute for Artificial Intelligence"
        meta["url"] = "allenai.org"
        meta["sources"] = ["OntoNotes 5", "Common Crawl", "GENIA 1.0"]
        meta["version"] = "1.0.0"
        meta["spacy_version"] = ">=2.2.1"
        meta["parent_package"] = "spacy"
        meta["email"] = "*****@*****.**"

    n_train_words = train_corpus.count_train()

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in  ['tagger', 'parser']]
    with nlp.disable_pipes(*other_pipes):
        if ontonotes_path:
            optimizer = nlp.begin_training(lambda: itertools.chain(train_corpus.train_tuples, onto_train_corpus.train_tuples))
        else:
            optimizer = nlp.begin_training(lambda: train_corpus.train_tuples)
        nlp._optimizer = None

    train_docs = train_corpus.train_docs(nlp)
    train_docs = list(train_docs)

    train_mixture = train_docs
    if ontonotes_path:
        onto_train_docs = onto_train_corpus.train_docs(nlp)
        onto_train_docs = list(onto_train_docs)
        num_onto_docs = int(float(ontonotes_train_percent)*len(onto_train_docs))
        randomly_sampled_onto = random.sample(onto_train_docs, num_onto_docs)
        train_mixture += randomly_sampled_onto

    row_head, output_stats = _configure_training_output(nlp.pipe_names, -1, False)
    row_widths = [len(w) for w in row_head]
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}

    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)

    best_epoch = 0
    best_epoch_uas = 0.0
    for i in range(20):
        random.shuffle(train_mixture)
        with nlp.disable_pipes(*other_pipes):
            with tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                minibatches = list(util.minibatch(train_docs, size=batch_sizes))
                for batch in minibatches:
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))

        # save intermediate model and output results on the dev set
        with nlp.use_params(optimizer.averages):
            epoch_model_path = os.path.join(model_output_dir, "model"+str(i))
            os.makedirs(epoch_model_path, exist_ok=True)
            nlp.to_disk(epoch_model_path)

            with open(os.path.join(model_output_dir, "model"+str(i), "meta.json"), "w") as meta_fp:
                meta_fp.write(json.dumps(meta))

            nlp_loaded = util.load_model_from_path(epoch_model_path)
            dev_docs = train_corpus.dev_docs(nlp_loaded)
            dev_docs = list(dev_docs)
            nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
            start_time = timer()
            scorer = nlp_loaded.evaluate(dev_docs)
            end_time = timer()
            gpu_wps = None
            cpu_wps = nwords/(end_time-start_time)

            if ontonotes_path:
                onto_dev_docs = list(onto_train_corpus.dev_docs(nlp_loaded))
                onto_scorer = nlp_loaded.evaluate(onto_dev_docs)


        if scorer.scores["uas"] > best_epoch_uas:
            best_epoch_uas = scorer.scores["uas"]
            best_epoch = i
        progress = _get_progress(
            i, losses, scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps
        )
        msg.row(progress, **row_settings)

        if ontonotes_path:
            progress = _get_progress(
                i, losses, onto_scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps
            )
            msg.row(progress, **row_settings)

    # save final model and output results on the test set
    final_model_path = os.path.join(model_output_dir, "best")
    if os.path.exists(final_model_path):
        shutil.rmtree(final_model_path)
    shutil.copytree(os.path.join(model_output_dir, "model" + str(best_epoch)),
                    final_model_path)

    nlp_loaded = util.load_model_from_path(final_model_path)
    start_time = timer()
    test_docs = test_corpus.dev_docs(nlp_loaded)
    test_docs = list(test_docs)
    nwords = sum(len(doc_gold[0]) for doc_gold in test_docs)
    scorer = nlp_loaded.evaluate(test_docs)
    end_time = timer()
    gpu_wps = None
    cpu_wps = nwords/(end_time-start_time)
    meta["speed"] = {"gpu": None, "nwords": nwords, "cpu": cpu_wps}

    print("Retrained genia evaluation")
    print("Test results:")
    print("UAS:", scorer.uas)
    print("LAS:", scorer.las)
    print("Tag %:", scorer.tags_acc)
    print("Token acc:", scorer.token_acc)
    with open(os.path.join(model_output_dir, "genia_test.json"), "w+") as metric_file:
        json.dump(scorer.scores, metric_file)
    with open(os.path.join(model_output_dir, "best", "meta.json"), "w") as meta_fp:
        meta_fp.write(json.dumps(meta))

    if ontonotes_path:
        onto_test_docs = list(onto_test_corpus.dev_docs(nlp_loaded))
        print("Retrained ontonotes evaluation")
        scorer_onto_retrained = nlp_loaded.evaluate(onto_test_docs)
        print("Test results:")
        print("UAS:", scorer_onto_retrained.uas)
        print("LAS:", scorer_onto_retrained.las)
        print("Tag %:", scorer_onto_retrained.tags_acc)
        print("Token acc:", scorer_onto_retrained.token_acc)

        with open(os.path.join(model_output_dir, "ontonotes_test.json"), "w+") as metric_file:
            json.dump(scorer_onto_retrained.scores, metric_file)
Example #26
0
def test_serialize_subclassed_kb():
    """Check that IO of a custom KB works fine as part of an EL pipe."""

    config_string = """
    [nlp]
    lang = "en"
    pipeline = ["entity_linker"]

    [components]

    [components.entity_linker]
    factory = "entity_linker"

    [initialize]

    [initialize.components]

    [initialize.components.entity_linker]

    [initialize.components.entity_linker.kb_loader]
    @misc = "spacy.CustomKB.v1"
    entity_vector_length = 342
    custom_field = 666
    """

    class SubKnowledgeBase(KnowledgeBase):
        def __init__(self, vocab, entity_vector_length, custom_field):
            super().__init__(vocab, entity_vector_length)
            self.custom_field = custom_field

    @registry.misc("spacy.CustomKB.v1")
    def custom_kb(entity_vector_length: int,
                  custom_field: int) -> Callable[[Vocab], KnowledgeBase]:
        def custom_kb_factory(vocab):
            kb = SubKnowledgeBase(
                vocab=vocab,
                entity_vector_length=entity_vector_length,
                custom_field=custom_field,
            )
            kb.add_entity("random_entity", 0.0, zeros(entity_vector_length))
            return kb

        return custom_kb_factory

    config = Config().from_str(config_string)
    nlp = load_model_from_config(config, auto_fill=True)
    nlp.initialize()

    entity_linker = nlp.get_pipe("entity_linker")
    assert type(entity_linker.kb) == SubKnowledgeBase
    assert entity_linker.kb.entity_vector_length == 342
    assert entity_linker.kb.custom_field == 666

    # Make sure the custom KB is serialized correctly
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        entity_linker2 = nlp2.get_pipe("entity_linker")
        # After IO, the KB is the standard one
        assert type(entity_linker2.kb) == KnowledgeBase
        assert entity_linker2.kb.entity_vector_length == 342
        assert not hasattr(entity_linker2.kb, "custom_field")
def test_replace_listeners():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    text = "This is awesome"
    examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
    optimizer = nlp.initialize(lambda: examples)
    # verify correct configuration with transformer listener
    transformer = nlp.get_pipe("transformer")
    tagger = nlp.get_pipe("tagger")
    tagger_tok2vec = tagger.model.get_ref("tok2vec")
    tagger_listener = tagger_tok2vec.get_ref("listener")
    assert isinstance(tagger_listener, TransformerListener)
    assert transformer.listener_map["tagger"][0] == tagger_listener
    assert isinstance(transformer.model, TransformerModel)
    assert (
        nlp.config["components"]["transformer"]["model"]["@architectures"]
        == "spacy-transformers.TransformerModel.v3"
    )
    assert (
        nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"]
        == "spacy-transformers.TransformerListener.v1"
    )
    # train pipe before replacing listeners
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
        doc = nlp(text)

    preds = [t.tag_ for t in doc]
    doc_tensor = tagger_tok2vec.predict([doc])

    # replace listener and verify predictions are still the same
    nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"])
    tagger = nlp.get_pipe("tagger")
    tagger_tok2vec = tagger.model.get_ref("tok2vec")
    assert isinstance(tagger_tok2vec, Model)
    assert tagger_tok2vec.layers[0].layers[0].name == "transformer"
    assert (
        nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"]
        == "spacy-transformers.Tok2VecTransformer.v3"
    )
    doc2 = nlp(text)
    assert preds == [t.tag_ for t in doc2]
    pred_tensor = tagger_tok2vec.predict([doc2])
    _assert_equal_tensors(doc_tensor, pred_tensor)

    # attempt training with the new pipeline
    optimizer = nlp.resume_training()
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
        assert losses["tagger"] > 0.0

    # check for presence of additional fields in model_output
    assert doc2._.trf_data.model_output.pooler_output is not None
    assert doc2._.trf_data.model_output.attentions is not None

    # ensure IO goes OK
    doc_tensor_trained = tagger_tok2vec.predict([doc])
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc3 = nlp2(text)
        tagger2 = nlp2.get_pipe("tagger")
        tagger_tok2vec2 = tagger2.model.get_ref("tok2vec")
        pred_tensor = tagger_tok2vec2.predict([doc3])
        _assert_equal_tensors(doc_tensor_trained, pred_tensor)
Example #28
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
    nlp = English()
    vector_length = 3
    assert "Q2146908" not in nlp.vocab.strings

    # Convert the texts to docs to make sure we have doc.ents set for the training examples
    train_examples = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        train_examples.append(Example.from_dict(doc, annotation))

    def create_kb(vocab):
        # create artificial KB - assign same prior weight to the two russ cochran's
        # Q2146908 (Russ Cochran): American golfer
        # Q7381115 (Russ Cochran): publisher
        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
            alias="Russ Cochran",
            entities=["Q2146908", "Q7381115"],
            probabilities=[0.5, 0.5],
        )
        return mykb

    # Create the Entity Linker component and add it to the pipeline
    entity_linker = nlp.add_pipe("entity_linker", last=True)
    entity_linker.set_kb(create_kb)
    assert "Q2146908" in entity_linker.vocab.strings
    assert "Q2146908" in entity_linker.kb.vocab.strings

    # train the NEL pipe
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert entity_linker.model.get_dim("nO") == vector_length
    assert entity_linker.model.get_dim(
        "nO") == entity_linker.kb.entity_vector_length

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["entity_linker"] < 0.001

    # adding additional components that are required for the entity_linker
    nlp.add_pipe("sentencizer", first=True)

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
    patterns = [{
        "label": "PERSON",
        "pattern": [{
            "LOWER": "russ"
        }, {
            "LOWER": "cochran"
        }]
    }]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)

    # test the trained model
    predictions = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        for ent in doc.ents:
            predictions.append(ent.kb_id_)
    assert predictions == GOLD_entities

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        assert nlp2.pipe_names == nlp.pipe_names
        assert "Q2146908" in nlp2.vocab.strings
        entity_linker2 = nlp2.get_pipe("entity_linker")
        assert "Q2146908" in entity_linker2.vocab.strings
        assert "Q2146908" in entity_linker2.kb.vocab.strings
        predictions = []
        for text, annotation in TRAIN_DATA:
            doc2 = nlp2(text)
            for ent in doc2.ents:
                predictions.append(ent.kb_id_)
        assert predictions == GOLD_entities

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Russ Cochran captured his first major title with his son as caddie.",
        "Russ Cochran his reprints include EC Comics.",
        "Russ Cochran has been publishing comic art.",
        "Russ Cochran was a member of University of Kentucky's golf team.",
    ]
    batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
Example #29
0
def test_beam_overfitting_IO():
    # Simple test to try and quickly overfit the Beam dependency parser
    nlp = English()
    beam_width = 16
    beam_density = 0.0001
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
    }
    parser = nlp.add_pipe("beam_parser", config=config)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
    optimizer = nlp.initialize()
    # run overfitting
    for i in range(150):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["beam_parser"] < 0.0001
    # test the scores from the beam
    test_text = "I like securities."
    docs = [nlp.make_doc(test_text)]
    beams = parser.predict(docs)
    head_scores, label_scores = parser.scored_parses(beams)
    # we only processed one document
    head_scores = head_scores[0]
    label_scores = label_scores[0]
    # test label annotations: 0=nsubj, 2=dobj, 3=punct
    assert label_scores[(0, "nsubj")] == pytest.approx(1.0, abs=eps)
    assert label_scores[(0, "dobj")] == pytest.approx(0.0, abs=eps)
    assert label_scores[(0, "punct")] == pytest.approx(0.0, abs=eps)
    assert label_scores[(2, "nsubj")] == pytest.approx(0.0, abs=eps)
    assert label_scores[(2, "dobj")] == pytest.approx(1.0, abs=eps)
    assert label_scores[(2, "punct")] == pytest.approx(0.0, abs=eps)
    assert label_scores[(3, "nsubj")] == pytest.approx(0.0, abs=eps)
    assert label_scores[(3, "dobj")] == pytest.approx(0.0, abs=eps)
    assert label_scores[(3, "punct")] == pytest.approx(1.0, abs=eps)
    # test head annotations: the root is token at index 1
    assert head_scores[(0, 0)] == pytest.approx(0.0, abs=eps)
    assert head_scores[(0, 1)] == pytest.approx(1.0, abs=eps)
    assert head_scores[(0, 2)] == pytest.approx(0.0, abs=eps)
    assert head_scores[(2, 0)] == pytest.approx(0.0, abs=eps)
    assert head_scores[(2, 1)] == pytest.approx(1.0, abs=eps)
    assert head_scores[(2, 2)] == pytest.approx(0.0, abs=eps)
    assert head_scores[(3, 0)] == pytest.approx(0.0, abs=eps)
    assert head_scores[(3, 1)] == pytest.approx(1.0, abs=eps)
    assert head_scores[(3, 2)] == pytest.approx(0.0, abs=eps)

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        docs2 = [nlp2.make_doc(test_text)]
        parser2 = nlp2.get_pipe("beam_parser")
        beams2 = parser2.predict(docs2)
        head_scores2, label_scores2 = parser2.scored_parses(beams2)
        # we only processed one document
        head_scores2 = head_scores2[0]
        label_scores2 = label_scores2[0]
        # check the results again
        assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, abs=eps)
        assert label_scores2[(0, "dobj")] == pytest.approx(0.0, abs=eps)
        assert label_scores2[(0, "punct")] == pytest.approx(0.0, abs=eps)
        assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, abs=eps)
        assert label_scores2[(2, "dobj")] == pytest.approx(1.0, abs=eps)
        assert label_scores2[(2, "punct")] == pytest.approx(0.0, abs=eps)
        assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, abs=eps)
        assert label_scores2[(3, "dobj")] == pytest.approx(0.0, abs=eps)
        assert label_scores2[(3, "punct")] == pytest.approx(1.0, abs=eps)
        assert head_scores2[(0, 0)] == pytest.approx(0.0, abs=eps)
        assert head_scores2[(0, 1)] == pytest.approx(1.0, abs=eps)
        assert head_scores2[(0, 2)] == pytest.approx(0.0, abs=eps)
        assert head_scores2[(2, 0)] == pytest.approx(0.0, abs=eps)
        assert head_scores2[(2, 1)] == pytest.approx(1.0, abs=eps)
        assert head_scores2[(2, 2)] == pytest.approx(0.0, abs=eps)
        assert head_scores2[(3, 0)] == pytest.approx(0.0, abs=eps)
        assert head_scores2[(3, 1)] == pytest.approx(1.0, abs=eps)
        assert head_scores2[(3, 2)] == pytest.approx(0.0, abs=eps)
Example #30
0
def custom_train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    orth_variant_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    textcat_multilabel=False,
    textcat_arch="bow",
    textcat_positive_label=None,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """

    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    import tqdm

    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [
            p for p in output_path.iterdir() if p.is_dir()
    ]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                if pipe == "parser":
                    pipe_cfg = {"learn_tokens": learn_tokens}
                elif pipe == "textcat":
                    pipe_cfg = {
                        "exclusive_classes": not textcat_multilabel,
                        "architecture": textcat_arch,
                        "positive_label": textcat_positive_label,
                    }
                else:
                    pipe_cfg = {}
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
            else:
                if pipe == "textcat":
                    textcat_cfg = nlp.get_pipe("textcat").cfg
                    base_cfg = {
                        "exclusive_classes": textcat_cfg["exclusive_classes"],
                        "architecture": textcat_cfg["architecture"],
                        "positive_label": textcat_cfg["positive_label"],
                    }
                    pipe_cfg = {
                        "exclusive_classes": not textcat_multilabel,
                        "architecture": textcat_arch,
                        "positive_label": textcat_positive_label,
                    }
                    if base_cfg != pipe_cfg:
                        msg.fail(
                            "The base textcat model configuration does"
                            "not match the provided training options. "
                            "Existing cfg: {}, provided cfg: {}".format(
                                base_cfg, pipe_cfg),
                            exits=1,
                        )
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        ### Here are our modifications:
        lang_cls.Defaults.tag_map = custom_tag_map
        nlp = lang_cls()
        assert nlp.vocab.morphology.n_tags == 36
        ###
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks),
                         ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail("Can't use multitask objective without '{}' in the "
                         "pipeline".format(pipe_name))
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples,
                                       device=use_gpu)

    nlp._optimizer = None

    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # Verify textcat config
    if "textcat" in pipeline:
        textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
        if textcat_positive_label and textcat_positive_label not in textcat_labels:
            msg.fail(
                "The textcat_positive_label (tpl) '{}' does not match any "
                "label in the training data.".format(textcat_positive_label),
                exits=1,
            )
        if textcat_positive_label and len(textcat_labels) != 2:
            msg.fail(
                "A textcat_positive_label (tpl) '{}' was provided for training "
                "data that does not appear to be a binary classification "
                "problem with two labels.".format(textcat_positive_label),
                exits=1,
            )
        train_docs = corpus.train_docs(
            nlp,
            noise_level=noise_level,
            gold_preproc=gold_preproc,
            max_length=0,
            ignore_misaligned=True,
        )
        train_labels = set()
        if textcat_multilabel:
            multilabel_found = False
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1:
                    multilabel_found = True
            if not multilabel_found and not base_model:
                msg.warn("The textcat training instances look like they have "
                         "mutually-exclusive classes. Remove the flag "
                         "'--textcat-multilabel' to train a classifier with "
                         "mutually-exclusive classes.")
        if not textcat_multilabel:
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
                    msg.warn(
                        "Some textcat training instances do not have exactly "
                        "one positive label. Modifying training options to "
                        "include the flag '--textcat-multilabel' for classes "
                        "that are not mutually exclusive.")
                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
                    textcat_multilabel = True
                    break
        if base_model and set(textcat_labels) != train_labels:
            msg.fail(
                "Cannot extend textcat model using data with different "
                "labels. Base model labels: {}, training data labels: "
                "{}.".format(textcat_labels, list(train_labels)),
                exits=1,
            )
        if textcat_multilabel:
            msg.text(
                "Textcat evaluation score: ROC AUC score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        elif textcat_positive_label and len(textcat_labels) == 2:
            msg.text("Textcat evaluation score: F1-score for the "
                     "label '{}'".format(textcat_positive_label))
        elif len(textcat_labels) > 1:
            if len(textcat_labels) == 2:
                msg.warn(
                    "If the textcat component is a binary classifier with "
                    "exclusive classes, provide '--textcat_positive_label' for "
                    "an evaluation on the positive class.")
            msg.text(
                "Textcat evaluation score: F1-score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        else:
            msg.fail(
                "Unsupported textcat configuration. Use `spacy debug-data` "
                "for more information.")

    # fmt: off
    row_head, output_stats = _configure_training_output(
        pipeline, use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {
        "widths": row_widths,
        "aligns": tuple(["r" for i in row_head]),
        "spacing": 2
    }
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
                ignore_misaligned=True,
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs,
                                                     size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(
                            nlp_loaded,
                            gold_preproc=gold_preproc,
                            ignore_misaligned=True,
                        ))
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(
                                epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(
                                    nlp_loaded,
                                    gold_preproc=gold_preproc,
                                    ignore_misaligned=True,
                                ))
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs,
                                                         verbose=verbose)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta["labels"] = nlp.meta["labels"]
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        output_stats,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    if i == 0 and "textcat" in pipeline:
                        textcats_per_cat = scorer.scores.get(
                            "textcats_per_cat", {})
                        for cat, cat_score in textcats_per_cat.items():
                            if cat_score.get("roc_auc_score", 0) < 0:
                                msg.warn(
                                    "Textcat ROC AUC score is undefined due to "
                                    "only one value in label '{}'.".format(
                                        cat))
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text("Early stopping, best iteration "
                                 "is: {}".format(i - iter_since_best))
                        msg.text("Best score = {}; Final iteration "
                                 "score = {}".format(best_score,
                                                     current_score))
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path,
                                                  nlp.pipe_names)
        msg.good("Created best model", best_model_path)