Ejemplo n.º 1
0
def test_issue1915():
    cfg = {"hidden_depth": 2}  # should error out
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.get_pipe("ner").add_label("answer")
    with pytest.raises(ValueError):
        nlp.begin_training(**cfg)
Ejemplo n.º 2
0
def test_issue1915():
    cfg = {"hidden_depth": 2}  # should error out
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.get_pipe("ner").add_label("answer")
    with pytest.raises(ValueError):
        nlp.begin_training(**cfg)
Ejemplo n.º 3
0
def nlp():
    nlp = Language(Vocab())
    textcat = nlp.create_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
    nlp.add_pipe(textcat)
    nlp.begin_training()
    return nlp
def test_tagger_begin_training_tag_map():
    """Test that Tagger.begin_training() without gold tuples does not clobber
    the tag map."""
    nlp = Language()
    tagger = nlp.create_pipe("tagger")
    orig_tag_count = len(tagger.labels)
    tagger.add_label("A", {"POS": "NOUN"})
    nlp.add_pipe(tagger)
    nlp.begin_training()
    assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN}
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
Ejemplo n.º 5
0
def test_ner_warns_no_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lexeme_norm")
    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list
Ejemplo n.º 6
0
def test_tagger_warns_no_lemma_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    tagger = nlp.create_pipe("tagger")
    with pytest.warns(UserWarning):
        tagger.begin_training()
    nlp.add_pipe(tagger)
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lemma_lookup")
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list
Ejemplo n.º 7
0
def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
    TRAIN_DATA = [
        ["hey", []],
        ["howdy", []],
        ["hey there", []],
        ["hello", []],
        ["hi", []],
        ["i'm looking for a place to eat", []],
        [
            "i'm looking for a place in the north of town",
            [[31, 36, "LOCATION"]]
        ],
        ["show me chinese restaurants", [[8, 15, "CUISINE"]]],
        ["show me chines restaurants", [[8, 14, "CUISINE"]]],
    ]

    nlp = Language()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
    nlp.begin_training()
    ner.model.learn_rate = 0.001
    for itn in range(100):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            nlp.update([raw_text], [{"entities": entity_offsets}])

    with make_tempdir() as model_dir:
        nlp.to_disk(model_dir)
        nlp2 = Language().from_disk(model_dir)

    for raw_text, entity_offsets in TRAIN_DATA:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
        else:
            if entity_offsets:
                raise Exception(ents)
Ejemplo n.º 8
0
def test_simple_train():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    nlp.get_pipe("textcat").add_label("answer")
    nlp.begin_training()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
            ("bbbb", 0),
            ("aa", 1.0),
            ("bbbbbbbbb", 0.0),
            ("aaaaaa", 1),
        ]:
            nlp.update([text], [{"cats": {"answer": answer}}])
    doc = nlp("aaa")
    assert "answer" in doc.cats
    assert doc.cats["answer"] >= 0.5
Ejemplo n.º 9
0
def test_simple_train():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    nlp.get_pipe("textcat").add_label("answer")
    nlp.begin_training()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
            ("bbbb", 0),
            ("aa", 1.0),
            ("bbbbbbbbb", 0.0),
            ("aaaaaa", 1),
        ]:
            nlp.update([text], [{"cats": {"answer": answer}}])
    doc = nlp("aaa")
    assert "answer" in doc.cats
    assert doc.cats["answer"] >= 0.5
Ejemplo n.º 10
0
def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
    TRAIN_DATA = [
        ["hey", []],
        ["howdy", []],
        ["hey there", []],
        ["hello", []],
        ["hi", []],
        ["i'm looking for a place to eat", []],
        ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
        ["show me chinese restaurants", [[8, 15, "CUISINE"]]],
        ["show me chines restaurants", [[8, 14, "CUISINE"]]],
    ]

    nlp = Language()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
    nlp.begin_training()
    ner.model.learn_rate = 0.001
    for itn in range(100):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            nlp.update([raw_text], [{"entities": entity_offsets}])

    with make_tempdir() as model_dir:
        nlp.to_disk(model_dir)
        nlp2 = Language().from_disk(model_dir)

    for raw_text, entity_offsets in TRAIN_DATA:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
        else:
            if entity_offsets:
                raise Exception(ents)
Ejemplo n.º 11
0
def main(vectors_loc=None, lang=None):

    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(VECTORS_PATH, "rb") as file_:
        print("loading vectors...")
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode("utf8")
            pieces = line.rsplit(" ", int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab

    tagger = nlp.create_pipe("tagger")
    # Add the tags. This needs to be done before you start training.
    print("trainning tags...")
    for tag, values in TAG_MAP.items():
        tagger.add_label(tag, values)
    nlp.add_pipe(tagger)
    optimizer = nlp.begin_training()
    for i in range(20):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
        print(losses)

    # test the trained model
    test_text = "Eu desejo ouvir uma música muito boa"
    doc = nlp(test_text)
    print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

    print("Saved mode to nl_model_tagger")

    nlp.to_disk("/app/model")