def test_pipe_factories_language_specific():
    """Test that language sub-classes can have their own factories, with
    fallbacks to the base factories."""
    name1 = "specific_component1"
    name2 = "specific_component2"
    Language.component(name1, func=lambda: "base")
    English.component(name1, func=lambda: "en")
    German.component(name2, func=lambda: "de")

    assert Language.has_factory(name1)
    assert not Language.has_factory(name2)
    assert English.has_factory(name1)
    assert not English.has_factory(name2)
    assert German.has_factory(name1)
    assert German.has_factory(name2)

    nlp = Language()
    assert nlp.create_pipe(name1)() == "base"
    with pytest.raises(ValueError):
        nlp.create_pipe(name2)
    nlp_en = English()
    assert nlp_en.create_pipe(name1)() == "en"
    with pytest.raises(ValueError):
        nlp_en.create_pipe(name2)
    nlp_de = German()
    assert nlp_de.create_pipe(name1)() == "base"
    assert nlp_de.create_pipe(name2)() == "de"
def test_pipe_factories_empty_dict_default():
    """Test that default config values can be empty dicts and that no config
    validation error is raised."""
    # TODO: fix this
    name = "test_pipe_factories_empty_dict_default"

    @Language.factory(name, default_config={"foo": {}})
    def factory(nlp: Language, name: str, foo: dict):
        ...

    nlp = Language()
    nlp.create_pipe(name)
Ejemplo n.º 3
0
def test_issue1727():
    """Test that models with no pretrained vectors can be deserialized
    correctly after vectors are added."""
    nlp = Language(Vocab())
    data = numpy.ones((3, 300), dtype="f")
    vectors = Vectors(data=data, keys=["I", "am", "Matt"])
    tagger = nlp.create_pipe("tagger")
    tagger.add_label("PRP")
    assert tagger.cfg.get("pretrained_dims", 0) == 0
    tagger.vocab.vectors = vectors
    with make_tempdir() as path:
        tagger.to_disk(path)
        tagger = nlp.create_pipe("tagger").from_disk(path)
        assert tagger.cfg.get("pretrained_dims", 0) == 0
Ejemplo n.º 4
0
def test_issue1915():
    cfg = {"hidden_depth": 2}  # should error out
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.get_pipe("ner").add_label("answer")
    with pytest.raises(ValueError):
        nlp.begin_training(**cfg)
Ejemplo n.º 5
0
def test_factories_merge_noun_chunks(doc):
    assert len(doc) == 7
    nlp = Language()
    merge_noun_chunks = nlp.create_pipe("merge_noun_chunks")
    merge_noun_chunks(doc)
    assert len(doc) == 6
    assert doc[2].text == "New York"
Ejemplo n.º 6
0
def test_issue1915():
    cfg = {"hidden_depth": 2}  # should error out
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.get_pipe("ner").add_label("answer")
    with pytest.raises(ValueError):
        nlp.begin_training(**cfg)
Ejemplo n.º 7
0
def test_factories_merge_noun_chunks(doc2):
    assert len(doc2) == 7
    nlp = Language()
    merge_noun_chunks = nlp.create_pipe("merge_noun_chunks")
    merge_noun_chunks(doc2)
    assert len(doc2) == 6
    assert doc2[2].text == "New York"
Ejemplo n.º 8
0
def nlp():
    nlp = Language(Vocab())
    textcat = nlp.create_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
    nlp.add_pipe(textcat)
    nlp.begin_training()
    return nlp
Ejemplo n.º 9
0
def create_pipeline(nlp: Language, cfg: omegaconf.DictConfig) -> List[Pipe]:
    if not isinstance(cfg, omegaconf.DictConfig):
        cfg = OmegaConf.create(cfg)

    pipes = []
    for name, pipe_config in cfg.items():
        pipe_config = OmegaConf.to_container(pipe_config or OmegaConf.create({}))
        pipes.append(nlp.create_pipe(name, config=pipe_config or dict()))
    return pipes
Ejemplo n.º 10
0
def test_factories_merge_ents(doc):
    assert len(doc) == 7
    assert len(list(doc.ents)) == 1
    nlp = Language()
    merge_entities = nlp.create_pipe("merge_entities")
    merge_entities(doc)
    assert len(doc) == 6
    assert len(list(doc.ents)) == 1
    assert doc[2].text == "New York"
Ejemplo n.º 11
0
def test_factories_merge_ents(doc2):
    assert len(doc2) == 7
    assert len(list(doc2.ents)) == 1
    nlp = Language()
    merge_entities = nlp.create_pipe("merge_entities")
    merge_entities(doc2)
    assert len(doc2) == 6
    assert len(list(doc2.ents)) == 1
    assert doc2[2].text == "New York"
def tagger():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("tagger"))
    tagger = nlp.get_pipe("tagger")
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    tagger.begin_training(pipeline=nlp.pipeline)
    return tagger
def entity_linker():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("entity_linker"))
    entity_linker = nlp.get_pipe("entity_linker")
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    entity_linker.set_kb(kb)
    entity_linker.begin_training(pipeline=nlp.pipeline)
    return entity_linker
def test_tagger_begin_training_tag_map():
    """Test that Tagger.begin_training() without gold tuples does not clobber
    the tag map."""
    nlp = Language()
    tagger = nlp.create_pipe("tagger")
    orig_tag_count = len(tagger.labels)
    tagger.add_label("A", {"POS": "NOUN"})
    nlp.add_pipe(tagger)
    nlp.begin_training()
    assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN}
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
Ejemplo n.º 15
0
def test_issue2564():
    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
    nlp = Language()
    tagger = nlp.create_pipe("tagger")
    tagger.begin_training()  # initialise weights
    nlp.add_pipe(tagger)
    doc = nlp("hello world")
    assert doc.is_tagged
    docs = nlp.pipe(["hello", "world"])
    piped_doc = next(docs)
    assert piped_doc.is_tagged
Ejemplo n.º 16
0
def load_default_model():
    # model = spacy.load(MODEL_PATH, disable=['tagger', 'parser', 'ner'])
    vocab = spacy.vocab.Vocab().from_disk(MODEL_PATH)
    model = Language(vocab)

    # ideally, we'd use their parser to do sentence segmentation,
    # but we don't have all day (week/month), so fall back to the basic version
    sbd = model.create_pipe('sentencizer')
    model.add_pipe(sbd)

    return model
Ejemplo n.º 17
0
def _initialize_textcat(nlp: Language) -> None:
    # Ensure that the text categorizer is added to the pipeline
    if 'textcat' not in nlp.pipe_names:
        LOGGER.info('Adding the text categorizer to the spacy nlp pipeline')

        # nlp.create_pipe works for built-ins that are registered with spaCy
        textcat = nlp.create_pipe('textcat',
                                  config={
                                      'exclusive_classes': True,
                                      'architecture': 'simple_cnn'
                                  })
        nlp.add_pipe(textcat, last=True)
Ejemplo n.º 18
0
def test_ner_warns_no_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lexeme_norm")
    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list
Ejemplo n.º 19
0
def test_tagger_warns_no_lemma_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    tagger = nlp.create_pipe("tagger")
    with pytest.warns(UserWarning):
        tagger.begin_training()
    nlp.add_pipe(tagger)
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lemma_lookup")
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list
Ejemplo n.º 20
0
def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
    TRAIN_DATA = [
        ["hey", []],
        ["howdy", []],
        ["hey there", []],
        ["hello", []],
        ["hi", []],
        ["i'm looking for a place to eat", []],
        [
            "i'm looking for a place in the north of town",
            [[31, 36, "LOCATION"]]
        ],
        ["show me chinese restaurants", [[8, 15, "CUISINE"]]],
        ["show me chines restaurants", [[8, 14, "CUISINE"]]],
    ]

    nlp = Language()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
    nlp.begin_training()
    ner.model.learn_rate = 0.001
    for itn in range(100):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            nlp.update([raw_text], [{"entities": entity_offsets}])

    with make_tempdir() as model_dir:
        nlp.to_disk(model_dir)
        nlp2 = Language().from_disk(model_dir)

    for raw_text, entity_offsets in TRAIN_DATA:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
        else:
            if entity_offsets:
                raise Exception(ents)
Ejemplo n.º 21
0
def test_issue1967(label):
    nlp = Language()
    config = {}
    ner = nlp.create_pipe("ner", config=config)
    example = Example.from_dict(
        Doc(ner.vocab, words=["word"]),
        {
            "ids": [0],
            "words": ["word"],
            "tags": ["tag"],
            "heads": [0],
            "deps": ["dep"],
            "entities": [label],
        },
    )
    assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
Ejemplo n.º 22
0
def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
    TRAIN_DATA = [
        ["hey", []],
        ["howdy", []],
        ["hey there", []],
        ["hello", []],
        ["hi", []],
        ["i'm looking for a place to eat", []],
        ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
        ["show me chinese restaurants", [[8, 15, "CUISINE"]]],
        ["show me chines restaurants", [[8, 14, "CUISINE"]]],
    ]

    nlp = Language()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
    nlp.begin_training()
    ner.model.learn_rate = 0.001
    for itn in range(100):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            nlp.update([raw_text], [{"entities": entity_offsets}])

    with make_tempdir() as model_dir:
        nlp.to_disk(model_dir)
        nlp2 = Language().from_disk(model_dir)

    for raw_text, entity_offsets in TRAIN_DATA:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
        else:
            if entity_offsets:
                raise Exception(ents)
Ejemplo n.º 23
0
def test_simple_train():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    nlp.get_pipe("textcat").add_label("answer")
    nlp.begin_training()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
            ("bbbb", 0),
            ("aa", 1.0),
            ("bbbbbbbbb", 0.0),
            ("aaaaaa", 1),
        ]:
            nlp.update([text], [{"cats": {"answer": answer}}])
    doc = nlp("aaa")
    assert "answer" in doc.cats
    assert doc.cats["answer"] >= 0.5
Ejemplo n.º 24
0
def test_simple_train():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    nlp.get_pipe("textcat").add_label("answer")
    nlp.begin_training()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
            ("bbbb", 0),
            ("aa", 1.0),
            ("bbbbbbbbb", 0.0),
            ("aaaaaa", 1),
        ]:
            nlp.update([text], [{"cats": {"answer": answer}}])
    doc = nlp("aaa")
    assert "answer" in doc.cats
    assert doc.cats["answer"] >= 0.5
Ejemplo n.º 25
0
def test_component_decorator_assigns():
    spacy.language.ENABLE_PIPELINE_ANALYSIS = True

    @component("c1", assigns=["token.tag", "doc.tensor"])
    def test_component1(doc):
        return doc

    @component("c2",
               requires=["token.tag", "token.pos"],
               assigns=["token.lemma", "doc.tensor"])
    def test_component2(doc):
        return doc

    @component("c3",
               requires=["token.lemma"],
               assigns=["token._.custom_lemma"])
    def test_component3(doc):
        return doc

    assert "c1" in Language.factories
    assert "c2" in Language.factories
    assert "c3" in Language.factories

    nlp = Language()
    nlp.add_pipe(test_component1)
    with pytest.warns(UserWarning):
        nlp.add_pipe(test_component2)
    nlp.add_pipe(test_component3)
    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
    assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
    test_component4 = nlp.create_pipe("c1")
    assert test_component4.name == "c1"
    assert test_component4.factory == "c1"
    nlp.add_pipe(test_component4, name="c4")
    assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
    assert "c4" not in Language.factories
    assert nlp.pipe_factories["c1"] == "c1"
    assert nlp.pipe_factories["c4"] == "c1"
    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
    assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
    requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
    assert [name for name, _ in requires_pos] == ["c2"]
    assert print_summary(nlp, no_print=True)
    assert nlp("hello world")
Ejemplo n.º 26
0
def main(vectors_loc=None, lang=None):

    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(VECTORS_PATH, "rb") as file_:
        print("loading vectors...")
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode("utf8")
            pieces = line.rsplit(" ", int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab

    tagger = nlp.create_pipe("tagger")
    # Add the tags. This needs to be done before you start training.
    print("trainning tags...")
    for tag, values in TAG_MAP.items():
        tagger.add_label(tag, values)
    nlp.add_pipe(tagger)
    optimizer = nlp.begin_training()
    for i in range(20):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
        print(losses)

    # test the trained model
    test_text = "Eu desejo ouvir uma música muito boa"
    doc = nlp(test_text)
    print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

    print("Saved mode to nl_model_tagger")

    nlp.to_disk("/app/model")
Ejemplo n.º 27
0
def test_component_factories_from_nlp():
    """Test that class components can implement a from_nlp classmethod that
    gives them access to the nlp object and config via the factory."""
    class TestComponent5(object):
        def __call__(self, doc):
            return doc

    mock = Mock()
    mock.return_value = TestComponent5()
    TestComponent5.from_nlp = classmethod(mock)
    TestComponent5 = component("c5")(TestComponent5)

    assert "c5" in Language.factories
    nlp = Language()
    pipe = nlp.create_pipe("c5", config={"foo": "bar"})
    nlp.add_pipe(pipe)
    assert nlp("hello world")
    # The first argument here is the class itself, so we're accepting any here
    mock.assert_called_once_with(ANY, nlp, foo="bar")
def test_pipe_function_component():
    name = "test_component"

    @Language.component(name)
    def component(doc: Doc) -> Doc:
        return doc

    assert name in registry.factories
    nlp = Language()
    with pytest.raises(ValueError):
        nlp.add_pipe(component)
    nlp.add_pipe(name)
    assert name in nlp.pipe_names
    assert nlp.pipe_factories[name] == name
    assert Language.get_factory_meta(name)
    assert nlp.get_pipe_meta(name)
    pipe = nlp.get_pipe(name)
    assert pipe == component
    pipe = nlp.create_pipe(name)
    assert pipe == component
def test_pipe_class_component_init():
    name1 = "test_class_component1"
    name2 = "test_class_component2"

    @Language.factory(name1)
    class Component1:
        def __init__(self, nlp: Language, name: str):
            self.nlp = nlp

        def __call__(self, doc: Doc) -> Doc:
            return doc

    class Component2:
        def __init__(self, nlp: Language, name: str):
            self.nlp = nlp

        def __call__(self, doc: Doc) -> Doc:
            return doc

    @Language.factory(name2)
    def factory(nlp: Language, name=name2):
        return Component2(nlp, name)

    nlp = Language()
    for name, Component in [(name1, Component1), (name2, Component2)]:
        assert name in registry.factories
        with pytest.raises(ValueError):
            nlp.add_pipe(Component(nlp, name))
        nlp.add_pipe(name)
        assert name in nlp.pipe_names
        assert nlp.pipe_factories[name] == name
        assert Language.get_factory_meta(name)
        assert nlp.get_pipe_meta(name)
        pipe = nlp.get_pipe(name)
        assert isinstance(pipe, Component)
        assert isinstance(pipe.nlp, Language)
        pipe = nlp.create_pipe(name)
        assert isinstance(pipe, Component)
        assert isinstance(pipe.nlp, Language)
Ejemplo n.º 30
0
def test_label_types():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    nlp.get_pipe("textcat").add_label("answer")
    with pytest.raises(ValueError):
        nlp.get_pipe("textcat").add_label(9)