Ejemplo n.º 1
0
def test_pretraining_tagger_tok2vec(config):
    """Test pretraining of the tagger's tok2vec layer (via a listener)"""
    config = Config().from_str(pretrain_string_listener)
    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
    filled = nlp.config
    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
    filled = pretrain_config.merge(filled)
    with make_tempdir() as tmp_dir:
        file_path = write_sample_jsonl(tmp_dir)
        filled["paths"]["raw_text"] = file_path
        filled["pretraining"]["component"] = "tagger"
        filled["pretraining"]["layer"] = "tok2vec"
        filled = filled.interpolate()
        pretrain(filled, tmp_dir)
        assert Path(tmp_dir / "model0.bin").exists()
        assert Path(tmp_dir / "model4.bin").exists()
        assert not Path(tmp_dir / "model5.bin").exists()
Ejemplo n.º 2
0
def test_initialized_inline_transformer_todisk():
    orig_config = Config().from_str(inline_cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    assert nlp.pipe_names == ["tagger"]
    tagger = nlp.get_pipe("tagger")
    tagger.add_label("V")
    nlp.initialize()
    with make_tempdir() as d:
        tagger.to_disk(d)
        nlp2 = util.load_model_from_config(orig_config,
                                           auto_fill=True,
                                           validate=True)
        tagger2 = nlp2.get_pipe("tagger")
        tagger2.from_disk(d)
        assert list(tagger2.labels) == ["V"]
def test_transformer_sentencepiece_IO():
    """Test that a transformer using sentencepiece trains + IO goes OK"""
    orig_config = Config().from_str(cfg_string)
    orig_config["components"]["transformer"]["model"]["name"] = "camembert-base"
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    tagger = nlp.get_pipe("tagger")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    optimizer = nlp.initialize(lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    text = "We're interested at underwater basket weaving."
    doc = nlp(text)
    doc_tensor = tagger_trf.predict([doc])

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc2 = nlp2(text)
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)

        # make sure that this can be saved to directory once more
        file_path_2 = d / "trained_nlp_2"
        nlp2.to_disk(file_path_2)

    # ensure to_bytes / from_bytes works
    nlp_bytes = nlp.to_bytes()
    nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(text)
    tagger3 = nlp3.get_pipe("tagger")
    tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
    doc_tensor3 = tagger_trf3.predict([doc3])
    _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
Ejemplo n.º 4
0
def test_serialize_parser(parser_config_string):
    """ Create a non-default parser config to check nlp serializes it correctly """
    nlp = English()
    model_config = Config().from_str(parser_config_string)
    parser = nlp.add_pipe("parser", config=model_config)
    parser.add_label("nsubj")
    nlp.initialize()

    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
        model = nlp2.get_pipe("parser").model
        model.get_ref("tok2vec")
        # check that we have the correct settings, not the default ones
        if model.attrs["has_upper"]:
            assert model.get_ref("upper").get_dim("nI") == 66
        assert model.get_ref("lower").get_dim("nI") == 66
def test_replace_listeners_invalid():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    text = "This is awesome"
    examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
    optimizer = nlp.initialize(lambda: examples)
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
    with pytest.raises(ValueError):
        nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"])
    with pytest.raises(ValueError):
        nlp.replace_listeners("transformer", "parser", ["model.tok2vec"])
    with pytest.raises(ValueError):
        nlp.replace_listeners("transformer", "tagger", ["model.yolo"])
    with pytest.raises(ValueError):
        nlp.replace_listeners("transformer", "tagger", ["model.tok2vec", "model.yolo"])
Ejemplo n.º 6
0
def test_tok2vec_listener_callback():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    assert nlp.pipe_names == ["tok2vec", "tagger"]
    tagger = nlp.get_pipe("tagger")
    tok2vec = nlp.get_pipe("tok2vec")
    nlp._link_components()
    docs = [nlp.make_doc("A random sentence")]
    tok2vec.model.initialize(X=docs)
    gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
    label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")]
    tagger.model.initialize(X=docs, Y=label_sample)
    docs = [nlp.make_doc("Another entirely random sentence")]
    tok2vec.update([Example.from_dict(x, {}) for x in docs])
    Y, get_dX = tagger.model.begin_update(docs)
    # assure that the backprop call works (and doesn't hit a 'None' callback)
    assert get_dX(Y) is not None
def test_replace_listeners():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    text = "This is awesome"
    examples = [
        Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})
    ]
    optimizer = nlp.initialize(lambda: examples)
    # verify correct configuration with transformer listener
    transformer = nlp.get_pipe("transformer")
    tagger = nlp.get_pipe("tagger")
    tagger_tok2vec = tagger.model.get_ref("tok2vec")
    tagger_listener = tagger_tok2vec.get_ref("listener")
    assert isinstance(tagger_listener, TransformerListener)
    assert transformer.listener_map["tagger"][0] == tagger_listener
    assert (nlp.config["components"]["transformer"]["model"]["@architectures"]
            == "spacy-transformers.TransformerModel.v1")
    assert (nlp.config["components"]["tagger"]["model"]["tok2vec"]
            ["@architectures"] == "spacy-transformers.TransformerListener.v1")
    # train pipe before replacing listeners
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
        doc = nlp(text)

    preds = [t.tag_ for t in doc]
    doc_tensor = tagger_tok2vec.predict([doc])

    # replace listener and verify predictions are still the same
    nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"])
    tagger = nlp.get_pipe("tagger")
    tagger_tok2vec = tagger.model.get_ref("tok2vec")
    assert tagger_tok2vec.layers[0].layers[0].name == "transformer"
    assert (nlp.config["components"]["tagger"]["model"]["tok2vec"]
            ["@architectures"] == "spacy-transformers.Tok2VecTransformer.v1")
    doc2 = nlp(text)
    assert preds == [t.tag_ for t in doc2]
    assert_equal(doc_tensor, tagger_tok2vec.predict([doc2]))
    # attempt training with the new pipeline
    optimizer = nlp.resume_training()
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
        assert losses["tagger"] > 0.0
Ejemplo n.º 8
0
def test_pretraining_tok2vec_characters(objective):
    """Test that pretraining works with the character objective"""
    config = Config().from_str(pretrain_string_listener)
    config["pretraining"]["objective"] = objective
    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
    filled = nlp.config
    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
    filled = pretrain_config.merge(filled)
    with make_tempdir() as tmp_dir:
        file_path = write_sample_jsonl(tmp_dir)
        filled["paths"]["raw_text"] = file_path
        filled = filled.interpolate()
        assert filled["pretraining"]["component"] == "tok2vec"
        pretrain(filled, tmp_dir)
        assert Path(tmp_dir / "model0.bin").exists()
        assert Path(tmp_dir / "model4.bin").exists()
        assert not Path(tmp_dir / "model5.bin").exists()
Ejemplo n.º 9
0
def get_third_party_dependencies(
    config: Config, exclude: List[str] = util.SimpleFrozenList()) -> List[str]:
    """If the config includes references to registered functions that are
    provided by third-party packages (spacy-transformers, other libraries), we
    want to include them in meta["requirements"] so that the package specifies
    them as dependencies and the user won't have to do it manually.

    We do this by:
    - traversing the config to check for registered function (@ keys)
    - looking up the functions and getting their module
    - looking up the module version and generating an appropriate version range

    config (Config): The pipeline config.
    exclude (list): List of packages to exclude (e.g. that already exist in meta).
    RETURNS (list): The versioned requirements.
    """
    own_packages = ("spacy", "spacy-legacy", "spacy-nightly", "thinc", "srsly")
    distributions = util.packages_distributions()
    funcs = defaultdict(set)
    for path, value in util.walk_dict(config):
        if path[-1].startswith(
                "@"):  # collect all function references by registry
            funcs[path[-1][1:]].add(value)
    for component in config.get("components", {}).values():
        if "factory" in component:
            funcs["factories"].add(component["factory"])
    modules = set()
    for reg_name, func_names in funcs.items():
        for func_name in func_names:
            func_info = util.registry.find(reg_name, func_name)
            module_name = func_info.get("module")
            if module_name:  # the code is part of a module, not a --code file
                modules.add(func_info["module"].split(".")[0])
    dependencies = []
    for module_name in modules:
        if module_name in distributions:
            dist = distributions.get(module_name)
            if dist:
                pkg = dist[0]
                if pkg in own_packages or pkg in exclude:
                    continue
                version = util.get_package_version(pkg)
                version_range = util.get_minor_version_range(version)
                dependencies.append(f"{pkg}{version_range}")
    return dependencies
Ejemplo n.º 10
0
def test_pretraining_training():
    """Test that training can use a pretrained Tok2Vec model"""
    config = Config().from_str(pretrain_string_internal)
    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
    filled = nlp.config
    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
    filled = pretrain_config.merge(filled)
    train_config = util.load_config(DEFAULT_CONFIG_PATH)
    filled = train_config.merge(filled)
    with make_tempdir() as tmp_dir:
        pretrain_dir = tmp_dir / "pretrain"
        pretrain_dir.mkdir()
        file_path = write_sample_jsonl(pretrain_dir)
        filled["paths"]["raw_text"] = file_path
        filled["pretraining"]["component"] = "tagger"
        filled["pretraining"]["layer"] = "tok2vec"
        train_dir = tmp_dir / "train"
        train_dir.mkdir()
        train_path, dev_path = write_sample_training(train_dir)
        filled["paths"]["train"] = train_path
        filled["paths"]["dev"] = dev_path
        filled = filled.interpolate()
        P = filled["pretraining"]
        nlp_base = init_nlp(filled)
        model_base = (nlp_base.get_pipe(P["component"]).model.get_ref(
            P["layer"]).get_ref("embed"))
        embed_base = None
        for node in model_base.walk():
            if node.name == "hashembed":
                embed_base = node
        pretrain(filled, pretrain_dir)
        pretrained_model = Path(pretrain_dir / "model3.bin")
        assert pretrained_model.exists()
        filled["initialize"]["init_tok2vec"] = str(pretrained_model)
        nlp = init_nlp(filled)
        model = nlp.get_pipe(P["component"]).model.get_ref(
            P["layer"]).get_ref("embed")
        embed = None
        for node in model.walk():
            if node.name == "hashembed":
                embed = node
        # ensure that the tok2vec weights are actually changed by the pretraining
        assert np.any(
            np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
        train(nlp, train_dir)
Ejemplo n.º 11
0
def test_serialize_nlp():
    """ Create a custom nlp pipeline from config and ensure it serializes it correctly """
    nlp_config = Config().from_str(nlp_config_string)
    nlp = load_model_from_config(nlp_config, auto_fill=True)
    nlp.get_pipe("tagger").add_label("A")
    nlp.initialize()
    assert "tok2vec" in nlp.pipe_names
    assert "tagger" in nlp.pipe_names
    assert "parser" not in nlp.pipe_names
    assert nlp.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342

    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
        assert "tok2vec" in nlp2.pipe_names
        assert "tagger" in nlp2.pipe_names
        assert "parser" not in nlp2.pipe_names
        assert nlp2.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342
Ejemplo n.º 12
0
def test_annotating_components_from_config(config_str):
    @registry.readers("unannotated_corpus")
    def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]:
        return UnannotatedCorpus()

    class UnannotatedCorpus:
        def __call__(self, nlp: Language) -> Iterator[Example]:
            for text in ["a a", "b b", "c c"]:
                doc = nlp.make_doc(text)
                yield Example(doc, doc)

    orig_config = Config().from_str(config_str)
    nlp = load_model_from_config(orig_config, auto_fill=True, validate=True)
    assert nlp.config["training"]["annotating_components"] == ["sentencizer"]
    train(nlp)

    nlp.config["training"]["annotating_components"] = []
    with pytest.raises(ValueError):
        train(nlp)
Ejemplo n.º 13
0
def test_create_nlp_from_config_multiple_instances():
    """Test that the nlp object is created correctly for a config with multiple
    instances of the same component."""
    config = Config().from_str(nlp_config_string)
    config["components"] = {
        "t2v": config["components"]["tok2vec"],
        "tagger1": config["components"]["tagger"],
        "tagger2": config["components"]["tagger"],
    }
    config["nlp"]["pipeline"] = list(config["components"].keys())
    nlp = load_model_from_config(config, auto_fill=True)
    assert nlp.pipe_names == ["t2v", "tagger1", "tagger2"]
    assert nlp.get_pipe_meta("t2v").factory == "tok2vec"
    assert nlp.get_pipe_meta("tagger1").factory == "tagger"
    assert nlp.get_pipe_meta("tagger2").factory == "tagger"
    pipeline_config = nlp.config["components"]
    assert len(pipeline_config) == 3
    assert list(pipeline_config.keys()) == ["t2v", "tagger1", "tagger2"]
    assert nlp.config["nlp"]["pipeline"] == ["t2v", "tagger1", "tagger2"]
def test_transformer_pipeline_tagger():
    """Test that a pipeline with just a transformer+tagger runs and trains properly"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    assert nlp.pipe_names == ["transformer", "tagger"]
    tagger = nlp.get_pipe("tagger")
    transformer = nlp.get_pipe("transformer")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    assert isinstance(transformer, Transformer)
    assert isinstance(tagger_trf, TransformerListener)
    assert tagger_trf.upstream_name == "custom_upstream"
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    # Check that the Transformer component finds it listeners
    assert transformer.listeners == []
    optimizer = nlp.initialize(lambda: train_examples)
    assert tagger_trf in transformer.listeners

    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    doc = nlp("We're interested at underwater basket weaving.")
    doc_tensor = tagger_trf.predict([doc])
    assert_equal(doc._.trf_data.tensors, doc_tensor[0].tensors)

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc = nlp2("We're interested at underwater basket weaving.")
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc])
        assert_equal(doc_tensor2[0].tensors, doc_tensor[0].tensors)
Ejemplo n.º 15
0
def test_replace_listeners_from_config():
    orig_config = Config().from_str(cfg_string_multi)
    nlp = util.load_model_from_config(orig_config, auto_fill=True)
    annots = {"tags": ["V", "Z"], "entities": [(0, 1, "A"), (1, 2, "B")]}
    examples = [Example.from_dict(nlp.make_doc("x y"), annots)]
    nlp.initialize(lambda: examples)
    tok2vec = nlp.get_pipe("tok2vec")
    tagger = nlp.get_pipe("tagger")
    ner = nlp.get_pipe("ner")
    assert tok2vec.listening_components == ["tagger", "ner"]
    assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
    assert any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
    with make_tempdir() as dir_path:
        nlp.to_disk(dir_path)
        base_model = str(dir_path)
        new_config = {
            "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
            "components": {
                "tok2vec": {"source": base_model},
                "tagger": {
                    "source": base_model,
                    "replace_listeners": ["model.tok2vec"],
                },
                "ner": {"source": base_model},
            },
        }
        new_nlp = util.load_model_from_config(new_config, auto_fill=True)
    new_nlp.initialize(lambda: examples)
    tok2vec = new_nlp.get_pipe("tok2vec")
    tagger = new_nlp.get_pipe("tagger")
    ner = new_nlp.get_pipe("ner")
    assert tok2vec.listening_components == ["ner"]
    assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
    assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
    t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
    assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
    assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg
    assert (
        new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"]
        == "spacy.Tok2VecListener.v1"
    )
def test_transformer_pipeline_empty():
    """Test that the pipeline doesn't fail with empty input"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    tagger = nlp.get_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    # train on empty doc
    optimizer = nlp.initialize()
    losses = {}
    empty_train_example = Example.from_dict(nlp.make_doc(""), {})
    nlp.update(train_examples, sgd=optimizer, losses=losses)
    nlp.update([empty_train_example], sgd=optimizer, losses=losses)
    train_examples.append(empty_train_example)
    nlp.update(train_examples, sgd=optimizer, losses=losses)

    # predict empty doc
    doc = nlp("")
    _assert_empty(doc._.trf_data)
    docs = nlp.pipe(["", ""])
    for doc in docs:
        _assert_empty(doc._.trf_data)
    nlp.pipe([])

    # predict combination of empty and non-empty
    doc = nlp("This is a sentence")
    normal_tags = [t.tag_ for t in doc]

    docs = list(nlp.pipe(["", "This is a sentence", "", ""]))
    _assert_empty(docs[0]._.trf_data)
    assert [t.tag_ for t in docs[0]] == []
    assert [t.tag_ for t in docs[1]] == normal_tags
    _assert_empty(docs[2]._.trf_data)
    _assert_empty(docs[3]._.trf_data)
Ejemplo n.º 17
0
def test_config_interpolation(d):
    """Test that config values are interpolated correctly. The parametrized
    value is the final divider (${a.b} vs. ${a:b}). Both should now work and be
    valid. The double {{ }} in the config strings are required to prevent the
    references from being interpreted as an actual f-string variable.
    """
    c_str = """[a]\nfoo = "hello"\n\n[b]\nbar = ${foo}"""
    with pytest.raises(ConfigValidationError):
        Config().from_str(c_str)
    c_str = f"""[a]\nfoo = "hello"\n\n[b]\nbar = ${{a{d}foo}}"""
    assert Config().from_str(c_str)["b"]["bar"] == "hello"
    c_str = f"""[a]\nfoo = "hello"\n\n[b]\nbar = ${{a{d}foo}}!"""
    assert Config().from_str(c_str)["b"]["bar"] == "hello!"
    c_str = f"""[a]\nfoo = "hello"\n\n[b]\nbar = "${{a{d}foo}}!\""""
    assert Config().from_str(c_str)["b"]["bar"] == "hello!"
    c_str = f"""[a]\nfoo = 15\n\n[b]\nbar = ${{a{d}foo}}!"""
    assert Config().from_str(c_str)["b"]["bar"] == "15!"
    c_str = f"""[a]\nfoo = ["x", "y"]\n\n[b]\nbar = ${{a{d}foo}}"""
    assert Config().from_str(c_str)["b"]["bar"] == ["x", "y"]
    # Interpolation within the same section
    c_str = f"""[a]\nfoo = "x"\nbar = ${{a{d}foo}}\nbaz = "${{a{d}foo}}y\""""
    assert Config().from_str(c_str)["a"]["bar"] == "x"
    assert Config().from_str(c_str)["a"]["baz"] == "xy"
Ejemplo n.º 18
0
def test_config_custom_sort_preserve():
    """Test that sort order is preserved when merging and copying configs,
    or when configs are filled and resolved."""
    cfg = {"x": {}, "y": {}, "z": {}}
    section_order = ["y", "z", "x"]
    expected = "[y]\n\n[z]\n\n[x]"
    config = Config(cfg, section_order=section_order)
    assert config.to_str() == expected
    config2 = config.copy()
    assert config2.to_str() == expected
    config3 = config.merge({"a": {}})
    assert config3.to_str() == f"{expected}\n\n[a]"
    config4 = Config(config)
    assert config4.to_str() == expected
    config_str = """[a]\nb = 1\n[c]\n@cats = "catsie.v1"\nevil = true\n\n[t]\n x = 2"""
    section_order = ["c", "a", "t"]
    config5 = Config(section_order=section_order).from_str(config_str)
    assert list(config5.keys()) == section_order
    filled = my_registry.fill_config(config5)
    assert filled.section_order == section_order
Ejemplo n.º 19
0
def test_positional_args_to_from_string():
    cfg = """[a]\nb = 1\n* = ["foo","bar"]"""
    assert Config().from_str(cfg).to_str() == cfg
    cfg = """[a]\nb = 1\n\n[a.*.foo]\ntest = 1\n\n[a.*.bar]\ntest = 2"""
    assert Config().from_str(cfg).to_str() == cfg

    @my_registry.cats("catsie.v666")
    def catsie_666(*args, meow=False):
        return args

    cfg = """[a]\n@cats = "catsie.v666"\n* = ["foo","bar"]"""
    filled = my_registry.fill_config(Config().from_str(cfg)).to_str()
    assert filled == """[a]\n@cats = "catsie.v666"\n* = ["foo","bar"]\nmeow = false"""
    assert my_registry.make_from_config(Config().from_str(cfg)) == {
        "a": ("foo", "bar")
    }
    cfg = """[a]\n@cats = "catsie.v666"\n\n[a.*.foo]\nx = 1"""
    filled = my_registry.fill_config(Config().from_str(cfg)).to_str()
    assert filled == """[a]\n@cats = "catsie.v666"\nmeow = false\n\n[a.*.foo]\nx = 1"""
    assert my_registry.make_from_config(Config().from_str(cfg)) == {
        "a": ({
            "x": 1
        }, )
    }

    @my_registry.cats("catsie.v777")
    def catsie_777(y: int = 1):
        return "meow" * y

    cfg = """[a]\n@cats = "catsie.v666"\n\n[a.*.foo]\n@cats = "catsie.v777\""""
    filled = my_registry.fill_config(Config().from_str(cfg)).to_str()
    expected = """[a]\n@cats = "catsie.v666"\nmeow = false\n\n[a.*.foo]\n@cats = "catsie.v777"\ny = 1"""
    assert filled == expected
    cfg = """[a]\n@cats = "catsie.v666"\n\n[a.*.foo]\n@cats = "catsie.v777"\ny = 3"""
    result = my_registry.make_from_config(Config().from_str(cfg))
    assert result == {"a": ("meowmeowmeow", )}
Ejemplo n.º 20
0
def main(numpy: bool = False,
         pytorch: bool = False,
         generic: bool = False,
         gpu_id: int = -1):
    global CONFIG
    fix_random_seed(0)
    if gpu_id >= 0:
        require_gpu(gpu_id)
        print("Set GPU", gpu_id)
    backends = {"pytorch": pytorch, "numpy": numpy, "generic": generic}
    for name, use_backend in backends.items():
        if not use_backend:
            print(f"Skipping {name}")
            continue
        set_backend(name, gpu_id)
        print("Getting data")
        C = registry.resolve(Config().from_str(CONFIG))
        model = C["model"]
        X, Y = get_dummy_data(**C["data"])
        print("Copy to device")
        X = [model.ops.asarray(x) for x in X]
        Y = [model.ops.asarray(y) for y in Y]
        print("Begin init", len(X))
        model.initialize(X=X[:5])
        print("Pre-batch")
        n_words = sum(len(x) for x in X)
        batches = model.ops.multibatch(16, X, Y)
        batches = [(model.layers[0].predict(x), y) for x, y in batches]
        model.layers.pop(0)
        print("Start")
        start_time = timer()
        total = run_forward(model, [x for x, y in batches])
        end_time = timer()
        print(name, n_words, total, end_time - start_time)
        start_time = timer()
        total = run_forward_backward(model, batches)
        end_time = timer()
        print(name, n_words, total, end_time - start_time)
Ejemplo n.º 21
0
def test_replace_listeners():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    examples = [Example.from_dict(nlp.make_doc("x y"), {"tags": ["V", "Z"]})]
    nlp.initialize(lambda: examples)
    tok2vec = nlp.get_pipe("tok2vec")
    tagger = nlp.get_pipe("tagger")
    assert isinstance(tagger.model.layers[0], Tok2VecListener)
    assert tok2vec.listener_map["tagger"][0] == tagger.model.layers[0]
    assert (
        nlp.config["components"]["tok2vec"]["model"]["@architectures"]
        == "spacy.Tok2Vec.v2"
    )
    assert (
        nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"]
        == "spacy.Tok2VecListener.v1"
    )
    nlp.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
    assert not isinstance(tagger.model.layers[0], Tok2VecListener)
    t2v_cfg = nlp.config["components"]["tok2vec"]["model"]
    assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
    assert nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg
    with pytest.raises(ValueError):
        nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"])
    with pytest.raises(ValueError):
        nlp.replace_listeners("tok2vec", "parser", ["model.tok2vec"])
    with pytest.raises(ValueError):
        nlp.replace_listeners("tok2vec", "tagger", ["model.yolo"])
    with pytest.raises(ValueError):
        nlp.replace_listeners("tok2vec", "tagger", ["model.tok2vec", "model.yolo"])
    # attempt training with the new pipeline
    optimizer = nlp.initialize(lambda: examples)
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
        assert losses["tok2vec"] == 0.0
        assert losses["tagger"] > 0.0
Ejemplo n.º 22
0
def test_config_deep_merge():
    config = {"a": "hello", "b": {"c": "d"}}
    defaults = {"a": "world", "b": {"c": "e", "f": "g"}}
    merged = Config(defaults).merge(config)
    assert len(merged) == 2
    assert merged["a"] == "hello"
    assert merged["b"] == {"c": "d", "f": "g"}
    config = {"a": "hello", "b": {"@test": "x", "foo": 1}}
    defaults = {"a": "world", "b": {"@test": "x", "foo": 100, "bar": 2}, "c": 100}
    merged = Config(defaults).merge(config)
    assert len(merged) == 3
    assert merged["a"] == "hello"
    assert merged["b"] == {"@test": "x", "foo": 1, "bar": 2}
    assert merged["c"] == 100
    config = {"a": "hello", "b": {"@test": "x", "foo": 1}, "c": 100}
    defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
    merged = Config(defaults).merge(config)
    assert len(merged) == 3
    assert merged["a"] == "hello"
    assert merged["b"] == {"@test": "x", "foo": 1}
    assert merged["c"] == 100
    # Test that leaving out the factory just adds to existing
    config = {"a": "hello", "b": {"foo": 1}, "c": 100}
    defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
    merged = Config(defaults).merge(config)
    assert len(merged) == 3
    assert merged["a"] == "hello"
    assert merged["b"] == {"@test": "y", "foo": 1, "bar": 2}
    assert merged["c"] == 100
    # Test that switching to a different factory prevents the default from being added
    config = {"a": "hello", "b": {"@foo": 1}, "c": 100}
    defaults = {"a": "world", "b": {"@bar": "y"}}
    merged = Config(defaults).merge(config)
    assert len(merged) == 3
    assert merged["a"] == "hello"
    assert merged["b"] == {"@foo": 1}
    assert merged["c"] == 100
    config = {"a": "hello", "b": {"@foo": 1}, "c": 100}
    defaults = {"a": "world", "b": "y"}
    merged = Config(defaults).merge(config)
    assert len(merged) == 3
    assert merged["a"] == "hello"
    assert merged["b"] == {"@foo": 1}
    assert merged["c"] == 100
Ejemplo n.º 23
0
def get_nlps(language: str, *, add_coreferee: bool = True) -> list:
    """ Returns a list of *nlp* objects to use when testing the functionality for *language*.
        The list contains the latest versions of the Spacy models named in the config file.
        Note that if this method is called with *add_coreferee=False*, this setting will apply
        to all future calls within the same process space. This means that *add_coreferee=False*
        is only appropriate during development of rules tests and before any smoke tests are
        required."""
    with lock:
        if language not in language_to_nlps:
            relative_config_filename = sep.join(
                ('lang', language, 'config.cfg'))
            if not pkg_resources.resource_exists('coreferee',
                                                 relative_config_filename):
                raise LanguageNotSupportedError(language)
            absolute_config_filename = pkg_resources.resource_filename(
                __name__, relative_config_filename)
            config = Config().from_disk(absolute_config_filename)
            model_set = set()
            for config_entry in config:
                model_set.add('_'.join(
                    (language, config[config_entry]['model'])))
            nlps = []
            for model in model_set:
                # At present we presume there will never be an entry in the config file that
                # specifies a model name that can no longer be loaded. This seems a reasonable
                # assumption, but if it no longer applies this code will need to be changed in the
                # future.
                nlp = spacy.load(model)
                if add_coreferee:
                    nlp.add_pipe('coreferee')
                nlps.append(nlp)
            nlps = sorted(nlps,
                          key=lambda nlp:
                          (nlp.meta['name'], nlp.meta['version']))
            language_to_nlps[language] = nlps
        return language_to_nlps[language]
Ejemplo n.º 24
0
def test_config_to_str_roundtrip():
    cfg = {"cfg": {"foo": False}}
    config_str = Config(cfg).to_str()
    assert config_str == "[cfg]\nfoo = false"
    config = Config().from_str(config_str)
    assert dict(config) == cfg
    cfg = {"cfg": {"foo": "false"}}
    config_str = Config(cfg).to_str()
    assert config_str == '[cfg]\nfoo = "false"'
    config = Config().from_str(config_str)
    assert dict(config) == cfg
    # Bad non-serializable value
    cfg = {"cfg": {"x": numpy.asarray([[1, 2, 3, 4], [4, 5, 3, 4]], dtype="f")}}
    config = Config(cfg)
    with pytest.raises(ConfigValidationError):
        config.to_str()
    # Roundtrip with variables: preserve variables correctly (quoted/unquoted)
    config_str = """[a]\nb = 1\n\n[c]\nd = ${a:b}\ne = \"hello${a:b}"\nf = "${a:b}\""""
    config = Config().from_str(config_str, interpolate=False)
    assert config.to_str() == config_str
Ejemplo n.º 25
0
from ..vocab import Vocab
from ..language import Language
from ..errors import Errors

default_model_config = """
[model]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]


@Language.factory(
    "tok2vec", assigns=["doc.tensor"], default_config={"model": DEFAULT_TOK2VEC_MODEL}
)
def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
    return Tok2Vec(nlp.vocab, model, name)


class Tok2Vec(TrainablePipe):
    """Apply a "token-to-vector" model and set its outputs in the doc.tensor
    attribute. This is mostly useful to share a single subnetwork between multiple
    components, e.g. to have one embedding and CNN network shared between a
    parser, tagger and NER.
Ejemplo n.º 26
0
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 96
rows = [5000, 2000, 1000, 1000]
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false

[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 4
"""

DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]


@runtime_checkable
class Suggester(Protocol):
    def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
        ...


@registry.misc("spacy.ngram_suggester.v1")
def build_ngram_suggester(sizes: List[int]) -> Suggester:
    """Suggest all spans of the given lengths. Spans are returned as a ragged
    array of integers. The array has two columns, indicating the start and end
    position."""

    def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
Ejemplo n.º 27
0
    from thinc.api import Config

    defaults = Config().from_str("""
    [Yake]
    window = 2
    lemmatize = false
    candidate_selection = ngram
    
    [TextRank]
    window = 3
    alpha = 0.85
    tol = 1.0e-6
    candidate_selection = chunk
    
    [PositionRank]
    window = 10
    alpha = 0.85
    tol = 1.0e-5
    normalize = false
    candidate_selection = chunk
    
    [TopicRank]
    clustering_method = average
    distance_metric = jaccard
    threshold = 0.74
    alpha = 0.85
    tol = 1.0e-6
    heuristic = null
    candidate_selection = chunk
    """)

    @Language.factory("yake", default_config=defaults["Yake"])
Ejemplo n.º 28
0
def test_create_nlp_from_pretraining_config():
    """Test that the default pretraining config validates properly"""
    config = Config().from_str(pretrain_config_string)
    pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
    filled = config.merge(pretrain_config)
    registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
Ejemplo n.º 29
0
default_model_config = """
[model]
@architectures = "spacy.EntityLinker.v1"

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 2
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]


@Language.factory(
    "entity_linker",
    requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
    assigns=["token.ent_kb_id"],
    default_config={
        "model": DEFAULT_NEL_MODEL,
        "labels_discard": [],
        "incl_prior": True,
        "incl_context": True,
        "entity_vector_length": 64,
        "get_candidates": {
            "@misc": "spacy.CandidateGenerator.v1"
        },
Ejemplo n.º 30
0
[transformer.set_extra_annotations]
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"

[transformer.model]
@architectures = "spacy-transformers.TransformerModel.v2"
name = "roberta-base"
tokenizer_config = {"use_fast": true}
transformer_config = {}

[transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96
"""

DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_STR)
DOC_EXT_ATTR = "trf_data"


@Language.factory(
    "transformer",
    assigns=[f"doc._.{DOC_EXT_ATTR}"],
    default_config=DEFAULT_CONFIG["transformer"],
)
def make_transformer(
    nlp: Language,
    name: str,
    model: Model[List[Doc], FullTransformerBatch],
    set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None],
    max_batch_items: int,
):