コード例 #1
0
ファイル: test_serialize.py プロジェクト: zhuikonger/thinc
def test_serialize_attrs():
    fwd = lambda model, X, is_train: (X, lambda dY: dY)
    attrs = {"test": "foo"}
    model1 = Model("test", fwd, attrs=attrs).initialize()
    bytes_attr = serialize_attr(model1.attrs["test"], attrs["test"], "test",
                                model1)
    assert bytes_attr == srsly.msgpack_dumps("foo")
    model2 = Model("test", fwd, attrs={"test": ""})
    result = deserialize_attr(model2.attrs["test"], bytes_attr, "test", model2)
    assert result == "foo"

    # Test objects with custom serialization functions
    @serialize_attr.register(SerializableAttr)
    def serialize_attr_custom(_, value, name, model):
        return value.to_bytes()

    @deserialize_attr.register(SerializableAttr)
    def deserialize_attr_custom(_, value, name, model):
        return SerializableAttr().from_bytes(value)

    attrs = {"test": SerializableAttr()}
    model3 = Model("test", fwd, attrs=attrs)
    bytes_attr = serialize_attr(model3.attrs["test"], attrs["test"], "test",
                                model3)
    assert bytes_attr == b"foo"
    model4 = Model("test", fwd, attrs=attrs)
    assert model4.attrs["test"].value == "foo"
    result = deserialize_attr(model4.attrs["test"], bytes_attr, "test", model4)
    assert result.value == "foo from bytes"
コード例 #2
0
ファイル: test_serialize.py プロジェクト: zhuikonger/thinc
def test_serialize_model_shims_roundtrip_bytes():
    fwd = lambda model, X, is_train: (X, lambda dY: dY)
    test_shim = SerializableShim(None)
    shim_model = Model("shimmodel", fwd, shims=[test_shim])
    model = chain(Linear(2, 3), shim_model, Maxout(2, 3))
    model.initialize()
    assert model.layers[1].shims[0].value == "shimdata"
    model_bytes = model.to_bytes()
    with pytest.raises(ValueError):
        Linear(2, 3).from_bytes(model_bytes)
    test_shim = SerializableShim(None)
    shim_model = Model("shimmodel", fwd, shims=[test_shim])
    new_model = chain(Linear(2, 3), shim_model,
                      Maxout(2, 3)).from_bytes(model_bytes)
    assert new_model.layers[1].shims[0].value == "shimdata from bytes"
コード例 #3
0
def TransformerModel(
    name: str, get_spans: Callable, tokenizer_config: dict = {}, transformer_config: dict = {}
) -> Model[List[Doc], FullTransformerBatch]:
    """
    get_spans (Callable[[List[Doc]], List[Span]]):
        A function to extract spans from the batch of Doc objects.
        This is used to manage long documents, by cutting them into smaller
        sequences before running the transformer. The spans are allowed to
        overlap, and you can also omit sections of the Doc if they are not
        relevant.
    tokenizer_config (dict): Settings to pass to the transformers tokenizer.
    transformer_config (dict): Settings to pass to the transformers forward pass.
    """

    return Model(
        "transformer",
        forward,
        init=init,
        layers=[],
        dims={"nO": None},
        attrs={
            "tokenizer": None,
            "get_spans": get_spans,
            "name": name,
            "tokenizer_config": tokenizer_config,
            "transformer_config": transformer_config,
            "set_transformer": set_pytorch_transformer,
            "has_transformer": False,
            "flush_cache_chance": 0.0,
            "replace_listener": replace_listener,
            "replace_listener_cfg": replace_listener_cfg,
        },
    )
コード例 #4
0
def trfs2arrays(
    pooling: Model[Ragged, Floats2d], grad_factor: float
) -> Model[List[TransformerData], List[Floats2d]]:
    """Pool transformer data into token-aligned tensors."""
    return Model(
        "trfs2arrays", forward, layers=[pooling], attrs={"grad_factor": grad_factor},
    )
コード例 #5
0
ファイル: multi_task.py プロジェクト: EricM2/venv
def build_masked_language_model(
    vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15
) -> Model:
    """Convert a model into a BERT-style masked language model"""
    random_words = _RandomWords(vocab)

    def mlm_forward(model, docs, is_train):
        mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
        output, backprop = model.layers[0](docs, is_train)

        def mlm_backward(d_output):
            d_output *= 1 - mask
            return backprop(d_output)

        return output, mlm_backward

    def mlm_initialize(model: Model, X=None, Y=None):
        wrapped = model.layers[0]
        wrapped.initialize(X=X, Y=Y)
        for dim in wrapped.dim_names:
            if wrapped.has_dim(dim):
                model.set_dim(dim, wrapped.get_dim(dim))

    mlm_model = Model(
        "masked-language-model",
        mlm_forward,
        layers=[wrapped_model],
        init=mlm_initialize,
        refs={"wrapped": wrapped_model},
        dims={dim: None for dim in wrapped_model.dim_names},
    )
    mlm_model.set_ref("wrapped", wrapped_model)
    return mlm_model
コード例 #6
0
def test_replace_node():
    relu1 = Relu(5)
    relu2 = Relu(5)
    relu_chain = chain(relu1, relu2)
    relu1_debug = with_debug(relu1)
    debug = Model(
        "test",
        lambda X: (X, lambda dY: dY),
        layers=[relu1, relu2, relu1, relu_chain],
        refs={
            "relu1": relu1,
            "relu2": relu2,
            "relu3": relu1
        },
    )
    debug.replace_node(relu1, relu1_debug)
    assert debug.layers[0] == relu1_debug
    assert debug.layers[1] == relu2
    assert debug.layers[2] == relu1_debug
    assert debug.get_ref("relu1") == relu1_debug
    assert debug.get_ref("relu2") == relu2
    assert debug.get_ref("relu3") == relu1_debug

    # Check that nodes are replaced recursively
    assert debug.layers[3] == relu_chain
    assert debug.layers[3].layers[0] == relu1_debug
    assert debug.layers[3].layers[1] == relu2
コード例 #7
0
def TransitionModel(tok2vec,
                    lower,
                    upper,
                    resize_output,
                    dropout=0.2,
                    unseen_classes=set()):
    """Set up a stepwise transition-based model"""
    if upper is None:
        has_upper = False
        upper = noop()
    else:
        has_upper = True
    # don't define nO for this object, because we can't dynamically change it
    return Model(
        name="parser_model",
        forward=forward,
        dims={"nI": tok2vec.maybe_get_dim("nI")},
        layers=[tok2vec, lower, upper],
        refs={
            "tok2vec": tok2vec,
            "lower": lower,
            "upper": upper
        },
        init=init,
        attrs={
            "has_upper": has_upper,
            "unseen_classes": set(unseen_classes),
            "resize_output": resize_output,
        },
    )
コード例 #8
0
def get_array_model():
    def _trim_array_forward(model, X, is_train):
        def backprop(dY):
            return model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1)

        return X[:, :-1], backprop

    return with_array2d(Model("trimarray", _trim_array_forward))
コード例 #9
0
def trfs2arrays(
        pooling: Model[Ragged, Floats2d],
        grad_factor: float) -> Model[List[TransformerData], List[Floats2d]]:
    return Model(
        "trfs2arrays",
        forward,
        layers=[pooling],
        attrs={"grad_factor": grad_factor},
    )
コード例 #10
0
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
    model = Model(
        "precomputable_affine",
        forward,
        init=init,
        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
        params={"W": None, "b": None, "pad": None},
        attrs={"dropout_rate": dropout},
    )
    return model
コード例 #11
0
def get_ragged_model():
    def _trim_ragged_forward(model, Xr, is_train):
        def backprop(dYr):
            dY = dYr.data
            dX = model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1)
            return Ragged(dX, dYr.lengths)

        return Ragged(Xr.data[:, :-1], Xr.lengths), backprop

    return with_ragged(Model("trimragged", _trim_ragged_forward))
コード例 #12
0
ファイル: test_serialize.py プロジェクト: zhuikonger/thinc
def test_serialize_refs_roundtrip_bytes():
    fwd = lambda model, X, is_train: (X, lambda dY: dY)
    model_a = Model("a", fwd)
    model = Model("test", fwd, refs={"a": model_a, "b": None}).initialize()
    with pytest.raises(ValueError):  # ref not in nodes
        model.to_bytes()
    model = Model("test",
                  fwd,
                  refs={
                      "a": model_a,
                      "b": None
                  },
                  layers=[model_a])
    assert model.ref_names == ("a", "b")
    model_bytes = model.to_bytes()
    with pytest.raises(ValueError):
        Model("test", fwd).from_bytes(model_bytes)
    new_model = Model("test", fwd, layers=[model_a])
    new_model.from_bytes(model_bytes)
    assert new_model.ref_names == ("a", "b")
コード例 #13
0
ファイル: util.py プロジェクト: wolfju/spacy-transformers
def DummyTransformerModel(width: int, depth: int):
    def _forward(model, tokens, is_train):
        width = model.attrs["width"]
        depth = model.attrs["depth"]
        tensors = []
        shape = (tokens["input_ids"].shape[0], tokens["input_ids"].shape[1], width)
        for i in range(depth):
            tensors.append(torch.zeros(*shape))
        return tensors, lambda d_tensors: tokens

    return Model("dummy-transformer", _forward, attrs={"width": width, "depth": depth})
コード例 #14
0
ファイル: extract_spans.py プロジェクト: svlandeg/spaCy
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
    """Extract spans from a sequence of source arrays, as specified by an array
    of (start, end) indices. The output is a ragged array of the
    extracted spans.
    """
    return Model("extract_spans",
                 forward,
                 layers=[],
                 refs={},
                 attrs={},
                 dims={},
                 init=init)
コード例 #15
0
def get_list_model():
    def _trim_list_forward(model, Xs, is_train):
        def backprop(dYs):
            dXs = []
            for dY in dYs:
                dXs.append(model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1))
            return dXs

        Ys = [X[:, :-1] for X in Xs]
        return Ys, backprop

    return with_list(Model("trimlist", _trim_list_forward))
コード例 #16
0
def HFWrapper(
    hf_model: "HFObjects",
    convert_inputs: Optional[Callable] = None,
    convert_outputs: Optional[Callable] = None,
    mixed_precision: bool = False,
    grad_scaler_config: dict = {},
) -> Model[Any, Any]:
    """Wrap a PyTorch HF model, so that it has the same API as Thinc models.
    To optimize the model, you'll need to create a PyTorch optimizer and call
    optimizer.step() after each batch. See examples/wrap_pytorch.py

    Your PyTorch model's forward method can take arbitrary args and kwargs,
    but must return either a single tensor as output or a tuple. You may find the
    PyTorch register_forward_hook helpful if you need to adapt the output.

    The convert functions are used to map inputs and outputs to and from your
    PyTorch model. Each function should return the converted output, and a callback
    to use during the backward pass. So:

        Xtorch, get_dX = convert_inputs(X)
        Ytorch, torch_backprop = model.shims[0](Xtorch, is_train)
        Y, get_dYtorch = convert_outputs(Ytorch)

    To allow maximum flexibility, the PyTorchShim expects ArgsKwargs objects
    on the way into the forward and backward passed. The ArgsKwargs objects
    will be passed straight into the model in the forward pass, and straight
    into `torch.autograd.backward` during the backward pass.
    """
    if convert_inputs is None:
        convert_inputs = convert_pytorch_default_inputs
    if convert_outputs is None:
        convert_outputs = convert_pytorch_default_outputs

    return Model(
        "hf-pytorch",
        pt_forward,
        attrs={
            "convert_inputs": convert_inputs,
            "convert_outputs": convert_outputs
        },
        shims=[
            HFShim(
                hf_model,
                mixed_precision=mixed_precision,
                grad_scaler_config=grad_scaler_config,
            )
        ],
        dims={
            "nI": None,
            "nO": None
        },
    )
コード例 #17
0
def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
    # nM: Number of dimensions per character. nC: Number of characters.
    return Model(
        "charembed",
        forward,
        init=init,
        dims={
            "nM": nM,
            "nC": nC,
            "nO": nM * nC,
            "nV": 256
        },
        params={"E": None},
    )
コード例 #18
0
def create_tensors(
    tok2vec: Model[List[Doc], List[Floats2d]],
    pooling: Model[Ragged, Floats2d],
    get_instances: Callable[[Doc], List[Tuple[Span, Span]]],
) -> Model[List[Doc], Floats2d]:

    return Model(
        "instance_tensors",
        instance_forward,
        layers=[tok2vec, pooling],
        refs={"tok2vec": tok2vec, "pooling": pooling},
        attrs={"get_instances": get_instances},
        init=instance_init,
    )
コード例 #19
0
def get_padded_model():
    def _trim_padded_forward(model, Xp, is_train):
        def backprop(dYp):
            dY = dYp.data
            dX = model.ops.alloc3f(dY.shape[0], dY.shape[1], dY.shape[2] + 1)
            return Padded(dX, dYp.size_at_t, dYp.lengths, dYp.indices)

        assert isinstance(Xp, Padded)
        X = Xp.data
        X = X.reshape((X.shape[0] * X.shape[1], X.shape[2]))
        X = X[:, :-1]
        X = X.reshape((Xp.data.shape[0], Xp.data.shape[1], X.shape[1]))
        return Padded(X, Xp.size_at_t, Xp.lengths, Xp.indices), backprop

    return with_padded(Model("trimpadded", _trim_padded_forward))
コード例 #20
0
def DummyTransformer(
        depth: int = 2,
        width: int = 4,
        get_spans=get_doc_spans) -> Model[List[Doc], FullTransformerBatch]:
    """Create a test model that produces a FullTransformerBatch object."""
    return Model(
        "dummy-transformer",
        transformer_forward,
        layers=[DummyTransformerModel(width=width, depth=depth)],
        attrs={
            "get_spans": get_spans,
            "tokenizer": DummyTokenizer(),
            "grad_factor": 1.0,
        },
        dims={"nO": width},
    )
コード例 #21
0
def DummyTransformerModel(width: int, depth: int):
    def _forward(model, tokens, is_train):
        width = model.attrs["width"]
        depth = model.attrs["depth"]
        shape = (depth, tokens.input_ids.shape[0], tokens.input_ids.shape[1],
                 width)
        tensors = torch.zeros(*shape)
        return ModelOutput(last_hidden_state=tensors), lambda d_tensors: tokens

    return Model(
        "dummy-transformer",
        _forward,
        attrs={
            "width": width,
            "depth": depth
        },
    )
コード例 #22
0
def TransformersTokenizer(name: str) -> Model[List[List[str]], TokensPlus]:
    def forward(
        model, texts: List[List[str]], is_train: bool
    ) -> Tuple[TokensPlus, Callable]:
        tokenizer = model.attrs["tokenizer"]
        token_data = tokenizer.batch_encode_plus(
            [(text, None) for text in texts],
            add_special_tokens=True,
            return_token_type_ids=True,
            return_attention_masks=True,
            return_input_lengths=True,
            return_tensors="pt",
        )
        return TokensPlus(**token_data), lambda d_tokens: []

    return Model(
        "tokenizer", forward, attrs={"tokenizer": AutoTokenizer.from_pretrained(name)},
    )
コード例 #23
0
def TransformerModel(
        name: str, get_spans: Callable,
        tokenizer_config: dict) -> Model[List[Doc], FullTransformerBatch]:
    return Model(
        "transformer",
        forward,
        init=init,
        layers=[],
        dims={"nO": None},
        attrs={
            "tokenizer": None,
            "get_spans": get_spans,
            "name": name,
            "tokenizer_config": tokenizer_config,
            "set_transformer": set_pytorch_transformer,
            "has_transformer": False,
        },
    )
コード例 #24
0
ファイル: models.py プロジェクト: dbragdon1/textacy
def text_to_char_ngrams(
    n: int,
    max_chars: int,
    lower: bool,
) -> Model[List[str], List[List[str]]]:
    """
    Custom data type transfer thinc layer that transforms a sequence of text strings
    into a sequence of sequence of character ngram strings. Like this::

        ["a short text.", "another text."] => [["a ", " s", "sh", "ho", ...], ...]

    Args:
        n: Number of adjacent characters to combine into an ngram.
        max_chars: Max number of characters from the start of the text to transform
            into character ngrams.
        lower: If True, lowercase text before extracting character ngrams; otherwise,
            leave text casing as-is.
    """
    def forward(model: Model, texts: List[str],
                is_train: bool) -> Tuple[List[List[str]], Callable]:
        if lower is True:
            texts = (text[:max_chars].lower() for text in texts)
        else:
            texts = (text[:max_chars] for text in texts)
        if n == 1:
            char_ngs = [list(text) for text in texts]
        else:
            char_ngs = [[text[i:i + n] for i in range(len(text) - n + 1)]
                        for text in texts]

        def backprop(dY):
            return []

        return (char_ngs, backprop)

    return Model(
        "texts_to_char_ngrams",
        forward,
        attrs={
            "n": n,
            "max_chars": max_chars,
            "lower": lower
        },
    )
コード例 #25
0
def StaticVectors(
    nO: Optional[int] = None,
    nM: Optional[int] = None,
    *,
    dropout: Optional[float] = None,
    init_W: Callable = glorot_uniform_init,
    key_attr: str = "ORTH"
) -> Model[List[Doc], Ragged]:
    """Embed Doc objects with their vocab's vectors table, applying a learned
    linear projection to control the dimensionality. If a dropout rate is
    specified, the dropout is applied per dimension over the whole batch.
    """
    return Model(
        "static_vectors",
        forward,
        init=partial(init, init_W),
        params={"W": None},
        attrs={"key_attr": key_attr, "dropout_rate": dropout},
        dims={"nO": nO, "nM": nM},
    )
コード例 #26
0
ファイル: test_serialize.py プロジェクト: zhuikonger/thinc
def test_simple_model_roundtrip_bytes_serializable_attrs():
    fwd = lambda model, X, is_train: (X, lambda dY: dY)
    attr = SerializableAttr()
    assert attr.value == "foo"
    assert attr.to_bytes() == b"foo"
    model = Model("test", fwd, attrs={"test": attr})
    model.initialize()

    @serialize_attr.register(SerializableAttr)
    def serialize_attr_custom(_, value, name, model):
        return value.to_bytes()

    @deserialize_attr.register(SerializableAttr)
    def deserialize_attr_custom(_, value, name, model):
        return SerializableAttr().from_bytes(value)

    model_bytes = model.to_bytes()
    model = model.from_bytes(model_bytes)
    assert "test" in model.attrs
    assert model.attrs["test"].value == "foo from bytes"
コード例 #27
0
def test_model_set_dim():
    class MyShim(Shim):
        name = "testshim"

    model_a = create_model("a")
    model = Model(
        "test",
        lambda X: (X, lambda dY: dY),
        dims={
            "nI": 5,
            "nO": None
        },
        params={
            "W": None,
            "b": None
        },
        refs={
            "a": model_a,
            "b": None
        },
        attrs={"foo": "bar"},
        shims=[MyShim(None)],
        layers=[model_a, model_a],
    )
    with pytest.raises(ValueError):
        model.set_dim("nI", 10)
    # force can be used before any parameters are set
    model.set_dim("nI", 10, force=True)
    model.set_param("W", model.ops.alloc1f(10))
    model.set_grad("W", model.ops.alloc1f(10))
    assert model.has_dim("nI")
    assert model.get_dim("nI") == 10
    with pytest.raises(KeyError):
        model.set_dim("xyz", 20)
    with pytest.raises(ValueError):
        model.set_dim("nI", 20)
    # force can't be used after any parameter is set
    with pytest.raises(ValueError):
        model.set_dim("nI", 20, force=True)
コード例 #28
0
ファイル: extract_ngrams.py プロジェクト: mosynaq/spaCy
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
    model = Model("extract_ngrams", forward)
    model.attrs["ngram_size"] = ngram_size
    model.attrs["attr"] = attr
    return model
コード例 #29
0
def FeatureExtractor(
        columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
    return Model("extract_features", forward, attrs={"columns": columns})
コード例 #30
0
ファイル: test_model.py プロジェクト: zhuikonger/thinc
def test_model_init():
    class MyShim(Shim):
        name = "testshim"

    model_a = create_model("a")
    model = Model(
        "test",
        lambda X: (X, lambda dY: dY),
        dims={
            "nI": 10,
            "nO": None
        },
        params={
            "W": numpy.zeros((10, )),
            "b": None
        },
        refs={
            "a": model_a,
            "b": None
        },
        attrs={"foo": "bar"},
        shims=[MyShim(None)],
        layers=[model_a, model_a],
    )
    assert model.has_param("W")
    assert model.get_param("W").shape == (10, )
    assert model.has_param("b") is None
    with pytest.raises(KeyError):
        model.get_param("b")
    with pytest.raises(KeyError):
        model.get_param("X")
    model.set_param("X", numpy.zeros((10, )))
    assert model.has_param("X")
    assert model.get_param("X").shape == (10, )
    with model.use_params({(model.id, "X"): numpy.ones((10, ))}):
        assert numpy.array_equal(model.get_param("X"), numpy.ones((10, )))
    assert numpy.array_equal(model.get_param("X"), numpy.zeros((10, )))
    assert not model.has_grad("W")
    assert not model.has_grad("xyz")
    with pytest.raises(KeyError):
        model.get_grad("b")
    model.set_param("W", model.ops.alloc1f(10))
    model.set_grad("W", model.ops.alloc1f(10))
    with pytest.raises(ValueError):
        model.inc_grad("W", numpy.zeros((5, 0)))
    assert model.has_dim("nI")
    assert model.get_dim("nI") == 10
    with pytest.raises(KeyError):
        model.get_dim("xyz")
    with pytest.raises(ValueError):
        model.get_dim("nO")
    with pytest.raises(KeyError):
        model.set_dim("xyz", 20)
    with pytest.raises(ValueError):
        model.set_dim("nI", 20)
    assert model.has_ref("a")
    assert model.get_ref("a").name == "a"
    assert not model.has_ref("xyz")
    with pytest.raises(KeyError):
        model.get_ref("xyz")
    assert model.has_ref("b") is None
    with pytest.raises(ValueError):
        model.get_ref("b")
    model.set_ref("c", model_a)
    assert model.has_ref("c")
    assert model.get_ref("c").name == "a"
    with pytest.raises(ValueError):
        model.set_ref("c", create_model("c"))
    assert "foo" in model.attrs
    assert "bar" not in model.attrs
    assert model.attrs["foo"] == "bar"
    with pytest.raises(KeyError):
        model.attrs["bar"]
    model.attrs["bar"] = "baz"
    model_copy = model.copy()
    assert model_copy.name == "test"