def test_serialize_attrs(): fwd = lambda model, X, is_train: (X, lambda dY: dY) attrs = {"test": "foo"} model1 = Model("test", fwd, attrs=attrs).initialize() bytes_attr = serialize_attr(model1.attrs["test"], attrs["test"], "test", model1) assert bytes_attr == srsly.msgpack_dumps("foo") model2 = Model("test", fwd, attrs={"test": ""}) result = deserialize_attr(model2.attrs["test"], bytes_attr, "test", model2) assert result == "foo" # Test objects with custom serialization functions @serialize_attr.register(SerializableAttr) def serialize_attr_custom(_, value, name, model): return value.to_bytes() @deserialize_attr.register(SerializableAttr) def deserialize_attr_custom(_, value, name, model): return SerializableAttr().from_bytes(value) attrs = {"test": SerializableAttr()} model3 = Model("test", fwd, attrs=attrs) bytes_attr = serialize_attr(model3.attrs["test"], attrs["test"], "test", model3) assert bytes_attr == b"foo" model4 = Model("test", fwd, attrs=attrs) assert model4.attrs["test"].value == "foo" result = deserialize_attr(model4.attrs["test"], bytes_attr, "test", model4) assert result.value == "foo from bytes"
def test_serialize_model_shims_roundtrip_bytes(): fwd = lambda model, X, is_train: (X, lambda dY: dY) test_shim = SerializableShim(None) shim_model = Model("shimmodel", fwd, shims=[test_shim]) model = chain(Linear(2, 3), shim_model, Maxout(2, 3)) model.initialize() assert model.layers[1].shims[0].value == "shimdata" model_bytes = model.to_bytes() with pytest.raises(ValueError): Linear(2, 3).from_bytes(model_bytes) test_shim = SerializableShim(None) shim_model = Model("shimmodel", fwd, shims=[test_shim]) new_model = chain(Linear(2, 3), shim_model, Maxout(2, 3)).from_bytes(model_bytes) assert new_model.layers[1].shims[0].value == "shimdata from bytes"
def TransformerModel( name: str, get_spans: Callable, tokenizer_config: dict = {}, transformer_config: dict = {} ) -> Model[List[Doc], FullTransformerBatch]: """ get_spans (Callable[[List[Doc]], List[Span]]): A function to extract spans from the batch of Doc objects. This is used to manage long documents, by cutting them into smaller sequences before running the transformer. The spans are allowed to overlap, and you can also omit sections of the Doc if they are not relevant. tokenizer_config (dict): Settings to pass to the transformers tokenizer. transformer_config (dict): Settings to pass to the transformers forward pass. """ return Model( "transformer", forward, init=init, layers=[], dims={"nO": None}, attrs={ "tokenizer": None, "get_spans": get_spans, "name": name, "tokenizer_config": tokenizer_config, "transformer_config": transformer_config, "set_transformer": set_pytorch_transformer, "has_transformer": False, "flush_cache_chance": 0.0, "replace_listener": replace_listener, "replace_listener_cfg": replace_listener_cfg, }, )
def trfs2arrays( pooling: Model[Ragged, Floats2d], grad_factor: float ) -> Model[List[TransformerData], List[Floats2d]]: """Pool transformer data into token-aligned tensors.""" return Model( "trfs2arrays", forward, layers=[pooling], attrs={"grad_factor": grad_factor}, )
def build_masked_language_model( vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15 ) -> Model: """Convert a model into a BERT-style masked language model""" random_words = _RandomWords(vocab) def mlm_forward(model, docs, is_train): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) output, backprop = model.layers[0](docs, is_train) def mlm_backward(d_output): d_output *= 1 - mask return backprop(d_output) return output, mlm_backward def mlm_initialize(model: Model, X=None, Y=None): wrapped = model.layers[0] wrapped.initialize(X=X, Y=Y) for dim in wrapped.dim_names: if wrapped.has_dim(dim): model.set_dim(dim, wrapped.get_dim(dim)) mlm_model = Model( "masked-language-model", mlm_forward, layers=[wrapped_model], init=mlm_initialize, refs={"wrapped": wrapped_model}, dims={dim: None for dim in wrapped_model.dim_names}, ) mlm_model.set_ref("wrapped", wrapped_model) return mlm_model
def test_replace_node(): relu1 = Relu(5) relu2 = Relu(5) relu_chain = chain(relu1, relu2) relu1_debug = with_debug(relu1) debug = Model( "test", lambda X: (X, lambda dY: dY), layers=[relu1, relu2, relu1, relu_chain], refs={ "relu1": relu1, "relu2": relu2, "relu3": relu1 }, ) debug.replace_node(relu1, relu1_debug) assert debug.layers[0] == relu1_debug assert debug.layers[1] == relu2 assert debug.layers[2] == relu1_debug assert debug.get_ref("relu1") == relu1_debug assert debug.get_ref("relu2") == relu2 assert debug.get_ref("relu3") == relu1_debug # Check that nodes are replaced recursively assert debug.layers[3] == relu_chain assert debug.layers[3].layers[0] == relu1_debug assert debug.layers[3].layers[1] == relu2
def TransitionModel(tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()): """Set up a stepwise transition-based model""" if upper is None: has_upper = False upper = noop() else: has_upper = True # don't define nO for this object, because we can't dynamically change it return Model( name="parser_model", forward=forward, dims={"nI": tok2vec.maybe_get_dim("nI")}, layers=[tok2vec, lower, upper], refs={ "tok2vec": tok2vec, "lower": lower, "upper": upper }, init=init, attrs={ "has_upper": has_upper, "unseen_classes": set(unseen_classes), "resize_output": resize_output, }, )
def get_array_model(): def _trim_array_forward(model, X, is_train): def backprop(dY): return model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1) return X[:, :-1], backprop return with_array2d(Model("trimarray", _trim_array_forward))
def trfs2arrays( pooling: Model[Ragged, Floats2d], grad_factor: float) -> Model[List[TransformerData], List[Floats2d]]: return Model( "trfs2arrays", forward, layers=[pooling], attrs={"grad_factor": grad_factor}, )
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): model = Model( "precomputable_affine", forward, init=init, dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, params={"W": None, "b": None, "pad": None}, attrs={"dropout_rate": dropout}, ) return model
def get_ragged_model(): def _trim_ragged_forward(model, Xr, is_train): def backprop(dYr): dY = dYr.data dX = model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1) return Ragged(dX, dYr.lengths) return Ragged(Xr.data[:, :-1], Xr.lengths), backprop return with_ragged(Model("trimragged", _trim_ragged_forward))
def test_serialize_refs_roundtrip_bytes(): fwd = lambda model, X, is_train: (X, lambda dY: dY) model_a = Model("a", fwd) model = Model("test", fwd, refs={"a": model_a, "b": None}).initialize() with pytest.raises(ValueError): # ref not in nodes model.to_bytes() model = Model("test", fwd, refs={ "a": model_a, "b": None }, layers=[model_a]) assert model.ref_names == ("a", "b") model_bytes = model.to_bytes() with pytest.raises(ValueError): Model("test", fwd).from_bytes(model_bytes) new_model = Model("test", fwd, layers=[model_a]) new_model.from_bytes(model_bytes) assert new_model.ref_names == ("a", "b")
def DummyTransformerModel(width: int, depth: int): def _forward(model, tokens, is_train): width = model.attrs["width"] depth = model.attrs["depth"] tensors = [] shape = (tokens["input_ids"].shape[0], tokens["input_ids"].shape[1], width) for i in range(depth): tensors.append(torch.zeros(*shape)) return tensors, lambda d_tensors: tokens return Model("dummy-transformer", _forward, attrs={"width": width, "depth": depth})
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]: """Extract spans from a sequence of source arrays, as specified by an array of (start, end) indices. The output is a ragged array of the extracted spans. """ return Model("extract_spans", forward, layers=[], refs={}, attrs={}, dims={}, init=init)
def get_list_model(): def _trim_list_forward(model, Xs, is_train): def backprop(dYs): dXs = [] for dY in dYs: dXs.append(model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1)) return dXs Ys = [X[:, :-1] for X in Xs] return Ys, backprop return with_list(Model("trimlist", _trim_list_forward))
def HFWrapper( hf_model: "HFObjects", convert_inputs: Optional[Callable] = None, convert_outputs: Optional[Callable] = None, mixed_precision: bool = False, grad_scaler_config: dict = {}, ) -> Model[Any, Any]: """Wrap a PyTorch HF model, so that it has the same API as Thinc models. To optimize the model, you'll need to create a PyTorch optimizer and call optimizer.step() after each batch. See examples/wrap_pytorch.py Your PyTorch model's forward method can take arbitrary args and kwargs, but must return either a single tensor as output or a tuple. You may find the PyTorch register_forward_hook helpful if you need to adapt the output. The convert functions are used to map inputs and outputs to and from your PyTorch model. Each function should return the converted output, and a callback to use during the backward pass. So: Xtorch, get_dX = convert_inputs(X) Ytorch, torch_backprop = model.shims[0](Xtorch, is_train) Y, get_dYtorch = convert_outputs(Ytorch) To allow maximum flexibility, the PyTorchShim expects ArgsKwargs objects on the way into the forward and backward passed. The ArgsKwargs objects will be passed straight into the model in the forward pass, and straight into `torch.autograd.backward` during the backward pass. """ if convert_inputs is None: convert_inputs = convert_pytorch_default_inputs if convert_outputs is None: convert_outputs = convert_pytorch_default_outputs return Model( "hf-pytorch", pt_forward, attrs={ "convert_inputs": convert_inputs, "convert_outputs": convert_outputs }, shims=[ HFShim( hf_model, mixed_precision=mixed_precision, grad_scaler_config=grad_scaler_config, ) ], dims={ "nI": None, "nO": None }, )
def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: # nM: Number of dimensions per character. nC: Number of characters. return Model( "charembed", forward, init=init, dims={ "nM": nM, "nC": nC, "nO": nM * nC, "nV": 256 }, params={"E": None}, )
def create_tensors( tok2vec: Model[List[Doc], List[Floats2d]], pooling: Model[Ragged, Floats2d], get_instances: Callable[[Doc], List[Tuple[Span, Span]]], ) -> Model[List[Doc], Floats2d]: return Model( "instance_tensors", instance_forward, layers=[tok2vec, pooling], refs={"tok2vec": tok2vec, "pooling": pooling}, attrs={"get_instances": get_instances}, init=instance_init, )
def get_padded_model(): def _trim_padded_forward(model, Xp, is_train): def backprop(dYp): dY = dYp.data dX = model.ops.alloc3f(dY.shape[0], dY.shape[1], dY.shape[2] + 1) return Padded(dX, dYp.size_at_t, dYp.lengths, dYp.indices) assert isinstance(Xp, Padded) X = Xp.data X = X.reshape((X.shape[0] * X.shape[1], X.shape[2])) X = X[:, :-1] X = X.reshape((Xp.data.shape[0], Xp.data.shape[1], X.shape[1])) return Padded(X, Xp.size_at_t, Xp.lengths, Xp.indices), backprop return with_padded(Model("trimpadded", _trim_padded_forward))
def DummyTransformer( depth: int = 2, width: int = 4, get_spans=get_doc_spans) -> Model[List[Doc], FullTransformerBatch]: """Create a test model that produces a FullTransformerBatch object.""" return Model( "dummy-transformer", transformer_forward, layers=[DummyTransformerModel(width=width, depth=depth)], attrs={ "get_spans": get_spans, "tokenizer": DummyTokenizer(), "grad_factor": 1.0, }, dims={"nO": width}, )
def DummyTransformerModel(width: int, depth: int): def _forward(model, tokens, is_train): width = model.attrs["width"] depth = model.attrs["depth"] shape = (depth, tokens.input_ids.shape[0], tokens.input_ids.shape[1], width) tensors = torch.zeros(*shape) return ModelOutput(last_hidden_state=tensors), lambda d_tensors: tokens return Model( "dummy-transformer", _forward, attrs={ "width": width, "depth": depth }, )
def TransformersTokenizer(name: str) -> Model[List[List[str]], TokensPlus]: def forward( model, texts: List[List[str]], is_train: bool ) -> Tuple[TokensPlus, Callable]: tokenizer = model.attrs["tokenizer"] token_data = tokenizer.batch_encode_plus( [(text, None) for text in texts], add_special_tokens=True, return_token_type_ids=True, return_attention_masks=True, return_input_lengths=True, return_tensors="pt", ) return TokensPlus(**token_data), lambda d_tokens: [] return Model( "tokenizer", forward, attrs={"tokenizer": AutoTokenizer.from_pretrained(name)}, )
def TransformerModel( name: str, get_spans: Callable, tokenizer_config: dict) -> Model[List[Doc], FullTransformerBatch]: return Model( "transformer", forward, init=init, layers=[], dims={"nO": None}, attrs={ "tokenizer": None, "get_spans": get_spans, "name": name, "tokenizer_config": tokenizer_config, "set_transformer": set_pytorch_transformer, "has_transformer": False, }, )
def text_to_char_ngrams( n: int, max_chars: int, lower: bool, ) -> Model[List[str], List[List[str]]]: """ Custom data type transfer thinc layer that transforms a sequence of text strings into a sequence of sequence of character ngram strings. Like this:: ["a short text.", "another text."] => [["a ", " s", "sh", "ho", ...], ...] Args: n: Number of adjacent characters to combine into an ngram. max_chars: Max number of characters from the start of the text to transform into character ngrams. lower: If True, lowercase text before extracting character ngrams; otherwise, leave text casing as-is. """ def forward(model: Model, texts: List[str], is_train: bool) -> Tuple[List[List[str]], Callable]: if lower is True: texts = (text[:max_chars].lower() for text in texts) else: texts = (text[:max_chars] for text in texts) if n == 1: char_ngs = [list(text) for text in texts] else: char_ngs = [[text[i:i + n] for i in range(len(text) - n + 1)] for text in texts] def backprop(dY): return [] return (char_ngs, backprop) return Model( "texts_to_char_ngrams", forward, attrs={ "n": n, "max_chars": max_chars, "lower": lower }, )
def StaticVectors( nO: Optional[int] = None, nM: Optional[int] = None, *, dropout: Optional[float] = None, init_W: Callable = glorot_uniform_init, key_attr: str = "ORTH" ) -> Model[List[Doc], Ragged]: """Embed Doc objects with their vocab's vectors table, applying a learned linear projection to control the dimensionality. If a dropout rate is specified, the dropout is applied per dimension over the whole batch. """ return Model( "static_vectors", forward, init=partial(init, init_W), params={"W": None}, attrs={"key_attr": key_attr, "dropout_rate": dropout}, dims={"nO": nO, "nM": nM}, )
def test_simple_model_roundtrip_bytes_serializable_attrs(): fwd = lambda model, X, is_train: (X, lambda dY: dY) attr = SerializableAttr() assert attr.value == "foo" assert attr.to_bytes() == b"foo" model = Model("test", fwd, attrs={"test": attr}) model.initialize() @serialize_attr.register(SerializableAttr) def serialize_attr_custom(_, value, name, model): return value.to_bytes() @deserialize_attr.register(SerializableAttr) def deserialize_attr_custom(_, value, name, model): return SerializableAttr().from_bytes(value) model_bytes = model.to_bytes() model = model.from_bytes(model_bytes) assert "test" in model.attrs assert model.attrs["test"].value == "foo from bytes"
def test_model_set_dim(): class MyShim(Shim): name = "testshim" model_a = create_model("a") model = Model( "test", lambda X: (X, lambda dY: dY), dims={ "nI": 5, "nO": None }, params={ "W": None, "b": None }, refs={ "a": model_a, "b": None }, attrs={"foo": "bar"}, shims=[MyShim(None)], layers=[model_a, model_a], ) with pytest.raises(ValueError): model.set_dim("nI", 10) # force can be used before any parameters are set model.set_dim("nI", 10, force=True) model.set_param("W", model.ops.alloc1f(10)) model.set_grad("W", model.ops.alloc1f(10)) assert model.has_dim("nI") assert model.get_dim("nI") == 10 with pytest.raises(KeyError): model.set_dim("xyz", 20) with pytest.raises(ValueError): model.set_dim("nI", 20) # force can't be used after any parameter is set with pytest.raises(ValueError): model.set_dim("nI", 20, force=True)
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model: model = Model("extract_ngrams", forward) model.attrs["ngram_size"] = ngram_size model.attrs["attr"] = attr return model
def FeatureExtractor( columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]: return Model("extract_features", forward, attrs={"columns": columns})
def test_model_init(): class MyShim(Shim): name = "testshim" model_a = create_model("a") model = Model( "test", lambda X: (X, lambda dY: dY), dims={ "nI": 10, "nO": None }, params={ "W": numpy.zeros((10, )), "b": None }, refs={ "a": model_a, "b": None }, attrs={"foo": "bar"}, shims=[MyShim(None)], layers=[model_a, model_a], ) assert model.has_param("W") assert model.get_param("W").shape == (10, ) assert model.has_param("b") is None with pytest.raises(KeyError): model.get_param("b") with pytest.raises(KeyError): model.get_param("X") model.set_param("X", numpy.zeros((10, ))) assert model.has_param("X") assert model.get_param("X").shape == (10, ) with model.use_params({(model.id, "X"): numpy.ones((10, ))}): assert numpy.array_equal(model.get_param("X"), numpy.ones((10, ))) assert numpy.array_equal(model.get_param("X"), numpy.zeros((10, ))) assert not model.has_grad("W") assert not model.has_grad("xyz") with pytest.raises(KeyError): model.get_grad("b") model.set_param("W", model.ops.alloc1f(10)) model.set_grad("W", model.ops.alloc1f(10)) with pytest.raises(ValueError): model.inc_grad("W", numpy.zeros((5, 0))) assert model.has_dim("nI") assert model.get_dim("nI") == 10 with pytest.raises(KeyError): model.get_dim("xyz") with pytest.raises(ValueError): model.get_dim("nO") with pytest.raises(KeyError): model.set_dim("xyz", 20) with pytest.raises(ValueError): model.set_dim("nI", 20) assert model.has_ref("a") assert model.get_ref("a").name == "a" assert not model.has_ref("xyz") with pytest.raises(KeyError): model.get_ref("xyz") assert model.has_ref("b") is None with pytest.raises(ValueError): model.get_ref("b") model.set_ref("c", model_a) assert model.has_ref("c") assert model.get_ref("c").name == "a" with pytest.raises(ValueError): model.set_ref("c", create_model("c")) assert "foo" in model.attrs assert "bar" not in model.attrs assert model.attrs["foo"] == "bar" with pytest.raises(KeyError): model.attrs["bar"] model.attrs["bar"] = "baz" model_copy = model.copy() assert model_copy.name == "test"