def TextCatCNN_v1(tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None) -> Model[List[Doc], Floats2d]: """ Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the outputs sum to 1. If exclusive_classes=False, a logistic non-linearity is applied instead, so that outputs are in the range [0, 1]. """ chain = registry.get("layers", "chain.v1") reduce_mean = registry.get("layers", "reduce_mean.v1") Logistic = registry.get("layers", "Logistic.v1") Softmax = registry.get("layers", "Softmax.v1") Linear = registry.get("layers", "Linear.v1") list2ragged = registry.get("layers", "list2ragged.v1") with Model.define_operators({">>": chain}): cnn = tok2vec >> list2ragged() >> reduce_mean() if exclusive_classes: output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = cnn >> output_layer model.set_ref("output_layer", output_layer) else: linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = cnn >> linear_layer >> Logistic() model.set_ref("output_layer", linear_layer) model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nO) model.attrs["multi_label"] = not exclusive_classes return model
def test_serialize_attrs(): fwd = lambda model, X, is_train: (X, lambda dY: dY) attrs = {"test": "foo"} model1 = Model("test", fwd, attrs=attrs).initialize() bytes_attr = serialize_attr(model1.attrs["test"], attrs["test"], "test", model1) assert bytes_attr == srsly.msgpack_dumps("foo") model2 = Model("test", fwd, attrs={"test": ""}) result = deserialize_attr(model2.attrs["test"], bytes_attr, "test", model2) assert result == "foo" # Test objects with custom serialization functions @serialize_attr.register(SerializableAttr) def serialize_attr_custom(_, value, name, model): return value.to_bytes() @deserialize_attr.register(SerializableAttr) def deserialize_attr_custom(_, value, name, model): return SerializableAttr().from_bytes(value) attrs = {"test": SerializableAttr()} model3 = Model("test", fwd, attrs=attrs) bytes_attr = serialize_attr(model3.attrs["test"], attrs["test"], "test", model3) assert bytes_attr == b"foo" model4 = Model("test", fwd, attrs=attrs) assert model4.attrs["test"].value == "foo" result = deserialize_attr(model4.attrs["test"], bytes_attr, "test", model4) assert result.value == "foo from bytes"
def forward(model: Model, docs: List[Doc], is_train: bool): if docs is None: return [] ids = [] output = [] E = model.get_param("E") nC = model.get_dim("nC") nM = model.get_dim("nM") nO = model.get_dim("nO") # This assists in indexing; it's like looping over this dimension. # Still consider this weird witch craft...But thanks to Mark Neumann # for the tip. nCv = model.ops.xp.arange(nC) for doc in docs: doc_ids = model.ops.asarray(doc.to_utf8_array(nr_char=nC)) doc_vectors = model.ops.alloc3f(len(doc), nC, nM) # Let's say I have a 2d array of indices, and a 3d table of data. What numpy # incantation do I chant to get # output[i, j, k] == data[j, ids[i, j], k]? doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]] output.append(doc_vectors.reshape((len(doc), nO))) ids.append(doc_ids) def backprop(d_output): dE = model.ops.alloc(E.shape, dtype=E.dtype) for doc_ids, d_doc_vectors in zip(ids, d_output): d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM)) dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv] model.inc_grad("E", dE) return [] return output, backprop
def init(model: Model, X=None, Y=None): if model.attrs["has_transformer"]: return name = model.attrs["name"] tok_cfg = model._init_tokenizer_config trf_cfg = model._init_transformer_config tokenizer, transformer = huggingface_from_pretrained( name, tok_cfg, trf_cfg) model.attrs["set_transformer"](model, transformer, tokenizer) tokenizer = model.tokenizer # Call the model with a batch of inputs to infer the width if X: # If we're dealing with actual texts, do the work to setup the wordpieces # batch properly docs = X get_spans = model.attrs["get_spans"] nested_spans = get_spans(docs) flat_spans = [] for doc_spans in nested_spans: flat_spans.extend(doc_spans) token_data = huggingface_tokenize(tokenizer, [span.text for span in flat_spans]) wordpieces = WordpieceBatch.from_batch_encoding(token_data) align = get_alignment(flat_spans, wordpieces.strings, tokenizer.all_special_tokens) wordpieces, align = truncate_oversize_splits( wordpieces, align, tokenizer.model_max_length) else: texts = ["hello world", "foo bar"] token_data = huggingface_tokenize(tokenizer, texts) wordpieces = WordpieceBatch.from_batch_encoding(token_data) model.layers[0].initialize(X=wordpieces) model_output = model.layers[0].predict(wordpieces) model.set_dim("nO", model_output.last_hidden_state.shape[-1])
def build_masked_language_model( vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15 ) -> Model: """Convert a model into a BERT-style masked language model""" random_words = _RandomWords(vocab) def mlm_forward(model, docs, is_train): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) output, backprop = model.layers[0](docs, is_train) def mlm_backward(d_output): d_output *= 1 - mask return backprop(d_output) return output, mlm_backward def mlm_initialize(model: Model, X=None, Y=None): wrapped = model.layers[0] wrapped.initialize(X=X, Y=Y) for dim in wrapped.dim_names: if wrapped.has_dim(dim): model.set_dim(dim, wrapped.get_dim(dim)) mlm_model = Model( "masked-language-model", mlm_forward, layers=[wrapped_model], init=mlm_initialize, refs={"wrapped": wrapped_model}, dims={dim: None for dim in wrapped_model.dim_names}, ) mlm_model.set_ref("wrapped", wrapped_model) return mlm_model
def __init__(self, upstream_name: str): Model.__init__(self, name=self.name, forward=forward, dims={"nO": None}) self.upstream_name = upstream_name self._batch_id = None self._outputs = None self._backprop = None
def get_tok2vec_width(model: Model): nO = None if model.has_ref("tok2vec"): tok2vec = model.get_ref("tok2vec") if tok2vec.has_dim("nO"): nO = tok2vec.get_dim("nO") elif tok2vec.has_ref("listener"): nO = tok2vec.get_ref("listener").get_dim("nO") return nO
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: with Model.define_operators({">>": chain, "**": clone}): token_width = tok2vec.get_dim("nO") output_layer = Linear(nO=nO, nI=token_width) model = (tok2vec >> list2ragged() >> reduce_mean() >> residual( Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) >> output_layer) model.set_ref("output_layer", output_layer) model.set_ref("tok2vec", tok2vec) return model
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model[List[Doc], Floats2d]: with Model.define_operators({">>": chain, "&": tuplify}): token_width = tok2vec.maybe_get_dim("nO") output_layer = Linear(nO=nO, nI=token_width) model = (((tok2vec >> list2ragged()) & build_span_maker()) >> extract_spans() >> reduce_mean() >> residual( Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) >> output_layer) model.set_ref("output_layer", output_layer) model.set_ref("tok2vec", tok2vec) # flag to show this isn't legacy model.attrs["include_span_maker"] = True return model
def __init__(self, upstream_name: str, width: int) -> None: """ upstream_name (str): A string to identify the 'upstream' Tok2Vec component to communicate with. The upstream name should either be the wildcard string '*', or the name of the `Tok2Vec` component. You'll almost never have multiple upstream Tok2Vec components, so the wildcard string will almost always be fine. width (int): The width of the vectors produced by the upstream tok2vec component. """ Model.__init__(self, name=self.name, forward=forward, dims={"nO": width}) self.upstream_name = upstream_name self._batch_id = None self._outputs = None self._backprop = None
def test_serialize_model_shims_roundtrip_bytes(): fwd = lambda model, X, is_train: (X, lambda dY: dY) test_shim = SerializableShim(None) shim_model = Model("shimmodel", fwd, shims=[test_shim]) model = chain(Linear(2, 3), shim_model, Maxout(2, 3)) model.initialize() assert model.layers[1].shims[0].value == "shimdata" model_bytes = model.to_bytes() with pytest.raises(ValueError): Linear(2, 3).from_bytes(model_bytes) test_shim = SerializableShim(None) shim_model = Model("shimmodel", fwd, shims=[test_shim]) new_model = chain(Linear(2, 3), shim_model, Maxout(2, 3)).from_bytes(model_bytes) assert new_model.layers[1].shims[0].value == "shimdata from bytes"
def test_tuplify_operator_three(model1, model2, model3): # Previously we 'flattened' these nested calls. We might opt to do so # again, especially for the operators. with Model.define_operators({"&": tuplify}): model = model1 & model2 & model3 assert len(model.layers) == 2 assert len(model.layers[0].layers) == 2
def CharNgramsEmbedding( n: int, max_chars: int, lower: bool, num_vectors: int, embed_dim: int, dropout: Optional[float], ) -> Model[List[str], thinc.types.Floats1d]: """ Args: n max_chars lower num_vectors embed_dim dropout """ with Model.define_operators({">>": chain}): model = ( text_to_char_ngrams(n, max_chars, lower) >> thinc.layers.strings2arrays() >> thinc.layers.with_array( thinc.layers.HashEmbed( nO=embed_dim, nV=num_vectors, dropout=dropout, column=0, )) >> thinc.layers.list2ragged() >> thinc.layers.reduce_mean()) return model
def trfs2arrays( pooling: Model[Ragged, Floats2d], grad_factor: float ) -> Model[List[TransformerData], List[Floats2d]]: """Pool transformer data into token-aligned tensors.""" return Model( "trfs2arrays", forward, layers=[pooling], attrs={"grad_factor": grad_factor}, )
def cnn_tagger(width: int, vector_width: int, nr_classes: int = 17): with Model.define_operators({">>": chain}): model = strings2arrays() >> with_array( HashEmbed(nO=width, nV=vector_width, column=0) >> expand_window( window_size=1) >> Relu(nO=width, nI=width * 3) >> Relu( nO=width, nI=width) >> Softmax(nO=nr_classes, nI=width)) return model
def TransformerModel( name: str, get_spans: Callable, tokenizer_config: dict = {}, transformer_config: dict = {} ) -> Model[List[Doc], FullTransformerBatch]: """ get_spans (Callable[[List[Doc]], List[Span]]): A function to extract spans from the batch of Doc objects. This is used to manage long documents, by cutting them into smaller sequences before running the transformer. The spans are allowed to overlap, and you can also omit sections of the Doc if they are not relevant. tokenizer_config (dict): Settings to pass to the transformers tokenizer. transformer_config (dict): Settings to pass to the transformers forward pass. """ return Model( "transformer", forward, init=init, layers=[], dims={"nO": None}, attrs={ "tokenizer": None, "get_spans": get_spans, "name": name, "tokenizer_config": tokenizer_config, "transformer_config": transformer_config, "set_transformer": set_pytorch_transformer, "has_transformer": False, "flush_cache_chance": 0.0, "replace_listener": replace_listener, "replace_listener_cfg": replace_listener_cfg, }, )
def LangIdentifierModelV2( ns: Sequence[int] = (1, 2, 3), embed_dim: int = 100, hidden_width: int = 512, dropout: Optional[float] = 0.1, ) -> Model[List[str], thinc.types.Floats2d]: """ Build a language identification model inspired by Google's CLD3. Args: ns: Set of "n" for which character "n"-grams are extracted from input texts. If 1, only unigrams (single characters) are used; if [1, 2], then both unigrams and bigrams are used; and so on. embed_dim: Size of the vectors into which each set of ngrams are embedded. hidden_width: Width of the dense layer with Relu activation, just before the final prediction (Softmax) layer. dropout: Dropout rate to avoid overfitting. Returns: Thinc :class:`Model`. """ with Model.define_operators({">>": chain}): model = (MultiCharNgramsEmbedding( ns=list(ns), max_chars=1000, lower=True, num_vectors=[2000 * n for n in ns], embed_dims=embed_dim, dropout=dropout, ) >> thinc.layers.Relu( nI=embed_dim * len(ns), nO=hidden_width, dropout=dropout, ) >> thinc.layers.Softmax(nI=hidden_width)) return model
def _resume_model( model: Model, resume_path: Path, epoch_resume: int, silent: bool = True ) -> None: msg = Printer(no_print=silent) msg.info(f"Resume training tok2vec from: {resume_path}") with resume_path.open("rb") as file_: weights_data = file_.read() model.get_ref("tok2vec").from_bytes(weights_data) # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(resume_path)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 msg.info(f"Resuming from epoch: {epoch_resume}") else: msg.info(f"Resuming from epoch: {epoch_resume}")
def TransitionModel(tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()): """Set up a stepwise transition-based model""" if upper is None: has_upper = False upper = noop() else: has_upper = True # don't define nO for this object, because we can't dynamically change it return Model( name="parser_model", forward=forward, dims={"nI": tok2vec.maybe_get_dim("nI")}, layers=[tok2vec, lower, upper], refs={ "tok2vec": tok2vec, "lower": lower, "upper": upper }, init=init, attrs={ "has_upper": has_upper, "unseen_classes": set(unseen_classes), "resize_output": resize_output, }, )
def make_update(model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable) -> float: """Perform an update over a single batch of documents. docs (iterable): A batch of `Doc` objects. optimizer (callable): An optimizer. RETURNS loss: A float for the loss. """ predictions, backprop = model.begin_update(docs) loss, gradients = objective_func(model.ops, docs, predictions) backprop(gradients) model.finish_update(optimizer) # Don't want to return a cupy object here # The gradients are modified in-place by the BERT MLM, # so we get an accurate loss return float(loss)
def build_text_classifier_v2( tok2vec: Model[List[Doc], List[Floats2d]], linear_model: Model[List[Doc], Floats2d], nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: exclusive_classes = not linear_model.attrs["multi_label"] with Model.define_operators({">>": chain, "|": concatenate}): width = tok2vec.maybe_get_dim("nO") attention_layer = ParametricAttention( width) # TODO: benchmark performance difference of this layer maxout_layer = Maxout(nO=width, nI=width) norm_layer = LayerNorm(nI=width) cnn_model = ( tok2vec >> list2ragged() >> attention_layer >> reduce_sum() >> residual(maxout_layer >> norm_layer >> Dropout(0.0))) nO_double = nO * 2 if nO else None if exclusive_classes: output_layer = Softmax(nO=nO, nI=nO_double) else: output_layer = Linear(nO=nO, nI=nO_double) >> Logistic() model = (linear_model | cnn_model) >> output_layer model.set_ref("tok2vec", tok2vec) if model.has_dim("nO") is not False: model.set_dim("nO", nO) model.set_ref("output_layer", linear_model.get_ref("output_layer")) model.set_ref("attention_layer", attention_layer) model.set_ref("maxout_layer", maxout_layer) model.set_ref("norm_layer", norm_layer) model.attrs["multi_label"] = not exclusive_classes model.init = init_ensemble_textcat return model
def MultiCharNgramsEmbedding( ns: List[int], max_chars: int, lower: bool, num_vectors: int | List[int], embed_dims: int | List[int], dropout: Optional[float], ) -> Model[List[str], thinc.types.Floats1d]: """ Args: ns max_chars lower num_vectors embed_dims dropout """ numn = len(ns) num_vectors = [num_vectors] * numn if isinstance(num_vectors, int) else num_vectors embed_dims = [embed_dims] * numn if isinstance(embed_dims, int) else embed_dims with Model.define_operators({">>": chain}): model = concatenate(*[ CharNgramsEmbedding( n=n, max_chars=max_chars, lower=lower, num_vectors=nvec, embed_dim=edim, dropout=dropout, ) for n, nvec, edim in zip(ns, num_vectors, embed_dims) ]) return model
def instance_init(model: Model, X: List[Doc] = None, Y: Floats2d = None) -> Model: tok2vec = model.get_ref("tok2vec") if X is not None: tok2vec.initialize(X) return model
def build_bow_text_classifier( exclusive_classes: bool, ngram_size: int, no_output_layer: bool, nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: fill_defaults = {"b": 0, "W": 0} with Model.define_operators({">>": chain}): sparse_linear = SparseLinear(nO=nO) output_layer = None if not no_output_layer: fill_defaults["b"] = NEG_VALUE output_layer = softmax_activation() if exclusive_classes else Logistic() resizable_layer = resizable( sparse_linear, resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults), ) model = extract_ngrams(ngram_size, attr=ORTH) >> resizable_layer model = with_cpu(model, model.ops) if output_layer: model = model >> with_cpu(output_layer, output_layer.ops) model.set_dim("nO", nO) model.set_ref("output_layer", sparse_linear) model.attrs["multi_label"] = not exclusive_classes model.attrs["resize_output"] = partial( resize_and_set_ref, resizable_layer=resizable_layer ) return model
def get_array_model(): def _trim_array_forward(model, X, is_train): def backprop(dY): return model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1) return X[:, :-1], backprop return with_array2d(Model("trimarray", _trim_array_forward))
def create_relation_model( create_instance_tensor: Model[List[Doc], Floats2d], classification_layer: Model[Floats2d, Floats2d], ) -> Model[List[Doc], Floats2d]: with Model.define_operators({">>": chain}): model = create_instance_tensor >> classification_layer model.attrs["get_instances"] = create_instance_tensor.attrs["get_instances"] return model
def trfs2arrays( pooling: Model[Ragged, Floats2d], grad_factor: float) -> Model[List[TransformerData], List[Floats2d]]: return Model( "trfs2arrays", forward, layers=[pooling], attrs={"grad_factor": grad_factor}, )
def test_plus_chain(): with Model.define_operators({"+": lambda a, b: a}): m = ( create_model(name="a") + create_model(name="b") + create_model(name="c") + create_model(name="d") ) assert m.name == "a"
def init(model: Model, X=None, Y=None): if model.attrs["has_transformer"]: return name = model.attrs["name"] tok_cfg = model.attrs["tokenizer_config"] tokenizer, transformer = huggingface_from_pretrained(name, tok_cfg) model.attrs["tokenizer"] = tokenizer model.attrs["set_transformer"](model, transformer) # Call the model with a batch of inputs to infer the width if X: texts = [x.text for x in X] else: texts = ["hello world", "foo bar"] token_data = huggingface_tokenize(model.attrs["tokenizer"], texts) model.layers[0].initialize(X=token_data) tensors = model.layers[0].predict(token_data) t_i = find_last_hidden(tensors) model.set_dim("nO", tensors[t_i].shape[-1])
def get_ragged_model(): def _trim_ragged_forward(model, Xr, is_train): def backprop(dYr): dY = dYr.data dX = model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1) return Ragged(dX, dYr.lengths) return Ragged(Xr.data[:, :-1], Xr.lengths), backprop return with_ragged(Model("trimragged", _trim_ragged_forward))