def build_spancat_model( tok2vec: Model[List[Doc], List[Floats2d]], reducer: Model[Ragged, Floats2d], scorer: Model[Floats2d, Floats2d], ) -> Model[Tuple[List[Doc], Ragged], Floats2d]: """Build a span categorizer model, given a token-to-vector model, a reducer model to map the sequence of vectors for each span down to a single vector, and a scorer model to map the vectors to probabilities. tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model. reducer (Model[Ragged, Floats2d]): The reducer model. scorer (Model[Floats2d, Floats2d]): The scorer model. """ model = chain( cast( Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]], with_getitem( 0, chain(tok2vec, cast(Model[List[Floats2d], Ragged], list2ragged()))), ), extract_spans(), reducer, scorer, ) model.set_ref("tok2vec", tok2vec) model.set_ref("reducer", reducer) model.set_ref("scorer", scorer) return model
def test_issue208(): """Test issue that was caused by trying to flatten nested chains.""" layer1 = Linear(nO=9, nI=3) layer2 = Linear(nO=12, nI=9) layer3 = Linear(nO=5, nI=12) model = chain(layer1, chain(layer2, layer3)).initialize() assert model.get_dim("nO") == 5
def test_wrap_non_child_references(): relu = Relu(5) relu2 = Relu(5) chained = chain(relu, relu) chained2 = chain(relu2, chained) chained2.set_ref("relu", relu) # Fails in case non-child references cannot be set. wrap_model_recursive(chained2, with_debug)
def test_multi_model_load_missing_dims(): model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3)) model._layers[0].b += 1 model._layers[1].b += 2 data = model.to_bytes() model2 = chain(Maxout(5), Maxout()) model2 = model2.from_bytes(data) assert model2._layers[0].b[0, 0] == 1 assert model2._layers[1].b[0, 0] == 2
def softmax_tanh_class_vector(nr_class, *, exclusive_classes=True, **cfg): """Select features from the class-vectors from the last hidden state, mean-pool them, and softmax to produce one vector per document. The gradients of the class vectors are incremented in the backward pass, to allow fine-tuning. """ width = cfg["token_vector_width"] return chain(get_pytt_class_tokens, flatten_add_lengths, with_getitem(0, chain(Affine(width, width), tanh)), Pooling(mean_pool), Softmax(2, width))
def Tok2Vec(config): doc2feats = make_layer(config["@doc2feats"]) embed = make_layer(config["@embed"]) encode = make_layer(config["@encode"]) field_size = getattr(encode, "receptive_field", 0) tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=field_size)) tok2vec.cfg = config tok2vec.nO = encode.nO tok2vec.embed = embed tok2vec.encode = encode return tok2vec
def Tok2Vec(config): doc2feats = make_layer(config["@doc2feats"]) embed = make_layer(config["@embed"]) encode = make_layer(config["@encode"]) depth = config["@encode"]["config"]["depth"] tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=depth)) tok2vec.cfg = config tok2vec.nO = encode.nO tok2vec.embed = embed tok2vec.encode = encode return tok2vec
def test_multi_model_load_missing_dims(): model = chain(Maxout(5, 10, nP=2), Maxout(2, 3)).initialize() b = model.layers[0].get_param("b") b += 1 b = model.layers[1].get_param("b") b += 2 data = model.to_bytes() model2 = chain(Maxout(5, nP=None), Maxout(nP=None)) model2 = model2.from_bytes(data) assert model2.layers[0].get_param("b")[0, 0] == 1 assert model2.layers[1].get_param("b")[0, 0] == 2
def test_walk_dfs(): relu = Relu(5) relu2 = Relu(5) inner_chain = chain(relu, relu2) chained = chain(inner_chain, inner_chain) assert list( chained.walk(order="dfs_pre")) == [chained, inner_chain, relu, relu2] assert list(chained.walk(order="dfs_post")) == [ relu, relu2, inner_chain, chained, ]
def build_cloze_characters_multi_task_model( vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int ) -> Model: output_layer = chain( list2array(), Maxout(nO=hidden_size, nP=maxout_pieces), LayerNorm(nI=hidden_size), MultiSoftmax([256] * nr_char, nI=hidden_size), ) model = build_masked_language_model(vocab, chain(tok2vec, output_layer)) model.set_ref("tok2vec", tok2vec) model.set_ref("output_layer", output_layer) return model
def fine_tune_class_vector(nr_class, *, exclusive_classes=True, **cfg): """Select features from the class-vectors from the last hidden state, softmax them, and then mean-pool them to produce one feature per vector. The gradients of the class vectors are incremented in the backward pass, to allow fine-tuning. """ return chain( get_pytt_class_tokens, flatten_add_lengths, with_getitem( 0, chain(Affine(cfg["token_vector_width"], cfg["token_vector_width"]), tanh)), Pooling(mean_pool), Affine(2, cfg["token_vector_width"], drop_factor=0), softmax)
def test_serialize_model_shims_roundtrip_bytes(): fwd = lambda model, X, is_train: (X, lambda dY: dY) test_shim = SerializableShim(None) shim_model = Model("shimmodel", fwd, shims=[test_shim]) model = chain(Linear(2, 3), shim_model, Maxout(2, 3)) model.initialize() assert model.layers[1].shims[0].value == "shimdata" model_bytes = model.to_bytes() with pytest.raises(ValueError): Linear(2, 3).from_bytes(model_bytes) test_shim = SerializableShim(None) shim_model = Model("shimmodel", fwd, shims=[test_shim]) new_model = chain(Linear(2, 3), shim_model, Maxout(2, 3)).from_bytes(model_bytes) assert new_model.layers[1].shims[0].value == "shimdata from bytes"
def test_recursive_double_wrap(): def dummy_model(name, layers): return Model(name, lambda model, X, is_train:..., layers=layers) relu = Relu(5) chained = chain(relu, relu) concat = concatenate(chained, chained, relu) concat_wrapped = wrap_model_recursive( concat, lambda model: dummy_model(f"dummy({model.name})", [model])) n_debug = 0 for model in concat_wrapped.walk(): if model.name.startswith("dummy"): n_debug += 1 # There should be 3 unique dummy wrappers: # * Around concatenate. # * Around chain. # * Around relu. assert n_debug == 3 assert concat_wrapped.layers[0].layers[0].layers[0].layers[ 0].name == "dummy(relu)" assert concat_wrapped.layers[0].layers[0].layers[0].layers[ 1].name == "dummy(relu)" assert concat_wrapped.layers[0].layers[1].layers[0].layers[ 0].name == "dummy(relu)" assert concat_wrapped.layers[0].layers[1].layers[0].layers[ 1].name == "dummy(relu)" assert concat_wrapped.layers[0].layers[2].name == "dummy(relu)"
def Model(cls, **cfg) -> Any: """Create an instance of `PyTT_Wrapper`, which holds the PyTorch-Transformers model. **cfg: Optional config parameters. RETURNS (thinc.neural.Model): The wrapped model. """ name = cfg.get("pytt_name") if not name: raise ValueError( "Need pytt_name argument, e.g. 'bert-base-uncased'") if cfg.get("from_pretrained"): pytt_model = PyTT_Wrapper.from_pretrained(name) else: pytt_model = PyTT_Wrapper(name) nO = pytt_model.nO batch_by_length = cfg.get("words_per_batch", 3000) max_length = cfg.get("max_length", 512) model = foreach_sentence( chain( get_word_pieces, with_length_batching( truncate_long_inputs(pytt_model, max_length), batch_by_length), )) setattr(model, "nO", nO) setattr(model, "_model", pytt_model) return model
def test_replace_node_with_indirect_node_ref(): # a # / \ # x b[y=y] # | | # y x # | # y def dummy_model(name, layers): return Model(name, lambda model, X, is_train:..., layers=layers) y = dummy_model("y", []) x = dummy_model("x", [y]) y_debug = with_debug(y) b = dummy_model("b", [x]) b.set_ref("y", y) a = chain(x, b) a.name = "a" a.replace_node(y, y_debug) assert a.layers[0].layers[0] == y_debug assert a.layers[1].layers[0].layers[0] == y_debug assert a.layers[1].get_ref("y") == y_debug
def MaxoutWindowEncoder_v1(width: int, window_size: int, maxout_pieces: int, depth: int) -> Model[Floats2d, Floats2d]: """Encode context using convolutions with maxout activation, layer normalization and residual connections. width (int): The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between 64 and 300. window_size (int): The number of words to concatenate around each token to construct the convolution. Recommended value is 1. maxout_pieces (int): The number of maxout pieces to use. Recommended values are 2 or 3. depth (int): The number of convolutional layers. Recommended value is 4. """ cnn = chain( expand_window(window_size=window_size), Maxout( nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True, ), ) model = clone(residual(cnn), depth) model.set_dim("nO", width) model.attrs["receptive_field"] = window_size * depth return model
def test_model_gpu(): prefer_gpu() n_hidden = 32 dropout = 0.2 (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist() model = chain( Relu(nO=n_hidden, dropout=dropout), Relu(nO=n_hidden, dropout=dropout), Softmax(), ) # making sure the data is on the right device train_X = model.ops.asarray(train_X) train_Y = model.ops.asarray(train_Y) dev_X = model.ops.asarray(dev_X) dev_Y = model.ops.asarray(dev_Y) model.initialize(X=train_X[:5], Y=train_Y[:5]) optimizer = Adam(0.001) batch_size = 128 for i in range(2): batches = model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True) for X, Y in batches: Yh, backprop = model.begin_update(X) backprop(Yh - Y) model.finish_update(optimizer) # Evaluate and print progress correct = 0 total = 0 for X, Y in model.ops.multibatch(batch_size, dev_X, dev_Y): Yh = model.predict(X) correct += (Yh.argmax(axis=1) == Y.argmax(axis=1)).sum() total += Yh.shape[0]
def FancyEmbed(width, rows, cols=(ORTH, SHAPE, PREFIX, SUFFIX)): from thinc.i2v import HashEmbed from thinc.v2v import Maxout from thinc.api import chain, concatenate tables = [HashEmbed(width, rows, column=i) for i in range(len(cols))] return chain(concatenate(*tables), Maxout(width, width * len(tables), pieces=3))
def transformer_tok2vec_v2( name: str, get_spans, tokenizer_config: dict, transformer_config: dict, pooling: Model[Ragged, Floats2d], grad_factor: float = 1.0, ) -> Model[List[Doc], List[Floats2d]]: """Use a transformer as a "Tok2Vec" layer directly. This does not allow multiple components to share the transformer weights, and does not allow the transformer to set annotations into the `Doc` object, but it's a simpler solution if you only need the transformer within one component. get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract spans from the batch of Doc objects. See the "TransformerModel" layer for details. tokenizer_config (dict): Settings to pass to the transformers tokenizer. transformers_config (dict): Settings to pass to the transformers forward pass of the transformer. pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice. grad_factor (float): Reweight gradients from the component before passing them to the transformer. You can set this to 0 to "freeze" the transformer weights with respect to the component, or to make it learn more slowly. Leaving it at 1.0 is usually fine. """ return chain( TransformerModel(name, get_spans, tokenizer_config, transformer_config), split_trf_batch(), trfs2arrays(pooling, grad_factor), )
def test_recursive_double_wrap(): relu = Relu(5) chained = chain(relu, relu) concat = concatenate(chained, chained) concat_debug = wrap_model_recursive(concat, with_debug) n_debug = 0 for model in concat_debug.walk(): if model.name.startswith("debug"): n_debug += 1 # There should be 5 unique debug wrappers: # * Around concatenate. (= 1) # * One around each chain in concatenate. (= 2) # * One around each relu in the chain. (= 2) assert n_debug == 5 assert concat_debug.layers[0].layers[0].layers[0].layers[ 0].name == "debug(relu)" assert concat_debug.layers[0].layers[0].layers[0].layers[ 1].name == "debug(relu)" assert concat_debug.layers[0].layers[1].layers[0].layers[ 0].name == "debug(relu)" assert concat_debug.layers[0].layers[1].layers[0].layers[ 1].name == "debug(relu)"
def test_with_debug(): pytest.importorskip("ml_datasets") import ml_datasets (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist() counts = Counter() def on_init(*_): counts["init"] += 1 def on_forward(*_): counts["forward"] += 1 def on_backprop(*_): counts["backprop"] += 1 relu = Relu() relu2 = with_debug(Relu(), on_init=on_init, on_forward=on_forward, on_backprop=on_backprop) chained = chain(relu, relu2, relu2) chained.initialize(X=train_X[:5], Y=train_Y[:5]) _, backprop = chained(X=train_X[:5], is_train=False) # Not real loss gradients, but we don't care for testing. backprop(train_Y[:5]) # Four times forward, because initialization also applies forward for # validation. assert counts == {"init": 2, "forward": 4, "backprop": 2}
def transformer_listener_tok2vec_v1( pooling: Model[Ragged, Floats2d], grad_factor: float = 1.0, upstream: str = "*") -> Model[List[Doc], List[Floats2d]]: """Create a 'TransformerListener' layer, which will connect to a Transformer component earlier in the pipeline. The layer takes a list of Doc objects as input, and produces a list of 2d arrays as output, with each array having one row per token. Most spaCy models expect a sublayer with this signature, making it easy to connect them to a transformer model via this sublayer. Transformer models usually operate over wordpieces, which usually don't align one-to-one against spaCy tokens. The layer therefore requires a reduction operation in order to calculate a single token vector given zero or more wordpiece vectors. pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice. grad_factor (float): Reweight gradients from the component before passing them upstream. You can set this to 0 to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at 1.0 is usually fine. upstream (str): A string to identify the 'upstream' Transformer to communicate with. The upstream name should either be the wildcard string '*', or the name of the `Transformer` component. You'll almost never have multiple upstream Transformer components, so the wildcard string will almost always be fine. """ listener = TransformerListener(upstream_name=upstream) model = chain(listener, trfs2arrays(pooling, grad_factor)) model.set_ref("listener", listener) return model
def concatenate_lists(*layers, **kwargs): # pragma: no cover """Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` """ if not layers: return noop() drop_factor = kwargs.get("drop_factor", 1.0) ops = layers[0].ops layers = [chain(layer, flatten) for layer in layers] concat = concatenate(*layers) def concatenate_lists_fwd(Xs, drop=0.0): if drop is not None: drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype="i") flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) def concatenate_lists_bwd(d_ys, sgd=None): return bp_flat_y(ops.flatten(d_ys), sgd=sgd) return ys, concatenate_lists_bwd model = wrap(concatenate_lists_fwd, concat) return model
def main(n_hidden: int = 256, dropout: float = 0.2, n_iter: int = 10, batch_size: int = 128): # Define the model model: Model = chain( Relu(nO=n_hidden, dropout=dropout), Relu(nO=n_hidden, dropout=dropout), Softmax(), ) # Load the data (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist() # Set any missing shapes for the model. model.initialize(X=train_X[:5], Y=train_Y[:5]) train_data = model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True) dev_data = model.ops.multibatch(batch_size, dev_X, dev_Y) # Create the optimizer. optimizer = Adam(0.001) for i in range(n_iter): for X, Y in tqdm(train_data, leave=False): Yh, backprop = model.begin_update(X) backprop(Yh - Y) model.finish_update(optimizer) # Evaluate and print progress correct = 0 total = 0 for X, Y in dev_data: Yh = model.predict(X) correct += (Yh.argmax(axis=1) == Y.argmax(axis=1)).sum() total += Yh.shape[0] score = correct / total msg.row((i, f"{score:.3f}"), widths=(3, 5))
def transformer_listener_tok2vec_v1( pooling: Model[Ragged, Floats2d], grad_factor: float = 1.0) -> Model[List[Doc], List[Floats2d]]: """Create a 'TransformerListener' layer, which will connect to a Transformer component earlier in the pipeline. The layer takes a list of Doc objects as input, and produces a list of 2d arrays as output, with each array having one row per token. Most spaCy models expect a sublayer with this signature, making it easy to connect them to a transformer model via this sublayer. Transformer models usually operate over wordpieces, which usually don't align one-to-one against spaCy tokens. The layer therefore requires a reduction operation in order to calculate a single token vector given zero or more wordpiece vectors. pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice. grad_factor (float): Reweight gradients from the component before passing them upstream. You can set this to 0 to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at 1.0 is usually fine. """ return chain( TransformerListener("transformer"), trfs2arrays(pooling, grad_factor), )
def test_replace_node(): relu1 = Relu(5) relu2 = Relu(5) relu_chain = chain(relu1, relu2) relu1_debug = with_debug(relu1) debug = Model( "test", lambda X: (X, lambda dY: dY), layers=[relu1, relu2, relu1, relu_chain], refs={ "relu1": relu1, "relu2": relu2, "relu3": relu1 }, ) debug.replace_node(relu1, relu1_debug) assert debug.layers[0] == relu1_debug assert debug.layers[1] == relu2 assert debug.layers[2] == relu1_debug assert debug.get_ref("relu1") == relu1_debug assert debug.get_ref("relu2") == relu2 assert debug.get_ref("relu3") == relu1_debug # Check that nodes are replaced recursively assert debug.layers[3] == relu_chain assert debug.layers[3].layers[0] == relu1_debug assert debug.layers[3].layers[1] == relu2
def TransformersTagger( starter: str, n_tags: int = 17 ) -> Model[List[List[str]], List[Floats2d]]: return chain( TransformersTokenizer(starter), Transformer(starter), with_array(Softmax(nO=n_tags)), )
def tok2vec_per_sentence(model_name, cfg): max_words = cfg.get("words_per_batch", 1000) name = cfg["trf_name"] model = foreach_sentence( chain(get_word_pieces(name), with_length_batching(model_name, max_words))) return model
def test_validation_complex(): good_model = chain(list2ragged(), reduce_sum(), Relu(12, dropout=0.5), Relu(1)) X = [good_model.ops.xp.zeros((4, 75), dtype="f")] Y = good_model.ops.xp.zeros((1,), dtype="f") good_model.initialize(X, Y) good_model.predict(X) bad_model = chain( list2ragged(), reduce_sum(), Relu(12, dropout=0.5), # ERROR: Why can't I attach a Relu to an attention layer? ParametricAttention(12), Relu(1), ) with pytest.raises(DataValidationError): bad_model.initialize(X, Y)
def test_validation(): model = chain(Relu(10), Relu(10), with_ragged(reduce_max()), Softmax()) with pytest.raises(DataValidationError): model.initialize(X=model.ops.alloc2f(1, 10), Y=model.ops.alloc2f(1, 10)) with pytest.raises(DataValidationError): model.initialize(X=model.ops.alloc3f(1, 10, 1), Y=model.ops.alloc2f(1, 10)) with pytest.raises(DataValidationError): model.initialize(X=[model.ops.alloc2f(1, 10)], Y=model.ops.alloc2f(1, 10))
def baseline_mwe(nO, nP, depth): from thinc.neural._classes.model import Model from thinc.neural._classes.resnet import Residual from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.layernorm import LayerNorm from thinc.api import chain, clone, with_flatten maxout = Maxout(nO, nO*3, pieces=nP) normalize = LayerNorm(maxout) with Model.define_operators({'>>': chain, '**': clone}): model = Residual(ExtractWindow(nW=1) >> normalize) model = with_flatten(chain(*([model]*depth))) model.maxout = maxout model.normalize = normalize return model
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg): """ Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the outputs sum to 1. If exclusive_classes=False, a logistic non-linearity is applied instead, so that outputs are in the range [0, 1]. """ with Model.define_operators({">>": chain}): if exclusive_classes: output_layer = Softmax(nr_class, tok2vec.nO) else: output_layer = ( zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic ) model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class return model
def main(width=32, nr_vector=1000): train_data, check_data, nr_tag = ancora_pos_tags(encode_words=True) model = with_flatten( chain( HashEmbed(width, nr_vector), ReLu(width, width), ReLu(width, width), Softmax(nr_tag, width))) train_X, train_y = zip(*train_data) dev_X, dev_y = zip(*check_data) train_y = [to_categorical(y, nb_classes=nr_tag) for y in train_y] dev_y = [to_categorical(y, nb_classes=nr_tag) for y in dev_y] with model.begin_training(train_X, train_y) as (trainer, optimizer): trainer.each_epoch.append( lambda: print(model.evaluate(dev_X, dev_y))) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) backprop([yh[i]-y[i] for i in range(len(yh))], optimizer) with model.use_params(optimizer.averages): print(model.evaluate(dev_X, dev_y))
def concatenate_lists(*layers, **kwargs): # pragma: no cover """Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` """ if not layers: return noop() drop_factor = kwargs.get("drop_factor", 1.0) ops = layers[0].ops layers = [chain(layer, flatten) for layer in layers] concat = concatenate(*layers) def concatenate_lists_fwd(Xs, drop=0.0): drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype="i") flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) def concatenate_lists_bwd(d_ys, sgd=None): return bp_flat_y(ops.flatten(d_ys), sgd=sgd) return ys, concatenate_lists_bwd model = wrap(concatenate_lists_fwd, concat) return model
def build_text_classifier(nr_class, width=64, **cfg): depth = cfg.get("depth", 2) nr_vector = cfg.get("nr_vector", 5000) pretrained_dims = cfg.get("pretrained_dims", 0) with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}): if cfg.get("low_data") and pretrained_dims: model = ( SpacyVectors >> flatten_add_lengths >> with_getitem(0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width)) ** 2 >> zero_init(Affine(nr_class, width, drop_factor=0.0)) >> logistic ) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] ) >> with_flatten( uniqued( (lower | prefix | suffix | shape) >> LN(Maxout(width, width + (width // 2) * 3)), column=0, ) ) if pretrained_dims: static_vectors = SpacyVectors >> with_flatten( Affine(width, pretrained_dims) ) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None tok2vec = vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth, pad=depth, ) cnn_model = ( tok2vec >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0)) ) linear_model = build_bow_text_classifier( nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False ) if cfg.get("exclusive_classes"): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = ( zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic ) model = (linear_model | cnn_model) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class model.lsuv = False return model