Ejemplo n.º 1
0
def TextCatCNN_v1(tok2vec: Model,
                  exclusive_classes: bool,
                  nO: Optional[int] = None) -> Model[List[Doc], Floats2d]:
    """
    Build a simple CNN text classifier, given a token-to-vector model as inputs.
    If exclusive_classes=True, a softmax non-linearity is applied, so that the
    outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
    is applied instead, so that outputs are in the range [0, 1].
    """
    chain = registry.get("layers", "chain.v1")
    reduce_mean = registry.get("layers", "reduce_mean.v1")
    Logistic = registry.get("layers", "Logistic.v1")
    Softmax = registry.get("layers", "Softmax.v1")
    Linear = registry.get("layers", "Linear.v1")
    list2ragged = registry.get("layers", "list2ragged.v1")

    with Model.define_operators({">>": chain}):
        cnn = tok2vec >> list2ragged() >> reduce_mean()
        if exclusive_classes:
            output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
            model = cnn >> output_layer
            model.set_ref("output_layer", output_layer)
        else:
            linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
            model = cnn >> linear_layer >> Logistic()
            model.set_ref("output_layer", linear_layer)
    model.set_ref("tok2vec", tok2vec)
    model.set_dim("nO", nO)
    model.attrs["multi_label"] = not exclusive_classes
    return model
Ejemplo n.º 2
0
def test_serialize_attrs():
    fwd = lambda model, X, is_train: (X, lambda dY: dY)
    attrs = {"test": "foo"}
    model1 = Model("test", fwd, attrs=attrs).initialize()
    bytes_attr = serialize_attr(model1.attrs["test"], attrs["test"], "test",
                                model1)
    assert bytes_attr == srsly.msgpack_dumps("foo")
    model2 = Model("test", fwd, attrs={"test": ""})
    result = deserialize_attr(model2.attrs["test"], bytes_attr, "test", model2)
    assert result == "foo"

    # Test objects with custom serialization functions
    @serialize_attr.register(SerializableAttr)
    def serialize_attr_custom(_, value, name, model):
        return value.to_bytes()

    @deserialize_attr.register(SerializableAttr)
    def deserialize_attr_custom(_, value, name, model):
        return SerializableAttr().from_bytes(value)

    attrs = {"test": SerializableAttr()}
    model3 = Model("test", fwd, attrs=attrs)
    bytes_attr = serialize_attr(model3.attrs["test"], attrs["test"], "test",
                                model3)
    assert bytes_attr == b"foo"
    model4 = Model("test", fwd, attrs=attrs)
    assert model4.attrs["test"].value == "foo"
    result = deserialize_attr(model4.attrs["test"], bytes_attr, "test", model4)
    assert result.value == "foo from bytes"
def forward(model: Model, docs: List[Doc], is_train: bool):
    if docs is None:
        return []
    ids = []
    output = []
    E = model.get_param("E")
    nC = model.get_dim("nC")
    nM = model.get_dim("nM")
    nO = model.get_dim("nO")
    # This assists in indexing; it's like looping over this dimension.
    # Still consider this weird witch craft...But thanks to Mark Neumann
    # for the tip.
    nCv = model.ops.xp.arange(nC)
    for doc in docs:
        doc_ids = model.ops.asarray(doc.to_utf8_array(nr_char=nC))
        doc_vectors = model.ops.alloc3f(len(doc), nC, nM)
        # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
        # incantation do I chant to get
        # output[i, j, k] == data[j, ids[i, j], k]?
        doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]]
        output.append(doc_vectors.reshape((len(doc), nO)))
        ids.append(doc_ids)

    def backprop(d_output):
        dE = model.ops.alloc(E.shape, dtype=E.dtype)
        for doc_ids, d_doc_vectors in zip(ids, d_output):
            d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM))
            dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
        model.inc_grad("E", dE)
        return []

    return output, backprop
Ejemplo n.º 4
0
def init(model: Model, X=None, Y=None):
    if model.attrs["has_transformer"]:
        return
    name = model.attrs["name"]
    tok_cfg = model._init_tokenizer_config
    trf_cfg = model._init_transformer_config
    tokenizer, transformer = huggingface_from_pretrained(
        name, tok_cfg, trf_cfg)
    model.attrs["set_transformer"](model, transformer, tokenizer)
    tokenizer = model.tokenizer
    # Call the model with a batch of inputs to infer the width
    if X:
        # If we're dealing with actual texts, do the work to setup the wordpieces
        # batch properly
        docs = X
        get_spans = model.attrs["get_spans"]
        nested_spans = get_spans(docs)
        flat_spans = []
        for doc_spans in nested_spans:
            flat_spans.extend(doc_spans)
        token_data = huggingface_tokenize(tokenizer,
                                          [span.text for span in flat_spans])
        wordpieces = WordpieceBatch.from_batch_encoding(token_data)
        align = get_alignment(flat_spans, wordpieces.strings,
                              tokenizer.all_special_tokens)
        wordpieces, align = truncate_oversize_splits(
            wordpieces, align, tokenizer.model_max_length)
    else:
        texts = ["hello world", "foo bar"]
        token_data = huggingface_tokenize(tokenizer, texts)
        wordpieces = WordpieceBatch.from_batch_encoding(token_data)
    model.layers[0].initialize(X=wordpieces)
    model_output = model.layers[0].predict(wordpieces)
    model.set_dim("nO", model_output.last_hidden_state.shape[-1])
Ejemplo n.º 5
0
def build_masked_language_model(
    vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15
) -> Model:
    """Convert a model into a BERT-style masked language model"""
    random_words = _RandomWords(vocab)

    def mlm_forward(model, docs, is_train):
        mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
        output, backprop = model.layers[0](docs, is_train)

        def mlm_backward(d_output):
            d_output *= 1 - mask
            return backprop(d_output)

        return output, mlm_backward

    def mlm_initialize(model: Model, X=None, Y=None):
        wrapped = model.layers[0]
        wrapped.initialize(X=X, Y=Y)
        for dim in wrapped.dim_names:
            if wrapped.has_dim(dim):
                model.set_dim(dim, wrapped.get_dim(dim))

    mlm_model = Model(
        "masked-language-model",
        mlm_forward,
        layers=[wrapped_model],
        init=mlm_initialize,
        refs={"wrapped": wrapped_model},
        dims={dim: None for dim in wrapped_model.dim_names},
    )
    mlm_model.set_ref("wrapped", wrapped_model)
    return mlm_model
Ejemplo n.º 6
0
 def __init__(self, upstream_name: str):
     Model.__init__(self,
                    name=self.name,
                    forward=forward,
                    dims={"nO": None})
     self.upstream_name = upstream_name
     self._batch_id = None
     self._outputs = None
     self._backprop = None
Ejemplo n.º 7
0
def get_tok2vec_width(model: Model):
    nO = None
    if model.has_ref("tok2vec"):
        tok2vec = model.get_ref("tok2vec")
        if tok2vec.has_dim("nO"):
            nO = tok2vec.get_dim("nO")
        elif tok2vec.has_ref("listener"):
            nO = tok2vec.get_ref("listener").get_dim("nO")
    return nO
Ejemplo n.º 8
0
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
    with Model.define_operators({">>": chain, "**": clone}):
        token_width = tok2vec.get_dim("nO")
        output_layer = Linear(nO=nO, nI=token_width)
        model = (tok2vec >> list2ragged() >> reduce_mean() >> residual(
            Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) >>
                 output_layer)
        model.set_ref("output_layer", output_layer)
        model.set_ref("tok2vec", tok2vec)
    return model
Ejemplo n.º 9
0
def build_nel_encoder(tok2vec: Model,
                      nO: Optional[int] = None) -> Model[List[Doc], Floats2d]:
    with Model.define_operators({">>": chain, "&": tuplify}):
        token_width = tok2vec.maybe_get_dim("nO")
        output_layer = Linear(nO=nO, nI=token_width)
        model = (((tok2vec >> list2ragged()) & build_span_maker()) >>
                 extract_spans() >> reduce_mean() >> residual(
                     Maxout(nO=token_width, nI=token_width, nP=2,
                            dropout=0.0)) >> output_layer)
        model.set_ref("output_layer", output_layer)
        model.set_ref("tok2vec", tok2vec)
    # flag to show this isn't legacy
    model.attrs["include_span_maker"] = True
    return model
Ejemplo n.º 10
0
 def __init__(self, upstream_name: str, width: int) -> None:
     """
     upstream_name (str): A string to identify the 'upstream' Tok2Vec component
         to communicate with. The upstream name should either be the wildcard
         string '*', or the name of the `Tok2Vec` component. You'll almost
         never have multiple upstream Tok2Vec components, so the wildcard
         string will almost always be fine.
     width (int):
         The width of the vectors produced by the upstream tok2vec component.
     """
     Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
     self.upstream_name = upstream_name
     self._batch_id = None
     self._outputs = None
     self._backprop = None
Ejemplo n.º 11
0
def test_serialize_model_shims_roundtrip_bytes():
    fwd = lambda model, X, is_train: (X, lambda dY: dY)
    test_shim = SerializableShim(None)
    shim_model = Model("shimmodel", fwd, shims=[test_shim])
    model = chain(Linear(2, 3), shim_model, Maxout(2, 3))
    model.initialize()
    assert model.layers[1].shims[0].value == "shimdata"
    model_bytes = model.to_bytes()
    with pytest.raises(ValueError):
        Linear(2, 3).from_bytes(model_bytes)
    test_shim = SerializableShim(None)
    shim_model = Model("shimmodel", fwd, shims=[test_shim])
    new_model = chain(Linear(2, 3), shim_model,
                      Maxout(2, 3)).from_bytes(model_bytes)
    assert new_model.layers[1].shims[0].value == "shimdata from bytes"
Ejemplo n.º 12
0
def test_tuplify_operator_three(model1, model2, model3):
    # Previously we 'flattened' these nested calls. We might opt to do so
    # again, especially for the operators.
    with Model.define_operators({"&": tuplify}):
        model = model1 & model2 & model3
        assert len(model.layers) == 2
        assert len(model.layers[0].layers) == 2
Ejemplo n.º 13
0
def CharNgramsEmbedding(
    n: int,
    max_chars: int,
    lower: bool,
    num_vectors: int,
    embed_dim: int,
    dropout: Optional[float],
) -> Model[List[str], thinc.types.Floats1d]:
    """
    Args:
        n
        max_chars
        lower
        num_vectors
        embed_dim
        dropout
    """
    with Model.define_operators({">>": chain}):
        model = (
            text_to_char_ngrams(n, max_chars, lower) >>
            thinc.layers.strings2arrays() >> thinc.layers.with_array(
                thinc.layers.HashEmbed(
                    nO=embed_dim,
                    nV=num_vectors,
                    dropout=dropout,
                    column=0,
                )) >> thinc.layers.list2ragged() >> thinc.layers.reduce_mean())
    return model
Ejemplo n.º 14
0
def trfs2arrays(
    pooling: Model[Ragged, Floats2d], grad_factor: float
) -> Model[List[TransformerData], List[Floats2d]]:
    """Pool transformer data into token-aligned tensors."""
    return Model(
        "trfs2arrays", forward, layers=[pooling], attrs={"grad_factor": grad_factor},
    )
Ejemplo n.º 15
0
def cnn_tagger(width: int, vector_width: int, nr_classes: int = 17):
    with Model.define_operators({">>": chain}):
        model = strings2arrays() >> with_array(
            HashEmbed(nO=width, nV=vector_width, column=0) >> expand_window(
                window_size=1) >> Relu(nO=width, nI=width * 3) >> Relu(
                    nO=width, nI=width) >> Softmax(nO=nr_classes, nI=width))
    return model
def TransformerModel(
    name: str, get_spans: Callable, tokenizer_config: dict = {}, transformer_config: dict = {}
) -> Model[List[Doc], FullTransformerBatch]:
    """
    get_spans (Callable[[List[Doc]], List[Span]]):
        A function to extract spans from the batch of Doc objects.
        This is used to manage long documents, by cutting them into smaller
        sequences before running the transformer. The spans are allowed to
        overlap, and you can also omit sections of the Doc if they are not
        relevant.
    tokenizer_config (dict): Settings to pass to the transformers tokenizer.
    transformer_config (dict): Settings to pass to the transformers forward pass.
    """

    return Model(
        "transformer",
        forward,
        init=init,
        layers=[],
        dims={"nO": None},
        attrs={
            "tokenizer": None,
            "get_spans": get_spans,
            "name": name,
            "tokenizer_config": tokenizer_config,
            "transformer_config": transformer_config,
            "set_transformer": set_pytorch_transformer,
            "has_transformer": False,
            "flush_cache_chance": 0.0,
            "replace_listener": replace_listener,
            "replace_listener_cfg": replace_listener_cfg,
        },
    )
Ejemplo n.º 17
0
def LangIdentifierModelV2(
    ns: Sequence[int] = (1, 2, 3),
    embed_dim: int = 100,
    hidden_width: int = 512,
    dropout: Optional[float] = 0.1,
) -> Model[List[str], thinc.types.Floats2d]:
    """
    Build a language identification model inspired by Google's CLD3.

    Args:
        ns: Set of "n" for which character "n"-grams are extracted from input texts.
            If 1, only unigrams (single characters) are used; if [1, 2], then both
            unigrams and bigrams are used; and so on.
        embed_dim: Size of the vectors into which each set of ngrams are embedded.
        hidden_width: Width of the dense layer with Relu activation, just before
            the final prediction (Softmax) layer.
        dropout: Dropout rate to avoid overfitting.

    Returns:
        Thinc :class:`Model`.
    """
    with Model.define_operators({">>": chain}):
        model = (MultiCharNgramsEmbedding(
            ns=list(ns),
            max_chars=1000,
            lower=True,
            num_vectors=[2000 * n for n in ns],
            embed_dims=embed_dim,
            dropout=dropout,
        ) >> thinc.layers.Relu(
            nI=embed_dim * len(ns),
            nO=hidden_width,
            dropout=dropout,
        ) >> thinc.layers.Softmax(nI=hidden_width))
    return model
Ejemplo n.º 18
0
def _resume_model(
    model: Model, resume_path: Path, epoch_resume: int, silent: bool = True
) -> None:
    msg = Printer(no_print=silent)
    msg.info(f"Resume training tok2vec from: {resume_path}")
    with resume_path.open("rb") as file_:
        weights_data = file_.read()
        model.get_ref("tok2vec").from_bytes(weights_data)
    # Parse the epoch number from the given weight file
    model_name = re.search(r"model\d+\.bin", str(resume_path))
    if model_name:
        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
        msg.info(f"Resuming from epoch: {epoch_resume}")
    else:
        msg.info(f"Resuming from epoch: {epoch_resume}")
Ejemplo n.º 19
0
def TransitionModel(tok2vec,
                    lower,
                    upper,
                    resize_output,
                    dropout=0.2,
                    unseen_classes=set()):
    """Set up a stepwise transition-based model"""
    if upper is None:
        has_upper = False
        upper = noop()
    else:
        has_upper = True
    # don't define nO for this object, because we can't dynamically change it
    return Model(
        name="parser_model",
        forward=forward,
        dims={"nI": tok2vec.maybe_get_dim("nI")},
        layers=[tok2vec, lower, upper],
        refs={
            "tok2vec": tok2vec,
            "lower": lower,
            "upper": upper
        },
        init=init,
        attrs={
            "has_upper": has_upper,
            "unseen_classes": set(unseen_classes),
            "resize_output": resize_output,
        },
    )
Ejemplo n.º 20
0
def make_update(model: Model, docs: Iterable[Doc], optimizer: Optimizer,
                objective_func: Callable) -> float:
    """Perform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    """
    predictions, backprop = model.begin_update(docs)
    loss, gradients = objective_func(model.ops, docs, predictions)
    backprop(gradients)
    model.finish_update(optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
    return float(loss)
Ejemplo n.º 21
0
def build_text_classifier_v2(
    tok2vec: Model[List[Doc], List[Floats2d]],
    linear_model: Model[List[Doc], Floats2d],
    nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
    exclusive_classes = not linear_model.attrs["multi_label"]
    with Model.define_operators({">>": chain, "|": concatenate}):
        width = tok2vec.maybe_get_dim("nO")
        attention_layer = ParametricAttention(
            width)  # TODO: benchmark performance difference of this layer
        maxout_layer = Maxout(nO=width, nI=width)
        norm_layer = LayerNorm(nI=width)
        cnn_model = (
            tok2vec >> list2ragged() >> attention_layer >> reduce_sum() >>
            residual(maxout_layer >> norm_layer >> Dropout(0.0)))

        nO_double = nO * 2 if nO else None
        if exclusive_classes:
            output_layer = Softmax(nO=nO, nI=nO_double)
        else:
            output_layer = Linear(nO=nO, nI=nO_double) >> Logistic()
        model = (linear_model | cnn_model) >> output_layer
        model.set_ref("tok2vec", tok2vec)
    if model.has_dim("nO") is not False:
        model.set_dim("nO", nO)
    model.set_ref("output_layer", linear_model.get_ref("output_layer"))
    model.set_ref("attention_layer", attention_layer)
    model.set_ref("maxout_layer", maxout_layer)
    model.set_ref("norm_layer", norm_layer)
    model.attrs["multi_label"] = not exclusive_classes

    model.init = init_ensemble_textcat
    return model
Ejemplo n.º 22
0
def MultiCharNgramsEmbedding(
    ns: List[int],
    max_chars: int,
    lower: bool,
    num_vectors: int | List[int],
    embed_dims: int | List[int],
    dropout: Optional[float],
) -> Model[List[str], thinc.types.Floats1d]:
    """
    Args:
        ns
        max_chars
        lower
        num_vectors
        embed_dims
        dropout
    """
    numn = len(ns)
    num_vectors = [num_vectors] * numn if isinstance(num_vectors,
                                                     int) else num_vectors
    embed_dims = [embed_dims] * numn if isinstance(embed_dims,
                                                   int) else embed_dims
    with Model.define_operators({">>": chain}):
        model = concatenate(*[
            CharNgramsEmbedding(
                n=n,
                max_chars=max_chars,
                lower=lower,
                num_vectors=nvec,
                embed_dim=edim,
                dropout=dropout,
            ) for n, nvec, edim in zip(ns, num_vectors, embed_dims)
        ])
    return model
Ejemplo n.º 23
0
def instance_init(model: Model,
                  X: List[Doc] = None,
                  Y: Floats2d = None) -> Model:
    tok2vec = model.get_ref("tok2vec")
    if X is not None:
        tok2vec.initialize(X)
    return model
Ejemplo n.º 24
0
def build_bow_text_classifier(
    exclusive_classes: bool,
    ngram_size: int,
    no_output_layer: bool,
    nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
    fill_defaults = {"b": 0, "W": 0}
    with Model.define_operators({">>": chain}):
        sparse_linear = SparseLinear(nO=nO)
        output_layer = None
        if not no_output_layer:
            fill_defaults["b"] = NEG_VALUE
            output_layer = softmax_activation() if exclusive_classes else Logistic()
        resizable_layer = resizable(
            sparse_linear,
            resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults),
        )
        model = extract_ngrams(ngram_size, attr=ORTH) >> resizable_layer
        model = with_cpu(model, model.ops)
        if output_layer:
            model = model >> with_cpu(output_layer, output_layer.ops)
    model.set_dim("nO", nO)
    model.set_ref("output_layer", sparse_linear)
    model.attrs["multi_label"] = not exclusive_classes
    model.attrs["resize_output"] = partial(
        resize_and_set_ref, resizable_layer=resizable_layer
    )
    return model
def get_array_model():
    def _trim_array_forward(model, X, is_train):
        def backprop(dY):
            return model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1)

        return X[:, :-1], backprop

    return with_array2d(Model("trimarray", _trim_array_forward))
Ejemplo n.º 26
0
def create_relation_model(
    create_instance_tensor: Model[List[Doc], Floats2d],
    classification_layer: Model[Floats2d, Floats2d],
) -> Model[List[Doc], Floats2d]:
    with Model.define_operators({">>": chain}):
        model = create_instance_tensor >> classification_layer
        model.attrs["get_instances"] = create_instance_tensor.attrs["get_instances"]
    return model
Ejemplo n.º 27
0
def trfs2arrays(
        pooling: Model[Ragged, Floats2d],
        grad_factor: float) -> Model[List[TransformerData], List[Floats2d]]:
    return Model(
        "trfs2arrays",
        forward,
        layers=[pooling],
        attrs={"grad_factor": grad_factor},
    )
Ejemplo n.º 28
0
def test_plus_chain():
    with Model.define_operators({"+": lambda a, b: a}):
        m = (
            create_model(name="a")
            + create_model(name="b")
            + create_model(name="c")
            + create_model(name="d")
        )
        assert m.name == "a"
Ejemplo n.º 29
0
def init(model: Model, X=None, Y=None):
    if model.attrs["has_transformer"]:
        return
    name = model.attrs["name"]
    tok_cfg = model.attrs["tokenizer_config"]
    tokenizer, transformer = huggingface_from_pretrained(name, tok_cfg)
    model.attrs["tokenizer"] = tokenizer
    model.attrs["set_transformer"](model, transformer)
    # Call the model with a batch of inputs to infer the width
    if X:
        texts = [x.text for x in X]
    else:
        texts = ["hello world", "foo bar"]
    token_data = huggingface_tokenize(model.attrs["tokenizer"], texts)
    model.layers[0].initialize(X=token_data)
    tensors = model.layers[0].predict(token_data)
    t_i = find_last_hidden(tensors)
    model.set_dim("nO", tensors[t_i].shape[-1])
def get_ragged_model():
    def _trim_ragged_forward(model, Xr, is_train):
        def backprop(dYr):
            dY = dYr.data
            dX = model.ops.alloc2f(dY.shape[0], dY.shape[1] + 1)
            return Ragged(dX, dYr.lengths)

        return Ragged(Xr.data[:, :-1], Xr.lengths), backprop

    return with_ragged(Model("trimragged", _trim_ragged_forward))