Esempi in Python per dropout, esempi in Python per finetune.base_models.gpt.featurizer.dropout

Esempio n. 1

0

Mostra file

File: target_blocks.py Progetto: yishuihanhan/finetune

def multi_classifier(hidden,
                     targets,
                     n_targets,
                     config,
                     train=False,
                     reuse=None,
                     **kwargs):
    """
    A simple linear classifier.

    :param hidden: The output of the featurizer. [batch_size, embed_dim]
    :param targets: The placeholder representing the sparse targets [batch_size, n_targets]
    :param n_targets: A python int containing the number of classes that the model should be learning to predict over.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :param kwargs: Spare arguments.
    :return: dict containing:
        logits: The unnormalised log probabilities of each class.
        losses: The loss for the classifier.
    """
    with tf.variable_scope('model', reuse=reuse):
        hidden = dropout(hidden, config.clf_p_drop, train)
        clf_logits = perceptron(hidden, n_targets, config)
        if targets is None:
            clf_losses = None
        else:
            clf_losses = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=clf_logits, labels=tf.stop_gradient(targets))
            clf_losses = _apply_class_weight(clf_losses, targets,
                                             kwargs.get('class_weights'))
        return {'logits': clf_logits, 'losses': clf_losses}

Esempio n. 2

0

Mostra file

def mlp(x, scope, n_state, *, hparams, train=False):
    with tf.variable_scope(scope):
        nx = x.shape[-1].value
        h = gelu(conv1d(x, "c_fc", n_state))
        h2 = conv1d(h, "c_proj", nx)
        h2 = dropout(h2, hparams.resid_p_drop, train=train)
        return h2

Esempio n. 3

0

Mostra file

File: target_blocks.py Progetto: yishuihanhan/finetune

def multi_choice_question(hidden,
                          targets,
                          n_targets,
                          config,
                          train=False,
                          reuse=None,
                          **kwargs):
    with tf.variable_scope("model", reuse=reuse):
        hidden = dropout(hidden, config.clf_p_drop, train)
        hidden = tf.unstack(hidden, num=n_targets, axis=1)
        hidden = tf.concat(hidden, axis=0)

        clf_out = perceptron(hidden, 1, config)
        clf_out = tf.split(clf_out, n_targets, axis=0)
        clf_out = tf.concat(clf_out, 1)

        if targets is None:
            clf_losses = None
        else:
            clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=clf_out, labels=tf.stop_gradient(targets))

            clf_losses = _apply_class_weight(clf_losses, targets,
                                             kwargs.get('class_weights'))

        return {'logits': clf_out, 'losses': clf_losses}

Esempio n. 4

0

Mostra file

def regressor(hidden, targets, n_targets, config, train=False, reuse=None, **kwargs):
    """
    A simple linear regressor.

    :param hidden: The output of the featurizer. [batch_size, embed_dim]
    :param targets: The placeholder representing the regression targets. [batch_size]
    :param n_targets: A python int containing the number of outputs that the model should be learning to predict over.
    :param dropout_placeholder:
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :param kwargs: Spare arguments.
    :return: dict containing:
        logits: The regression outputs.
        losses: L2 Loss for the regression targets.
    """
    with tf.variable_scope("regressor", reuse=reuse):
        hidden = dropout(hidden, config.clf_p_drop, train)
        outputs = perceptron(hidden, n_targets, config)
        if targets is None:
            loss = None
        else:
            if config.regression_loss.upper() == "L2":
                loss = tf.nn.l2_loss(outputs - targets)
            elif config.regression_loss.upper() == "L1":
                loss = tf.abs(outputs - targets)
            else:
                raise FinetuneError(
                    "regression_loss needs to be either L1 or L2, instead it is {}".format(
                        config.regression_loss
                    )
                )
        return {"logits": outputs, "losses": loss}

Esempio n. 5

0

Mostra file

File: featurizer.py Progetto: tc-wolf/finetune

def block(X, block_name, use_fp16, pool_idx=None, encoder_state=None, train=False, pdrop=0.1, nominal_pool_length=512, use_fused_kernel=True):
    with tf.variable_scope(block_name):
        h1 = cumulative_state_net(X, "cumulative_state_net", use_fp16, pdrop, train, nominal_pool_length=nominal_pool_length, use_fused_kernel=use_fused_kernel)
        if encoder_state is not None:
            mixed = enc_dec_mix(encoder_state["sequence_features"], h1, encoder_state["pool_idx"], pool_idx)
            h1 = h1 + mixed
        output = tf.nn.relu(dropout(norm(h1 + X, "norm", fp16=use_fp16, e=1e-2), pdrop, train))
        return output

Esempio n. 6

0

Mostra file

    def multihead_attn(q, k, v, attn_p_drop, train=False):
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))

        w = mask_attn_weights(w)
        w = softmax(w)
        w = dropout(w, attn_p_drop, train)
        a = tf.matmul(w, v)
        return a

Esempio n. 7

0

Mostra file

File: featurizer.py Progetto: tc-wolf/finetune

def cumulative_state_net(X, name, use_fp16, pdrop, train, pool_kernel_size=2, nominal_pool_length=512, use_fused_kernel=True):
    conv_kernel = 4
    pool_kernel_size = pool_kernel_size or conv_kernel

    nx = shape_list(X)[-1]
    with tf.variable_scope(name):
        output = tf.nn.relu(normal_1d_conv_block(X, conv_kernel, "1-" + str(conv_kernel), use_fp16, output_dim=nx))
        output = tf.nn.relu(normal_1d_conv_block(output, conv_kernel, "2-" + str(conv_kernel), use_fp16, output_dim=nx))
        output = normal_1d_conv_block(output, conv_kernel, "3-" + str(conv_kernel), use_fp16, output_dim=nx)

    output = dropout(output, pdrop, train)
    aggregated = cascaded_pool(output, kernel_size=pool_kernel_size, pool_len=nominal_pool_length, use_fused_kernel=use_fused_kernel)

    return normal_1d_conv_block(aggregated, 1, "output_reproject", use_fp16, output_dim=nx)

Esempio n. 8

0

Mostra file

def attn(x, scope, n_state, *, past, hparams, train=False):
    assert x.shape.ndims == 3  # Should be [batch, sequence, features]
    assert n_state % hparams.n_heads == 0
    if past is not None:
        assert (
            past.shape.ndims == 5
        )  # Should be [batch, 2, heads, sequence, features], where 2 is [k, v]

    def split_heads(x):
        # From [batch, sequence, features] to [batch, heads, sequence, features]
        return tf.transpose(split_states(x, hparams.n_heads), [0, 2, 1, 3])

    def merge_heads(x):
        # Reverse of split_heads
        return merge_states(tf.transpose(x, [0, 2, 1, 3]))

    def mask_attn_weights(w):
        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
        _, _, nd, ns = shape_list(w)
        b = attention_mask(nd, ns, dtype=w.dtype)
        b = tf.reshape(b, [1, 1, nd, ns])
        w = w * b - tf.cast(1e10, w.dtype) * (1 - b)
        return w

    def multihead_attn(q, k, v, attn_p_drop, train=False):
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))

        w = mask_attn_weights(w)
        w = softmax(w)
        w = dropout(w, attn_p_drop, train)
        a = tf.matmul(w, v)
        return a

    with tf.variable_scope(scope):
        c = conv1d(x, "c_attn", n_state * 3)
        q, k, v = map(split_heads, tf.split(c, 3, axis=2))
        if past is not None:
            pk, pv = tf.unstack(past, axis=1)
            k = tf.concat([pk, k], axis=-2)
            v = tf.concat([pv, v], axis=-2)
        a = multihead_attn(q, k, v, hparams.attn_p_drop, train=train)
        a = merge_heads(a)
        a = conv1d(a, "c_proj", n_state)
        a = dropout(a, hparams.resid_p_drop, train=train)
        return a

Esempio n. 9

0

Mostra file

def ordinal_regressor(
    hidden,
    targets,
    n_targets,
    config,
    shared_threshold_weights=True,
    train=False,
    reuse=None,
    **kwargs
):
    """
    Ordinal Regressor using all-threshold loss.

    :param hidden: The output of the featurizer. [batch_size, embed_dim]
    :param targets: The placeholder representing the regression targets (binary threshold values). [batch_size]
    :param n_targets: A python int containing the number of thresholds that the model should be learning to predict over.
    :param dropout_placeholder:
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :param kwargs: Spare arguments.
    :return: dict containing:
        logits: The regression outputs.
        losses: All-threshold Loss for the regression targets.
    """
    with tf.variable_scope("ordinalregressor", reuse=reuse):
        hidden = dropout(hidden, config.clf_p_drop, train)
        if shared_threshold_weights:
            w_init = tf.random_normal_initializer(stddev=config.weight_stddev)
            b_init = tf.random_normal_initializer(0)
            nx = config.n_embed
            w = tf.get_variable("w", [nx, 1], initializer=w_init)
            b = tf.get_variable("b", [n_targets], initializer=b_init)
            logits = tf.matmul(hidden, w) + b
        else:
            logits = perceptron(hidden, n_targets, config)

        if targets is None:
            outputs = tf.sigmoid(logits)
            loss = None
        else:
            outputs = logits
            loss = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits, labels=tf.stop_gradient(targets)
            )
        return {"logits": outputs, "losses": loss}

Esempio n. 10

0

Mostra file

File: target_blocks.py Progetto: seeker1943/finetune

def classifier(hidden,
               targets,
               n_targets,
               config,
               train=False,
               reuse=None,
               **kwargs):
    """
    A simple linear classifier.

    :param hidden: The output of the featurizer. [batch_size, embed_dim]
    :param targets: One hot encoded target ids. [batch_size, n_classes]
    :param n_targets: A python int containing the number of classes that the model should be learning to predict over.
    :param dropout_placeholder:
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :param kwargs: Spare arguments.
    :return: dict containing:
        logits: The unnormalised log probabilities of each class.
        losses: The loss for the classifier.
    """
    with tf.variable_scope("classifier", reuse=reuse):
        hidden = dropout(hidden, config.clf_p_drop, train)
        clf_logits = perceptron(hidden, n_targets, config)
        if targets is None:
            clf_losses = None
        else:
            clf_losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=clf_logits, labels=tf.stop_gradient(targets))

            clf_losses = _apply_class_weight(clf_losses, targets,
                                             kwargs.get("class_weights"))

            # From Unsupervised Data Augmentation for Consistency Training, Xie et al. 2019
            if config.tsa_schedule:
                clf_logits, clf_losses = tsa_loss(n_targets, config,
                                                  clf_losses, clf_logits,
                                                  targets)

        return {"logits": clf_logits, "losses": clf_losses}

Esempio n. 11

0

Mostra file

def gpt2_featurizer(
    X,
    encoder,
    config,
    train=False,
    reuse=None,
    **kwargs
):
    initial_shape = tf.shape(X)
    X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0))
    X.set_shape([None, None, None])

    with tf.variable_scope("model/featurizer", reuse=reuse):
        embed_weights = tf.get_variable(
            name="we",
            shape=[encoder.vocab_size + config.max_length, config.n_embed],
            initializer=tf.random_normal_initializer(stddev=config.weight_stddev),
        )
        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

        X = tf.reshape(X, [-1, config.max_length, 2])
        h = embed(X, embed_weights)

        # Transformer
        pasts = [None] * config.n_layer
        for layer, past in enumerate(pasts):
            if (
                (config.n_layer - layer) == config.num_layers_trained
                and config.num_layers_trained != config.n_layer
                and config.adapter_size is None
            ):
                h = tf.stop_gradient(h)
                train_layer = False
            else:
                train_layer = train

            with tf.variable_scope("h%d" % layer):
                block_fn = functools.partial(
                    block, past=past, hparams=config, train=train
                )
                if config.low_memory_mode and train_layer:
                    block_fn = recompute_grad(block_fn, use_entire_scope=True)
                h = block_fn(h)

        h = norm(h, "ln_f")

        # Use hidden state at classifier token as input to final proj. + softmax
        clf_h = tf.reshape(h, [-1, config.n_embed])  # [batch * seq_len, embed]
        clf_token = encoder["_classify_"]
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32
        )
        clf_h = tf.gather(
            clf_h,
            tf.range(shape_list(X)[0], dtype=tf.int32) * config.max_length + pool_idx,
        )
        clf_h = tf.reshape(
            clf_h, shape=tf.concat((initial_shape[:-2], [config.n_embed]), 0)
        )
        seq_feats = tf.reshape(
            h, shape=tf.concat((initial_shape[:-1], [config.n_embed]), 0)
        )

        lengths = lengths_from_eos_idx(eos_idx=pool_idx, max_length=shape_list(X)[0])

        return {
            "embed_weights": embed_weights,
            "features": clf_h,
            "sequence_features": seq_feats,
            "eos_idx": pool_idx,
            "lengths": lengths
        }

Esempio n. 12

0

Mostra file

File: featurizer.py Progetto: tc-wolf/finetune

def featurizer(X, encoder, config, train=False, reuse=None, encoder_state=None, context=None, context_dim=None, **kwargs):
    """
    The main element of the OSCAR model. Maps from tokens ids to a dense, embedding of the sequence.

    :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx]
    :param encoder: A TextEncoder object.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :return: A dict containing;
        embed_weights: the word embedding matrix.
        features: The output of the featurizer_final state.
        sequence_features: The output of the featurizer at each timestep.
    """
    initial_shape = [a or -1 for a in X.get_shape().as_list()]
    if len(initial_shape) != 3:
        X = tf.reshape(X, shape=[-1] + initial_shape[-2:])

    x_shape = tf.shape(X)
    with tf.variable_scope('model/featurizer', reuse=reuse):
        encoder._lazy_init()
        clf_token = encoder.end_token
        pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32)
        if encoder_state is None:
            embed_weights = tf.get_variable("we", [encoder.vocab_size + config.max_length, config.n_embed],
                                            initializer=tf.random_normal_initializer(stddev=config.weight_stddev))
        else:
            embed_weights = encoder_state["embed_weights"]

        if config.oscar_use_fp16:
            embed_weights = tf.cast(embed_weights, tf.float16)

        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

        X = tf.reshape(X, [-1, x_shape[1], 2])

        if config.oscar_use_timing:
            h = embed(X, embed_weights)
        else:
            h = embed_no_timing(X, embed_weights)

        for layer in range(config.n_layer):
            with tf.variable_scope('h%d_' % layer):
                if (
                        (config.n_layer - layer) == config.num_layers_trained and
                        config.num_layers_trained != config.n_layer
                ):
                    h = tf.stop_gradient(h)

                block_fn_fwd = functools.partial(
                    block, block_name='block%d_' % layer, use_fp16=config.oscar_use_fp16,
                    pool_idx=None, encoder_state=encoder_state, train=train,
                    pdrop=config.resid_p_drop, use_fused_kernel=config.oscar_use_fused_kernel,
                )

                if config.low_memory_mode and train:
                    block_fn_fwd = recompute_grad(block_fn_fwd, use_entire_scope=True)
                h = block_fn_fwd(h)

        h = normal_1d_conv_block(h, 1, "output", config.oscar_use_fp16, dilation=1)

        mask = tf.expand_dims(tf.sequence_mask(pool_idx, maxlen=tf.shape(h)[1], dtype=h.dtype), -1)

        if config.oscar_feat_mode == "clf_tok":
            clf_h = tf.gather_nd(h, tf.stack([tf.range(shape_list(h)[0]), pool_idx], 1))
        elif config.oscar_feat_mode == "mean_tok":
            clf_h = tf.reduce_sum(h * mask, 1) / tf.reduce_sum(h)
        elif config.oscar_feat_mode == "max_tok":
            clf_h = tf.reduce_max(h - (1e5 * (1.0 - mask)), 1)
        else:
            raise ValueError("config.feat_mode should be one of clf_tok, mean_tok or max_tok")

        if len(initial_shape) != 3:
            seq_feats = tf.reshape(h, shape=initial_shape[:-1] + [config.n_embed])
        else:
            seq_feats = h

        return {
            'embed_weights': embed_weights,
            'features': cast_maybe(clf_h, tf.float32),
            'sequence_features': seq_feats,
            'eos_idx': pool_idx,
            'encoded_input': X[:, :tf.reduce_min(pool_idx), 0],
            'lengths': lengths_from_eos_idx(eos_idx=pool_idx, max_length=shape_list(X)[0])
        }

Esempio n. 13

0

Mostra file

File: featurizer.py Progetto: saikat1506/finetune

def tcn_featurizer(X, encoder, config, train=False, reuse=None, **kwargs):
    """
    The featurizer element of the finetuning model. Maps from tokens ids to a dense embedding of the sequence.

    :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx]
    :param encoder: A TextEncoder object.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :return: A dict containing;
        embed_weights: the word embedding matrix.
        features: The output of the featurizer_final state.
        sequence_features: The output of the featurizer at each timestep.
    """
    initial_shape = tf.shape(X)
    X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0))

    with tf.variable_scope("model/featurizer", reuse=reuse):
        embed_weights = tf.get_variable(
            name="we",
            shape=[
                encoder.vocab_size + config.max_length,
                config.n_embed_featurizer
            ],
            initializer=tf.random_normal_initializer(
                stddev=config.weight_stddev),
        )

        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

        X = tf.reshape(X, [-1, config.max_length, 2])

        # we remove positional embeddings from the model
        h = embed(X[:, :, :1], embed_weights)

        # keep track of the classify token
        clf_token = encoder["_classify_"]

        with tf.variable_scope("tcn_stack"):
            representation = h
            for layer_num in range(config.n_layer):
                representation = TemporalBlock(
                    n_filters=config.n_filter,
                    kernel_size=config.kernel_size,
                    rate=config.resid_p_drop if train else 0,
                    dilation_rate=2**layer_num,
                    scope="Temporal{}".format(layer_num),
                )(representation)

        seq_feats = tf.reshape(representation,
                               shape=[-1, config.max_length, config.n_filter])

        # mask out the values past the classify token before performing pooling
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1),
            tf.int32,
        )

        # mask is past the classify token (i.e. make those results extremely negative)
        mask = tf.expand_dims(
            1.0 - tf.sequence_mask(pool_idx,
                                   maxlen=tf.shape(representation)[1],
                                   dtype=tf.float32),
            -1,
        )
        pool = tf.reduce_max(representation + mask * -1e9, 1)
        clf_h = pool
        clf_h = tf.reshape(clf_h,
                           shape=tf.concat(
                               (initial_shape[:-2], [config.n_filter]), 0))

        # note that, due to convolution and pooling, the dimensionality of the features is much smaller than in the
        # transformer base models

        lengths = lengths_from_eos_idx(eos_idx=pool_idx,
                                       max_length=config.max_length)
        return {
            "embed_weights": embed_weights,
            "features":
            clf_h,  # [batch_size, n_embed] for classify, [batch_size, 1, n_embed] for comparison, etc.
            "sequence_features": seq_feats,  # [batch_size, seq_len, n_embed]
            "eos_idx": pool_idx,  # [batch_size]
            "lengths": lengths
        }

Esempio n. 14

0

Mostra file

def textcnn_featurizer(X, encoder, config, train=False, reuse=None):
    """
    The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence.

    :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx]
    :param encoder: A TextEncoder object.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :return: A dict containing;
        embed_weights: the word embedding matrix.
        features: The output of the featurizer_final state.
        sequence_features: The output of the featurizer at each timestep.
    """
    initial_shape = tf.shape(X)
    X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0))

    with tf.variable_scope('model/featurizer', reuse=reuse):
        embed_weights = tf.get_variable(
            name="we",
            shape=[
                encoder.vocab_size + config.max_length,
                config.n_embed_featurizer
            ],
            initializer=tf.random_normal_initializer(
                stddev=config.weight_stddev))
        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

        X = tf.reshape(X, [-1, config.max_length, 2])

        # we remove positional embeddings from the model
        h = embed(X[:, :, :1], embed_weights)

        # keep track of the classify token
        clf_token = encoder['_classify_']
        # Convolutional Layer (this is all the same layer, just different filter sizes)
        pool_layers = []
        conv_layers = []
        for i, kernel_size in enumerate(config.kernel_sizes):
            conv = tf.layers.conv1d(
                inputs=h,
                filters=config.num_filters_per_size,
                kernel_size=kernel_size,
                padding='same',
                activation=tf.nn.relu,
                name='conv' + str(i),
                kernel_initializer=tf.initializers.glorot_normal)
            conv_layers.append(conv)
            # mask out the values past the classify token before performing pooling
            pool_idx = tf.cast(
                tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32),
                          1), tf.int32)
            # mask is past the classify token (i.e. make those results extremely negative)
            mask = tf.expand_dims(
                1.0 - tf.sequence_mask(
                    pool_idx, maxlen=tf.shape(conv)[1], dtype=tf.float32), -1)
            pool = tf.reduce_max(conv + mask * -1e9, 1)
            pool_layers.append(pool)

        # Concat the output of the convolutional layers for use in sequence embedding
        conv_seq = tf.concat(conv_layers, axis=2)
        seq_feats = tf.reshape(conv_seq,
                               shape=[-1, config.max_length, config.n_embed])
        # Concatenate the univariate vectors as features for classification
        clf_h = tf.concat(pool_layers, axis=1)
        clf_h = tf.reshape(clf_h,
                           shape=tf.concat(
                               (initial_shape[:-2], [config.n_embed]), 0))

        # note that, due to convolution and pooling, the dimensionality of the features is much smaller than in the
        # transformer base models
        return {
            'embed_weights': embed_weights,
            'features':
            clf_h,  # [batch_size, n_embed] for classify, [batch_size, 1, n_embed] for comparison, etc.
            'sequence_features': seq_feats,  # [batch_size, seq_len, n_embed]
            'pool_idx': pool_idx  # [batch_size]
        }