def _attention_scores(self, query, key, mask=None):
        """Calculates attention scores as a query-key dot product.

    Args:
      query: Query tensor of shape `[batch_size, sequence_length, Tq]`.
      key: Key tensor of shape `[batch_size, sequence_length, Tv]`.
      mask: mask tensor of shape `[batch_size, sequence_length]`.

    Returns:
      Tensor of shape `[batch_size, sequence_length, sequence_length]`.
    """
        scores = tf.linalg.matmul(query, key, transpose_b=True)

        if mask is not None:
            mask = layers.SelfAttentionMask()(scores, mask)
            # Prevent pointing to self (zeros down the diagonal).
            diagonal_mask = tf.linalg.diag(tf.zeros(
                (tf.shape(mask)[0], self._seq_length)),
                                           padding_value=1)
            diagonal_mask = tf.cast(diagonal_mask, tf.float32)
            mask = tf.math.multiply(diagonal_mask, mask)
            # As this is pre softmax (exp) as such we set the values very low.
            mask_add = -1e9 * (1. - mask)
            scores = scores * mask + mask_add

        return scores
Beispiel #2
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            inner_dim=3072,
            inner_activation=lambda x: tf.keras.activations.gelu(
                x, approximate=True),
            output_dropout=0.1,
            attention_dropout=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            output_range=None,
            embedding_width=None,
            embedding_layer=None,
            **kwargs):
        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            embedding_layer_inst = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            embedding_layer_inst = embedding_layer
        word_embeddings = embedding_layer_inst(word_ids)

        # Always uses dynamic slicing for simplicity.
        position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = position_embedding_layer(word_embeddings)
        type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')
        type_embeddings = type_embedding_layer(type_ids)

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])

        embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        embeddings = embedding_norm_layer(embeddings)
        embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')
            embeddings = embedding_projection(embeddings)
        else:
            embedding_projection = None

        transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()(data, mask)
        encoder_outputs = []
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        last_encoder_output = encoder_outputs[-1]
        # Applying a tf.slice op (through subscript notation) to a Keras tensor
        # like this will create a SliceOpLambda layer. This is better than a Lambda
        # layer with Python code, because that is fundamentally less portable.
        first_token_tensor = last_encoder_output[:, 0, :]
        pooler_layer = tf.keras.layers.Dense(units=hidden_size,
                                             activation='tanh',
                                             kernel_initializer=initializer,
                                             name='pooler_transform')
        cls_output = pooler_layer(first_token_tensor)

        outputs = dict(
            sequence_output=encoder_outputs[-1],
            pooled_output=cls_output,
            encoder_outputs=encoder_outputs,
        )

        # Once we've created the network using the Functional API, we call
        # super().__init__ as though we were invoking the Functional API Model
        # constructor, resulting in this object having all the properties of a model
        # created using the Functional API. Once super().__init__ is called, we
        # can assign attributes to `self` - note that all `self` assignments are
        # below this line.
        super(BertEncoder, self).__init__(inputs=[word_ids, mask, type_ids],
                                          outputs=outputs,
                                          **kwargs)

        config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
        }

        # We are storing the config dict as a namedtuple here to ensure checkpoint
        # compatibility with an earlier version of this model which did not track
        # the config dict attribute. TF does not track immutable attrs which
        # do not contain Trackables, so by creating a config namedtuple instead of
        # a dict we avoid tracking it.
        config_cls = collections.namedtuple('Config', config_dict.keys())
        self._config = config_cls(**config_dict)
        self._pooler_layer = pooler_layer
        self._transformer_layers = transformer_layers
        self._embedding_norm_layer = embedding_norm_layer
        self._embedding_layer = embedding_layer_inst
        self._position_embedding_layer = position_embedding_layer
        self._type_embedding_layer = type_embedding_layer
        if embedding_projection is not None:
            self._embedding_projection = embedding_projection
    def __init__(self,
                 network,
                 bert_config,
                 initializer='glorot_uniform',
                 seq_length=128,
                 use_pointing=True,
                 is_training=True):
        """Creates Felix Tagger.

    Setting up all of the layers needed for call.

    Args:
      network: An encoder network, which should output a sequence of hidden
               states.
      bert_config: A config file which in addition to the  BertConfig values
      also includes: num_classes, hidden_dropout_prob, and query_transformer.
      initializer: The initializer (if any) to use in the classification
                   networks. Defaults to a Glorot uniform initializer.
      seq_length:  Maximum sequence length.
      use_pointing: Whether a pointing network is used.
      is_training: The model is being trained.
    """

        super(FelixTagger, self).__init__()
        self._network = network
        self._seq_length = seq_length
        self._bert_config = bert_config
        self._use_pointing = use_pointing
        self._is_training = is_training

        self._tag_logits_layer = tf.keras.layers.Dense(
            self._bert_config.num_classes)
        if not self._use_pointing:
            return

        # An arbitrary heuristic (sqrt vocab size) for the tag embedding dimension.
        self._tag_embedding_layer = tf.keras.layers.Embedding(
            self._bert_config.num_classes,
            int(math.ceil(math.sqrt(self._bert_config.num_classes))),
            input_length=seq_length)

        self._position_embedding_layer = layers.PositionEmbedding(
            max_length=seq_length)
        self._edit_tagged_sequence_output_layer = tf.keras.layers.Dense(
            self._bert_config.hidden_size, activation=activations.gelu)

        if self._bert_config.query_transformer:
            self._self_attention_mask_layer = layers.SelfAttentionMask()
            self._transformer_query_layer = layers.TransformerEncoderBlock(
                num_attention_heads=self._bert_config.num_attention_heads,
                inner_dim=self._bert_config.intermediate_size,
                inner_activation=activations.gelu,
                output_dropout=self._bert_config.hidden_dropout_prob,
                attention_dropout=self._bert_config.hidden_dropout_prob,
                output_range=seq_length,
            )

        self._query_embeddings_layer = tf.keras.layers.Dense(
            self._bert_config.query_size)

        self._key_embeddings_layer = tf.keras.layers.Dense(
            self._bert_config.query_size)
Beispiel #4
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            inner_dim=3072,
            inner_activation=lambda x: tf.keras.activations.gelu(
                x, approximate=True),
            output_dropout=0.1,
            attention_dropout=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            output_range=None,
            embedding_width=None,
            **kwargs):
        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
        }

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size
        self._embedding_layer = self._build_embedding_layer()
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = self._position_embedding_layer(word_embeddings)
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')
        type_embeddings = self._type_embedding_layer(type_ids)

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        embeddings = self._embedding_norm_layer(embeddings)
        embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')
            embeddings = self._embedding_projection(embeddings)

        self._transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()(data, mask)
        encoder_outputs = []
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        last_enocder_output = encoder_outputs[-1]
        # Applying a tf.slice op (through subscript notation) to a Keras tensor
        # like this will create a SliceOpLambda layer. This is better than a Lambda
        # layer with Python code, because that is fundamentally less portable.
        first_token_tensor = last_enocder_output[:, 0, :]
        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')
        cls_output = self._pooler_layer(first_token_tensor)

        outputs = dict(
            sequence_output=encoder_outputs[-1],
            pooled_output=cls_output,
            encoder_outputs=encoder_outputs,
        )
        super(BertEncoder, self).__init__(inputs=[word_ids, mask, type_ids],
                                          outputs=outputs,
                                          **kwargs)