def test_raises(self):
   num_attention_heads = 2
   with self.assertRaisesRegex(ValueError, 'The inner_dim of.*'):
     _ = roformer_encoder_block.RoformerEncoderBlock(
         num_attention_heads=num_attention_heads,
         inner_dim=31,
         inner_activation='relu',
         output_dropout=0.1,
         attention_dropout=0.1,
         use_bias=False,
         norm_first=True,
         norm_epsilon=1e-6,
         inner_dropout=0.1,
         attention_initializer=tf.keras.initializers.RandomUniform(
             minval=0., maxval=1.))
 def test_get_config(self):
   num_attention_heads = 2
   encoder_block = roformer_encoder_block.RoformerEncoderBlock(
       num_attention_heads=num_attention_heads,
       inner_dim=32,
       inner_activation='relu',
       output_dropout=0.1,
       attention_dropout=0.1,
       use_bias=False,
       norm_first=True,
       norm_epsilon=1e-6,
       inner_dropout=0.1,
       attention_initializer=tf.keras.initializers.RandomUniform(
           minval=0., maxval=1.))
   encoder_block_config = encoder_block.get_config()
   new_encoder_block = roformer_encoder_block.RoformerEncoderBlock.from_config(
       encoder_block_config)
   self.assertEqual(encoder_block_config, new_encoder_block.get_config())
 def test_several_attention_axes(self, attention_axes):
   test_layer = roformer_encoder_block.RoformerEncoderBlock(
       inner_dim=32,
       inner_activation='relu',
       output_dropout=0.1,
       attention_dropout=0.1,
       use_bias=False,
       norm_first=True,
       norm_epsilon=1e-6,
       inner_dropout=0.1,
       num_attention_heads=10,
       attention_axes=attention_axes)
   seq_len = 21
   dimensions = 80
   # Create a 3-dimensional input (the first dimension is implicit).
   data_tensor = tf.keras.Input(shape=(seq_len, dimensions))
   output_tensor = test_layer(data_tensor)
   # The default output of a transformer layer should be the same as the input.
   self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
 def test_use_bias_norm_first(self):
   num_attention_heads = 2
   hidden_size = 16
   encoder_block = roformer_encoder_block.RoformerEncoderBlock(
       num_attention_heads=num_attention_heads,
       inner_dim=32,
       inner_activation='relu',
       output_dropout=0.1,
       attention_dropout=0.1,
       use_bias=False,
       norm_first=True,
       norm_epsilon=1e-6,
       inner_dropout=0.1,
       attention_initializer=tf.keras.initializers.RandomUniform(
           minval=0., maxval=1.))
   # Forward path.
   dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
   dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
   inputs = [dummy_tensor, dummy_mask]
   output = encoder_block(inputs)
   self.assertEqual(output.shape, (2, 4, hidden_size))
Esempio n. 5
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,  # FIXME: hidden_size per head should be even!
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            inner_dim=3072,
            inner_activation=lambda x: tf.keras.activations.gelu(
                x, approximate=True),
            output_dropout=0.1,
            attention_dropout=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            output_range=None,
            embedding_width=None,
            embedding_layer=None,
            norm_first=False,
            **kwargs):
        if 'intermediate_size' in kwargs:
            inner_dim = kwargs['intermediate_size']
            del kwargs['intermediate_size']
        if 'activation' in kwargs:
            inner_activation = kwargs['activation']
            del kwargs['activation']
        if 'dropout_rate' in kwargs:
            output_dropout = kwargs['dropout_rate']
            del kwargs['dropout_rate']
        if 'attention_dropout_rate' in kwargs:
            attention_dropout = kwargs['attention_dropout_rate']
            del kwargs['attention_dropout_rate']

        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            embedding_layer_inst = layers.on_device_embedding.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            embedding_layer_inst = embedding_layer
        word_embeddings = embedding_layer_inst(word_ids)

        # Roformer does not need a position embedding layer
        type_embedding_layer = layers.on_device_embedding.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')
        type_embeddings = type_embedding_layer(type_ids)

        # Roformer does not have absolute position embedding
        embeddings = tf.keras.layers.Add()([word_embeddings, type_embeddings])

        embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        embeddings = embedding_norm_layer(embeddings)
        embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')
            embeddings = embedding_projection(embeddings)
        else:
            embedding_projection = None

        transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()(data, mask)
        encoder_outputs = []
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None
            layer = roformer_encoder_block.RoformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                q_max_sequence_length=max_sequence_length,
                kv_max_sequence_length=max_sequence_length,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='roformer/layer_%d' % i)
            transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        last_encoder_output = encoder_outputs[-1]
        # Applying a tf.slice op (through subscript notation) to a Keras tensor
        # like this will create a SliceOpLambda layer. This is better than a Lambda
        # layer with Python code, because that is fundamentally less portable.
        first_token_tensor = last_encoder_output[:, 0, :]
        pooler_layer = tf.keras.layers.Dense(units=hidden_size,
                                             activation='tanh',
                                             kernel_initializer=initializer,
                                             name='pooler_transform')
        cls_output = pooler_layer(first_token_tensor)

        outputs = dict(
            sequence_output=encoder_outputs[-1],
            pooled_output=cls_output,
            encoder_outputs=encoder_outputs,
        )

        # Once we've created the network using the Functional API, we call
        # super().__init__ as though we were invoking the Functional API Model
        # constructor, resulting in this object having all the properties of a model
        # created using the Functional API. Once super().__init__ is called, we
        # can assign attributes to `self` - note that all `self` assignments are
        # below this line.
        super(RoformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=outputs,
                             **kwargs)

        config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
        }

        # We are storing the config dict as a namedtuple here to ensure checkpoint
        # compatibility with an earlier version of this model which did not track
        # the config dict attribute. TF does not track immutable attrs which
        # do not contain Trackables, so by creating a config namedtuple instead of
        # a dict we avoid tracking it.
        config_cls = collections.namedtuple('Config', config_dict.keys())
        self._config = config_cls(**config_dict)
        self._pooler_layer = pooler_layer
        self._transformer_layers = transformer_layers
        self._embedding_norm_layer = embedding_norm_layer
        self._embedding_layer = embedding_layer_inst
        # self._position_embedding_layer = position_embedding_layer
        self._position_embedding_layer = None
        self._type_embedding_layer = type_embedding_layer
        if embedding_projection is not None:
            self._embedding_projection = embedding_projection