Esempio n. 1
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        embedding_width = 768
        dropout_rate = 0.1
        initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02)

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=30522,
            embedding_width=embedding_width,
            initializer=initializer,
            name="word_embeddings",
        )

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=512,
            name="position_embedding",
        )
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=2,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name="type_embeddings",
        )
        self._add = tf.keras.layers.Add()
        self._layer_norm = tf.keras.layers.LayerNormalization(
            name="embeddings/layer_norm",
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)
        self._dropout = tf.keras.layers.Dropout(rate=dropout_rate)

        self._attention_mask = layers.SelfAttentionMask()
        self._transformer_layers = []
        for i in range(12):
            layer = layers.Transformer(
                num_attention_heads=12,
                intermediate_size=3072,
                intermediate_activation=activations.gelu,
                dropout_rate=dropout_rate,
                attention_dropout_rate=0.1,
                output_range=None,
                kernel_initializer=initializer,
                name="transformer/layer_%d" % i,
            )
            self._transformer_layers.append(layer)

        self._lambda = tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))
        self._pooler_layer = tf.keras.layers.Dense(
            units=embedding_width,
            activation="tanh",
            kernel_initializer=initializer,
            name="pooler_transform",
        )
Esempio n. 2
0
 def build(self, input_shape):
   """Implements build() for the layer."""
   self.encoder_layers = []
   for i in range(self._num_layers):
     self.encoder_layers.append(
         layers.Transformer(
             num_attention_heads=self._num_attention_heads,
             intermediate_size=self._intermediate_size,
             intermediate_activation=self._activation,
             dropout_rate=self._dropout_rate,
             attention_dropout_rate=self._attention_dropout_rate,
             use_bias=self._use_bias,
             norm_first=self._norm_first,
             norm_epsilon=self._norm_epsilon,
             intermediate_dropout=self._intermediate_dropout,
             attention_initializer=attention_initializer(input_shape[2]),
             name=("layer_%d" % i)))
   self.output_normalization = tf.keras.layers.LayerNormalization(
       epsilon=self._norm_epsilon, dtype="float32")
   super(TransformerEncoder, self).build(input_shape)
Esempio n. 3
0
    def __init__(self,
                 encoder_cfg,
                 num_task_agnostic_layers,
                 output='logits',
                 name='rtd',
                 **kwargs):
        super(ReplacedTokenDetectionHead, self).__init__(name=name, **kwargs)
        self.num_task_agnostic_layers = num_task_agnostic_layers
        self.hidden_size = encoder_cfg['embedding_cfg']['hidden_size']
        self.num_hidden_instances = encoder_cfg['num_hidden_instances']
        self.hidden_cfg = encoder_cfg['hidden_cfg']
        self.activation = self.hidden_cfg['intermediate_activation']
        self.initializer = self.hidden_cfg['kernel_initializer']

        self.hidden_layers = []
        for i in range(self.num_task_agnostic_layers,
                       self.num_hidden_instances):
            self.hidden_layers.append(
                layers.Transformer(
                    num_attention_heads=self.hidden_cfg['num_attention_heads'],
                    intermediate_size=self.hidden_cfg['intermediate_size'],
                    intermediate_activation=self.activation,
                    dropout_rate=self.hidden_cfg['dropout_rate'],
                    attention_dropout_rate=self.
                    hidden_cfg['attention_dropout_rate'],
                    kernel_initializer=self.initializer,
                    name='transformer/layer_%d_rtd' % i))
        self.dense = tf.keras.layers.Dense(self.hidden_size,
                                           activation=self.activation,
                                           kernel_initializer=self.initializer,
                                           name='transform/rtd_dense')
        self.rtd_head = tf.keras.layers.Dense(
            units=1,
            kernel_initializer=self.initializer,
            name='transform/rtd_head')

        if output not in ('predictions', 'logits'):
            raise ValueError((
                'Unknown `output` value "%s". `output` can be either "logits" or '
                '"predictions"') % output)
        self._output_type = output
Esempio n. 4
0
 def build(self, input_shape):
     self.hidden_layers = []
     for i in range(self.num_task_agnostic_layers,
                    self.num_hidden_instances):
         self.hidden_layers.append(
             layers.Transformer(
                 num_attention_heads=self.hidden_cfg['num_attention_heads'],
                 intermediate_size=self.hidden_cfg['intermediate_size'],
                 intermediate_activation=self.activation,
                 dropout_rate=self.hidden_cfg['dropout_rate'],
                 attention_dropout_rate=self.
                 hidden_cfg['attention_dropout_rate'],
                 kernel_initializer=self.initializer,
                 name='transformer/layer_%d_rtd' % i))
     self.dense = tf.keras.layers.Dense(self.hidden_size,
                                        activation=self.activation,
                                        kernel_initializer=self.initializer,
                                        name='transform/rtd_dense')
     self.rtd_head = tf.keras.layers.Dense(
         units=1,
         kernel_initializer=self.initializer,
         name='transform/rtd_head')
Esempio n. 5
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            float_dtype='float32',
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'float_dtype': float_dtype,
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=max_sequence_length)
        position_embeddings = self._position_embedding_layer(word_embeddings)

        type_embeddings = (layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate,
                                              dtype=tf.float32)(embeddings))

        if float_dtype == 'float16':
            embeddings = tf.cast(embeddings, tf.float16)

        data = embeddings
        attention_mask = MakeAttentionMaskLayer()([data, mask])
        for i in range(num_layers):
            layer = layers.Transformer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                kernel_initializer=initializer,
                dtype=float_dtype,
                name='transformer/layer_%d' % i)
            data = layer([data, attention_mask])

        first_token_tensor = (tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data))
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)

        super(TransformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=[data, cls_output],
                             **kwargs)
Esempio n. 6
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_sizes=(16, ),
            num_float_features=0,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            return_all_encoder_outputs=False,
            bert_init_ckpt=None,
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        num_type_features = len(type_vocab_sizes)
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_sizes': type_vocab_sizes,
            'num_type_features': num_type_features,
            'num_float_features': num_float_features,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'return_all_encoder_outputs': return_all_encoder_outputs,
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        all_inputs = [word_ids, mask]
        if num_type_features:
            type_ids = tf.keras.layers.Input(shape=(sequence_length,
                                                    num_type_features),
                                             dtype=tf.int32,
                                             name='input_type_ids')
            all_inputs.append(type_ids)
        if num_float_features:
            float_features = tf.keras.layers.Input(shape=(sequence_length,
                                                          num_float_features),
                                                   dtype=tf.float32,
                                                   name='float_features')
            all_inputs.append(float_features)

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = modeling.layers.PositionEmbedding(
            initializer=initializer, max_length=max_sequence_length)
        position_embeddings = self._position_embedding_layer(word_embeddings)
        all_embeddings = [word_embeddings, position_embeddings]

        if num_type_features:
            type_embeddings = [(layers.OnDeviceEmbedding(
                vocab_size=type_vocab_sizes[idx],
                embedding_width=hidden_size,
                initializer=initializer,
                use_one_hot=True,
                name='type_embeddings_{}'.format(idx))(type_ids[..., idx]))
                               for idx in range(num_type_features)]
            all_embeddings += type_embeddings

        if num_float_features:
            float_embeddings = [
                (
                    tf.keras.layers.Dense(
                        hidden_size, name='float_features_{}'.format(idx))(
                            # Expanding the last dim here is important.
                            float_features[..., idx, None]))
                for idx in range(num_float_features)
            ]
            all_embeddings += float_embeddings

        embeddings = tf.keras.layers.Add()(all_embeddings)
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))

        self._transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        encoder_outputs = []
        for i in range(num_layers):
            layer = layers.Transformer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                kernel_initializer=initializer,
                name='model/layer_with_weights-%d' % (i + 4))
            self._transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        first_token_tensor = (
            tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
                encoder_outputs[-1]))
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)

        if return_all_encoder_outputs:
            outputs = [encoder_outputs, cls_output]
        else:
            outputs = [encoder_outputs[-1], cls_output]
        super(TransformerEncoder, self).__init__(inputs=all_inputs,
                                                 outputs=outputs,
                                                 **kwargs)

        if bert_init_ckpt and learner_flags.INIT_CHECKPOINT.value is None:
            self.init_weights(bert_init_ckpt)
Esempio n. 7
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            return_all_encoder_outputs=False,
            output_range=None,
            embedding_width=None,
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'return_all_encoder_outputs': return_all_encoder_outputs,
            'output_range': output_range,
            'embedding_width': embedding_width,
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size
        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = self._position_embedding_layer(word_embeddings)
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')
        type_embeddings = self._type_embedding_layer(type_ids)

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])

        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')
            embeddings = self._embedding_projection(embeddings)

        self._transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        encoder_outputs = []
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None
            layer = layers.Transformer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        first_token_tensor = (
            tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
                encoder_outputs[-1]))
        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')
        cls_output = self._pooler_layer(first_token_tensor)

        if return_all_encoder_outputs:
            outputs = [encoder_outputs, cls_output]
        else:
            outputs = [encoder_outputs[-1], cls_output]

        super(TransformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=outputs,
                             **kwargs)