Beispiel #1
0
 def test_sample(self):
     inputs = get_inputs(seq_len=512)
     embed_layer = get_embedding(inputs,
                                 token_num=12,
                                 embed_dim=768,
                                 pos_num=512)
     model = keras.models.Model(inputs=inputs, outputs=embed_layer)
     model.compile(
         optimizer='adam',
         loss='mse',
         metrics={},
     )
     model.summary(line_length=120)
     self.assertEqual((None, 512, 768), model.layers[-1].output_shape)
Beispiel #2
0
 def test_sample(self):
     inputs = get_inputs(seq_len=512)
     embed_layer, _ = get_embedding(inputs, token_num=12, embed_dim=768, pos_num=512)
     masked_layer = Masked(name='Masked')([embed_layer, inputs[-1]])
     model = keras.models.Model(inputs=inputs, outputs=masked_layer)
     model.compile(
         optimizer='adam',
         loss='mse',
         metrics={},
     )
     model.summary()
     model.predict([
         np.asarray([[1] + [0] * 511]),
         np.asarray([[0] * 512]),
         np.asarray([[1] + [0] * 511]),
     ])
     self.assertEqual((None, 512, 768), model.layers[-1].output_shape)
Beispiel #3
0
def get_checkpoint_model(token_num,
              pos_num=512,
              seq_len=512,
              embed_dim=768,
              transformer_num=12,
              head_num=12,
              feed_forward_dim=3072,
              dropout_rate=0.1,
              attention_activation=None,
              feed_forward_activation='gelu',
              training=True,
              finetuned=False,
              output_dim=2,
              trainable=None,
              output_layer_num=1,
              retention_configuration=None,
              LAMBDA=None,
              FLAG_EXTRACT_LAYER=None,
              TASK=None,
              ):
        """Get BERT model.
        :param token_num: Number of tokens.
        :param pos_num: Maximum position.
        :param seq_len: Maximum length of the input sequence or None.
        :param embed_dim: Dimensions of embeddings.
        :param transformer_num: Number of transformers.
        :param head_num: Number of heads in multi-head attention in each transformer.
        :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
        :param dropout_rate: Dropout rate.
        :param attention_activation: Activation for attention layers.
        :param feed_forward_activation: Activation for feed-forward layers.
        :param trainable: Whether the model is trainable.
        :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `training` is `False`.
        :return: The built model.
        """
        if attention_activation == 'gelu':
                attention_activation = gelu
        if feed_forward_activation == 'gelu':
                feed_forward_activation = gelu
        if trainable is None:
                trainable = training
        def _trainable(_layer):
                if isinstance(trainable, (list, tuple, set)):
                    for prefix in trainable:
                        if _layer.name.startswith(prefix):
                            return True
                    return False
                return trainable

        inputs = get_inputs(seq_len=seq_len)
        attention_mask = inputs[2]
        embed_layer, embed_weights = get_embedding(
                inputs,
                token_num=token_num,
                embed_dim=embed_dim,
                pos_num=pos_num,
                dropout_rate=dropout_rate,
        )

        if dropout_rate > 0.0:
                dropout_layer = keras.layers.Dropout(
                    rate=dropout_rate,
                    name='Embedding-Dropout',
                )(embed_layer)
        else:
                dropout_layer = embed_layer
        embed_layer = LayerNormalization(
                trainable=trainable,
                    name='Embedding-Norm',
                )(dropout_layer)

        transformed = get_encoders(
                encoder_num=transformer_num,
                input_layer=embed_layer,
                head_num=head_num,
                hidden_dim=feed_forward_dim,
                attention_activation=attention_activation,
                feed_forward_activation=feed_forward_activation,
                dropout_rate=dropout_rate,
                attention_mask=attention_mask,
                SEQ_LEN=seq_len,
                retention_configuration=retention_configuration,
                LAMBDA=LAMBDA,
                FLAG_EXTRACT_LAYER=FLAG_EXTRACT_LAYER,
        )
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
                units=embed_dim,
                activation='tanh',
                name='NSP-Dense',
        )(extract_layer)
        if TASK == 'sts-b':
            nsp_pred_layer = keras.layers.Dense(
                 units=output_dim,
                 name='NSP',
            )(nsp_dense_layer)
        else:
            nsp_pred_layer = keras.layers.Dense(
                 units=output_dim,
                 activation='softmax',
                 name='NSP',
            )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs, outputs=nsp_pred_layer)
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        return model