def test_sample(self):
        encoder_input_layer = keras.layers.Input(shape=(512, 768), name='Encoder-Input')
        decoder_input_layer = keras.layers.Input(shape=(512, 768), name='Decoder-Input')
        encoded_layer = get_encoders(
            encoder_num=2,
            input_layer=encoder_input_layer,
            head_num=12,
            hidden_dim=3072,
            dropout_rate=0.0,
        )
        output_layer = get_decoders(
            decoder_num=2,
            input_layer=decoder_input_layer,
            encoded_layer=encoded_layer,
            head_num=12,
            hidden_dim=3072,
            dropout_rate=0.0,
        )
        model = keras.models.Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=output_layer)
        model.compile(optimizer='adam', loss='mse', metrics={})
        model.summary(line_length=160)

        output_layer = get_decoders(
            decoder_num=2,
            input_layer=decoder_input_layer,
            encoded_layer=encoded_layer,
            head_num=12,
            hidden_dim=3072,
            dropout_rate=0.1,
        )
        model = keras.models.Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=output_layer)
        model.compile(optimizer='adam', loss='mse', metrics={})
        model.summary(line_length=160)
        self.assertIsNotNone(model)
Exemple #2
0
 def test_mask_result(self):
     input_layer = keras.layers.Input(
         shape=(None, ),
         name='Input',
     )
     embed_layer = keras.layers.Embedding(
         input_dim=12,
         output_dim=9,
         mask_zero=True,
         name='Embedding',
     )(input_layer)
     transformer_layer = get_encoders(
         encoder_num=1,
         input_layer=embed_layer,
         head_num=1,
         hidden_dim=12,
         attention_activation=None,
         feed_forward_activation=gelu,
         dropout_rate=0.1,
     )
     dense_layer = keras.layers.Dense(
         units=12,
         activation='softmax',
         name='Dense',
     )(transformer_layer)
     mask_layer = keras.layers.Input(
         shape=(None, ),
         name='Mask',
     )
     masked_layer, mask_result = Masked(
         return_masked=True,
         name='Masked',
     )([dense_layer, mask_layer])
     print([masked_layer, mask_result])
     model = keras.models.Model(
         inputs=[input_layer, mask_layer],
         outputs=[masked_layer, mask_result],
     )
     model.compile(
         optimizer='adam',
         loss='mse',
     )
     model.summary(line_length=150)
     predicts = model.predict([
         np.asarray([
             [1, 2, 3, 4, 5, 6, 7, 8, 0, 0],
             [1, 2, 3, 4, 0, 0, 0, 0, 0, 0],
         ]),
         np.asarray([
             [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
             [0, 1, 0, 1, 0, 0, 0, 0, 0, 0],
         ]),
     ])
     expect = np.asarray([
         [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
         [0, 1, 0, 1, 0, 0, 0, 0, 0, 0],
     ])
     self.assertTrue(np.allclose(expect, predicts[1]))
Exemple #3
0
def get_model(token_num,
              pos_num=512,
              seq_len=512,
              embed_dim=768,
              transformer_num=12,
              head_num=12,
              feed_forward_dim=3072,
              dropout_rate=0.1,
              attention_activation=None,
              feed_forward_activation=gelu,
              custom_layers=None,
              training=True,
              lr=1e-4):
    """Get BERT model.

    See: https://arxiv.org/pdf/1810.04805.pdf

    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
    :param dropout_rate: Dropout rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param custom_layers: A function that takes the embedding tensor and returns the tensor after feature extraction.
                          Arguments such as `transformer_num` and `head_num` will be ignored if `custom_layer` is not
                          `None`.
    :param training: The built model will be returned if it is `True`, otherwise the input layers and the last feature
                     extraction layer will be returned.
    :param lr: Learning rate.
    :return: The compiled model.
    """
    inputs = get_inputs(seq_len=seq_len)
    embed_layer, embed_weights = get_embedding(
        inputs,
        token_num=token_num,
        embed_dim=embed_dim,
        pos_num=pos_num,
        dropout_rate=dropout_rate,
        trainable=training,
    )
    transformed = embed_layer
    if custom_layers is not None:
        kwargs = {}
        if keras.utils.generic_utils.has_arg(custom_layers, 'trainable'):
            kwargs['trainable'] = training
        transformed = custom_layers(transformed, **kwargs)
    else:
        transformed = get_encoders(
            encoder_num=transformer_num,
            input_layer=transformed,
            head_num=head_num,
            hidden_dim=feed_forward_dim,
            attention_activation=attention_activation,
            feed_forward_activation=feed_forward_activation,
            dropout_rate=dropout_rate,
            trainable=training,
        )
    if not training:
        return inputs[:2], transformed
    mlm_dense_layer = keras.layers.Dense(
        units=embed_dim,
        activation=feed_forward_activation,
        name='MLM-Dense',
    )(transformed)
    mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
    mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')(
        [mlm_norm_layer, embed_weights])
    masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
    extract_layer = Extract(index=0, name='Extract')(transformed)
    nsp_dense_layer = keras.layers.Dense(
        units=embed_dim,
        activation='tanh',
        name='NSP-Dense',
    )(extract_layer)
    nsp_pred_layer = keras.layers.Dense(
        units=2,
        activation='softmax',
        name='NSP',
    )(nsp_dense_layer)
    model = keras.models.Model(inputs=inputs,
                               outputs=[masked_layer, nsp_pred_layer])
    model.compile(
        optimizer=keras.optimizers.Adam(lr=lr),
        loss=keras.losses.sparse_categorical_crossentropy,
        metrics=[],
    )
    return model
Exemple #4
0
def get_model(token_num,
              pos_num=512,
              seq_len=512,
              embed_dim=768,
              transformer_num=12,
              head_num=12,
              feed_forward_dim=3072,
              dropout_rate=0.1,
              attention_activation=None,
              feed_forward_activation='gelu',
              training=True,
              trainable=None,
              output_layer_num=1,
              use_task_embed=False,
              task_num=10,
              use_adapter=False,
              adapter_units=None):
    """Get BERT model.

    See: https://arxiv.org/pdf/1810.04805.pdf

    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
    :param dropout_rate: Dropout rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param training: A built model with MLM and NSP outputs will be returned if it is `True`,
                     otherwise the input layers and the last feature extraction layer will be returned.
    :param trainable: Whether the model is trainable.
    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `training` is `False`.
    :param use_task_embed: Whether to add task embeddings to existed embeddings.
    :param task_num: The number of tasks.
    :param use_adapter: Whether to use feed-forward adapters before each residual connections.
    :param adapter_units: The dimension of the first transformation in feed-forward adapter.
    :return: The built model.
    """
    if attention_activation == 'gelu':
        attention_activation = gelu
    if feed_forward_activation == 'gelu':
        feed_forward_activation = gelu
    if trainable is None:
        trainable = training
    if adapter_units is None:
        adapter_units = max(1, embed_dim // 100)

    def _trainable(_layer):
        if isinstance(trainable, (list, tuple, set)):
            for prefix in trainable:
                if _layer.name.startswith(prefix):
                    return True
            return False
        return trainable

    inputs = get_inputs(seq_len=seq_len)
    x, s, m = inputs
    x = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]),
                            name='Input-Token-Reshape')(x)
    s = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]),
                            name='Input-Segment-Reshape')(s)
    m = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]),
                            name='Input-Mention-Reshape')(m)

    embed_layer, embed_weights = get_embedding(
        [x, s, m],
        token_num=token_num,
        embed_dim=embed_dim,
        pos_num=pos_num,
        dropout_rate=dropout_rate,
    )
    if use_task_embed:
        task_input = keras.layers.Input(
            shape=(1, ),
            name='Input-Task',
        )
        embed_layer = TaskEmbedding(
            input_dim=task_num,
            output_dim=embed_dim,
            mask_zero=False,
            name='Embedding-Task',
        )([embed_layer, task_input])
        inputs = inputs[:2] + [task_input, inputs[-1]]
    if dropout_rate > 0.0:
        dropout_layer = keras.layers.Dropout(
            rate=dropout_rate,
            name='Embedding-Dropout',
        )(embed_layer)
    else:
        dropout_layer = embed_layer
    embed_layer = LayerNormalization(
        trainable=trainable,
        name='Embedding-Norm',
    )(dropout_layer)
    transformed = get_encoders(
        encoder_num=transformer_num,
        input_layer=embed_layer,
        head_num=head_num,
        hidden_dim=feed_forward_dim,
        attention_activation=attention_activation,
        feed_forward_activation=feed_forward_activation,
        dropout_rate=dropout_rate,
        use_adapter=use_adapter,
        adapter_units=adapter_units,
        adapter_activation=gelu,
    )
    if training:
        mlm_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation=feed_forward_activation,
            name='MLM-Dense',
        )(transformed)
        mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
        mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')(
            [mlm_norm_layer, embed_weights])
        masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation='tanh',
            name='NSP-Dense',
        )(extract_layer)
        nsp_pred_layer = keras.layers.Dense(
            units=2,
            activation='softmax',
            name='NSP',
        )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs,
                                   outputs=[masked_layer, nsp_pred_layer])
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        return model
    else:
        model = keras.models.Model(inputs=inputs, outputs=transformed)
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        if isinstance(output_layer_num, int):
            output_layer_num = min(output_layer_num, transformer_num)
            output_layer_num = [-i for i in range(1, output_layer_num + 1)]
        outputs = []
        for layer_index in output_layer_num:
            if layer_index < 0:
                layer_index = transformer_num + layer_index
            layer_index += 1
            layer = model.get_layer(
                name='Encoder-{}-FeedForward-Norm'.format(layer_index))
            outputs.append(layer.output)
        if len(outputs) > 1:
            transformed = keras.layers.Concatenate(name='Encoder-Output')(list(
                reversed(outputs)))
        else:
            transformed = outputs[0]
        return inputs, transformed
Exemple #5
0
def get_model_from_embedding(inputs,
                             embed_layer,
                             transformer_num=12,
                             head_num=12,
                             feed_forward_dim=3072,
                             dropout_rate=0.1,
                             attention_activation=None,
                             feed_forward_activation='gelu',
                             trainable=None,
                             output_layer_num=1):
    """Get BERT model.

    See: https://arxiv.org/pdf/1810.04805.pdf
    :param inputs: raw inputs
    :param embed_layer: input embeddings.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
    :param dropout_rate: Dropout rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param trainable: Whether the model is trainable.
    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `training` is `False`.
    :return: The built model.
    """
    from keras_transformer import get_encoders, gelu
    from keras_layer_normalization import LayerNormalization
    if attention_activation == 'gelu':
        attention_activation = gelu
    if feed_forward_activation == 'gelu':
        feed_forward_activation = gelu
    if trainable is None:
        trainable = True

    def _trainable(_layer):
        if isinstance(trainable, (list, tuple, set)):
            for prefix in trainable:
                if _layer.name.startswith(prefix):
                    return True
            return False
        return trainable

    if dropout_rate > 0.0:
        dropout_layer = keras.layers.Dropout(
            rate=dropout_rate,
            name='Embedding-Dropout',
        )(embed_layer)
    else:
        dropout_layer = embed_layer
    embed_layer = LayerNormalization(
        trainable=trainable,
        name='Embedding-Norm',
    )(dropout_layer)
    transformed = get_encoders(
        encoder_num=transformer_num,
        input_layer=embed_layer,
        head_num=head_num,
        hidden_dim=feed_forward_dim,
        attention_activation=attention_activation,
        feed_forward_activation=feed_forward_activation,
        dropout_rate=dropout_rate,
    )

    model = keras.models.Model(inputs=inputs, outputs=transformed)
    for layer in model.layers:
        layer.trainable = _trainable(layer)
    if isinstance(output_layer_num, int):
        output_layer_num = min(output_layer_num, transformer_num)
        output_layer_num = [-i for i in range(1, output_layer_num + 1)]
    outputs = []
    for layer_index in output_layer_num:
        if layer_index < 0:
            layer_index = transformer_num + layer_index
        layer_index += 1
        layer = model.get_layer(
            name='Encoder-{}-FeedForward-Norm'.format(layer_index))
        outputs.append(layer.output)
    if len(outputs) > 1:
        transformed = keras.layers.Concatenate(name='Encoder-Output')(list(
            reversed(outputs)))
    else:
        transformed = outputs[0]
    return transformed, model
Exemple #6
0
def get_model(token_num,
              pos_num=512,
              seq_len=512,
              embed_dim=768,
              transformer_num=12,
              head_num=12,
              feed_forward_dim=3072,
              dropout_rate=0.1,
              weight_decay=0.01,
              attention_activation=None,
              feed_forward_activation=gelu,
              custom_layers=None,
              training=True,
              trainable=None,
              output_layer_num=1,
              decay_steps=100000,
              warmup_steps=10000,
              lr=1e-4):
    """Get BERT model.

    See: https://arxiv.org/pdf/1810.04805.pdf

    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
    :param dropout_rate: Dropout rate.
    :param weight_decay: Weight decay rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param custom_layers: A function that takes the embedding tensor and returns the tensor after feature extraction.
                          Arguments such as `transformer_num` and `head_num` will be ignored if `custom_layer` is not
                          `None`.
    :param training: The built model will be returned if it is `True`, otherwise the input layers and the last feature
                     extraction layer will be returned.
    :param trainable: Whether the model is trainable.
    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `training` is `False`.
    :param decay_steps: Learning rate will decay linearly to zero in decay steps.
    :param warmup_steps: Learning rate will increase linearly to lr in first warmup steps.
    :param lr: Learning rate.
    :return: The compiled model.
    """
    if trainable is None:
        trainable = training
    inputs = get_inputs(seq_len=seq_len)
    embed_layer, embed_weights = get_embedding(
        inputs,
        token_num=token_num,
        embed_dim=embed_dim,
        pos_num=pos_num,
        dropout_rate=dropout_rate,
        trainable=trainable,
    )
    transformed = embed_layer
    if custom_layers is not None:
        kwargs = {}
        if keras.utils.generic_utils.has_arg(custom_layers, 'trainable'):
            kwargs['trainable'] = trainable
        transformed = custom_layers(transformed, **kwargs)
    else:
        transformed = get_encoders(
            encoder_num=transformer_num,
            input_layer=transformed,
            head_num=head_num,
            hidden_dim=feed_forward_dim,
            attention_activation=attention_activation,
            feed_forward_activation=feed_forward_activation,
            dropout_rate=dropout_rate,
            trainable=trainable,
        )
    if not training:
        if output_layer_num > 1:
            if output_layer_num > transformer_num:
                output_layer_num = transformer_num
            model = keras.models.Model(inputs=inputs[:2], outputs=transformed)
            outputs = []
            for i in range(output_layer_num):
                layer = model.get_layer(
                    name='Encoder-{}-FeedForward-Norm'.format(transformer_num -
                                                              i))
                outputs.append(layer.output)
            transformed = keras.layers.Concatenate(name='Encoder-Output')(list(
                reversed(outputs)))
        return inputs[:2], transformed
    mlm_dense_layer = keras.layers.Dense(
        units=embed_dim,
        activation=feed_forward_activation,
        trainable=trainable,
        name='MLM-Dense',
    )(transformed)
    mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
    mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')(
        [mlm_norm_layer, embed_weights])
    masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
    extract_layer = Extract(index=0, name='Extract')(transformed)
    nsp_dense_layer = keras.layers.Dense(
        units=embed_dim,
        activation='tanh',
        trainable=trainable,
        name='NSP-Dense',
    )(extract_layer)
    nsp_pred_layer = keras.layers.Dense(
        units=2,
        activation='softmax',
        trainable=trainable,
        name='NSP',
    )(nsp_dense_layer)
    model = keras.models.Model(inputs=inputs,
                               outputs=[masked_layer, nsp_pred_layer])
    if weight_decay:
        weight_decay *= 0.5
        for layer in model.layers:
            if hasattr(layer, 'embeddings_regularizer'):
                layer.embeddings_regularizer = keras.regularizers.l2(
                    weight_decay)
            if hasattr(layer, 'kernel_regularizer'):
                layer.kernel_regularizer = keras.regularizers.l2(weight_decay)
    model.compile(
        optimizer=AdamWarmup(decay_steps=decay_steps,
                             warmup_steps=warmup_steps,
                             lr=lr),
        loss=keras.losses.sparse_categorical_crossentropy,
    )
    return model
Exemple #7
0
def get_model(token_num,
              pos_num=512,
              seq_len=512,
              embed_dim=768,
              transformer_num=12,
              head_num=12,
              feed_forward_dim=3072,
              dropout_rate=0.1,
              weight_decay=0.01,
              attention_activation=None,
              feed_forward_activation='gelu',
              training=True,
              trainable=None,
              output_layer_num=1,
              decay_steps=100000,
              warmup_steps=10000,
              lr=1e-4):
    """Get BERT model.

    See: https://arxiv.org/pdf/1810.04805.pdf

    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
    :param dropout_rate: Dropout rate.
    :param weight_decay: Weight decay rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param training: A built model with MLM and NSP outputs will be returned if it is `True`,
                     otherwise the input layers and the last feature extraction layer will be returned.
    :param trainable: Whether the model is trainable.
    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `training` is `False`.
    :param decay_steps: Learning rate will decay linearly to zero in decay steps.
    :param warmup_steps: Learning rate will increase linearly to lr in first warmup steps.
    :param lr: Learning rate.
    :return: The compiled model.
    """
    if attention_activation == 'gelu':
        attention_activation = gelu
    if feed_forward_activation == 'gelu':
        feed_forward_activation = gelu
    if trainable is None:
        trainable = training

    def _trainable(_layer):
        if isinstance(trainable, (list, tuple, set)):
            for prefix in trainable:
                if _layer.name.startswith(prefix):
                    return True
            return False
        return trainable

    inputs = get_inputs(seq_len=seq_len)
    embed_layer, embed_weights = get_embedding(
        inputs,
        token_num=token_num,
        embed_dim=embed_dim,
        pos_num=pos_num,
        dropout_rate=dropout_rate,
    )
    transformed = get_encoders(
        encoder_num=transformer_num,
        input_layer=embed_layer,
        head_num=head_num,
        hidden_dim=feed_forward_dim,
        attention_activation=attention_activation,
        feed_forward_activation=feed_forward_activation,
        dropout_rate=dropout_rate,
    )
    if training:
        mlm_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation=feed_forward_activation,
            name='MLM-Dense',
        )(transformed)
        mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
        mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')(
            [mlm_norm_layer, embed_weights])
        masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation='tanh',
            name='NSP-Dense',
        )(extract_layer)
        nsp_pred_layer = keras.layers.Dense(
            units=2,
            activation='softmax',
            name='NSP',
        )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs,
                                   outputs=[masked_layer, nsp_pred_layer])
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        model.compile(
            optimizer=AdamWarmup(
                decay_steps=decay_steps,
                warmup_steps=warmup_steps,
                lr=lr,
                weight_decay=weight_decay,
                weight_decay_pattern=[
                    'embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo'
                ],
            ),
            loss=keras.losses.sparse_categorical_crossentropy,
        )
        return model
    else:
        inputs = inputs[:2]
        model = keras.models.Model(inputs=inputs, outputs=transformed)
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        output_layer_num = min(output_layer_num, transformer_num)
        if output_layer_num > 1:
            outputs = []
            for i in range(output_layer_num):
                layer = model.get_layer(
                    name='Encoder-{}-FeedForward-Norm'.format(transformer_num -
                                                              i))
                outputs.append(layer.output)
            transformed = keras.layers.Concatenate(name='Encoder-Output')(list(
                reversed(outputs)))
        return inputs, transformed
Exemple #8
0
def transformer_models(trainX, trainy, valX, valy, embedding, vocab, maxlen,
                       head_num, encoder_num, hidden_dim, project_name):
    #import relevant packages from keras instead of tensorflow.keras to avoid the runtime problem in this model.
    from keras.layers import Input, MaxPooling1D, Flatten, Dense, Embedding, SpatialDropout1D, Dropout, Conv1D
    from keras.models import Model
    from keras.utils import to_categorical
    from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
    import keras
    input = Input(shape=(maxlen, ), dtype="int32")

    # embedding
    x = Embedding(len(vocab) + 1, 300, weights=[embedding],
                  trainable=False)(input)
    #x = Dropout(0.2)(x)
    print(x.shape)
    #postition embedding
    x = PositionEmbedding(120, 300, "add")(x)

    output = get_encoders(encoder_num=encoder_num,
                          input_layer=x,
                          head_num=head_num,
                          hidden_dim=hidden_dim,
                          attention_activation="relu",
                          dropout_rate=0.1)

    # three kind of filters size are 2,3,4
    cnn1 = Conv1D(32, 2, padding="same", strides=1, activation="relu")(x)
    cnn1 = MaxPooling1D()(cnn1)

    cnn2 = Conv1D(32, 3, padding="same", strides=1, activation="relu")(x)
    cnn2 = MaxPooling1D()(cnn2)

    cnn3 = Conv1D(32, 4, padding="same", strides=1, activation="relu")(x)
    cnn3 = MaxPooling1D()(cnn3)

    features = keras.layers.concatenate([cnn1, cnn2, cnn3], axis=-1)

    output = Flatten()(features)
    output = Dropout(0.2)(output)
    output = Dense(2, activation="softmax")(output)
    trainy = to_categorical(trainy, 2)
    val_y = to_categorical(valy, 2)
    model = Model(inputs=input, outputs=output)
    path = "../data/experiment_results/RQ5/model/" + project_name + str(
        hidden_dim) + ".h5"
    early_stopping = EarlyStopping()
    callbacks = [early_stopping, ModelCheckpoint(path, save_best_only=True)]

    model.compile(optimizer=Adam(0.005),
                  loss="binary_crossentropy",
                  metrics=[km.recall(),
                           km.precision(),
                           km.f1_score()])
    model.fit(trainX,
              trainy,
              batch_size=64,
              epochs=5,
              callbacks=callbacks,
              validation_data=(valX, val_y))
    pred = model.predict(valX)
    pred = pred.argmax(-1)

    f1 = f1_score(valy, pred)
    precision = precision_score(valy, pred)
    recall = recall_score(valy, pred)
    return f1, precision, recall
Exemple #9
0
def get_model(token_num,
              pos_num=512,
              seq_len=512,
              embed_dim=768,
              transformer_num=12,
              head_num=12,
              feed_forward_dim=3072,
              dropout_rate=0.1,
              attention_activation=None,
              feed_forward_activation='gelu',
              training=True,
              trainable=None,
              output_layer_num=1):
    """Get BERT model.

    See: https://arxiv.org/pdf/1810.04805.pdf

    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
    :param dropout_rate: Dropout rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param training: A built model with MLM and NSP outputs will be returned if it is `True`,
                     otherwise the input layers and the last feature extraction layer will be returned.
    :param trainable: Whether the model is trainable.
    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `training` is `False`.
    :return: The built model.
    """
    if attention_activation == 'gelu':
        attention_activation = gelu
    if feed_forward_activation == 'gelu':
        feed_forward_activation = gelu
    if trainable is None:
        trainable = training

    def _trainable(_layer):
        if isinstance(trainable, (list, tuple, set)):
            for prefix in trainable:
                if _layer.name.startswith(prefix):
                    return True
            return False
        return trainable

    inputs = get_inputs(seq_len=seq_len)
    embed_layer, embed_weights = get_embedding(
        inputs,
        token_num=token_num,
        embed_dim=embed_dim,
        pos_num=pos_num,
        dropout_rate=dropout_rate,
    )
    transformed = get_encoders(
        encoder_num=transformer_num,
        input_layer=embed_layer,
        head_num=head_num,
        hidden_dim=feed_forward_dim,
        attention_activation=attention_activation,
        feed_forward_activation=feed_forward_activation,
        dropout_rate=dropout_rate,
    )
    if training:
        mlm_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation=feed_forward_activation,
            name='MLM-Dense',
        )(transformed)
        mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
        mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')([mlm_norm_layer, embed_weights])
        masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation='tanh',
            name='NSP-Dense',
        )(extract_layer)
        nsp_pred_layer = keras.layers.Dense(
            units=2,
            activation='softmax',
            name='NSP',
        )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer])
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        return model
    else:
        inputs = inputs[:2]
        model = keras.models.Model(inputs=inputs, outputs=transformed)
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        if isinstance(output_layer_num, int):
            output_layer_num = min(output_layer_num, transformer_num)
            output_layer_num = [-i for i in range(1, output_layer_num + 1)]
        outputs = []
        for layer_index in output_layer_num:
            if layer_index < 0:
                layer_index = transformer_num + layer_index
            layer_index += 1
            layer = model.get_layer(name='Encoder-{}-FeedForward-Norm'.format(layer_index))
            outputs.append(layer.output)
        if len(outputs) > 1:
            transformed = keras.layers.Concatenate(name='Encoder-Output')(list(reversed(outputs)))
        else:
            transformed = outputs[0]
        return inputs, transformed