コード例 #1
0
def build_xlnet(args):
    # Load pretrained model
    model = load_trained_model_from_checkpoint(
        config_path=args.config_path,
        checkpoint_path=args.model_path,
        batch_size=args.batch_size,
        memory_len=0,
        target_len=args.maxlen,
        in_train_phase=False,
        attention_type=ATTENTION_TYPE_BI,
    )

    # Build classification model
    last = model.output
    extract = Extract(index=-1, name='Extract')(last)
    output = keras.layers.Dense(units=args.nclass,
                                activation='softmax',
                                name='Softmax')(extract)
    model = keras.models.Model(inputs=model.inputs, outputs=output)
    model.summary()

    # Fit model
    model.compile(
        optimizer=RAdam(args.lr),
        loss='categorical_crossentropy',
        metrics=['accuracy'],
    )

    print(model.summary())
    return model
コード例 #2
0
def build_model():
    model = load_trained_model_from_checkpoint(
        config_path=paths.config,
        checkpoint_path=paths.model,
        batch_size=BATCH_SIZE,
        memory_len=MEMORY_LEN,
        target_len=TEXT_LEN,
        in_train_phase=False,
        attention_type=ATTENTION_TYPE_BI)

    # 加载预训练权重
    # Build classification model
    last = model.output
    extract = Extract(index=-1, name='Extract')(last)
    dense = keras.layers.Dense(units=768, name='Dense')(extract)
    norm = keras.layers.BatchNormalization(name='Normal')(dense)
    output = keras.layers.Dense(units=2, activation='softmax',
                                name='Softmax')(norm)
    model = keras.models.Model(inputs=model.inputs, outputs=output)

    model.compile(
        optimizer=Adam(learning_rate=LEARNING_RATE),
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'],
    )

    return model
コード例 #3
0
ファイル: test_extract.py プロジェクト: WQAQs/keras-bert
 def test_sample(self):
     input_layer = keras.layers.Input(
         shape=(3, 4),
         name='Input',
     )
     extract_layer = Extract(
         index=1,
         name='Extract'
     )(input_layer)
     model = keras.models.Model(
         inputs=input_layer,
         outputs=extract_layer,
     )
     model.compile(
         optimizer='adam',
         loss='mse',
         metrics={},
     )
     model.summary()
     inputs = np.asarray([[
         [0.1, 0.2, 0.3, 0.4],
         [-0.1, 0.2, -0.3, 0.4],
         [0.1, -0.2, 0.3, -0.4],
     ]])
     predict = model.predict(inputs)
     expected = np.asarray([[0.1, 0.2, 0.3, 0.4]])
     self.assertTrue(np.allclose(expected, predict), predict)
コード例 #4
0
def build_bert(model, poolings=None, output_layer_num=1):
    """Extract embeddings from texts.

    :param model: Path to the checkpoint or built model without MLM and NSP.
    :param texts: Iterable texts.
    :param poolings: Pooling methods. Word embeddings will be returned if it is None.
                     Otherwise concatenated pooled embeddings will be returned.
    :param vocabs: A dict should be provided if model is built.
    :param cased: Whether it is cased for tokenizer.
    :param batch_size: Batch size.
    :param cut_embed: The computed embeddings will be cut based on their input lengths.
    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `model` is a path to checkpoint.
    :return: A list of numpy arrays representing the embeddings.
    """
    model = get_pretrained(PretrainedList.multi_cased_base)
    if isinstance(model, (str, type(u''))):
        paths = get_checkpoint_paths(model)
        model = load_trained_model_from_checkpoint(
            config_file=paths.config,
            checkpoint_file=paths.checkpoint,
            output_layer_num=output_layer_num,
        )

    outputs = []

    if poolings is not None:
        if isinstance(poolings, (str, type(u''))):
            poolings = [poolings]
        # outputs = []
        for pooling in poolings:
            if pooling == POOL_NSP:
                outputs.append(
                    Extract(index=0, name='Pool-NSP')(model.outputs[0]))
            elif pooling == POOL_MAX:
                outputs.append(
                    MaskedGlobalMaxPool1D(name='Pool-Max')(model.outputs[0]))
            elif pooling == POOL_AVE:
                outputs.append(
                    keras.layers.GlobalAvgPool1D(name='Pool-Ave')(
                        model.outputs[0]))
            else:
                raise ValueError('Unknown pooling method: {}'.format(pooling))
        # print(outputs)
        if len(outputs) == 1:
            outputs = outputs[0]
        else:
            outputs = keras.layers.Concatenate(name='Concatenate')(outputs)
        outputs = Lambda(bert_output_sum)(outputs)
        # model = keras.models.Model(inputs=model.inputs, outputs=outputs)
    return model.inputs, outputs
コード例 #5
0
def get_finetune_model():
    word2id_dict, id2word_dict = utils.get_word_id_map(word_id_map_file_path)
    input_layer, transformed = keras_bert.my_get_model(
        token_num=len(word2id_dict),
        head_num=hp.head_num,
        transformer_num=hp.transformer_num,
        embed_dim=hp.embed_dim,
        feed_forward_dim=hp.feed_forward_dim,
        dropout_rate=hp.dropout_rate,
        seq_len=hp.seq_len,
        pos_num=hp.seq_len,
        attention_activation='gelu',
        training=False,  ### !!!!!!!一定不能忘记设置为False!!!!!!!!!
        trainable=True)
    # output_layer = model.inputs[:2]
    # dense = model.get_layer('Encoder-2-FeedForward-Norm').output
    # output_layer = keras.layers.Dense(units=2, activation='relu')(dense)
    extract_layer = Extract(index=0, name='Extract')(transformed)
    # coor_dense = keras.layers.Dense(units=embed_dim, activation="relu", name="coor_dense")(transformed)
    output_layer = keras.layers.Dense(units=2,
                                      activation="relu",
                                      name="coor_output")(extract_layer)
    model = keras.models.Model(inputs=input_layer, outputs=output_layer)
    return model
コード例 #6
0
train_seq = generate_sequence(train_path)
dev_seq = generate_sequence(dev_path)

# Load pretrained model
model = load_trained_model_from_checkpoint(
    config_path=paths.config,
    checkpoint_path=paths.model,
    batch_size=BATCH_SIZE,
    memory_len=0,
    target_len=SEQ_LEN,
    in_train_phase=False,
    attention_type=ATTENTION_TYPE_BI,
)

# Build classification model
last = Extract(index=-1, name='Extract')(model.output)
dense = keras.layers.Dense(units=768, activation='tanh', name='Dense')(last)
dropout = keras.layers.Dropout(rate=0.1, name='Dropout')(dense)
output = keras.layers.Dense(units=2, activation='softmax',
                            name='Softmax')(dropout)
model = keras.models.Model(inputs=model.inputs, outputs=output)
model.summary()

# Fit model
if os.path.exists(MODEL_NAME):
    model.load_weights(MODEL_NAME)

model.compile(
    optimizer=keras.optimizers.Adam(lr=3e-5),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'],
コード例 #7
0
def build_albert(token_num,
                 pos_num=512,
                 seq_len=512,
                 embed_dim=128,
                 hidden_dim=768,
                 transformer_num=12,
                 head_num=12,
                 feed_forward_dim=3072,
                 dropout_rate=0.1,
                 attention_activation=None,
                 feed_forward_activation='gelu',
                 training=True,
                 trainable=None,
                 output_layers=None):
    """Get ALBERT model.
    See: https://arxiv.org/pdf/1909.11942.pdf
    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param hidden_dim: Dimensions of hidden layers.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention
                    in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer
                             in each transformer.
    :param dropout_rate: Dropout rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param training: A built model with MLM and NSP outputs will be returned
                     if it is `True`, otherwise the input layers and the last
                     feature extraction layer will be returned.
    :param trainable: Whether the model is trainable.
    :param output_layers: A list of indices of output layers.
    """
    if attention_activation == 'gelu':
        attention_activation = gelu
    if feed_forward_activation == 'gelu':
        feed_forward_activation = gelu
    if trainable is None:
        trainable = training

    def _trainable(_layer):
        if isinstance(trainable, (list, tuple, set)):
            for prefix in trainable:
                if _layer.name.startswith(prefix):
                    return True
            return False
        return trainable

    # Build inputs
    input_token = keras.layers.Input(shape=(seq_len, ), name='Input-Token')
    input_segment = keras.layers.Input(shape=(seq_len, ), name='Input-Segment')
    inputs = [input_token, input_segment]

    # Build embeddings
    embed_token, embed_weights, embed_projection = AdaptiveEmbedding(
        input_dim=token_num,
        output_dim=hidden_dim,
        embed_dim=embed_dim,
        mask_zero=True,
        trainable=trainable,
        return_embeddings=True,
        return_projections=True,
        name='Embed-Token',
    )(input_token)
    embed_segment = keras.layers.Embedding(
        input_dim=2,
        output_dim=hidden_dim,
        trainable=trainable,
        name='Embed-Segment',
    )(input_segment)
    embed_layer = keras.layers.Add(name='Embed-Token-Segment')(
        [embed_token, embed_segment])
    embed_layer = PositionEmbedding(
        input_dim=pos_num,
        output_dim=hidden_dim,
        mode=PositionEmbedding.MODE_ADD,
        trainable=trainable,
        name='Embedding-Position',
    )(embed_layer)

    if dropout_rate > 0.0:
        dropout_layer = keras.layers.Dropout(
            rate=dropout_rate,
            name='Embedding-Dropout',
        )(embed_layer)
    else:
        dropout_layer = embed_layer
    embed_layer = LayerNormalization(
        trainable=trainable,
        name='Embedding-Norm',
    )(dropout_layer)

    # Build shared transformer
    attention_layer = MultiHeadAttention(
        head_num=head_num,
        activation=attention_activation,
        name='Attention',
    )
    attention_normal = LayerNormalization(name='Attention-Normal')
    feed_forward_layer = FeedForward(units=feed_forward_dim,
                                     activation=feed_forward_activation,
                                     name='Feed-Forward')
    feed_forward_normal = LayerNormalization(name='Feed-Forward-Normal')

    transformed = embed_layer
    transformed_layers = []
    for i in range(transformer_num):
        attention_input = transformed
        transformed = attention_layer(transformed)
        if dropout_rate > 0.0:
            transformed = keras.layers.Dropout(
                rate=dropout_rate,
                name='Attention-Dropout-{}'.format(i + 1),
            )(transformed)
        transformed = keras.layers.Add(
            name='Attention-Add-{}'.format(i + 1), )(
                [attention_input, transformed])
        transformed = attention_normal(transformed)

        feed_forward_input = transformed
        transformed = feed_forward_layer(transformed)
        if dropout_rate > 0.0:
            transformed = keras.layers.Dropout(
                rate=dropout_rate,
                name='Feed-Forward-Dropout-{}'.format(i + 1),
            )(transformed)
        transformed = keras.layers.Add(
            name='Feed-Forward-Add-{}'.format(i + 1), )(
                [feed_forward_input, transformed])
        transformed = feed_forward_normal(transformed)
        transformed_layers.append(transformed)

    if training:
        # Build tasks
        mlm_dense_layer = keras.layers.Dense(
            units=hidden_dim,
            activation=feed_forward_activation,
            name='MLM-Dense',
        )(transformed)
        mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
        mlm_pred_layer = AdaptiveSoftmax(
            input_dim=hidden_dim,
            output_dim=token_num,
            embed_dim=embed_dim,
            bind_embeddings=True,
            bind_projections=True,
            name='MLM-Sim',
        )([mlm_norm_layer, embed_weights, embed_projection])
        masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
            units=hidden_dim,
            activation='tanh',
            name='SOP-Dense',
        )(extract_layer)
        nsp_pred_layer = keras.layers.Dense(
            units=2,
            activation='softmax',
            name='SOP',
        )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs,
                                   outputs=[masked_layer, nsp_pred_layer])
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        return model
    if output_layers is not None:
        if isinstance(output_layers, list):
            output_layers = [
                transformed_layers[index] for index in output_layers
            ]
            output = keras.layers.Concatenate(name='Output', )(output_layers)
        else:
            output = transformed_layers[output_layers]
        model = keras.models.Model(inputs=inputs, outputs=output)
        return model
    model = keras.models.Model(inputs=inputs, outputs=transformed)
    for layer in model.layers:
        layer.trainable = _trainable(layer)
    return inputs, transformed
コード例 #8
0
ファイル: modeling_kb.py プロジェクト: midori1/zeshel
def get_model(token_num,
              pos_num=512,
              seq_len=512,
              embed_dim=768,
              transformer_num=12,
              head_num=12,
              feed_forward_dim=3072,
              dropout_rate=0.1,
              attention_activation=None,
              feed_forward_activation='gelu',
              training=True,
              trainable=None,
              output_layer_num=1,
              use_task_embed=False,
              task_num=10,
              use_adapter=False,
              adapter_units=None):
    """Get BERT model.

    See: https://arxiv.org/pdf/1810.04805.pdf

    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
    :param dropout_rate: Dropout rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param training: A built model with MLM and NSP outputs will be returned if it is `True`,
                     otherwise the input layers and the last feature extraction layer will be returned.
    :param trainable: Whether the model is trainable.
    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `training` is `False`.
    :param use_task_embed: Whether to add task embeddings to existed embeddings.
    :param task_num: The number of tasks.
    :param use_adapter: Whether to use feed-forward adapters before each residual connections.
    :param adapter_units: The dimension of the first transformation in feed-forward adapter.
    :return: The built model.
    """
    if attention_activation == 'gelu':
        attention_activation = gelu
    if feed_forward_activation == 'gelu':
        feed_forward_activation = gelu
    if trainable is None:
        trainable = training
    if adapter_units is None:
        adapter_units = max(1, embed_dim // 100)

    def _trainable(_layer):
        if isinstance(trainable, (list, tuple, set)):
            for prefix in trainable:
                if _layer.name.startswith(prefix):
                    return True
            return False
        return trainable

    inputs = get_inputs(seq_len=seq_len)
    x, s, m = inputs
    x = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]),
                            name='Input-Token-Reshape')(x)
    s = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]),
                            name='Input-Segment-Reshape')(s)
    m = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]),
                            name='Input-Mention-Reshape')(m)

    embed_layer, embed_weights = get_embedding(
        [x, s, m],
        token_num=token_num,
        embed_dim=embed_dim,
        pos_num=pos_num,
        dropout_rate=dropout_rate,
    )
    if use_task_embed:
        task_input = keras.layers.Input(
            shape=(1, ),
            name='Input-Task',
        )
        embed_layer = TaskEmbedding(
            input_dim=task_num,
            output_dim=embed_dim,
            mask_zero=False,
            name='Embedding-Task',
        )([embed_layer, task_input])
        inputs = inputs[:2] + [task_input, inputs[-1]]
    if dropout_rate > 0.0:
        dropout_layer = keras.layers.Dropout(
            rate=dropout_rate,
            name='Embedding-Dropout',
        )(embed_layer)
    else:
        dropout_layer = embed_layer
    embed_layer = LayerNormalization(
        trainable=trainable,
        name='Embedding-Norm',
    )(dropout_layer)
    transformed = get_encoders(
        encoder_num=transformer_num,
        input_layer=embed_layer,
        head_num=head_num,
        hidden_dim=feed_forward_dim,
        attention_activation=attention_activation,
        feed_forward_activation=feed_forward_activation,
        dropout_rate=dropout_rate,
        use_adapter=use_adapter,
        adapter_units=adapter_units,
        adapter_activation=gelu,
    )
    if training:
        mlm_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation=feed_forward_activation,
            name='MLM-Dense',
        )(transformed)
        mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
        mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')(
            [mlm_norm_layer, embed_weights])
        masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation='tanh',
            name='NSP-Dense',
        )(extract_layer)
        nsp_pred_layer = keras.layers.Dense(
            units=2,
            activation='softmax',
            name='NSP',
        )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs,
                                   outputs=[masked_layer, nsp_pred_layer])
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        return model
    else:
        model = keras.models.Model(inputs=inputs, outputs=transformed)
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        if isinstance(output_layer_num, int):
            output_layer_num = min(output_layer_num, transformer_num)
            output_layer_num = [-i for i in range(1, output_layer_num + 1)]
        outputs = []
        for layer_index in output_layer_num:
            if layer_index < 0:
                layer_index = transformer_num + layer_index
            layer_index += 1
            layer = model.get_layer(
                name='Encoder-{}-FeedForward-Norm'.format(layer_index))
            outputs.append(layer.output)
        if len(outputs) > 1:
            transformed = keras.layers.Concatenate(name='Encoder-Output')(list(
                reversed(outputs)))
        else:
            transformed = outputs[0]
        return inputs, transformed
コード例 #9
0
def bert_indoorlocation_train_with_label():
    config = tf.ConfigProto(allow_soft_placement=True)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    config.gpu_options.allow_growth = True

    # 准备训练集数据和验证集数据
    word2id_dict, id2word_dict = utils.get_word_id_map(word_id_map_file_path)
    x_train, y_train, reference_tags_train = utils.gen_fine_tune_bert_data(
        train_datafile_path, seq_len)
    x_valid, y_valid, reference_tags_valid = utils.gen_fine_tune_bert_data(
        valid_datafile_path, seq_len)
    # x_test, y_test, reference_tags_test = utils.gen_fine_tune_bert_data(test_datafile_name, seq_len)

    # x_train, y_train = np.array(x_train), np.array(y_train)
    # x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], x_train.shape[2], 1))
    # y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], 1))
    # with tf.Session(config=config) as sess:

    # model = load_trained_model_from_checkpoint(
    #     config_path,
    #     checkpoint_path,
    #     training=False,  ###!!!十分重要!!!!
    #     trainable=True,
    #     seq_len=seqence_len,
    # )
    # 初始化模型和参数
    # mymodel, myconfig = build_model_from_config(
    #     config_path,
    #     training=False,
    #     trainable=True)
    input_layer, transformed = keras_bert.my_get_model(
        token_num=len(word2id_dict),
        head_num=hp.head_num,
        transformer_num=hp.transformer_num,
        embed_dim=hp.embed_dim,
        feed_forward_dim=hp.feed_forward_dim,
        dropout_rate=hp.dropout_rate,
        seq_len=hp.seq_len,
        pos_num=hp.seq_len,
        attention_activation='gelu',
        training=False,  ### !!!!!!!一定不能忘记设置为False!!!!!!!!!
        trainable=True)
    # output_layer = model.inputs[:2]
    # dense = model.get_layer('Encoder-2-FeedForward-Norm').output
    # output_layer = keras.layers.Dense(units=2, activation='relu')(dense)
    extract_layer = Extract(index=0, name='Extract')(transformed)
    # coor_dense = keras.layers.Dense(units=embed_dim, activation="relu", name="coor_dense")(transformed)
    output_layer = keras.layers.Dense(units=2,
                                      activation="relu",
                                      name="coor_output")(extract_layer)
    model = keras.models.Model(inputs=input_layer, outputs=output_layer)
    if flag_retrain or only_evaluate_history_model_flag:
        model.load_weights(trained_model_path)
    else:
        model.load_weights(pretrained_model_path, by_name=True)
    model.summary()

    if not only_evaluate_history_model_flag:
        optimizer = keras.optimizers.RMSprop(LR)
        model.compile(
            optimizer=optimizer,
            loss='mse',
            metrics=['mae', 'mse'],
        )
        early_stopping = keras.callbacks.EarlyStopping(monitor="loss",
                                                       patience=5)
        # model.fit(
        #     x_train,
        #     y_train,
        #     validation_data=(x_valid, y_valid),
        #     epochs=EPOCHS,
        #     batch_size=BATCH_SIZE,
        #     callbacks=[early_stopping]
        # )
        model.fit(x_train,
                  y_train,
                  epochs=EPOCHS,
                  batch_size=BATCH_SIZE,
                  callbacks=[early_stopping])
        model.save(trained_model_path)

        # predicts = model.predict(x_train)
        # labels = y_train
        # reference_tags = reference_tags_train
        # evaluate_fine_tune_model(predicts, labels, reference_tags)

        # predicts = model.predict(x_test)
        # labels = y_test
        # reference_tags = reference_tags_test
        # utils.evaluate_fine_tune_model(predicts, labels, reference_tags)
        utils.evaluate_fine_tune_model(model, test_datafile_path)
    else:
        # predicts = model.predict(x_test)
        # labels = y_test
        # reference_tags = reference_tags_test
        # utils.evaluate_fine_tune_model(predicts, labels, reference_tags)
        utils.evaluate_fine_tune_model(model, test_datafile_path)
コード例 #10
0
ファイル: train.py プロジェクト: wayne841213/NLP
    config_path=paths.config,
    checkpoint_path=paths.model,
    batch_size=BATCH_SIZE,
    memory_len=0,
    target_len=SEQ_LEN,
    in_train_phase=False,
    attention_type=ATTENTION_TYPE_BI,
)

#### 加载预训练权重

# Build classification model

last = model.output

extract = Extract(index=-1, name='Extract')(last)

dense = keras.layers.Dense(units=768, name='Dense')(extract)

norm = keras.layers.BatchNormalization(name='Normal')(dense)

output = keras.layers.Dense(units=11, activation='softmax',
                            name='Softmax')(norm)

model = keras.models.Model(inputs=model.inputs, outputs=output)

model.summary()

# 定义优化器,loss和metrics

model.compile(
コード例 #11
0
def get_checkpoint_model(token_num,
              pos_num=512,
              seq_len=512,
              embed_dim=768,
              transformer_num=12,
              head_num=12,
              feed_forward_dim=3072,
              dropout_rate=0.1,
              attention_activation=None,
              feed_forward_activation='gelu',
              training=True,
              finetuned=False,
              output_dim=2,
              trainable=None,
              output_layer_num=1,
              retention_configuration=None,
              LAMBDA=None,
              FLAG_EXTRACT_LAYER=None,
              TASK=None,
              ):
        """Get BERT model.
        :param token_num: Number of tokens.
        :param pos_num: Maximum position.
        :param seq_len: Maximum length of the input sequence or None.
        :param embed_dim: Dimensions of embeddings.
        :param transformer_num: Number of transformers.
        :param head_num: Number of heads in multi-head attention in each transformer.
        :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
        :param dropout_rate: Dropout rate.
        :param attention_activation: Activation for attention layers.
        :param feed_forward_activation: Activation for feed-forward layers.
        :param trainable: Whether the model is trainable.
        :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `training` is `False`.
        :return: The built model.
        """
        if attention_activation == 'gelu':
                attention_activation = gelu
        if feed_forward_activation == 'gelu':
                feed_forward_activation = gelu
        if trainable is None:
                trainable = training
        def _trainable(_layer):
                if isinstance(trainable, (list, tuple, set)):
                    for prefix in trainable:
                        if _layer.name.startswith(prefix):
                            return True
                    return False
                return trainable

        inputs = get_inputs(seq_len=seq_len)
        attention_mask = inputs[2]
        embed_layer, embed_weights = get_embedding(
                inputs,
                token_num=token_num,
                embed_dim=embed_dim,
                pos_num=pos_num,
                dropout_rate=dropout_rate,
        )

        if dropout_rate > 0.0:
                dropout_layer = keras.layers.Dropout(
                    rate=dropout_rate,
                    name='Embedding-Dropout',
                )(embed_layer)
        else:
                dropout_layer = embed_layer
        embed_layer = LayerNormalization(
                trainable=trainable,
                    name='Embedding-Norm',
                )(dropout_layer)

        transformed = get_encoders(
                encoder_num=transformer_num,
                input_layer=embed_layer,
                head_num=head_num,
                hidden_dim=feed_forward_dim,
                attention_activation=attention_activation,
                feed_forward_activation=feed_forward_activation,
                dropout_rate=dropout_rate,
                attention_mask=attention_mask,
                SEQ_LEN=seq_len,
                retention_configuration=retention_configuration,
                LAMBDA=LAMBDA,
                FLAG_EXTRACT_LAYER=FLAG_EXTRACT_LAYER,
        )
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
                units=embed_dim,
                activation='tanh',
                name='NSP-Dense',
        )(extract_layer)
        if TASK == 'sts-b':
            nsp_pred_layer = keras.layers.Dense(
                 units=output_dim,
                 name='NSP',
            )(nsp_dense_layer)
        else:
            nsp_pred_layer = keras.layers.Dense(
                 units=output_dim,
                 activation='softmax',
                 name='NSP',
            )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs, outputs=nsp_pred_layer)
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        return model