def test_sample(self): encoder_input_layer = keras.layers.Input(shape=(512, 768), name='Encoder-Input') decoder_input_layer = keras.layers.Input(shape=(512, 768), name='Decoder-Input') encoded_layer = get_encoders( encoder_num=2, input_layer=encoder_input_layer, head_num=12, hidden_dim=3072, dropout_rate=0.0, ) output_layer = get_decoders( decoder_num=2, input_layer=decoder_input_layer, encoded_layer=encoded_layer, head_num=12, hidden_dim=3072, dropout_rate=0.0, ) model = keras.models.Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=output_layer) model.compile(optimizer='adam', loss='mse', metrics={}) model.summary(line_length=160) output_layer = get_decoders( decoder_num=2, input_layer=decoder_input_layer, encoded_layer=encoded_layer, head_num=12, hidden_dim=3072, dropout_rate=0.1, ) model = keras.models.Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=output_layer) model.compile(optimizer='adam', loss='mse', metrics={}) model.summary(line_length=160) self.assertIsNotNone(model)
def test_mask_result(self): input_layer = keras.layers.Input( shape=(None, ), name='Input', ) embed_layer = keras.layers.Embedding( input_dim=12, output_dim=9, mask_zero=True, name='Embedding', )(input_layer) transformer_layer = get_encoders( encoder_num=1, input_layer=embed_layer, head_num=1, hidden_dim=12, attention_activation=None, feed_forward_activation=gelu, dropout_rate=0.1, ) dense_layer = keras.layers.Dense( units=12, activation='softmax', name='Dense', )(transformer_layer) mask_layer = keras.layers.Input( shape=(None, ), name='Mask', ) masked_layer, mask_result = Masked( return_masked=True, name='Masked', )([dense_layer, mask_layer]) print([masked_layer, mask_result]) model = keras.models.Model( inputs=[input_layer, mask_layer], outputs=[masked_layer, mask_result], ) model.compile( optimizer='adam', loss='mse', ) model.summary(line_length=150) predicts = model.predict([ np.asarray([ [1, 2, 3, 4, 5, 6, 7, 8, 0, 0], [1, 2, 3, 4, 0, 0, 0, 0, 0, 0], ]), np.asarray([ [0, 0, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0, 0], ]), ]) expect = np.asarray([ [0, 0, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0, 0], ]) self.assertTrue(np.allclose(expect, predicts[1]))
def get_model(token_num, pos_num=512, seq_len=512, embed_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation=gelu, custom_layers=None, training=True, lr=1e-4): """Get BERT model. See: https://arxiv.org/pdf/1810.04805.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param custom_layers: A function that takes the embedding tensor and returns the tensor after feature extraction. Arguments such as `transformer_num` and `head_num` will be ignored if `custom_layer` is not `None`. :param training: The built model will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param lr: Learning rate. :return: The compiled model. """ inputs = get_inputs(seq_len=seq_len) embed_layer, embed_weights = get_embedding( inputs, token_num=token_num, embed_dim=embed_dim, pos_num=pos_num, dropout_rate=dropout_rate, trainable=training, ) transformed = embed_layer if custom_layers is not None: kwargs = {} if keras.utils.generic_utils.has_arg(custom_layers, 'trainable'): kwargs['trainable'] = training transformed = custom_layers(transformed, **kwargs) else: transformed = get_encoders( encoder_num=transformer_num, input_layer=transformed, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, trainable=training, ) if not training: return inputs[:2], transformed mlm_dense_layer = keras.layers.Dense( units=embed_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')( [mlm_norm_layer, embed_weights]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=embed_dim, activation='tanh', name='NSP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='NSP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.sparse_categorical_crossentropy, metrics=[], ) return model
def get_model(token_num, pos_num=512, seq_len=512, embed_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layer_num=1, use_task_embed=False, task_num=10, use_adapter=False, adapter_units=None): """Get BERT model. See: https://arxiv.org/pdf/1810.04805.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `training` is `False`. :param use_task_embed: Whether to add task embeddings to existed embeddings. :param task_num: The number of tasks. :param use_adapter: Whether to use feed-forward adapters before each residual connections. :param adapter_units: The dimension of the first transformation in feed-forward adapter. :return: The built model. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training if adapter_units is None: adapter_units = max(1, embed_dim // 100) def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable inputs = get_inputs(seq_len=seq_len) x, s, m = inputs x = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]), name='Input-Token-Reshape')(x) s = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]), name='Input-Segment-Reshape')(s) m = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]), name='Input-Mention-Reshape')(m) embed_layer, embed_weights = get_embedding( [x, s, m], token_num=token_num, embed_dim=embed_dim, pos_num=pos_num, dropout_rate=dropout_rate, ) if use_task_embed: task_input = keras.layers.Input( shape=(1, ), name='Input-Task', ) embed_layer = TaskEmbedding( input_dim=task_num, output_dim=embed_dim, mask_zero=False, name='Embedding-Task', )([embed_layer, task_input]) inputs = inputs[:2] + [task_input, inputs[-1]] if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) transformed = get_encoders( encoder_num=transformer_num, input_layer=embed_layer, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, use_adapter=use_adapter, adapter_units=adapter_units, adapter_activation=gelu, ) if training: mlm_dense_layer = keras.layers.Dense( units=embed_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')( [mlm_norm_layer, embed_weights]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=embed_dim, activation='tanh', name='NSP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='NSP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) return model else: model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) if isinstance(output_layer_num, int): output_layer_num = min(output_layer_num, transformer_num) output_layer_num = [-i for i in range(1, output_layer_num + 1)] outputs = [] for layer_index in output_layer_num: if layer_index < 0: layer_index = transformer_num + layer_index layer_index += 1 layer = model.get_layer( name='Encoder-{}-FeedForward-Norm'.format(layer_index)) outputs.append(layer.output) if len(outputs) > 1: transformed = keras.layers.Concatenate(name='Encoder-Output')(list( reversed(outputs))) else: transformed = outputs[0] return inputs, transformed
def get_model_from_embedding(inputs, embed_layer, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', trainable=None, output_layer_num=1): """Get BERT model. See: https://arxiv.org/pdf/1810.04805.pdf :param inputs: raw inputs :param embed_layer: input embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param trainable: Whether the model is trainable. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `training` is `False`. :return: The built model. """ from keras_transformer import get_encoders, gelu from keras_layer_normalization import LayerNormalization if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = True def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) transformed = get_encoders( encoder_num=transformer_num, input_layer=embed_layer, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, ) model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) if isinstance(output_layer_num, int): output_layer_num = min(output_layer_num, transformer_num) output_layer_num = [-i for i in range(1, output_layer_num + 1)] outputs = [] for layer_index in output_layer_num: if layer_index < 0: layer_index = transformer_num + layer_index layer_index += 1 layer = model.get_layer( name='Encoder-{}-FeedForward-Norm'.format(layer_index)) outputs.append(layer.output) if len(outputs) > 1: transformed = keras.layers.Concatenate(name='Encoder-Output')(list( reversed(outputs))) else: transformed = outputs[0] return transformed, model
def get_model(token_num, pos_num=512, seq_len=512, embed_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, weight_decay=0.01, attention_activation=None, feed_forward_activation=gelu, custom_layers=None, training=True, trainable=None, output_layer_num=1, decay_steps=100000, warmup_steps=10000, lr=1e-4): """Get BERT model. See: https://arxiv.org/pdf/1810.04805.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param weight_decay: Weight decay rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param custom_layers: A function that takes the embedding tensor and returns the tensor after feature extraction. Arguments such as `transformer_num` and `head_num` will be ignored if `custom_layer` is not `None`. :param training: The built model will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `training` is `False`. :param decay_steps: Learning rate will decay linearly to zero in decay steps. :param warmup_steps: Learning rate will increase linearly to lr in first warmup steps. :param lr: Learning rate. :return: The compiled model. """ if trainable is None: trainable = training inputs = get_inputs(seq_len=seq_len) embed_layer, embed_weights = get_embedding( inputs, token_num=token_num, embed_dim=embed_dim, pos_num=pos_num, dropout_rate=dropout_rate, trainable=trainable, ) transformed = embed_layer if custom_layers is not None: kwargs = {} if keras.utils.generic_utils.has_arg(custom_layers, 'trainable'): kwargs['trainable'] = trainable transformed = custom_layers(transformed, **kwargs) else: transformed = get_encoders( encoder_num=transformer_num, input_layer=transformed, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, trainable=trainable, ) if not training: if output_layer_num > 1: if output_layer_num > transformer_num: output_layer_num = transformer_num model = keras.models.Model(inputs=inputs[:2], outputs=transformed) outputs = [] for i in range(output_layer_num): layer = model.get_layer( name='Encoder-{}-FeedForward-Norm'.format(transformer_num - i)) outputs.append(layer.output) transformed = keras.layers.Concatenate(name='Encoder-Output')(list( reversed(outputs))) return inputs[:2], transformed mlm_dense_layer = keras.layers.Dense( units=embed_dim, activation=feed_forward_activation, trainable=trainable, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')( [mlm_norm_layer, embed_weights]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=embed_dim, activation='tanh', trainable=trainable, name='NSP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', trainable=trainable, name='NSP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) if weight_decay: weight_decay *= 0.5 for layer in model.layers: if hasattr(layer, 'embeddings_regularizer'): layer.embeddings_regularizer = keras.regularizers.l2( weight_decay) if hasattr(layer, 'kernel_regularizer'): layer.kernel_regularizer = keras.regularizers.l2(weight_decay) model.compile( optimizer=AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr), loss=keras.losses.sparse_categorical_crossentropy, ) return model
def get_model(token_num, pos_num=512, seq_len=512, embed_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, weight_decay=0.01, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layer_num=1, decay_steps=100000, warmup_steps=10000, lr=1e-4): """Get BERT model. See: https://arxiv.org/pdf/1810.04805.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param weight_decay: Weight decay rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `training` is `False`. :param decay_steps: Learning rate will decay linearly to zero in decay steps. :param warmup_steps: Learning rate will increase linearly to lr in first warmup steps. :param lr: Learning rate. :return: The compiled model. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable inputs = get_inputs(seq_len=seq_len) embed_layer, embed_weights = get_embedding( inputs, token_num=token_num, embed_dim=embed_dim, pos_num=pos_num, dropout_rate=dropout_rate, ) transformed = get_encoders( encoder_num=transformer_num, input_layer=embed_layer, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, ) if training: mlm_dense_layer = keras.layers.Dense( units=embed_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')( [mlm_norm_layer, embed_weights]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=embed_dim, activation='tanh', name='NSP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='NSP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) model.compile( optimizer=AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr, weight_decay=weight_decay, weight_decay_pattern=[ 'embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo' ], ), loss=keras.losses.sparse_categorical_crossentropy, ) return model else: inputs = inputs[:2] model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) output_layer_num = min(output_layer_num, transformer_num) if output_layer_num > 1: outputs = [] for i in range(output_layer_num): layer = model.get_layer( name='Encoder-{}-FeedForward-Norm'.format(transformer_num - i)) outputs.append(layer.output) transformed = keras.layers.Concatenate(name='Encoder-Output')(list( reversed(outputs))) return inputs, transformed
def transformer_models(trainX, trainy, valX, valy, embedding, vocab, maxlen, head_num, encoder_num, hidden_dim, project_name): #import relevant packages from keras instead of tensorflow.keras to avoid the runtime problem in this model. from keras.layers import Input, MaxPooling1D, Flatten, Dense, Embedding, SpatialDropout1D, Dropout, Conv1D from keras.models import Model from keras.utils import to_categorical from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau import keras input = Input(shape=(maxlen, ), dtype="int32") # embedding x = Embedding(len(vocab) + 1, 300, weights=[embedding], trainable=False)(input) #x = Dropout(0.2)(x) print(x.shape) #postition embedding x = PositionEmbedding(120, 300, "add")(x) output = get_encoders(encoder_num=encoder_num, input_layer=x, head_num=head_num, hidden_dim=hidden_dim, attention_activation="relu", dropout_rate=0.1) # three kind of filters size are 2,3,4 cnn1 = Conv1D(32, 2, padding="same", strides=1, activation="relu")(x) cnn1 = MaxPooling1D()(cnn1) cnn2 = Conv1D(32, 3, padding="same", strides=1, activation="relu")(x) cnn2 = MaxPooling1D()(cnn2) cnn3 = Conv1D(32, 4, padding="same", strides=1, activation="relu")(x) cnn3 = MaxPooling1D()(cnn3) features = keras.layers.concatenate([cnn1, cnn2, cnn3], axis=-1) output = Flatten()(features) output = Dropout(0.2)(output) output = Dense(2, activation="softmax")(output) trainy = to_categorical(trainy, 2) val_y = to_categorical(valy, 2) model = Model(inputs=input, outputs=output) path = "../data/experiment_results/RQ5/model/" + project_name + str( hidden_dim) + ".h5" early_stopping = EarlyStopping() callbacks = [early_stopping, ModelCheckpoint(path, save_best_only=True)] model.compile(optimizer=Adam(0.005), loss="binary_crossentropy", metrics=[km.recall(), km.precision(), km.f1_score()]) model.fit(trainX, trainy, batch_size=64, epochs=5, callbacks=callbacks, validation_data=(valX, val_y)) pred = model.predict(valX) pred = pred.argmax(-1) f1 = f1_score(valy, pred) precision = precision_score(valy, pred) recall = recall_score(valy, pred) return f1, precision, recall
def get_model(token_num, pos_num=512, seq_len=512, embed_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layer_num=1): """Get BERT model. See: https://arxiv.org/pdf/1810.04805.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `training` is `False`. :return: The built model. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable inputs = get_inputs(seq_len=seq_len) embed_layer, embed_weights = get_embedding( inputs, token_num=token_num, embed_dim=embed_dim, pos_num=pos_num, dropout_rate=dropout_rate, ) transformed = get_encoders( encoder_num=transformer_num, input_layer=embed_layer, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, ) if training: mlm_dense_layer = keras.layers.Dense( units=embed_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')([mlm_norm_layer, embed_weights]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=embed_dim, activation='tanh', name='NSP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='NSP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) return model else: inputs = inputs[:2] model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) if isinstance(output_layer_num, int): output_layer_num = min(output_layer_num, transformer_num) output_layer_num = [-i for i in range(1, output_layer_num + 1)] outputs = [] for layer_index in output_layer_num: if layer_index < 0: layer_index = transformer_num + layer_index layer_index += 1 layer = model.get_layer(name='Encoder-{}-FeedForward-Norm'.format(layer_index)) outputs.append(layer.output) if len(outputs) > 1: transformed = keras.layers.Concatenate(name='Encoder-Output')(list(reversed(outputs))) else: transformed = outputs[0] return inputs, transformed