Example #1
0
    def __init__(self,
                 num_layers,
                 d_model,
                 num_heads,
                 dff,
                 target_vocab_size,
                 maximum_position_encoding,
                 rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        self.maximum_position_encoding = maximum_position_encoding

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = PositionEmbedding(
            use_dynamic_slicing=True,
            max_sequence_length=self.maximum_position_encoding,
            name="decoder/position_embedding")

        self.dec_layers = [
            DecoderLayer(d_model, num_heads, dff, rate)
            for _ in range(num_layers)
        ]
        self.dropout = tf.keras.layers.Dropout(rate)
Example #2
0
def QANet(config, word_mat=None, char_mat=None, cove_model=None):
    ##数据预处理部分;每个问题对应一个example,同一个context的多个问题,会将context复制成多份与问题对应,然后context 和question
    ##分别输入到模型,通过计算各种attention进而求得;就像fit_demo那个例子构造的数据一样
    # parameters
    word_dim = config['word_dim']
    char_dim = config['char_dim']
    cont_limit = config['cont_limit']
    char_limit = config['char_limit']
    ans_limit = config['ans_limit']
    filters = config['filters']
    num_head = config['num_head']
    dropout = config['dropout']

    # Input Embedding Layer
    contw_input_ = Input((None, ))
    quesw_input_ = Input((None, ))
    contc_input_ = Input((None, char_limit))
    quesc_input_ = Input((None, char_limit))

    # get mask
    c_mask = Lambda(lambda x: tf.cast(x, tf.bool))(contw_input_)  # [bs, c_len]
    q_mask = Lambda(lambda x: tf.cast(x, tf.bool))(quesw_input_)
    cont_len = Lambda(lambda x: tf.expand_dims(
        tf.reduce_sum(tf.cast(x, tf.int32), axis=1), axis=1))(c_mask)
    ques_len = Lambda(lambda x: tf.expand_dims(
        tf.reduce_sum(tf.cast(x, tf.int32), axis=1), axis=1))(q_mask)

    # slice
    contw_input = BatchSlice(dim=2)([contw_input_, cont_len])
    quesw_input = BatchSlice(dim=2)([quesw_input_, ques_len])
    contc_input = BatchSlice(dim=3)([contc_input_, cont_len])
    quesc_input = BatchSlice(dim=3)([quesc_input_, ques_len])
    c_mask = BatchSlice(dim=2)([c_mask, cont_len])
    q_mask = BatchSlice(dim=2)([q_mask, ques_len])
    c_maxlen = tf.cast(tf.reduce_max(cont_len), tf.int32)
    q_maxlen = tf.cast(tf.reduce_max(ques_len), tf.int32)

    # embedding word
    WordEmbedding = Embedding(word_mat.shape[0],
                              word_dim,
                              weights=[word_mat],
                              trainable=False,
                              name='word_embedding')
    xw_cont = WordEmbedding(contw_input)
    xw_ques = WordEmbedding(quesw_input)

    # cove
    if cove_model is not None:
        x_cont_cove = cove_model(xw_cont)
        x_ques_cove = cove_model(xw_ques)
        xw_cont = Concatenate()([xw_cont, x_cont_cove])
        xw_ques = Concatenate()([xw_ques, x_ques_cove])

    # embedding char
    CharEmbedding = Embedding(char_mat.shape[0],
                              char_dim,
                              weights=[char_mat],
                              name='char_embedding')
    xc_cont = CharEmbedding(contc_input)
    xc_ques = CharEmbedding(quesc_input)
    char_conv = Conv1D(filters,
                       5,
                       activation='relu',
                       kernel_initializer=init_relu,
                       kernel_regularizer=regularizer,
                       name='char_conv')
    xc_cont = Lambda(lambda x: tf.reshape(x, (-1, char_limit, char_dim)))(
        xc_cont)
    xc_ques = Lambda(lambda x: tf.reshape(x, (-1, char_limit, char_dim)))(
        xc_ques)
    xc_cont = char_conv(xc_cont)
    xc_ques = char_conv(xc_ques)
    xc_cont = GlobalMaxPooling1D()(xc_cont)
    xc_ques = GlobalMaxPooling1D()(xc_ques)
    xc_cont = Lambda(lambda x: tf.reshape(x, (-1, c_maxlen, filters)))(xc_cont)
    xc_ques = Lambda(lambda x: tf.reshape(x, (-1, q_maxlen, filters)))(xc_ques)

    # highwayNet
    x_cont = Concatenate()([xw_cont, xc_cont])
    x_ques = Concatenate()([xw_ques, xc_ques])

    # highway shared layers
    highway_layers = [
        Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer,
               name='highway_input_projection')
    ]
    for i in range(2):
        highway_layers.append(
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='sigmoid',
                   name='highway' + str(i) + '_gate'))
        highway_layers.append(
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='linear',
                   name='highway' + str(i) + '_linear'))
    x_cont = highway(highway_layers, x_cont, num_layers=2, dropout=dropout)
    x_ques = highway(highway_layers, x_ques, num_layers=2, dropout=dropout)

    # build shared layers
    # shared convs
    Encoder_DepthwiseConv1 = []
    for i in range(4):
        Encoder_DepthwiseConv1.append(DepthwiseConv1D(7, filters))

    # shared attention
    Encoder_SelfAttention1 = [
        Conv1D(2 * filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer),
        Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer),
        MultiHeadAttention(filters, num_head, dropout=dropout, bias=False)
    ]
    # shared feed-forward
    Encoder_FeedForward1 = []
    Encoder_FeedForward1.append(
        Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer,
               activation='relu'))
    Encoder_FeedForward1.append(
        Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer,
               activation='linear'))

    # Context Embedding Encoder Layer
    x_cont = PositionEmbedding()(x_cont)
    x_cont = conv_block(Encoder_DepthwiseConv1, x_cont, 4, dropout)
    x_cont = attention_block(Encoder_SelfAttention1, x_cont, c_mask, dropout)
    x_cont = feed_forward_block(Encoder_FeedForward1, x_cont, dropout)

    # Question Embedding Encoder Layer
    x_ques = PositionEmbedding()(x_ques)
    x_ques = conv_block(Encoder_DepthwiseConv1, x_ques, 4, dropout)
    x_ques = attention_block(Encoder_SelfAttention1, x_ques, q_mask, dropout)
    x_ques = feed_forward_block(Encoder_FeedForward1, x_ques, dropout)

    # Context_to_Query_Attention_Layer
    #(self, output_dim, c_maxlen, q_maxlen, dropout, **kwargs):
    #x_shape=(batch_size, context_length, 512)
    x = context2query_attention(512, c_maxlen, q_maxlen,
                                dropout)([x_cont, x_ques, c_mask, q_mask])

    x = Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer,
               activation='linear')(x)

    # Model_Encoder_Layer
    # shared layers
    Encoder_DepthwiseConv2 = []
    Encoder_SelfAttention2 = []
    Encoder_FeedForward2 = []
    for i in range(7):
        DepthwiseConv_share_2_temp = []
        for i in range(2):
            DepthwiseConv_share_2_temp.append(DepthwiseConv1D(5, filters))

        Encoder_DepthwiseConv2.append(DepthwiseConv_share_2_temp)
        Encoder_SelfAttention2.append([
            Conv1D(2 * filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer),
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer),
            MultiHeadAttention(filters, num_head, dropout=dropout, bias=False)
        ])
        Encoder_FeedForward2.append([
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='relu'),
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='linear')
        ])

    ##context_query_attention经过conv1d之后的值就是x
    outputs = [x]
    for i in range(3):
        x = outputs[-1]
        for j in range(7):
            x = PositionEmbedding()(x)
            x = conv_block(Encoder_DepthwiseConv2[j], x, 2, dropout, l=j, L=7)
            x = attention_block(Encoder_SelfAttention2[j],
                                x,
                                c_mask,
                                dropout,
                                l=j,
                                L=7)
            x = feed_forward_block(Encoder_FeedForward2[j],
                                   x,
                                   dropout,
                                   l=j,
                                   L=7)
        outputs.append(x)

    # Output_Layer
    x_start = Concatenate()([outputs[1], outputs[2]])
    x_start = Conv1D(1,
                     1,
                     kernel_initializer=init,
                     kernel_regularizer=regularizer,
                     activation='linear')(x_start)
    x_start = Lambda(lambda x: tf.squeeze(x, axis=-1))(x_start)
    x_start = Lambda(lambda x: mask_logits(x[0], x[1]))([x_start, c_mask])

    x_start = Lambda(lambda x: K.softmax(x),
                     name='start')(x_start)  # [bs, len]

    x_end = Concatenate()([outputs[1], outputs[3]])
    x_end = Conv1D(1,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='linear')(x_end)
    x_end = Lambda(lambda x: tf.squeeze(x, axis=-1))(x_end)
    x_end = Lambda(lambda x: mask_logits(x[0], x[1]))([x_end, c_mask])
    x_end = Lambda(lambda x: K.softmax(x), name='end')(x_end)  # [bs, len]

    x_start_fin, x_end_fin = QAoutputBlock(ans_limit,
                                           name='qa_output')([x_start, x_end])

    # if use model.fit, the output shape must be padded to the max length
    x_start = LabelPadding(cont_limit, name='start_pos')(x_start)
    x_end = LabelPadding(cont_limit, name='end_pos')(x_end)

    return Model(
        inputs=[contw_input_, quesw_input_, contc_input_, quesc_input_],
        outputs=[x_start, x_end, x_start_fin, x_end_fin])
Example #3
0
def QANet(config, word_mat=None, char_mat=None, cove_model=None):
    # parameters
    word_dim = config['word_dim']
    char_dim = config['char_dim']
    cont_limit = config['cont_limit']
    char_limit = config['char_limit']
    ans_limit = config['ans_limit']
    filters = config['filters']
    num_head = config['num_head']
    dropout = config['dropout']

    # Input Embedding Layer
    #`Input()` is used to instantiate a Keras tensor.S
    contw_input_ = Input((None, ))
    quesw_input_ = Input((None, ))
    contc_input_ = Input((None, char_limit))
    quesc_input_ = Input((None, char_limit))

    # get mask
    c_mask = Lambda(lambda x: tf.cast(x, tf.bool))(contw_input_)  # [bs, c_len]
    q_mask = Lambda(lambda x: tf.cast(x, tf.bool))(quesw_input_)
    cont_len = Lambda(lambda x: tf.expand_dims(
        tf.reduce_sum(tf.cast(x, tf.int32), axis=1), axis=1))(c_mask)
    ques_len = Lambda(lambda x: tf.expand_dims(
        tf.reduce_sum(tf.cast(x, tf.int32), axis=1), axis=1))(q_mask)

    # slice
    contw_input = BatchSlice(dim=2)([contw_input_, cont_len])
    quesw_input = BatchSlice(dim=2)([quesw_input_, ques_len])
    contc_input = BatchSlice(dim=3)([contc_input_, cont_len])
    quesc_input = BatchSlice(dim=3)([quesc_input_, ques_len])
    c_mask = BatchSlice(dim=2)([c_mask, cont_len])
    q_mask = BatchSlice(dim=2)([q_mask, ques_len])
    c_maxlen = tf.cast(tf.reduce_max(cont_len), tf.int32)
    q_maxlen = tf.cast(tf.reduce_max(ques_len), tf.int32)

    # embedding word
    WordEmbedding = Embedding(word_mat.shape[0],
                              word_dim,
                              weights=[word_mat],
                              trainable=False,
                              name='word_embedding')
    xw_cont = WordEmbedding(contw_input)
    xw_ques = WordEmbedding(quesw_input)

    # cove
    if cove_model is not None:
        x_cont_cove = cove_model(xw_cont)
        x_ques_cove = cove_model(xw_ques)
        xw_cont = Concatenate()([xw_cont, x_cont_cove])
        xw_ques = Concatenate()([xw_ques, x_ques_cove])

    # embedding char
    CharEmbedding = Embedding(char_mat.shape[0],
                              char_dim,
                              weights=[char_mat],
                              name='char_embedding')
    xc_cont = CharEmbedding(contc_input)
    xc_ques = CharEmbedding(quesc_input)
    char_conv = Conv1D(filters,
                       5,
                       activation='relu',
                       kernel_initializer=init_relu,
                       kernel_regularizer=regularizer,
                       name='char_conv')
    xc_cont = Lambda(lambda x: tf.reshape(x, (-1, char_limit, char_dim)))(
        xc_cont)
    xc_ques = Lambda(lambda x: tf.reshape(x, (-1, char_limit, char_dim)))(
        xc_ques)
    xc_cont = char_conv(xc_cont)
    xc_ques = char_conv(xc_ques)
    xc_cont = GlobalMaxPooling1D()(xc_cont)
    xc_ques = GlobalMaxPooling1D()(xc_ques)
    xc_cont = Lambda(lambda x: tf.reshape(x, (-1, c_maxlen, filters)))(xc_cont)
    xc_ques = Lambda(lambda x: tf.reshape(x, (-1, q_maxlen, filters)))(xc_ques)

    # highwayNet
    x_cont = Concatenate()([xw_cont, xc_cont])
    x_ques = Concatenate()([xw_ques, xc_ques])

    # highway shared layers
    highway_layers = [
        Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer,
               name='highway_input_projection')
    ]
    for i in range(2):
        highway_layers.append(
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='sigmoid',
                   name='highway' + str(i) + '_gate'))
        highway_layers.append(
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='linear',
                   name='highway' + str(i) + '_linear'))
    x_cont = highway(highway_layers, x_cont, num_layers=2, dropout=dropout)
    x_ques = highway(highway_layers, x_ques, num_layers=2, dropout=dropout)

    # build shared layers
    # shared convs
    Encoder_DepthwiseConv1 = []
    for i in range(4):
        Encoder_DepthwiseConv1.append(DepthwiseConv1D(7, filters))

    # shared attention
    Encoder_SelfAttention1 = [
        Conv1D(2 * filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer),
        Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer),
        MultiHeadAttention(filters, num_head, dropout=dropout, bias=False)
    ]
    # shared feed-forward
    Encoder_FeedForward1 = []
    Encoder_FeedForward1.append(
        Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer,
               activation='relu'))
    Encoder_FeedForward1.append(
        Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer,
               activation='linear'))

    # Context Embedding Encoder Layer
    x_cont = PositionEmbedding()(x_cont)
    x_cont = conv_block(Encoder_DepthwiseConv1, x_cont, 4, dropout)
    x_cont = attention_block(Encoder_SelfAttention1, x_cont, c_mask, dropout)
    x_cont = feed_forward_block(Encoder_FeedForward1, x_cont, dropout)

    # Question Embedding Encoder Layer
    x_ques = PositionEmbedding()(x_ques)
    x_ques = conv_block(Encoder_DepthwiseConv1, x_ques, 4, dropout)
    x_ques = attention_block(Encoder_SelfAttention1, x_ques, q_mask, dropout)
    x_ques = feed_forward_block(Encoder_FeedForward1, x_ques, dropout)

    print('x_cont={}\n  x_ques={}\n  c_mask={}\n  q_mask={}\n'.format(
        x_cont, x_ques, c_mask, q_mask))

    # Context_to_Query_Attention_Layer
    ##512, c_maxlen, q_maxlen, dropout初始化该层的类,输入为[x_cont, x_ques, c_mask, q_mask]
    #x_shape=(batch_size, context_length, 512)
    x = context2query_attention(512, c_maxlen, q_maxlen,
                                dropout)([x_cont, x_ques, c_mask, q_mask])

    print('Context_to_Query_Attention_Layer x', x)
    x = Conv1D(filters,
               1,
               kernel_initializer=init,
               kernel_regularizer=regularizer,
               activation='linear')(x)

    print('conv1d x', x)
    # Model_Encoder_Layer
    # shared layers
    Encoder_DepthwiseConv2 = []
    Encoder_SelfAttention2 = []
    Encoder_FeedForward2 = []
    for i in range(7):
        DepthwiseConv_share_2_temp = []
        for i in range(2):
            DepthwiseConv_share_2_temp.append(DepthwiseConv1D(5, filters))

        Encoder_DepthwiseConv2.append(DepthwiseConv_share_2_temp)
        Encoder_SelfAttention2.append([
            Conv1D(2 * filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer),
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer),
            MultiHeadAttention(filters, num_head, dropout=dropout, bias=False)
        ])
        Encoder_FeedForward2.append([
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='relu'),
            Conv1D(filters,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='linear')
        ])

    outputs = [x]
    for i in range(3):
        x = outputs[-1]
        for j in range(7):
            x = PositionEmbedding()(x)
            x = conv_block(Encoder_DepthwiseConv2[j], x, 2, dropout, l=j, L=7)
            x = attention_block(Encoder_SelfAttention2[j],
                                x,
                                c_mask,
                                dropout,
                                l=j,
                                L=7)
            x = feed_forward_block(Encoder_FeedForward2[j],
                                   x,
                                   dropout,
                                   l=j,
                                   L=7)
        outputs.append(x)

    print('outputs', outputs)
    # Output_Layer
    x_start = Concatenate()([outputs[1], outputs[2]])
    print('output_layer x_start', x_start)
    '''
    keras.layers.Conv1D(filters, kernel_size, strides=1, padding='valid', data_format='channels_last', dilation_rate=1, activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None)
    
    Input shape:
      3D tensor with shape: `(batch_size, time_steps, input_dim)`

  Output shape:
      3D tensor with shape: `(batch_size, new_steps, filters)`
      `steps` value might have changed due to padding or strides.

这也可以解释,为什么在Keras中使用Conv1D可以进行自然语言处理,因为在自然语言处理中,我们假设一个序列是600个单词,每个单词的词向量是300维,那么一个序列输入到网络中就是(600,300),当我使用Conv1D进行卷积的时候,实际上就完成了直接在序列上的卷积,卷积的时候实际是以(3,300)进行卷积,又因为每一行都是一个词向量,因此使用Conv1D(kernel_size=3)也就相当于使用神经网络进行了n_gram=3的特征提取了。这也是为什么使用卷积神经网络处理文本会非常快速有效的内涵。

Conv1D(kernel_size=3)实际就是Conv2D(kernel_size=(3,300)),当然必须把输入也reshape成(600,300,1),即可在多行上进行Conv2D卷积。
所以这里的kernel_size=1,是conv2d的(1,词向量维度)



    '''
    x_start = Conv1D(1,
                     1,
                     kernel_initializer=init,
                     kernel_regularizer=regularizer,
                     activation='linear')(x_start)
    print('conv1D x_start', x_start)

    #从tensor中删除所有大小是1的维度
    x_start = Lambda(lambda x: tf.squeeze(x, axis=-1))(x_start)
    print('squeeze x_start', x_start)

    ## mask_logits输出维度与输入维度一样
    x_start = Lambda(lambda x: mask_logits(x[0], x[1]))([x_start, c_mask])
    print('mask_logits x_start', x_start)

    ##输出的x_start是已经经过了softmax计算之后的值
    '''
    softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
    Returns:
    A `Tensor`. Has the same type and shape as `logits`.
    '''
    x_start = Lambda(lambda x: K.softmax(x),
                     name='start')(x_start)  # [bs, len]
    print(
        'x_start softmax',
        x_start,
    )

    x_end = Concatenate()([outputs[1], outputs[3]])
    x_end = Conv1D(1,
                   1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='linear')(x_end)
    x_end = Lambda(lambda x: tf.squeeze(x, axis=-1))(x_end)
    x_end = Lambda(lambda x: mask_logits(x[0], x[1]))([x_end, c_mask])
    x_end = Lambda(lambda x: K.softmax(x), name='end')(x_end)  # [bs, len]

    x_start_fin, x_end_fin = QAoutputBlock(ans_limit,
                                           name='qa_output')([x_start, x_end])

    # if use model.fit, the output shape must be padded to the max length
    x_start = LabelPadding(cont_limit, name='start_pos')(x_start)
    x_end = LabelPadding(cont_limit, name='end_pos')(x_end)
    print('x_start  x_start_fin x_end x_end_fin ', x_start, x_start_fin, x_end,
          x_end_fin)
    return Model(
        inputs=[contw_input_, quesw_input_, contc_input_, quesc_input_],
        outputs=[x_start, x_end, x_start_fin, x_end_fin])
Example #4
0
  def __init__(
      self,
      vocab_size,
      output_dim,
      hidden_size=128,
      num_layers=12,
      num_attention_heads=4,
      max_sequence_length=150,
      inner_dim=512,
      inner_activation=lambda x: gelu(x, approximate=True),
      output_dropout=0.1,
      attention_dropout=0.1,
      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
      output_range=None,
      embedding_width=None,
      embedding_layer=None,
      **kwargs):
    activation = tf.keras.activations.get(inner_activation)
    initializer = tf.keras.initializers.get(initializer)

    word_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_word_ids')
    mask = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_mask')

    if embedding_width is None:
      embedding_width = hidden_size

    if embedding_layer is None:
      embedding_layer_inst = OnDeviceEmbedding(
          vocab_size=vocab_size,
          embedding_width=embedding_width,
          initializer=initializer,
          name='word_embeddings')
    else:
      embedding_layer_inst = embedding_layer
    word_embeddings = embedding_layer_inst(word_ids)

    # Always uses dynamic slicing for simplicity.
    position_embedding_layer = PositionEmbedding(
        initializer=initializer,
        max_length=max_sequence_length,
        name='position_embedding')
    position_embeddings = position_embedding_layer(word_embeddings)

    embeddings = tf.keras.layers.Add()(
        [word_embeddings, position_embeddings])

    embedding_norm_layer = tf.keras.layers.LayerNormalization(
        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)

    embeddings = embedding_norm_layer(embeddings)
    embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))

    # We project the 'embedding' output to 'hidden_size' if it is not already
    # 'hidden_size'.
    if embedding_width != hidden_size:
      embedding_projection = tf.keras.layers.experimental.EinsumDense(
          '...x,xy->...y',
          output_shape=hidden_size,
          bias_axes='y',
          kernel_initializer=initializer,
          name='embedding_projection')
      embeddings = embedding_projection(embeddings)
    else:
      embedding_projection = None

    transformer_layers = []
    data = embeddings
    attention_mask = SelfAttentionMask()(data, mask)
    encoder_outputs = []
    for i in range(num_layers):
      if i == num_layers - 1 and output_range is not None:
        transformer_output_range = output_range
      else:
        transformer_output_range = None
      layer = TransformerEncoderBlock(
          num_attention_heads=num_attention_heads,
          inner_dim=inner_dim,
          inner_activation=inner_activation,
          output_dropout=output_dropout,
          attention_dropout=attention_dropout,
          output_range=transformer_output_range,
          kernel_initializer=initializer,
          name='transformer/layer_%d' % i)
      transformer_layers.append(layer)
      data = layer([data, attention_mask])
      encoder_outputs.append(data)

    last_encoder_output = encoder_outputs[-1]
    # Applying a tf.slice op (through subscript notation) to a Keras tensor
    # like this will create a SliceOpLambda layer. This is better than a Lambda
    # layer with Python code, because that is fundamentally less portable.
    first_token_tensor = last_encoder_output[:, 0, :]
    pooler_layer = tf.keras.layers.Dense(
        units=hidden_size,
        activation='tanh',
        kernel_initializer=initializer,
        name='pooler_transform')
    cls_output = pooler_layer(first_token_tensor)

    dense_logits = tf.keras.layers.Dense(output_dim, activation = 'linear')
    logits = dense_logits(last_encoder_output)

    outputs = dict(
        sequence_output=encoder_outputs[-1],
        pooled_output=cls_output,
        encoder_outputs=encoder_outputs,
        logits=logits
    )

    # Once we've created the network using the Functional API, we call
    # super().__init__ as though we were invoking the Functional API Model
    # constructor, resulting in this object having all the properties of a model
    # created using the Functional API. Once super().__init__ is called, we
    # can assign attributes to `self` - note that all `self` assignments are
    # below this line.
    super(BertEncoder, self).__init__(
        inputs=[word_ids, mask], outputs=outputs, **kwargs)

    config_dict = {
        'vocab_size': vocab_size,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'num_attention_heads': num_attention_heads,
        'max_sequence_length': max_sequence_length,
        'inner_dim': inner_dim,
        'inner_activation': tf.keras.activations.serialize(activation),
        'output_dropout': output_dropout,
        'attention_dropout': attention_dropout,
        'initializer': tf.keras.initializers.serialize(initializer),
        'output_range': output_range,
        'embedding_width': embedding_width,
        'embedding_layer': embedding_layer,
    }

    # We are storing the config dict as a namedtuple here to ensure checkpoint
    # compatibility with an earlier version of this model which did not track
    # the config dict attribute. TF does not track immutable attrs which
    # do not contain Trackables, so by creating a config namedtuple instead of
    # a dict we avoid tracking it.
    config_cls = collections.namedtuple('Config', config_dict.keys())
    self._config = config_cls(**config_dict)
    self._pooler_layer = pooler_layer
    self._transformer_layers = transformer_layers
    self._embedding_norm_layer = embedding_norm_layer
    self._embedding_layer = embedding_layer_inst
    self._position_embedding_layer = position_embedding_layer
    if embedding_projection is not None:
      self._embedding_projection = embedding_projection