def test_mask_loss(self):
        def _loss(y_true, _):
            return K.sum(y_true, axis=-1)

        inputs = [keras.layers.Input((5, )), keras.layers.Input((5, ))]
        embed = keras.layers.Embedding(input_dim=2,
                                       output_dim=3,
                                       mask_zero=True)(inputs[0])
        masked = Masked()([embed, inputs[1]])

        model = keras.models.Model(inputs, masked)
        model.compile(
            optimizer='sgd',
            loss=_loss,
        )

        token_input = np.array([
            [1, 1, 1, 0, 0],
            [1, 1, 1, 1, 0],
        ])
        mask_input = np.array([
            [0, 1, 0, 0, 0],
            [1, 0, 0, 0, 0],
        ])
        outputs = np.arange(30).reshape((2, 5, 3))
        if TF_KERAS:
            expected = 6.0
        else:
            expected = 30.0
        self.assertAlmostEqual(expected,
                               model.evaluate([token_input, mask_input],
                                              outputs),
                               places=3)
Exemple #2
0
    def test_mask_loss(self):
        def _loss(y_true, _):
            return K.sum(y_true, axis=-1)

        inputs = [keras.layers.Input((5,)), keras.layers.Input((5,))]
        embed = keras.layers.Embedding(input_dim=2, output_dim=3, mask_zero=True)(inputs[0])
        masked = Masked()([embed, inputs[1]])

        model = keras.models.Model(inputs, masked)
        model.compile(
            optimizer='sgd',
            loss=_loss,
        )

        token_input = np.array([
            [1, 1, 1, 0, 0],
            [1, 1, 1, 1, 0],
        ])
        mask_input = np.array([
            [0, 1, 0, 0, 0],
            [1, 0, 0, 0, 0],
        ])
        outputs = np.arange(30, dtype=K.floatx()).reshape((2, 5, 3))
        actual = model.evaluate([token_input, mask_input], outputs)
        self.assertTrue(np.abs(actual - 6.0) < 1e-6 or np.abs(actual - 30.0) < 1e-6, actual)
Exemple #3
0
 def test_mask_result(self):
     input_layer = keras.layers.Input(
         shape=(None, ),
         name='Input',
     )
     embed_layer = keras.layers.Embedding(
         input_dim=12,
         output_dim=9,
         mask_zero=True,
         name='Embedding',
     )(input_layer)
     transformer_layer = get_encoders(
         encoder_num=1,
         input_layer=embed_layer,
         head_num=1,
         hidden_dim=12,
         attention_activation=None,
         feed_forward_activation=gelu,
         dropout_rate=0.1,
     )
     dense_layer = keras.layers.Dense(
         units=12,
         activation='softmax',
         name='Dense',
     )(transformer_layer)
     mask_layer = keras.layers.Input(
         shape=(None, ),
         name='Mask',
     )
     masked_layer, mask_result = Masked(
         return_masked=True,
         name='Masked',
     )([dense_layer, mask_layer])
     print([masked_layer, mask_result])
     model = keras.models.Model(
         inputs=[input_layer, mask_layer],
         outputs=[masked_layer, mask_result],
     )
     model.compile(
         optimizer='adam',
         loss='mse',
     )
     model.summary(line_length=150)
     predicts = model.predict([
         np.asarray([
             [1, 2, 3, 4, 5, 6, 7, 8, 0, 0],
             [1, 2, 3, 4, 0, 0, 0, 0, 0, 0],
         ]),
         np.asarray([
             [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
             [0, 1, 0, 1, 0, 0, 0, 0, 0, 0],
         ]),
     ])
     expect = np.asarray([
         [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
         [0, 1, 0, 1, 0, 0, 0, 0, 0, 0],
     ])
     self.assertTrue(np.allclose(expect, predicts[1]))
Exemple #4
0
 def test_sample(self):
     inputs = get_inputs(seq_len=512)
     embed_layer, _ = get_embedding(inputs, token_num=12, embed_dim=768, pos_num=512)
     masked_layer = Masked(name='Masked')([embed_layer, inputs[-1]])
     model = keras.models.Model(inputs=inputs, outputs=masked_layer)
     model.compile(
         optimizer='adam',
         loss='mse',
         metrics={},
     )
     model.summary()
     model.predict([
         np.asarray([[1] + [0] * 511]),
         np.asarray([[0] * 512]),
         np.asarray([[1] + [0] * 511]),
     ])
     self.assertEqual((None, 512, 768), model.layers[-1].output_shape)
Exemple #5
0
    def test_fit(self):
        input_layer = keras.layers.Input(
            shape=(15, ),
            name='Input',
        )
        embed_layer = keras.layers.Embedding(
            input_dim=12,
            output_dim=24,
            mask_zero=True,
            name='Embedding',
        )(input_layer)
        rnn_layer = keras.layers.Bidirectional(
            keras.layers.LSTM(units=100, return_sequences=True),
            name='Bi-LSTM',
        )(embed_layer)
        dense_layer = keras.layers.Dense(
            units=12,
            activation='softmax',
            name='Dense',
        )(rnn_layer)
        mask_layer = keras.layers.Input(
            shape=(None, ),
            name='Mask',
        )
        masked_layer = Masked(name='Masked', )([dense_layer, mask_layer])
        model = keras.models.Model(
            inputs=[input_layer, mask_layer],
            outputs=masked_layer,
        )
        model.compile(
            optimizer=keras.optimizers.Adam(lr=1e-4),
            loss=keras.losses.sparse_categorical_crossentropy,
            metrics=[keras.metrics.sparse_categorical_crossentropy],
        )
        model.summary(line_length=150)

        def _generator(batch_size=32):
            while True:
                inputs, masked, outputs = [], [], []
                for _ in range(batch_size):
                    inputs.append([])
                    masked.append([])
                    outputs.append([])
                    has_mask = False
                    for i in range(1, 11):
                        inputs[-1].append(i)
                        outputs[-1].append([i])
                        if random.random() < 0.3:
                            has_mask = True
                            inputs[-1][-1] = 11
                            masked[-1].append(1)
                        else:
                            masked[-1].append(0)
                    if not has_mask:
                        masked[-1][0] = 1
                    inputs[-1] += [0] * (15 - len(inputs[-1]))
                    masked[-1] += [0] * (15 - len(masked[-1]))
                    outputs[-1] += [[0]] * (15 - len(outputs[-1]))
                yield [np.asarray(inputs),
                       np.asarray(masked)], np.asarray(outputs)

        model.fit_generator(
            generator=_generator(),
            steps_per_epoch=1000,
            epochs=10,
            validation_data=_generator(),
            validation_steps=100,
            callbacks=[
                keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
            ],
        )
        for inputs, outputs in _generator(batch_size=32):
            predicts = model.predict(inputs)
            actual = np.argmax(predicts, axis=-1)
            for i in range(32):
                for j in range(15):
                    if inputs[1][i][j]:
                        self.assertEqual(j + 1, actual[i][j])
            break
Exemple #6
0
def build_albert(token_num,
                 pos_num=512,
                 seq_len=512,
                 embed_dim=128,
                 hidden_dim=768,
                 transformer_num=12,
                 head_num=12,
                 feed_forward_dim=3072,
                 dropout_rate=0.1,
                 attention_activation=None,
                 feed_forward_activation='gelu',
                 training=True,
                 trainable=None,
                 output_layers=None):
    """Get ALBERT model.
    See: https://arxiv.org/pdf/1909.11942.pdf
    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param hidden_dim: Dimensions of hidden layers.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention
                    in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer
                             in each transformer.
    :param dropout_rate: Dropout rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param training: A built model with MLM and NSP outputs will be returned
                     if it is `True`, otherwise the input layers and the last
                     feature extraction layer will be returned.
    :param trainable: Whether the model is trainable.
    :param output_layers: A list of indices of output layers.
    """
    if attention_activation == 'gelu':
        attention_activation = gelu
    if feed_forward_activation == 'gelu':
        feed_forward_activation = gelu
    if trainable is None:
        trainable = training

    def _trainable(_layer):
        if isinstance(trainable, (list, tuple, set)):
            for prefix in trainable:
                if _layer.name.startswith(prefix):
                    return True
            return False
        return trainable

    # Build inputs
    input_token = keras.layers.Input(shape=(seq_len, ), name='Input-Token')
    input_segment = keras.layers.Input(shape=(seq_len, ), name='Input-Segment')
    inputs = [input_token, input_segment]

    # Build embeddings
    embed_token, embed_weights, embed_projection = AdaptiveEmbedding(
        input_dim=token_num,
        output_dim=hidden_dim,
        embed_dim=embed_dim,
        mask_zero=True,
        trainable=trainable,
        return_embeddings=True,
        return_projections=True,
        name='Embed-Token',
    )(input_token)
    embed_segment = keras.layers.Embedding(
        input_dim=2,
        output_dim=hidden_dim,
        trainable=trainable,
        name='Embed-Segment',
    )(input_segment)
    embed_layer = keras.layers.Add(name='Embed-Token-Segment')(
        [embed_token, embed_segment])
    embed_layer = PositionEmbedding(
        input_dim=pos_num,
        output_dim=hidden_dim,
        mode=PositionEmbedding.MODE_ADD,
        trainable=trainable,
        name='Embedding-Position',
    )(embed_layer)

    if dropout_rate > 0.0:
        dropout_layer = keras.layers.Dropout(
            rate=dropout_rate,
            name='Embedding-Dropout',
        )(embed_layer)
    else:
        dropout_layer = embed_layer
    embed_layer = LayerNormalization(
        trainable=trainable,
        name='Embedding-Norm',
    )(dropout_layer)

    # Build shared transformer
    attention_layer = MultiHeadAttention(
        head_num=head_num,
        activation=attention_activation,
        name='Attention',
    )
    attention_normal = LayerNormalization(name='Attention-Normal')
    feed_forward_layer = FeedForward(units=feed_forward_dim,
                                     activation=feed_forward_activation,
                                     name='Feed-Forward')
    feed_forward_normal = LayerNormalization(name='Feed-Forward-Normal')

    transformed = embed_layer
    transformed_layers = []
    for i in range(transformer_num):
        attention_input = transformed
        transformed = attention_layer(transformed)
        if dropout_rate > 0.0:
            transformed = keras.layers.Dropout(
                rate=dropout_rate,
                name='Attention-Dropout-{}'.format(i + 1),
            )(transformed)
        transformed = keras.layers.Add(
            name='Attention-Add-{}'.format(i + 1), )(
                [attention_input, transformed])
        transformed = attention_normal(transformed)

        feed_forward_input = transformed
        transformed = feed_forward_layer(transformed)
        if dropout_rate > 0.0:
            transformed = keras.layers.Dropout(
                rate=dropout_rate,
                name='Feed-Forward-Dropout-{}'.format(i + 1),
            )(transformed)
        transformed = keras.layers.Add(
            name='Feed-Forward-Add-{}'.format(i + 1), )(
                [feed_forward_input, transformed])
        transformed = feed_forward_normal(transformed)
        transformed_layers.append(transformed)

    if training:
        # Build tasks
        mlm_dense_layer = keras.layers.Dense(
            units=hidden_dim,
            activation=feed_forward_activation,
            name='MLM-Dense',
        )(transformed)
        mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
        mlm_pred_layer = AdaptiveSoftmax(
            input_dim=hidden_dim,
            output_dim=token_num,
            embed_dim=embed_dim,
            bind_embeddings=True,
            bind_projections=True,
            name='MLM-Sim',
        )([mlm_norm_layer, embed_weights, embed_projection])
        masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
            units=hidden_dim,
            activation='tanh',
            name='SOP-Dense',
        )(extract_layer)
        nsp_pred_layer = keras.layers.Dense(
            units=2,
            activation='softmax',
            name='SOP',
        )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs,
                                   outputs=[masked_layer, nsp_pred_layer])
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        return model
    if output_layers is not None:
        if isinstance(output_layers, list):
            output_layers = [
                transformed_layers[index] for index in output_layers
            ]
            output = keras.layers.Concatenate(name='Output', )(output_layers)
        else:
            output = transformed_layers[output_layers]
        model = keras.models.Model(inputs=inputs, outputs=output)
        return model
    model = keras.models.Model(inputs=inputs, outputs=transformed)
    for layer in model.layers:
        layer.trainable = _trainable(layer)
    return inputs, transformed
Exemple #7
0
def get_model(token_num,
              pos_num=512,
              seq_len=512,
              embed_dim=768,
              transformer_num=12,
              head_num=12,
              feed_forward_dim=3072,
              dropout_rate=0.1,
              attention_activation=None,
              feed_forward_activation='gelu',
              training=True,
              trainable=None,
              output_layer_num=1,
              use_task_embed=False,
              task_num=10,
              use_adapter=False,
              adapter_units=None):
    """Get BERT model.

    See: https://arxiv.org/pdf/1810.04805.pdf

    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer in each transformer.
    :param dropout_rate: Dropout rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param training: A built model with MLM and NSP outputs will be returned if it is `True`,
                     otherwise the input layers and the last feature extraction layer will be returned.
    :param trainable: Whether the model is trainable.
    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `training` is `False`.
    :param use_task_embed: Whether to add task embeddings to existed embeddings.
    :param task_num: The number of tasks.
    :param use_adapter: Whether to use feed-forward adapters before each residual connections.
    :param adapter_units: The dimension of the first transformation in feed-forward adapter.
    :return: The built model.
    """
    if attention_activation == 'gelu':
        attention_activation = gelu
    if feed_forward_activation == 'gelu':
        feed_forward_activation = gelu
    if trainable is None:
        trainable = training
    if adapter_units is None:
        adapter_units = max(1, embed_dim // 100)

    def _trainable(_layer):
        if isinstance(trainable, (list, tuple, set)):
            for prefix in trainable:
                if _layer.name.startswith(prefix):
                    return True
            return False
        return trainable

    inputs = get_inputs(seq_len=seq_len)
    x, s, m = inputs
    x = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]),
                            name='Input-Token-Reshape')(x)
    s = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]),
                            name='Input-Segment-Reshape')(s)
    m = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]),
                            name='Input-Mention-Reshape')(m)

    embed_layer, embed_weights = get_embedding(
        [x, s, m],
        token_num=token_num,
        embed_dim=embed_dim,
        pos_num=pos_num,
        dropout_rate=dropout_rate,
    )
    if use_task_embed:
        task_input = keras.layers.Input(
            shape=(1, ),
            name='Input-Task',
        )
        embed_layer = TaskEmbedding(
            input_dim=task_num,
            output_dim=embed_dim,
            mask_zero=False,
            name='Embedding-Task',
        )([embed_layer, task_input])
        inputs = inputs[:2] + [task_input, inputs[-1]]
    if dropout_rate > 0.0:
        dropout_layer = keras.layers.Dropout(
            rate=dropout_rate,
            name='Embedding-Dropout',
        )(embed_layer)
    else:
        dropout_layer = embed_layer
    embed_layer = LayerNormalization(
        trainable=trainable,
        name='Embedding-Norm',
    )(dropout_layer)
    transformed = get_encoders(
        encoder_num=transformer_num,
        input_layer=embed_layer,
        head_num=head_num,
        hidden_dim=feed_forward_dim,
        attention_activation=attention_activation,
        feed_forward_activation=feed_forward_activation,
        dropout_rate=dropout_rate,
        use_adapter=use_adapter,
        adapter_units=adapter_units,
        adapter_activation=gelu,
    )
    if training:
        mlm_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation=feed_forward_activation,
            name='MLM-Dense',
        )(transformed)
        mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
        mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')(
            [mlm_norm_layer, embed_weights])
        masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
            units=embed_dim,
            activation='tanh',
            name='NSP-Dense',
        )(extract_layer)
        nsp_pred_layer = keras.layers.Dense(
            units=2,
            activation='softmax',
            name='NSP',
        )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs,
                                   outputs=[masked_layer, nsp_pred_layer])
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        return model
    else:
        model = keras.models.Model(inputs=inputs, outputs=transformed)
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        if isinstance(output_layer_num, int):
            output_layer_num = min(output_layer_num, transformer_num)
            output_layer_num = [-i for i in range(1, output_layer_num + 1)]
        outputs = []
        for layer_index in output_layer_num:
            if layer_index < 0:
                layer_index = transformer_num + layer_index
            layer_index += 1
            layer = model.get_layer(
                name='Encoder-{}-FeedForward-Norm'.format(layer_index))
            outputs.append(layer.output)
        if len(outputs) > 1:
            transformed = keras.layers.Concatenate(name='Encoder-Output')(list(
                reversed(outputs)))
        else:
            transformed = outputs[0]
        return inputs, transformed