def _feed_forward_builder(x):
     return FeedForward(
         units=hidden_dim,
         activation=activation,
         trainable=trainable,
         name=name,
     )(x)
def _wrap_layer(name,
                input_layer,
                build_func,
                dropout_rate=0.0,
                trainable=True,
                use_adapter=False,
                adapter_units=None,
                adapter_activation='relu'):

    build_output = build_func(input_layer)
    if dropout_rate > 0.0:
        dropout_layer = keras.layers.Dropout(
            rate=dropout_rate,
            name='%s-Dropout' % name,
        )(build_output)
    else:
        dropout_layer = build_output
    if isinstance(input_layer, list):
        input_layer = input_layer[0]
    if use_adapter:
        adapter = FeedForward(
            units=adapter_units,
            activation=adapter_activation,
            kernel_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001),
            name='%s-Adapter' % name,
        )(dropout_layer)
        dropout_layer = keras.layers.Add(name='%s-Adapter-Add' % name)([dropout_layer, adapter])
    add_layer = keras.layers.Add(name='%s-Add' % name)([input_layer, dropout_layer])
    normal_layer = LayerNormalization(
        trainable=trainable,
        name='%s-Norm' % name,
    )(add_layer)
    return normal_layer
def build_model(emb_cid, emb_advid):

    inp1 = layers.Input(shape=(max_len, ))
    inp2 = layers.Input(shape=(max_len, ))

    emb1 = layers.Embedding(input_dim=emb_cid.shape[0],
                            output_dim=emb_cid.shape[1],
                            input_length=max_len,
                            weights=[emb_cid],
                            trainable=False)(inp1)
    emb2 = layers.Embedding(input_dim=emb_advid.shape[0],
                            output_dim=emb_advid.shape[1],
                            input_length=max_len,
                            weights=[emb_advid],
                            trainable=False)(inp2)

    sdrop = layers.SpatialDropout1D(rate=0.2)

    emb1 = sdrop(emb1)
    emb2 = sdrop(emb2)

    content = layers.Concatenate()([emb1, emb2])

    mha = MultiHeadAttention(head_num=16)(content)
    mha = layers.Dropout(0.01)(mha)
    mha = layers.Add()([content, mha])
    mha = LayerNormalization()(mha)
    mha = layers.Dropout(0.01)(mha)
    mha_ff = FeedForward(256)(mha)
    mha_out = layers.Add()([mha, mha_ff])
    mha_out = LayerNormalization()(mha_out)

    lstm = layers.Bidirectional(layers.LSTM(128,
                                            return_sequences=True))(mha_out)

    avg_pool = layers.GlobalAveragePooling1D()(lstm)
    max_pool = layers.GlobalMaxPool1D()(lstm)

    x = layers.Concatenate()([avg_pool, max_pool])

    x = layers.Dense(128, activation='relu')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dense(64, activation='relu')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dropout(0.1)(x)

    out = layers.Dense(10, activation='softmax')(x)
    model = keras.Model(inputs=[inp1, inp2], outputs=out)
    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(1e-3),
                  metrics=['accuracy'])

    return model
 def test_sample(self):
     input_layer = keras.layers.Input(
         shape=(1, 3),
         name='Input',
     )
     feed_forward_layer = FeedForward(
         units=4,
         activation=self._leaky_relu,
         weights=[
             np.asarray([
                 [0.1, 0.2, 0.3, 0.4],
                 [-0.1, 0.2, -0.3, 0.4],
                 [0.1, -0.2, 0.3, -0.4],
             ]),
             np.asarray([
                 0.0,
                 -0.1,
                 0.2,
                 -0.3,
             ]),
             np.asarray([
                 [0.1, 0.2, 0.3],
                 [-0.1, 0.2, -0.3],
                 [0.1, -0.2, 0.3],
                 [-0.1, 0.2, 0.3],
             ]),
             np.asarray([
                 0.0,
                 0.1,
                 -0.2,
             ]),
         ],
         name='FeedForward',
     )(input_layer)
     model = keras.models.Model(
         inputs=input_layer,
         outputs=feed_forward_layer,
     )
     model.compile(
         optimizer='adam',
         loss='mse',
         metrics={},
     )
     model.summary()
     inputs = np.array([[[0.2, 0.1, 0.3]]])
     predict = model.predict(inputs)
     expected = np.asarray([[[0.0364, 0.0432, -0.0926]]])
     self.assertTrue(np.allclose(expected, predict), predict)
Beispiel #5
0
def encoder(seq_len, m_features, d_model, n_heads, dff, rate=0.1, encoder=None):
	"""Basic Attention Encoder. It can be concatenated with a previous encoder by passing it as argument."""
	if encoder == None:
		in_seq = keras.layers.Input(shape=(seq_len, m_features))
		in_seq = LayerNormalization()(in_seq)
	else::
		in_seq = encoder.output
	linear = keras.layers.Dense(units=d_model)(norm_0)
	pos = TrigPosEmbedding(mode=TrigPosEmbedding.MODE_ADD)(linear)
	mha = MultiHeadAttention(head_num=n_heads)(pos)
	mha_drop = keras.layers.Dropout(rate=rate)(mha)
	add_1  = keras.layers.Add()([pos, mha_drop])
	norm_1 = LayerNormalization()(add_1)
	ff = FeedForward(dff)(norm_1)
	ff_drop = keras.layers.Dropout(rate=rate)(ff)
	add_2 = keras.layers.Add()([ff_drop, norm_1])
	out = LayerNormalization()(add_2)
	return keras.Model(in_seq, out) if encoder == None else keras.Model(encoder.input, out)
Beispiel #6
0
def _wrap_layer(name,
                input_layer,
                build_func,
                dropout_rate=0.0,
                trainable=True,
                use_adapter=False,
                adapter_units=None,
                adapter_activation='relu'):
    """Wrap layers with residual, normalization and dropout.

    :param name: Prefix of names for internal layers.
    :param input_layer: Input layer.
    :param build_func: A callable that takes the input tensor and generates the output tensor.
    :param dropout_rate: Dropout rate.
    :param trainable: Whether the layers are trainable.
    :param use_adapter: Whether to use feed-forward adapters before each residual connections.
    :param adapter_units: The dimension of the first transformation in feed-forward adapter.
    :param adapter_activation: The activation after the first transformation in feed-forward adapter.
    :return: Output layer.
    """
    build_output = build_func(input_layer)
    if dropout_rate > 0.0:
        dropout_layer = keras.layers.Dropout(
            rate=dropout_rate,
            name='%s-Dropout' % name,
        )(build_output)
    else:
        dropout_layer = build_output
    if isinstance(input_layer, list):
        input_layer = input_layer[0]
    if use_adapter:
        adapter = FeedForward(
            units=adapter_units,
            activation=adapter_activation,
            kernel_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001),
            name='%s-Adapter' % name,
        )(dropout_layer)
        dropout_layer = keras.layers.Add(name='%s-Adapter-Add' % name)([dropout_layer, adapter])
    add_layer = keras.layers.Add(name='%s-Add' % name)([input_layer, dropout_layer])
    normal_layer = LayerNormalization(
        trainable=trainable,
        name='%s-Norm' % name,
    )(add_layer)
    return normal_layer
Beispiel #7
0
def block(attention_input, head_num: int, feed_forward_units: int,
          dropout_rate: float) -> Tensor:

    attention_x = MultiHeadAttention(
        head_num=head_num,
        activation=None,
        use_bias=False,
        history_only=True,
        trainable=True,
    )(attention_input)
    attention_x = Dropout(dropout_rate)(attention_x)
    attention_x = Add()([attention_input, attention_x])
    feed_forward_input = LayerNormalization(trainable=True)(attention_x)

    feed_forward_x = FeedForward(units=feed_forward_units,
                                 activation='relu',
                                 trainable=True)(feed_forward_input)
    feed_forward_x = Dropout(dropout_rate)(feed_forward_x)
    feed_forward_x = Add()([feed_forward_input, feed_forward_x])
    block_output = LayerNormalization(trainable=True)(feed_forward_x)

    return block_output
Beispiel #8
0
def build_albert(token_num,
                 pos_num=512,
                 seq_len=512,
                 embed_dim=128,
                 hidden_dim=768,
                 transformer_num=12,
                 head_num=12,
                 feed_forward_dim=3072,
                 dropout_rate=0.1,
                 attention_activation=None,
                 feed_forward_activation='gelu',
                 training=True,
                 trainable=None,
                 output_layers=None):
    """Get ALBERT model.
    See: https://arxiv.org/pdf/1909.11942.pdf
    :param token_num: Number of tokens.
    :param pos_num: Maximum position.
    :param seq_len: Maximum length of the input sequence or None.
    :param embed_dim: Dimensions of embeddings.
    :param hidden_dim: Dimensions of hidden layers.
    :param transformer_num: Number of transformers.
    :param head_num: Number of heads in multi-head attention
                    in each transformer.
    :param feed_forward_dim: Dimension of the feed forward layer
                             in each transformer.
    :param dropout_rate: Dropout rate.
    :param attention_activation: Activation for attention layers.
    :param feed_forward_activation: Activation for feed-forward layers.
    :param training: A built model with MLM and NSP outputs will be returned
                     if it is `True`, otherwise the input layers and the last
                     feature extraction layer will be returned.
    :param trainable: Whether the model is trainable.
    :param output_layers: A list of indices of output layers.
    """
    if attention_activation == 'gelu':
        attention_activation = gelu
    if feed_forward_activation == 'gelu':
        feed_forward_activation = gelu
    if trainable is None:
        trainable = training

    def _trainable(_layer):
        if isinstance(trainable, (list, tuple, set)):
            for prefix in trainable:
                if _layer.name.startswith(prefix):
                    return True
            return False
        return trainable

    # Build inputs
    input_token = keras.layers.Input(shape=(seq_len, ), name='Input-Token')
    input_segment = keras.layers.Input(shape=(seq_len, ), name='Input-Segment')
    inputs = [input_token, input_segment]

    # Build embeddings
    embed_token, embed_weights, embed_projection = AdaptiveEmbedding(
        input_dim=token_num,
        output_dim=hidden_dim,
        embed_dim=embed_dim,
        mask_zero=True,
        trainable=trainable,
        return_embeddings=True,
        return_projections=True,
        name='Embed-Token',
    )(input_token)
    embed_segment = keras.layers.Embedding(
        input_dim=2,
        output_dim=hidden_dim,
        trainable=trainable,
        name='Embed-Segment',
    )(input_segment)
    embed_layer = keras.layers.Add(name='Embed-Token-Segment')(
        [embed_token, embed_segment])
    embed_layer = PositionEmbedding(
        input_dim=pos_num,
        output_dim=hidden_dim,
        mode=PositionEmbedding.MODE_ADD,
        trainable=trainable,
        name='Embedding-Position',
    )(embed_layer)

    if dropout_rate > 0.0:
        dropout_layer = keras.layers.Dropout(
            rate=dropout_rate,
            name='Embedding-Dropout',
        )(embed_layer)
    else:
        dropout_layer = embed_layer
    embed_layer = LayerNormalization(
        trainable=trainable,
        name='Embedding-Norm',
    )(dropout_layer)

    # Build shared transformer
    attention_layer = MultiHeadAttention(
        head_num=head_num,
        activation=attention_activation,
        name='Attention',
    )
    attention_normal = LayerNormalization(name='Attention-Normal')
    feed_forward_layer = FeedForward(units=feed_forward_dim,
                                     activation=feed_forward_activation,
                                     name='Feed-Forward')
    feed_forward_normal = LayerNormalization(name='Feed-Forward-Normal')

    transformed = embed_layer
    transformed_layers = []
    for i in range(transformer_num):
        attention_input = transformed
        transformed = attention_layer(transformed)
        if dropout_rate > 0.0:
            transformed = keras.layers.Dropout(
                rate=dropout_rate,
                name='Attention-Dropout-{}'.format(i + 1),
            )(transformed)
        transformed = keras.layers.Add(
            name='Attention-Add-{}'.format(i + 1), )(
                [attention_input, transformed])
        transformed = attention_normal(transformed)

        feed_forward_input = transformed
        transformed = feed_forward_layer(transformed)
        if dropout_rate > 0.0:
            transformed = keras.layers.Dropout(
                rate=dropout_rate,
                name='Feed-Forward-Dropout-{}'.format(i + 1),
            )(transformed)
        transformed = keras.layers.Add(
            name='Feed-Forward-Add-{}'.format(i + 1), )(
                [feed_forward_input, transformed])
        transformed = feed_forward_normal(transformed)
        transformed_layers.append(transformed)

    if training:
        # Build tasks
        mlm_dense_layer = keras.layers.Dense(
            units=hidden_dim,
            activation=feed_forward_activation,
            name='MLM-Dense',
        )(transformed)
        mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
        mlm_pred_layer = AdaptiveSoftmax(
            input_dim=hidden_dim,
            output_dim=token_num,
            embed_dim=embed_dim,
            bind_embeddings=True,
            bind_projections=True,
            name='MLM-Sim',
        )([mlm_norm_layer, embed_weights, embed_projection])
        masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
        extract_layer = Extract(index=0, name='Extract')(transformed)
        nsp_dense_layer = keras.layers.Dense(
            units=hidden_dim,
            activation='tanh',
            name='SOP-Dense',
        )(extract_layer)
        nsp_pred_layer = keras.layers.Dense(
            units=2,
            activation='softmax',
            name='SOP',
        )(nsp_dense_layer)
        model = keras.models.Model(inputs=inputs,
                                   outputs=[masked_layer, nsp_pred_layer])
        for layer in model.layers:
            layer.trainable = _trainable(layer)
        return model
    if output_layers is not None:
        if isinstance(output_layers, list):
            output_layers = [
                transformed_layers[index] for index in output_layers
            ]
            output = keras.layers.Concatenate(name='Output', )(output_layers)
        else:
            output = transformed_layers[output_layers]
        model = keras.models.Model(inputs=inputs, outputs=output)
        return model
    model = keras.models.Model(inputs=inputs, outputs=transformed)
    for layer in model.layers:
        layer.trainable = _trainable(layer)
    return inputs, transformed
def build_transformer_xl(units,
                         embed_dim,
                         hidden_dim,
                         num_token,
                         num_block,
                         num_head,
                         batch_size,
                         memory_len,
                         target_len,
                         dropout=0.0,
                         attention_dropout=0.0,
                         cutoffs=None,
                         div_val=1,
                         force_projection=None,
                         bind_embeddings=True,
                         bind_projections=True,
                         clamp_len=None,
                         share_biases=True):
    """Build transformer-XL model.

    :param units: Units inside the transformer.
    :param embed_dim: Dimension of embeddings.
    :param hidden_dim: Dimension inside position-wise feed-forward layer.
    :param num_token: Number of distinct input tokens.
    :param num_block: Number of basic encoder blocks.
    :param num_head: Number of heads for attention.
    :param batch_size: Maximum batch size.
    :param memory_len: The maximum length of memories.
    :param target_len: The length of prediction block.
    :param dropout: General dropout rate.
    :param attention_dropout: Dropout rate inside attention layer.
    :param cutoffs: Cutoffs of adaptive embedding.
    :param div_val: Scale factor of adaptive embedding.
    :param force_projection: Add projection when the dimensions are equal.
    :param bind_embeddings: Whether to bind embeddings to adaptive softmax.
    :param bind_projections: Whether to bind projections to adaptive softmax.
    :param clamp_len: The maximum value of relative position.
    :param share_biases: Whether to use the same biases for all layers.
    :return: The built model.
    """
    token_input = keras.layers.Input(shape=(target_len,), name='Input-Token')
    memory_length_input = keras.layers.Input(shape=(1,), name='Input-Memory-Length')
    inputs = [token_input, memory_length_input]

    results = AdaptiveEmbedding(
        input_dim=num_token,
        output_dim=units,
        embed_dim=embed_dim,
        cutoffs=cutoffs,
        div_val=div_val,
        mask_zero=True,
        force_projection=force_projection,
        return_embeddings=True,
        return_projections=True,
        name='Embed-Token',
    )(token_input)
    token_embed, embedding_weights = results[0], results[1:]
    token_embed = Scale(scale=np.sqrt(units), name='Embed-Token-Scaled')(token_embed)
    last_memory = Memory(
        batch_size=batch_size,
        memory_len=memory_len,
        target_len=target_len,
        output_dim=units,
        name='Memory-0',
    )([token_embed, memory_length_input])

    position_embed = PositionalEmbedding(
        output_dim=units,
        clamp_len=clamp_len,
        name='Embed-Position',
    )([token_input, last_memory])

    if 0.0 < dropout < 1.0:
        token_embed = keras.layers.Dropout(rate=dropout, name='Embed-Token-Dropped')(token_embed)
        position_embed = keras.layers.Dropout(rate=dropout, name='Embed-Position-Dropped')(position_embed)

    context_bias, relative_bias = None, None
    if share_biases:
        context_bias, relative_bias = RelativeBias(units=units, name='Biases')(last_memory)

    outputs = [token_embed]
    for i in range(num_block):
        block_input, block_output = outputs[-1], outputs[-1]
        if not share_biases:
            context_bias, relative_bias = RelativeBias(units=units, name='Biases-{}'.format(i + 1))(last_memory)
        block_output = RelativePartialMultiHeadSelfAttention(
            units=units,
            num_head=num_head,
            use_bias=False,
            attention_dropout=attention_dropout,
            name='Attention-{}'.format(i + 1),
        )([block_output, position_embed, last_memory, context_bias, relative_bias])
        if 0.0 < dropout < 1.0:
            block_output = keras.layers.Dropout(rate=dropout, name='Attention-Dropped-{}'.format(i + 1))(block_output)
        block_output = keras.layers.Add(name='Attention-Res-{}'.format(i + 1))([block_input, block_output])
        block_output = LayerNormalization(name='Attention-Norm-{}'.format(i + 1))(block_output)

        block_input = block_output
        block_output = FeedForward(
            units=hidden_dim,
            dropout_rate=dropout,
            name='FeedForward-{}'.format(i + 1),
        )(block_output)
        if 0.0 < dropout < 1.0:
            block_output = keras.layers.Dropout(rate=dropout, name='FeedForward-Dropped-{}'.format(i + 1))(block_output)
        block_output = keras.layers.Add(name='FeedForward-Res-{}'.format(i + 1))([block_input, block_output])
        block_output = LayerNormalization(name='FeedForward-Norm-{}'.format(i + 1))(block_output)

        if i < num_block - 1:
            last_memory = Memory(
                batch_size=batch_size,
                memory_len=memory_len,
                target_len=target_len,
                output_dim=units,
                name='Memory-{}'.format(i + 1),
            )([block_output, memory_length_input])

        outputs.append(block_output)

    if 0.0 < dropout < 1.0:
        outputs[-1] = keras.layers.Dropout(rate=dropout, name='Output-Dropped')(outputs[-1])
    softmax = AdaptiveSoftmax(
        input_dim=units,
        output_dim=num_token,
        embed_dim=embed_dim,
        cutoffs=cutoffs,
        div_val=div_val,
        force_projection=force_projection,
        bind_embeddings=bind_embeddings,
        bind_projections=bind_projections,
        name='Softmax',
    )(outputs[-1:] + embedding_weights)

    model = keras.models.Model(inputs=inputs, outputs=softmax)
    return model
Beispiel #10
0
def _wrap_layer(name,
                input_layer,
                build_func,
                dropout_rate=0.0,
                trainable=True,
                use_adapter=False,
                adapter_units=None,
                adapter_activation='relu',
                attention_mask=None,
                SEQ_LEN=None,
                retention_configuration=None,
                LAMBDA=None,
                FLAG_EXTRACT_LAYER=None,
                layer_idx=None,
                word_vector_elimination=None):
        """Wrap layers with residual, normalization and dropout.

        :param name: Prefix of names for internal layers.
        :param input_layer: Input layer.
        :param build_func: A callable that takes the input tensor and uenerates the output tensor.
        :param dropout_rate: Dropout rate.
        :param trainable: Whether the layers are trainable.
        :param use_adapter: Whether to use feed-forward adapters before each residual connections.
        :param adapter_units: The dimension of the first transformation in feed-forward adapter.
        :param adapter_activation: The activation after the first transformation in feed-forward adapter.
        :return: Output layer.
        """
        if word_vector_elimination:
            [build_output, atten] = build_func(input_layer)
        else:
            build_output = build_func(input_layer)

        if dropout_rate > 0.0:
                dropout_layer = keras.layers.Dropout(
                        rate=dropout_rate,
                        name='%s-Dropout' % name,
                )(build_output)
        else:
                dropout_layer = build_output
        if isinstance(input_layer, list):
                input_layer = input_layer[0]
        if use_adapter:
                adapter = FeedForward(
                    units=adapter_units,
                    activation=adapter_activation,
                    kernel_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001),
                    name='%s-Adapter' % name,
                )(dropout_layer)
                dropout_layer = keras.layers.Add(name='%s-Adapter-Add' % name)([dropout_layer, adapter])
        add_layer = keras.layers.Add(name='%s-Add' % name)([input_layer, dropout_layer])
        normal_layer = LayerNormalization(
                trainable=trainable,
                name='%s-Norm' % name,
                )(add_layer)

        if word_vector_elimination:
        
                if FLAG_EXTRACT_LAYER == 1:
                
                        extract_layer = Soft_Extract(atten=atten, LAMBDA=LAMBDA*(layer_idx**1.0), name='%s-Soft-Extract' % name)(normal_layer)
                        return extract_layer, attention_mask

                elif FLAG_EXTRACT_LAYER == 2:
        
                        extract_layer = Hard_Extract(atten=atten, index=retention_configuration[layer_idx-1], name='%s-Extract' % name)(normal_layer)
                        attention_mask = attention_mask[:,:retention_configuration[layer_idx-1]]
                        return extract_layer, attention_mask
        
        return normal_layer, attention_mask
Beispiel #11
0
def build_model(emb_cid, emb_advid, emb_aid):

    inp1 = layers.Input(shape=(max_len, ))
    inp2 = layers.Input(shape=(max_len, ))
    inp3 = layers.Input(shape=(max_len, ))

    emb1 = layers.Embedding(input_dim=emb_cid.shape[0],
                            output_dim=emb_cid.shape[1],
                            input_length=max_len,
                            weights=[emb_cid],
                            trainable=False)(inp1)
    emb2 = layers.Embedding(input_dim=emb_advid.shape[0],
                            output_dim=emb_advid.shape[1],
                            input_length=max_len,
                            weights=[emb_advid],
                            trainable=False)(inp2)
    emb3 = layers.Embedding(input_dim=emb_aid.shape[0],
                            output_dim=emb_aid.shape[1],
                            input_length=max_len,
                            weights=[emb_aid],
                            trainable=False)(inp3)

    sdrop = layers.SpatialDropout1D(rate=0.2)

    emb1 = sdrop(emb1)
    emb2 = sdrop(emb2)
    emb3 = sdrop(emb3)

    content = layers.Concatenate()([emb1, emb2, emb3])

    mha1 = MultiHeadAttention(head_num=32)(content)
    mha1 = layers.Dropout(0.01)(mha1)
    mha1 = layers.Add()([content, mha1])
    mha1 = LayerNormalization()(mha1)
    mha1 = layers.Dropout(0.01)(mha1)
    mha1_ff = FeedForward(256)(mha1)
    mha1_out = layers.Add()([mha1, mha1_ff])
    mha1_out = LayerNormalization()(mha1_out)

    mha2 = MultiHeadAttention(head_num=32)(mha1_out)
    mha2 = layers.Dropout(0.01)(mha2)
    mha2 = layers.Add()([mha1_out, mha2])
    mha2 = LayerNormalization()(mha2)
    mha2 = layers.Dropout(0.01)(mha2)
    mha2_ff = FeedForward(256)(mha2)
    mha2_out = layers.Add()([mha2, mha2_ff])
    mha2_out = LayerNormalization()(mha2_out)

    mha3 = MultiHeadAttention(head_num=32)(mha2_out)
    mha3 = layers.Dropout(0.01)(mha3)
    mha3 = layers.Add()([mha2_out, mha3])
    mha3 = LayerNormalization()(mha3)
    mha3 = layers.Dropout(0.01)(mha3)
    mha3_ff = FeedForward(256)(mha3)
    mha3_out = layers.Add()([mha3, mha3_ff])
    mha3_out = LayerNormalization()(mha3_out)

    avg_pool = layers.GlobalAveragePooling1D()(mha3_out)
    max_pool = layers.GlobalMaxPool1D()(mha3_out)

    x = layers.Concatenate()([avg_pool, max_pool])

    x = layers.Dense(256)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.2)(x)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.2)(x)

    x = layers.Dense(64)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.2)(x)

    out = layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(inputs=[inp1, inp2, inp3], outputs=out)
    model.compile(loss='binary_crossentropy',
                  optimizer=keras.optimizers.Adam(2e-4),
                  metrics=['accuracy'])

    return model
    def test_fit(self):
        input_layer = keras.layers.Input(
            shape=(1, 3),
            name='Input',
        )
        att_layer = MultiHeadAttention(
            head_num=3,
            activation=self._leaky_relu,
            name='Multi-Head-Attention-1')(input_layer)
        normal_layer = LayerNormalization(
            name='Layer-Normalization-1', )(att_layer)
        feed_forward_layer = FeedForward(
            units=12,
            activation=self._leaky_relu,
            name='FeedForward',
        )(normal_layer)
        normal_layer = LayerNormalization(
            name='Layer-Normalization-2', )(feed_forward_layer)
        output_layer = keras.layers.Add(name='Add')(
            [input_layer, normal_layer])
        model = keras.models.Model(
            inputs=input_layer,
            outputs=output_layer,
        )
        model.compile(
            optimizer='adam',
            loss='mse',
            metrics={},
        )

        def _generator(batch_size=32):
            while True:
                batch_inputs = np.random.random((batch_size, 1, 3))
                batch_outputs = batch_inputs + 0.2
                yield batch_inputs, batch_outputs

        model.fit_generator(
            generator=_generator(),
            steps_per_epoch=1000,
            epochs=10,
            validation_data=_generator(),
            validation_steps=100,
            callbacks=[
                keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
            ],
        )
        model_path = os.path.join(
            tempfile.gettempdir(),
            'keras_feed_forward_%f.h5' % np.random.random())
        model.save(model_path)
        model = keras.models.load_model(
            model_path,
            custom_objects={
                '_leaky_relu': self._leaky_relu,
                'MultiHeadAttention': MultiHeadAttention,
                'LayerNormalization': LayerNormalization,
                'FeedForward': FeedForward,
            },
        )
        for inputs, _ in _generator(batch_size=3):
            predicts = model.predict(inputs)
            expect = inputs + 0.2
            for i in range(3):
                for j in range(3):
                    self.assertTrue(
                        np.abs(expect[i, 0, j] - predicts[i, 0, j]) < 0.1,
                        (expect, predicts))
            break
Beispiel #13
0
def build_xlnet(units,
                training,
                num_token,
                num_block,
                num_head,
                hidden_dim,
                batch_size,
                memory_len,
                target_len,
                permute=None,
                mask_index=Tokenizer.SYM_PAD,
                dropout=0.0,
                attention_dropout=0.0,
                attention_type=ATTENTION_TYPE_BI,
                clamp_len=None,
                shared_biases=True):
    """Build XLNet.

    :param units: Hidden dimensions throughout the model.
    :param training: Whether in training mode.
    :param num_token: Number of distinct tokens.
    :param num_block: Number of basic encoder blocks.
    :param num_head: Number of heads for attention.
    :param hidden_dim: Dimension inside position-wise feed-forward layer.
    :param batch_size: Maximum batch size.
    :param memory_len: The maximum length of memories.
    :param target_len: The length of prediction block.
    :param permute: Whether to enable permutation.
    :param mask_index: The index of padding.
    :param dropout: General dropout rate.
    :param attention_dropout: Dropout rate inside attention layer.
    :param attention_type: 'uni' or 'bi'.
    :param clamp_len: The maximum value of relative position.
    :param shared_biases: Whether to use the same biases for all layers.
    :return: The built model.
    """
    if permute is None:
        permute = training

    token_input = keras.layers.Input(
        shape=(target_len,),
        name='Input-Token',
    )
    seg_input = keras.layers.Input(
        shape=(target_len,),
        name='Input-Segment',
    )
    memory_length_input = keras.layers.Input(
        shape=(1,),
        name='Input-Memory-Length',
    )
    inputs = [token_input, seg_input, memory_length_input]
    if training:
        query_input = keras.layers.Input(
            shape=(target_len,),
            name='Input-Mask',
        )
        inputs.append(query_input)
    else:
        query_input = None
    token_embed, embed_weights = EmbeddingRet(
        input_dim=num_token,
        output_dim=units,
        mask_zero=mask_index == 0,
        name='Embed-Token',
    )(token_input)
    if mask_index is not None and mask_index != 0:
        masking = CreateMask(
            mask_value=mask_index,
            name='Masking',
        )(token_input)
        token_embed = RestoreMask(name='Embed-Token-Masked')([token_embed, masking])
    if training:
        mask_embed = MaskEmbedding(
            units=units,
            name='Embed-Mask'
        )([token_embed, query_input])
    else:
        mask_embed = None
    if 0.0 < dropout < 1.0:
        token_embed = keras.layers.Dropout(
            rate=dropout,
            name='Embed-Token-Dropout'
        )(token_embed)
        if training:
            mask_embed = keras.layers.Dropout(
                rate=dropout,
                name='Embed-Mask-Dropout'
            )(mask_embed)

    memories = [Memory(
        batch_size=batch_size,
        memory_len=memory_len,
        target_len=target_len,
        output_dim=units,
        name='Memory-0',
    )([token_embed, memory_length_input])]

    pos_embed = PositionalEmbedding(
        output_dim=units,
        clamp_len=clamp_len,
        directional=attention_type == 'uni',
        name='Embed-Pos',
    )([token_embed, memories[0]])

    content_mask, query_mask = PermutationMask(
        enabled=permute,
        directional=attention_type == 'uni',
        name='Permutation',
    )([token_embed, memories[0]])

    context_bias, relative_bias, segment_bias = None, None, None
    if shared_biases:
        context_bias, relative_bias = RelativeBias(
            units,
            name='Relative-Bias',
        )(memories[0])
        segment_bias = SegmentBias(
            units,
            name='Segment-Bias',
        )(memories[0])

    content_output, query_output = token_embed, None
    if training:
        query_output = mask_embed

    for i in range(num_block):
        if not shared_biases:
            context_bias, relative_bias = RelativeBias(
                units,
                name='Relative-Bias-{}'.format(i + 1),
            )(memories[i])
            segment_bias = SegmentBias(
                units,
                name='Segment-Bias-{}'.format(i + 1),
            )(memories[i])

        segment_mat, segment_embed = RelativeSegmentEmbedding(
            units=units,
            name='Embed-Segment-{}'.format(i + 1),
        )([seg_input, memories[i]])

        attention = Attention(
            units=units,
            num_head=num_head,
            use_bias=False,
            attention_dropout=attention_dropout,
            name='Attention-{}'.format(i + 1),
        )
        if 0.0 < dropout < 1.0:
            attention_dropout_layer = keras.layers.Dropout(
                rate=dropout,
                name='Attention-Dropout-{}'.format(i + 1),
            )
        else:
            attention_dropout_layer = None
        attention_add = keras.layers.Add(name='Attention-Residual-{}'.format(i + 1))
        attention_layer_norm = LayerNormalization(name='Attention-Normal-{}'.format(i + 1))

        feed_forward = FeedForward(
            units=hidden_dim,
            dropout_rate=dropout,
            activation=gelu,
            name='FeedForward-{}'.format(i + 1),
        )
        if 0.0 < dropout < 1.0:
            feed_forward_dropout = keras.layers.Dropout(
                rate=dropout,
                name='FeedForward-Dropout-{}'.format(i + 1),
            )
        else:
            feed_forward_dropout = None
        feed_forward_add = keras.layers.Add(name='FeedForward-Residual-{}'.format(i + 1))
        feed_forward_layer_norm = LayerNormalization(name='FeedForward-Normal-{}'.format(i + 1))

        content = content_output

        def _build_output(query, mask):
            attention_input = query
            _output = attention([
                query, content, memories[i],
                segment_mat, segment_embed, pos_embed,
                context_bias, relative_bias, segment_bias,
                mask,
            ])
            if attention_dropout_layer is not None:
                _output = attention_dropout_layer(_output)
            _output = attention_add([attention_input, _output])
            _output = attention_layer_norm(_output)

            feed_forward_input = _output
            _output = feed_forward(_output)
            if feed_forward_dropout is not None:
                _output = feed_forward_dropout(_output)
            _output = feed_forward_add([feed_forward_input, _output])
            _output = feed_forward_layer_norm(_output)
            return _output

        content_output = _build_output(content_output, content_mask)
        if training:
            query_output = _build_output(query_output, query_mask)

        if i < num_block - 1:
            memories.append(Memory(
                batch_size=batch_size,
                memory_len=memory_len,
                target_len=target_len,
                output_dim=units,
                name='Memory-{}'.format(i + 1),
            )([content_output, memory_length_input]))

    if training:
        output = EmbeddingSim(name='Softmax')([query_output, embed_weights])
    else:
        output = content_output
    model = keras.models.Model(
        inputs=inputs,
        outputs=output
    )
    return model
Beispiel #14
0
def build_model(emb_cid, emb_advid, emb_aid):

    inp1 = layers.Input(shape=(max_len, ))
    inp2 = layers.Input(shape=(max_len, ))
    inp3 = layers.Input(shape=(max_len, ))

    emb1 = layers.Embedding(input_dim=emb_cid.shape[0],
                            output_dim=emb_cid.shape[1],
                            input_length=max_len,
                            weights=[emb_cid],
                            trainable=False)(inp1)
    emb2 = layers.Embedding(input_dim=emb_advid.shape[0],
                            output_dim=emb_advid.shape[1],
                            input_length=max_len,
                            weights=[emb_advid],
                            trainable=False)(inp2)
    emb3 = layers.Embedding(input_dim=emb_aid.shape[0],
                            output_dim=emb_aid.shape[1],
                            input_length=max_len,
                            weights=[emb_aid],
                            trainable=False)(inp3)

    sdrop = layers.SpatialDropout1D(rate=0.2)

    emb1 = sdrop(emb1)
    emb2 = sdrop(emb2)
    emb3 = sdrop(emb3)

    id_c = emb1
    id_adv_ad = layers.Concatenate()([emb2, emb3])

    mha1 = MultiHeadAttention(head_num=16)(id_adv_ad)
    mha1 = layers.Dropout(0.01)(mha1)
    mha1 = layers.Add()([id_adv_ad, mha1])
    mha1 = LayerNormalization()(mha1)
    mha1 = layers.Dropout(0.01)(mha1)
    mha1_ff = FeedForward(128)(mha1)
    mha1_out = layers.Add()([mha1, mha1_ff])
    mha1_out = LayerNormalization()(mha1_out)
    id_adv_ad_lstm = layers.Bidirectional(
        layers.LSTM(200, return_sequences=True))(mha1_out)
    id_adv_ad_max_pool = layers.GlobalMaxPool1D()(id_adv_ad_lstm)

    mha2 = MultiHeadAttention(head_num=16)(id_c)
    mha2 = layers.Dropout(0.01)(mha2)
    mha2 = layers.Add()([id_c, mha2])
    mha2 = LayerNormalization()(mha2)
    mha2 = layers.Dropout(0.01)(mha2)
    mha2_ff = FeedForward(128)(mha2)
    mha2_out = layers.Add()([mha2, mha2_ff])
    mha2_out = LayerNormalization()(mha2_out)
    id_c_lstm = layers.Bidirectional(layers.LSTM(
        200, return_sequences=True))(mha2_out)
    id_c_max_pool = layers.GlobalMaxPool1D()(id_c_lstm)

    x = layers.Add()([id_c_max_pool, id_adv_ad_max_pool])

    x = layers.Dense(256, activation='relu')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dense(64, activation='relu')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dropout(0.15)(x)

    out = layers.Dense(10, activation='softmax')(x)
    model = keras.Model(inputs=[inp1, inp2, inp3], outputs=out)
    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(1e-3),
                  metrics=['accuracy'])

    return model
def build_model(emb_cid, emb_advid):

    inp1 = layers.Input(shape=(max_len, ))
    inp2 = layers.Input(shape=(max_len, ))
    inp_stacking = layers.Input(shape=(stacking_shape, ))

    emb1 = layers.Embedding(input_dim=emb_cid.shape[0],
                            output_dim=emb_cid.shape[1],
                            input_length=max_len,
                            weights=[emb_cid],
                            trainable=False)(inp1)
    emb2 = layers.Embedding(input_dim=emb_advid.shape[0],
                            output_dim=emb_advid.shape[1],
                            input_length=max_len,
                            weights=[emb_advid],
                            trainable=False)(inp2)

    sdrop = layers.SpatialDropout1D(rate=0.1)

    emb1 = sdrop(emb1)
    emb2 = sdrop(emb2)

    content = layers.Concatenate()([emb1, emb2])

    mha1 = MultiHeadAttention(head_num=8)(content)
    #     mha1 = layers.Dropout(0.01)(mha1)
    mha1 = layers.Add()([content, mha1])
    mha1 = LayerNormalization()(mha1)
    #     mha1 = layers.Dropout(0.01)(mha1)
    mha1_ff = FeedForward(128)(mha1)
    mha1_out = layers.Add()([mha1, mha1_ff])
    mha1_out = LayerNormalization()(mha1_out)

    #     mha2 = MultiHeadAttention(head_num=8)(mha1_out)
    #     mha2 = layers.Dropout(0.01)(mha2)
    #     mha2 = layers.Add()([mha1_out, mha2])
    #     mha2 = LayerNormalization()(mha2)
    #     mha2 = layers.Dropout(0.01)(mha2)
    #     mha2_ff = FeedForward(128)(mha2)
    #     mha2_out = layers.Add()([mha2, mha2_ff])
    #     mha2_out = LayerNormalization()(mha2_out)

    #     mha3 = MultiHeadAttention(head_num=8)(mha2_out)
    #     mha3 = layers.Dropout(0.01)(mha3)
    #     mha3 = layers.Add()([mha2_out, mha3])
    #     mha3 = LayerNormalization()(mha3)
    #     mha3 = layers.Dropout(0.01)(mha3)
    #     mha3_ff = FeedForward(128)(mha3)
    #     mha3_out = layers.Add()([mha3, mha3_ff])
    #     mha3_out = LayerNormalization()(mha3_out)

    #     avg_pool = layers.GlobalAveragePooling1D()(mha3_out)
    max_pool = layers.GlobalMaxPool1D()(mha1_out)

    x = layers.Concatenate()([max_pool, inp_stacking])

    x = layers.Dense(128, activation='relu')(x)
    #     x = layers.BatchNormalization()(x)

    x = layers.Dense(64, activation='relu')(x)
    #     x = layers.BatchNormalization()(x)

    x = layers.Dense(32, activation='relu')(x)
    #     x = layers.BatchNormalization()(x)

    #     x = layers.Dropout(0.1)(x)

    out = layers.Dense(10, activation='softmax')(x)
    model = keras.Model(inputs=[inp1, inp2, inp_stacking], outputs=out)
    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(5e-4),
                  metrics=['accuracy'])

    return model
import keras
from keras_position_wise_feed_forward import FeedForward
import keras2onnx

input_layer = keras.layers.Input(shape=(None, 32))
feed_forward_layer = FeedForward(units=128)(input_layer)
model = keras.models.Model(inputs=input_layer, outputs=feed_forward_layer)
model.compile(optimizer='adam', loss='mse')
model.summary()

#keras.backend.set_learning_phase(0)

onnx_model = keras2onnx.convert_keras(model, 'feed_forward', debug_mode=1)
keras2onnx.save_model(onnx_model, 'foo.onnx')