Esempio n. 1
0
 def __init__(self, model_dim=512, num_heads=8, ffn_dim=204, dropout=0.0):
     super(DecoderLayer, self).__init__()
     self.self_attention = MultiHeadAttention(model_dim, num_heads, dropout)
     self.joint_attention = MultiHeadAttention(model_dim, num_heads,
                                               dropout)
     self.feed_forward = PositionWiseFeedForward(model_dim, ffn_dim,
                                                 dropout)
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
Esempio n. 3
0
 def _attention_builder(x):
     return MultiHeadAttention(
         head_num=head_num,
         activation=activation,
         history_only=history_only,
         trainable=trainable,
         name=name,
     )(x)
Esempio n. 4
0
 def build(self, input_shape):
     self.embeddings = self.add_weight(shape=(self._vocab_size,
                                              self._model_dim),
                                       initializer='glorot_uniform',
                                       trainable=True,
                                       name="embeddings")
     self.EncoderPositionEncoding = PositionEncoding(self._model_dim)
     self.EncoderMultiHeadAttetions = [
         MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads)
         for _ in range(self._encoder_stack)
     ]
     self.EncoderLayerNorms0 = [
         LayerNormalization() for _ in range(self._encoder_stack)
     ]
     self.EncoderPositionWiseFeedForwards = [
         PositionWiseFeedForward(self._model_dim, self._feed_forward_size)
         for _ in range(self._encoder_stack)
     ]
     self.EncoderLayerNorms1 = [
         LayerNormalization() for _ in range(self._encoder_stack)
     ]
     self.DecoderPositionEncoding = PositionEncoding(self._model_dim)
     self.DecoderMultiHeadAttetions0 = [
         MultiHeadAttention(self._n_heads,
                            self._model_dim // self._n_heads,
                            future=True) for _ in range(self._decoder_stack)
     ]
     self.DecoderLayerNorms0 = [
         LayerNormalization() for _ in range(self._decoder_stack)
     ]
     self.DecoderMultiHeadAttetions1 = [
         MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads)
         for _ in range(self._decoder_stack)
     ]
     self.DecoderLayerNorms1 = [
         LayerNormalization() for _ in range(self._decoder_stack)
     ]
     self.DecoderPositionWiseFeedForwards = [
         PositionWiseFeedForward(self._model_dim, self._feed_forward_size)
         for _ in range(self._decoder_stack)
     ]
     self.DecoderLayerNorms2 = [
         LayerNormalization() for _ in range(self._decoder_stack)
     ]
     super(Transformer, self).build(input_shape)
Esempio n. 5
0
 def build(self, input_shape):
     self.d_model = input_shape[-1]
     
     # Self multi head attention
     self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
     self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
     self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
     
     # Multi head attention combinado con la salida del encoder 
     self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
     self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
     self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
     
     # Feed foward
     self.dense_1 = layers.Dense(units=self.FFN_units,
                                 activation="relu")
     self.dense_2 = layers.Dense(units=self.d_model)
     self.dropout_3 = layers.Dropout(rate=self.dropout_rate)
     self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
Esempio n. 6
0
    def build(self, input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
Esempio n. 7
0
 def sub_layer_multi_head_attention(self ,layer_index ,Q ,K_s,type,mask=None,is_training=None,dropout_keep_prob=None)  :# COMMON FUNCTION
     """
     multi head attention as sub layer
     :param layer_index: index of layer number
     :param Q: shape should be: [batch_size,sequence_length,embed_size]
     :param k_s: shape should be: [batch_size,sequence_length,embed_size]
     :param type: encoder,decoder or encoder_decoder_attention
     :param mask: when use mask,illegal connection will be mask as huge big negative value.so it's possiblitity will become zero.
     :return: output of multi head attention.shape:[batch_size,sequence_length,d_model]
     """
     with tf.variable_scope("base_mode_sub_layer_multi_head_attention_" + type+str(layer_index)):
         # below is to handle attention for encoder and decoder with difference length:
         #length=self.decoder_sent_length if (type!='encoder' and self.sequence_length!=self.decoder_sent_length) else self.sequence_length #TODO this may be useful
         length=self.sequence_length
         #1. get V as learned parameters
         V_s = tf.get_variable("V_s", shape=(self.batch_size,length,self.d_model),initializer=self.initializer)
         #2. call function of multi head attention to get result
         multi_head_attention_class = MultiHeadAttention(Q, K_s, V_s, self.d_model, self.d_k, self.d_v, self.sequence_length,
                                                         self.h,type=type,is_training=is_training,mask=mask,dropout_rate=(1.0-dropout_keep_prob))
         sub_layer_multi_head_attention_output = multi_head_attention_class.multi_head_attention_fn()  # [batch_size*sequence_length,d_model]
     return sub_layer_multi_head_attention_output  # [batch_size,sequence_length,d_model]
    def __init__(self,
                 dim,
                 src_n_vocab,
                 n_encod_layer,
                 tgt_n_vocab,
                 n_decode_layer,
                 max_len=512):
        self.src_emb = EmbeddingWithPositionalEncoding(dim, src_n_vocab,
                                                       max_len)
        self.tgt_emb = EmbeddingWithLearnedPositionalEncoding(
            dim, tgt_n_vocab, max_len)

        enc_layer = TransformerLayer(dim, MultiHeadAttention(6, dim, 0.1),
                                     None, nn.Linear(dim, dim), 0.1)
        self.encoder = Encoder(enc_layer, n_encod_layer)

        dec_layer = TransformerLayer(dim, MultiHeadAttention(6, dim, 0.1),
                                     MultiHeadAttention(6, dim, 0.1),
                                     nn.Linear(dim, dim), 0.1)
        self.decoder = Decoder(dec_layer, n_decode_layer)

        self.encoder_decoder = EncoderDecoder(self.encoder, self.decoder,
                                              self.src_emb, self.tgt_emb)
 def __init__(
     self,
     dim_model: int = 512,
     num_heads: int = 6,
     dim_feedforward: int = 2048,
     dropout: float = 0.1,
 ):
     super().__init__()
     dim_k = dim_v = dim_model // num_heads
     self.attention_1 = Residual(
         MultiHeadAttention(num_heads, dim_model, dim_k, dim_v),
         dimension=dim_model,
         dropout=dropout,
     )
     self.attention_2 = Residual(
         MultiHeadAttention(num_heads, dim_model, dim_k, dim_v),
         dimension=dim_model,
         dropout=dropout,
     )
     self.feed_forward = Residual(
         feed_forward(dim_model, dim_feedforward),
         dimension=dim_model,
         dropout=dropout,
     )
Esempio n. 10
0
def make_attention_cell(dec_cell, 
                        rnn_size, 
                        enc_output, 
                        bias_output, 
                        lengths, 
                        att_type, 
                        att_type_bias, 
                        bias_lengths, 
                        args):
    """Wraps the given cell with Attention.
    Args:
      dec_cell: the RNNCell for decoder.
      rnn_size: Integer. Number of hidden units to use for
            rnn cell.
      inputs: Array of input points.
      enc_output: encoder outputs in erery step.
      bias_output: bias representations.
      lengths: Array of integers. Sequence lengths of the
            input points.
      att_type: attention type for encoder.
      att_type_bias: attention for bias.
      bias_lengths: number of the bias words.

    Returns: a new Cell wrapped with attention.

    """
    if att_type=='BahdanauAttention':
    # if the attention type is BahdanauAttention, the bias has not been implemented
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                                      num_units=rnn_size,
                                      memory=enc_output,
                                      memory_sequence_length=lengths,
                                      name='BahdanauAttention')

        return tf.contrib.seq2seq.AttentionWrapper(cell=dec_cell,
                              attention_mechanism=attention_mechanism,
                              attention_layer_size=None,
                              output_attention=False)
    
    elif att_type=='MultiHeadAttention' and att_type_bias=='MultiHeadAttention':
    # multi_head_attention implement
        size_per_head = int(rnn_size/args.num_heads)
        my_attention_mechanism = MyAttentionMechanism(num_heads=args.num_heads,
                              size_per_head=size_per_head,
                              memory=enc_output,
                              memory_sequence_length=lengths,
                              name='MultiHeadAttention')
        my_attention_mechanism_bias = MyAttentionMechanism(num_heads=args.num_heads,
                              size_per_head=size_per_head,
                              memory=bias_output,
                              memory_sequence_length=bias_lengths,
                              name='MultiHeadAttentionBias')
        
        attention_mechanisms = []
        attention_mechanisms_for_bias = []
        for i in range(args.num_heads):
            attention_mechanism = MultiHeadAttention(num_units=rnn_size,
                                          memory=enc_output,
                                          memory_sequence_length=lengths,
                                          name='MultiHeadAttention')
            attention_mechanism_for_bias = MultiHeadAttention(num_units=rnn_size,
                                          memory=bias_output,
                                          memory_sequence_length=bias_lengths,
                                          name='MultiHeadAttentionBias')
            attention_mechanisms.append(attention_mechanism)
            attention_mechanisms_for_bias.append(attention_mechanism_for_bias)
        
        return AttentionWrapper(cell=dec_cell,
                    attention_mechanism=attention_mechanisms,
                    attention_mechanism_for_bias=attention_mechanisms_for_bias,
                    my_attention_mechanism=my_attention_mechanism,
                    my_attention_mechanism_bias=my_attention_mechanism_bias,
                    attention_layer_size=None,
                    output_attention=False)
    
    elif att_type=='MultiHeadAttention' and att_type_bias=='BahdanauAttention':
        if args.num_heads>1:
            raise ValueError("it's illegal if the num_heads>1 and att_type_bias \
                              equals BahdanauAttention at the same time, please set \
                              num_heads 1 or set att_type_bias MultiHeadAttention")
        size_per_head = int(rnn_size/args.num_heads)
        my_attention_mechanism = MyAttentionMechanism(num_heads=args.num_heads,
                                                size_per_head=size_per_head,
                                                memory=enc_output,
                                                memory_sequence_length=lengths,
                                                name='MultiHeadAttention')
        
        attention_mechanisms_for_bias = tf.contrib.seq2seq.BahdanauAttention(
                                        num_units=rnn_size,
                                        memory=bias_output,
                                        memory_sequence_length=bias_lengths,
                                        name='BahdanauAttention')
        attention_mechanisms = []
        for i in range(args.num_heads):
            attention_mechanism = MultiHeadAttention(num_units=rnn_size,
                                                    memory=enc_output,
                                                    memory_sequence_length=lengths,
                                                    name='MultiHeadAttention')
            attention_mechanisms.append(attention_mechanism)
        
        return AttentionWrapper(cell=dec_cell,
                    attention_mechanism=attention_mechanisms,
                    attention_mechanism_for_bias=attention_mechanisms_for_bias,
                    my_attention_mechanism=my_attention_mechanism,
                    my_attention_mechanism_bias=None,
                    attention_layer_size=None,
                    output_attention=False)
def build_Model(input_shape, n_states=2, n_speaker=40):
    '''
    architecture:
        1. 4 layers CNN (dilated convolution)
        2. multi head attention(head num: 2)
        3. FC
    '''
    # Input layer
    inputs = Input(name='the_input', shape=input_shape, dtype='float32')

    # Convolution layer (VGG)
    inner = Conv2D(32, (3, 3),
                   padding='same',
                   name='conv1',
                   dilation_rate=2,
                   kernel_initializer='he_normal')(inputs)
    inner = BatchNormalization()(inner)
    inner = Activation('relu')(inner)
    inner = MaxPooling2D(pool_size=(2, 2), name='max1')(inner)

    inner = Conv2D(64, (3, 3),
                   padding='same',
                   name='conv2',
                   dilation_rate=2,
                   kernel_initializer='he_normal')(inner)
    inner = BatchNormalization()(inner)
    inner = Activation('relu')(inner)
    inner = MaxPooling2D(pool_size=(2, 2), name='max2')(inner)

    inner = Conv2D(256, (3, 3),
                   padding='same',
                   name='conv3',
                   dilation_rate=2,
                   kernel_initializer='he_normal')(inner)
    inner = BatchNormalization()(inner)
    inner = Activation('relu')(inner)
    inner = Conv2D(256, (3, 3),
                   padding='same',
                   name='conv4',
                   dilation_rate=2,
                   kernel_initializer='he_normal')(inner)
    inner = BatchNormalization()(inner)
    inner = Activation('relu')(inner)

    # CNN reshape
    inner = Reshape(target_shape=((125, 2560)), name='reshape')(inner)
    inner = Dense(256,
                  activation='relu',
                  kernel_initializer='he_normal',
                  name='dense1')(inner)

    # Multi-head attention layer
    inner = MultiHeadAttention(head_num=2, name='Multi-Head')(inner)
    inner = Lambda(lambda xin: K.sum(xin, axis=1))(inner)

    # Gradient Reversal Layer
    Flip = GradientReversal(hp_lambda=0.31)
    dann_in = Flip(inner)
    dann_out = Dense(units=n_speaker,
                     activation='softmax',
                     name='gradient_reversal')(dann_in)

    # transforms RNN output to character activations:
    predictions = Dense(units=n_states,
                        activation='softmax',
                        name='output_layer')(inner)  # (None, 3)

    model = Model(inputs=inputs, outputs=[predictions, dann_out])
    adam = optimizers.Adam(lr=0.00001)
    model.compile(optimizer=adam,
                  loss={
                      'output_layer': 'categorical_crossentropy',
                      'gradient_reversal': 'categorical_crossentropy'
                  },
                  loss_weights={
                      'output_layer': 0.997,
                      'gradient_reversal': 0.003
                  },
                  metrics=['accuracy'])
    model.summary()
    return model