Esempio n. 1
0
    def decode(self, ys, memory, training=True):
        '''
        memory: encoder outputs. (N, T1, hidden_units)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)  # (N, T2, hidden_units)
            dec *= self.hp.num_units ** 0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen)
            dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(queries=dec,
                                              keys=dec,
                                              values=dec,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=True,
                                              scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(queries=dec,
                                              keys=memory,
                                              values=memory,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=False,
                                              scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.num_units])

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(self.embeddings) # (hidden_units, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size)
        # set values corresponding to unk = 0
        logits_first = tf.expand_dims(logits[:,:,0], 2)
        zeros = tf.zeros_like(logits_first)
        logits = tf.concat([logits_first, zeros, logits[:,:,2:]], axis=2)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
Esempio n. 2
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, hidden_units)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, hidden_units)
            enc *= self.hp.num_units**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.num_units])
        memory = enc
        return memory, sents1
Esempio n. 3
0
 def __init__(self, hidden_size, num_heads, dropout_rate=0.5):
     super(TransformerLayer, self).__init__()
     self.hidden_size = hidden_size
     self.num_heads = num_heads
     self.dropout_rate = dropout_rate
     self.SelfAttention = multihead_attention(
         num_units=self.hidden_size,
         num_heads=self.num_heads,
         dropout_rate=self.dropout_rate,
         causality=True,
         with_qk=False,
         hidden_size=self.hidden_size)
     self.ff = feedforward(num_units=[self.hidden_size, self.hidden_size],
                           dropout_rate=self.dropout_rate)