def decode(self, ys, memory, training=True): ''' memory: encoder outputs. (N, T1, hidden_units) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, hidden_units) dec *= self.hp.num_units ** 0.5 # scale dec += positional_encoding(dec, self.hp.maxlen) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention(queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention(queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.num_units]) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (hidden_units, vocab_size) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) # set values corresponding to unk = 0 logits_first = tf.expand_dims(logits[:,:,0], 2) zeros = tf.zeros_like(logits_first) logits = tf.concat([logits_first, zeros, logits[:,:,2:]], axis=2) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, hidden_units) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, hidden_units) enc *= self.hp.num_units**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.num_units]) memory = enc return memory, sents1
def __init__(self, hidden_size, num_heads, dropout_rate=0.5): super(TransformerLayer, self).__init__() self.hidden_size = hidden_size self.num_heads = num_heads self.dropout_rate = dropout_rate self.SelfAttention = multihead_attention( num_units=self.hidden_size, num_heads=self.num_heads, dropout_rate=self.dropout_rate, causality=True, with_qk=False, hidden_size=self.hidden_size) self.ff = feedforward(num_units=[self.hidden_size, self.hidden_size], dropout_rate=self.dropout_rate)