def _encoder_decoder_attention(self, q, k, v): with tf.variable_scope("encoder-decoder-attention"): attention = Attention(num_heads=self.num_heads, masked=False, linear_key_dim=self.linear_key_dim, linear_value_dim=self.linear_value_dim, model_dim=self.model_dim, dropout=self.dropout) return attention.multi_head(q, k, v)
def _masked_self_attention(self, q, k, v): with tf.variable_scope("masked-self-attention"): attention = Attention( num_heads=self.num_heads, masked=True, # Not implemented yet linear_key_dim=self.linear_key_dim, linear_value_dim=self.linear_value_dim, model_dim=self.model_dim, dropout=self.dropout) return attention.multi_head(q, k, v)
def _self_attention(self, q, k, v, key_masks): with tf.variable_scope("self-attention"): attention = Attention( num_heads=self.num_heads, masked=False, linear_key_dim=self.linear_key_dim, linear_value_dim=self.linear_value_dim, model_dim=self.model_dim, max_seq_len=self.max_seq_len, dropout=self.dropout, ) #self.att = attention return attention.multi_head(q, k, v, key_masks)