def global_attention(query): # linear map y = Linear(query, global_attention_vec_size, True) y = y.view(-1, 1, 1, global_attention_vec_size) # Attention mask is a softmax of v_g^{\top} * tanh(...) s = torch.sum(global_v * torch.tanh(global_hidden_features + y), dim=[1, 3]) a = tf.softmax(s) return a
def local_attention(query): # linear map y = Linear(query, local_attention_vec_size, True) y = y.view(-1, 1, 1, local_attention_vec_size) # Attention mask is a softmax of v_l^{\top} * tanh(...) #print((local_v * torch.tanh(local_hidden_features + y)).size()) s = torch.sum(local_v * torch.tanh(local_hidden_features + y), dim=[1, 3]) # Now calculate the attention-weighted vector, i.e., alpha in eq.[2] a = tf.softmax(s) return a
def attention(query): # linear map y = Linear(query, attention_vec_size, True) y = y.view(-1, 1, 1, attention_vec_size) # Attention mask is a softmax of v_d^{\top} * tanh(...). s = torch.sum(v * torch.tanh(hidden_features + y), dim=[1, 3]) # Now calculate the attention-weighted vector, i.e., gamma in eq.[7] a = tf.softmax(s) # eq. [8] #print(hidden.size()) #print((a.view(-1, 1, attn_length, 1)).size()) d = torch.sum(a.view(-1, 1, attn_length, 1) * hidden, dim=[2, 3]) return d.view(-1, attn_size)