Example #1
0
    def forward(self, query, key, value, mask=None):
        """
        :param query: [batch_size, query_seq_len, query_dim]
        :param key: [batch_size, key_seq_len, key_dim]
        :param value: [batch_size, key_seq_len, value_dim]
        :param mask: [batch_size, key_seq_len]
            binary mask where the padding entries are set to 1
        :return: attn_vec: [batch_size, query_seq_len, value_dim]
        """
        # [batch_size, query_seq_len, key_seq_len]
        attn_weights = ops.matmul(query, key.transpose(1, 2))
        if (query.size(1) == key.size(1)) and self.causal:
            causal_mask = ops.fill_var_cuda((query.size(1), key.size(1)), 1).triu(1)
            attn_weights -= causal_mask.unsqueeze(0) * ops.HUGE_INT
        if mask is not None:
            attn_weights.data.masked_fill_(mask.unsqueeze(1).expand_as(attn_weights), -ops.HUGE_INT)
        attn_weights /= np.sqrt(key.size(-1))
        if self.return_normalized_weights:
            attn_weights = F.softmax(attn_weights, -1)

        if self.return_attn_vec:
            assert(self.return_normalized_weights)
            # [batch_size, query_seq_len, value_dim]
            attn_vec = ops.matmul(attn_weights, self.dropout(value))
            return attn_vec, attn_weights
        else:
            return attn_weights
Example #2
0
    def forward(self, query, key, value, mask=None):
        """
        :param query: [batch_size, query_seq_len, query_dim]
        :param key: [batch_size, key_seq_len, key_dim]
        :param value: [batch_size, key_seq_len, value_dim]
        :param mask: [batch_size, key_seq_len]
            binary mask where the padding entries are set to 1
        :return: attn_vec: [batch_size, query_seq_len, value_dim]
        """
        # [batch_size, query_seq_len, key_seq_len]
        batch_size = query.size(0)
        query_seq_len = query.size(1)
        key_seq_len = key.size(1)
        tiled_seq_len = query_seq_len * key_seq_len
        tiled_query = query.unsqueeze(2).repeat(1, 1, key_seq_len, 1).view(batch_size, tiled_seq_len, -1)
        tiled_key = key.repeat(1, query_seq_len, 1)
        attn_weights = self.ffn(torch.cat([tiled_query, tiled_key], dim=2)).view(batch_size, query_seq_len, key_seq_len)

        if (query.size(1) == key.size(1)) and self.causal:
            causal_mask = ops.fill_var_cuda((query.size(1), key.size(1)), 1).triu(1)
            attn_weights -= causal_mask.unsqueeze(0) * ops.HUGE_INT
        if mask is not None:
            attn_weights.data.masked_fill_(mask.unsqueeze(1).expand_as(attn_weights), -ops.HUGE_INT)
        attn_weights /= np.sqrt(key.size(-1))
        attn_weights = F.softmax(attn_weights, -1)

        if self.return_attn_vec:
            # [batch_size, query_seq_len, value_dim]
            attn_vec = ops.matmul(attn_weights, self.dropout(value))
            return attn_vec, attn_weights
        else:
            return attn_weights