def loss(self,
             input: Tensor,
             target: Tensor,
             mask: Tensor = None) -> Tuple[Tensor, Tensor]:
        """
        Args:
            input: Tensor
                the input tensor with shape = [batch, length, input_size]
            target: Tensor
                the tensor of target labels with shape [batch, length]
            mask: Tensor or None
                the mask tensor with shape = [batch, length]

        Returns: Tensor
                A 1D tensor for minus log likelihood loss
        """
        batch, length, _ = input.size()
        energy = self.forward(input, mask=mask)
        # shape = [length, batch, num_label, num_label]
        energy_transpose = energy.transpose(0, 1)
        # shape = [length, batch]
        target_transpose = target.transpose(0, 1)

        # shape = [batch, num_label]
        partition = None

        # shape = [batch]
        batch_index = torch.arange(0, batch).type_as(input).long()
        prev_label = input.new_full((batch, ), self.index_bos).long()
        tgt_energy = input.new_zeros(batch)

        for t in range(length):
            # shape = [batch, num_label, num_label]
            curr_energy = energy_transpose[t]
            if t == 0:
                partition = curr_energy[:, self.index_bos, :]
            else:
                # shape = [batch, num_label]
                partition = logsumexp(curr_energy + partition.unsqueeze(2),
                                      dim=1)
            label = target_transpose[t]
            tgt_energy += curr_energy[batch_index, prev_label, label]
            prev_label = label

        return logsumexp(
            self.trans_matrix.data[:, self.index_eos].unsqueeze(0) + partition,
            dim=1) - tgt_energy, energy
Example #2
0
def attention(query: Tensor,
              key: Tensor,
              value: Tensor,
              mask: Tensor = None,
              dropout=None):
    """
    scaled dot production attention
    @param query shape -> batch_size, head_count, max_length, model_dim_size/head_count
    @param key shape -> batch_size, head_count, max_length, model_dim_size/head_count
    @param value shape -> batch_size, head_count, max_length, model_dim_size/head_count

    """
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores.masked_fill(mask == 0, -1e9)
    p_attention = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attention = dropout(p_attention)
    return torch.matmul(p_attention, value), p_attention
 def forward(self, input: Tensor) -> Tensor:
     return f.dropout2d(input.transpose(1, 2), self.p, self.training,
                        self.inplace).transpose(1, 2)