def loss(self, input: Tensor, target: Tensor, mask: Tensor = None) -> Tuple[Tensor, Tensor]: """ Args: input: Tensor the input tensor with shape = [batch, length, input_size] target: Tensor the tensor of target labels with shape [batch, length] mask: Tensor or None the mask tensor with shape = [batch, length] Returns: Tensor A 1D tensor for minus log likelihood loss """ batch, length, _ = input.size() energy = self.forward(input, mask=mask) # shape = [length, batch, num_label, num_label] energy_transpose = energy.transpose(0, 1) # shape = [length, batch] target_transpose = target.transpose(0, 1) # shape = [batch, num_label] partition = None # shape = [batch] batch_index = torch.arange(0, batch).type_as(input).long() prev_label = input.new_full((batch, ), self.index_bos).long() tgt_energy = input.new_zeros(batch) for t in range(length): # shape = [batch, num_label, num_label] curr_energy = energy_transpose[t] if t == 0: partition = curr_energy[:, self.index_bos, :] else: # shape = [batch, num_label] partition = logsumexp(curr_energy + partition.unsqueeze(2), dim=1) label = target_transpose[t] tgt_energy += curr_energy[batch_index, prev_label, label] prev_label = label return logsumexp( self.trans_matrix.data[:, self.index_eos].unsqueeze(0) + partition, dim=1) - tgt_energy, energy
def attention(query: Tensor, key: Tensor, value: Tensor, mask: Tensor = None, dropout=None): """ scaled dot production attention @param query shape -> batch_size, head_count, max_length, model_dim_size/head_count @param key shape -> batch_size, head_count, max_length, model_dim_size/head_count @param value shape -> batch_size, head_count, max_length, model_dim_size/head_count """ d_k = query.size(-1) scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) if mask is not None: scores.masked_fill(mask == 0, -1e9) p_attention = F.softmax(scores, dim=-1) if dropout is not None: p_attention = dropout(p_attention) return torch.matmul(p_attention, value), p_attention
def forward(self, input: Tensor) -> Tensor: return f.dropout2d(input.transpose(1, 2), self.p, self.training, self.inplace).transpose(1, 2)