def forward(self, input: Tensor, target: LongTensor) -> Tensor: # type: ignore """ hidden :: [len*bsz x d_proj] target :: [len*bsz] """ input_shape = input.size() input = input.contiguous().view(-1, input_shape[-1]) target = target.contiguous().view(-1) if input.size(0) != target.size(0): raise RuntimeError('Input and target should have the same size ' 'in the batch dimension.') if self.n_clusters == 0: logits = self._compute_logits(input, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) nll = F.nll_loss(logits, target, reduction='none') else: weights, biases = self._construct_weights() head_weight, head_bias = weights[0], biases[0] head_proj = self.out_projs[0] if len(self.out_projs) > 0 else None head_logits = self._compute_logits(input, head_weight, head_bias, head_proj) head_log_probs = F.log_softmax(head_logits, dim=1) nonzero_indices: List[torch.ByteTensor] = [ ((target >= l) & (target < r)).nonzero().squeeze() for l, r in zip(self.cutoffs[:-1], self.cutoffs[1:]) ] head_indices: LongTensor = target.clone() for idx, indices in enumerate(nonzero_indices): if indices.numel() == 0: continue index = self.shortlist_size + self.n_clusters - 1 - idx head_indices.index_fill_(0, indices, index) head_nll = F.nll_loss(head_log_probs, head_indices, reduction='none') for idx, indices in enumerate(nonzero_indices): if indices.numel() == 0: continue weight_i, bias_i = weights[idx + 1], biases[idx + 1] proj_i = self.out_projs[idx + 1] if len(self.out_projs) > idx + 1 else None cluster_hidden = input.index_select(0, indices) cluster_target = target.index_select(0, indices) - self.cutoffs[idx] cluster_logits = self._compute_logits(cluster_hidden, weight_i, bias_i, proj_i) cluster_nll = F.cross_entropy(cluster_logits, cluster_target, reduction='none') tail_nll = torch.zeros_like(head_nll) tail_nll.index_copy_(0, indices, cluster_nll) head_nll = head_nll + tail_nll nll = head_nll nll = nll.view(input_shape[:-1]) return nll
def _get_loss(logits: torch.LongTensor, targets: torch.LongTensor, target_mask: torch.LongTensor) -> torch.Tensor: relevant_targets = targets[:, 1:].contiguous() relevant_mask = target_mask[:, 1:].contiguous() # bs * decoding_step # return my_sequence_cross_entropy_with_logits(logits.contiguous(), relevant_targets, relevant_mask) return util.sequence_cross_entropy_with_logits(logits.contiguous(), relevant_targets, relevant_mask)
def wrap(b: torch.LongTensor): if b is None: return b if len(b.size()) > 1 and isinstance(b, list): b = torch.stack(b, 0) b = b.contiguous() if self.cuda: b = b.cuda() b = Variable(b, volatile=self.volatile, requires_grad=False) return b
def _get_loss( logits: torch.FloatTensor, targets: torch.LongTensor, target_mask: torch.FloatTensor ) -> torch.Tensor: logits = logits.contiguous() # shape: (batch_size, num_decoding_steps) relevant_targets = targets.contiguous() # shape: (batch_size, num_decoding_steps) relevant_mask = target_mask.contiguous() return util.sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask)
def _get_loss_custom(logits: torch.LongTensor, targets: torch.LongTensor, target_mask: torch.LongTensor, training: bool = True) -> torch.LongTensor: """ As opposed to get_loss, logits and targets are of same size """ relevant_targets = targets.contiguous( ) # (batch_size, num_decoding_steps) relevant_mask = target_mask.contiguous( ) # (batch_size, num_decoding_steps) # loss = util.sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask) if training: loss = sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask) else: loss = sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask, average=None) return loss
def get_rseq(self, rel: torch.LongTensor, tem: torch.LongTensor): r_e = self.embedding['rel'](rel) r_e = r_e.unsqueeze(0).transpose(0, 1) bs = tem.size(0) tem_len = tem.size(1) tem = tem.contiguous() tem = tem.view(bs * tem_len) token_e = self.embedding['tem'](tem) token_e = token_e.view(bs, tem_len, self.emb_dim) seq_e = torch.cat((r_e, token_e), 1) hidden_tem = self.lstm(seq_e) hidden_tem = hidden_tem[0, :, :] rseq_e = hidden_tem return rseq_e
def _get_loss(self, scores: torch.Tensor, targets: torch.LongTensor, generate_mask: torch.LongTensor, copy_mask: torch.LongTensor, target_mask: torch.LongTensor) -> torch.Tensor: """ :param scores: (batch_size, decode_length, num_class + encode_length) :param targets: (batch_size, decode_length + 1) :param generate_mask: (batch_size, decode_length + 1), where 1.0 indicates the target word is selected from target vocabulary, 0.0 indicates the target is copied from entity candidates :param copy_mask: (batch_size, decode_length + 1, encode_length), where 1.0 indicates that the target word is copied from this source word :param target_mask: (batch_size, decode_length) :return: """ batch_size, decode_length, _ = scores.size() # (batch_size, decode_length, num_class) generate_scores = scores[:, :, :self._num_classes] # (batch_size, decode_length, encode_length) copy_scores = scores[:, :, self._num_classes:] # shape: (batch_size * decode_length, 1) relevant_targets = targets[:, 1:].contiguous().view(-1, 1) target_generate_scores = torch.gather(generate_scores.view( -1, self._num_classes), dim=1, index=relevant_targets) target_scores = target_generate_scores.view(batch_size, decode_length) target_scores = target_scores * generate_mask[:, 1:] target_scores += (copy_scores * copy_mask[:, 1:, :].float()).sum(dim=-1) # shape: (batch_size, decode_length) relevant_mask = target_mask.contiguous().float() loss = -target_scores.log() * relevant_mask loss = loss.sum(dim=-1) / relevant_mask.sum(dim=-1) loss = loss.sum() / batch_size return loss