def weighted_smooth_l1_loss(input, target, weights, size_average=None, reduce=None, reduction="mean"): # type: (Tensor, Tensor, Optional[bool], Optional[bool], str) -> Tensor r"""Function that uses a squared term if the absolute element-wise error falls below 1 and an L1 term otherwise. See :class:`~torch.nn.SmoothL1Loss` for details. """ if not (target.size() == input.size()): warnings.warn( "Using a target size ({}) that is different to the input size ({}). " "This will likely lead to incorrect results due to broadcasting. " "Please ensure they have the same size.".format(target.size(), input.size()), stacklevel=2, ) if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) if target.requires_grad: ret = _weighted_smooth_l1_loss(input, target, weights) if reduction != "none": ret = torch.mean(ret) if reduction == "mean" else torch.sum(ret) else: raise (ValueError("haven't thought this through")) expanded_input, expanded_target = torch.broadcast_tensors(input, target) ret = torch._C._nn.smooth_l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction)) return ret
def __init__(self, kernel_size: int = 11, kernel_sigma: float = 1.5, k1: float = 0.01, k2: float = 0.03, scale_weights: Optional[Union[Tuple[float], List[float]]] = None, size_average: Optional[bool] = None, reduce: Optional[bool] = None, reduction: str = 'mean', data_range: Union[int, float] = 1.) -> None: super(MultiScaleSSIMLoss, self).__init__(size_average, reduce, reduction) # Generic loss parameters. self.size_average = size_average self.reduce = reduce if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) self.reduction = reduction # Loss-specific parameters. if scale_weights is None: scale_weights_from_ms_ssim_paper = [0.0448, 0.2856, 0.3001, 0.2363, 0.1333] scale_weights = scale_weights_from_ms_ssim_paper self.scale_weights_tensor = torch.tensor(scale_weights) self.kernel_size = kernel_size self.kernel_sigma = kernel_sigma self.k1 = k1 self.k2 = k2 self.data_range = data_range # Cash kernel between calls. self.kernel = _fspecial_gauss_1d(kernel_size, kernel_sigma)
def ssim(input: Tensor, target: Tensor, max_val: float, filter_size: int = 11, k1: float = 0.01, k2: float = 0.03, sigma: float = 1.5, size_average=None, reduce=None, reduction: str = 'mean') -> Tensor: """Measures the structural similarity index (SSIM) error.""" dim = input.dim() if dim != 4: raise ValueError('Expected 4 dimensions (got {})'.format(dim)) if input.size() != target.size(): raise ValueError( 'Expected input size ({}) to match target size ({}).'.format( input.size(0), target.size(0))) if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) channel = input.size(1) kernel = _fspecial_gaussian(filter_size, channel, sigma, device=input.device, dtype=input.dtype, max_size=input.shape[-2:]) ret, _ = _ssim(input, target, max_val, k1, k2, channel, kernel) if reduction != 'none': ret = torch.mean(ret) if reduction == 'mean' else torch.sum(ret) return ret
def cross_entropy(input, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean'): # type: (Tensor, Tensor, Optional[Tensor], Optional[bool], int, Optional[bool], str) -> Tensor if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
def SmoothL1Loss_custom(input, target, sigma=3.0, size_average=None, reduce=True, reduction='mean'): # type: (Tensor, Tensor, Optional[bool], Optional[bool], str) -> Tensor def _smooth_l1_loss(input, target, sigma=3.0): # type: (Tensor, Tensor) -> Tensor t = torch.abs(input - target) return torch.where(t < 1, 0.5 * t**2, t - 0.5) if not (target.size() == input.size()): warnings.warn( "Using a target size ({}) that is different to the input size ({}). " "This will likely lead to incorrect results due to broadcasting. " "Please ensure they have the same size.".format( target.size(), input.size()), stacklevel=2) if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) if target.requires_grad: ret = _smooth_l1_loss(input, target) if reduction != 'none': ret = torch.mean(ret) if reduction == 'mean' else torch.sum( ret) else: expanded_input, expanded_target = torch.broadcast_tensors( input, target) ret = torch._C._nn.smooth_l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction)) return ret
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None: super(_Loss, self).__init__() if size_average is not None or reduce is not None: self.reduction = _Reduction.legacy_get_string(size_average, reduce) else: self.reduction = reduction
def nll_loss(input, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean'): # type: (Tensor, Tensor, Optional[Tensor], Optional[bool], int, Optional[bool], str) -> Tensor if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) dim = input.dim() if dim < 2: raise ValueError('Expected 2 or more dimensions (got {})'.format(dim)) if input.size(0) != target.size(0): raise ValueError( 'Expected input batch_size ({}) to match target batch_size ({}).'. format(input.size(0), target.size(0))) if dim == 2: ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index) elif dim == 4: ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index) else: # dim == 3 or dim > 4 n = input.size(0) c = input.size(1) out_size = (n, ) + input.size()[2:] if target.size()[1:] != input.size()[2:]: raise ValueError('Expected target size {}, got {}'.format( out_size, target.size())) input = input.contiguous() target = target.contiguous() # support empty batches, see #15870 if input.numel() > 0: input = input.view(n, c, 1, -1) else: input = input.view(n, c, 0, 0) if target.numel() > 0: target = target.view(n, 1, -1) else: target = target.view(n, 0, 0) reduction_enum = _Reduction.get_enum(reduction) if reduction != 'none': ret = torch._C._nn.nll_loss2d(input, target, weight, reduction_enum, ignore_index) else: out = torch._C._nn.nll_loss2d(input, target, weight, reduction_enum, ignore_index) ret = out.view(out_size) return ret
def ms_ssim(input: Tensor, target: Tensor, max_val: float, filter_size: int = 11, k1: float = 0.01, k2: float = 0.03, sigma: float = 1.5, size_average=None, reduce=None, reduction: str = 'mean') -> Tensor: """Measures the multi-scale structural similarity index (MS-SSIM) error.""" dim = input.dim() if dim != 4: raise ValueError( 'Expected 4 dimensions (got {}) from input'.format(dim)) if input.size() != target.size(): raise ValueError( 'Expected input size ({}) to match target size ({}).'.format( input.size(0), target.size(0))) if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) channel = input.size(1) kernel = _fspecial_gaussian(filter_size, channel, sigma, device=input.device, dtype=input.dtype, max_size=input.shape[-2:]) weights = ms_weights(input.device).unsqueeze(-1).unsqueeze(-1) levels = weights.size(0) mssim = [] mcs = [] for i in range(levels): if i: input = avg_pool2d(input, kernel_size=2, ceil_mode=True) target = avg_pool2d(target, kernel_size=2, ceil_mode=True) if min(size := input.shape[-2:]) <= filter_size: kernel = _fspecial_gaussian(filter_size, channel, sigma, device=input.device, dtype=input.dtype, max_size=size) ssim, cs = _ssim(input, target, max_val, k1, k2, channel, kernel) ssim = ssim.mean((2, 3)) cs = cs.mean((2, 3)) mssim.append(ssim) mcs.append(cs)
def smooth_ex_loss(input, target, size_average=None, reduce=None, reduction='mean'): if not (target.size() == input.size()): warnings.warn("Using a target size ({}) that is different to the input size ({}). " "This will likely lead to incorrect results due to broadcasting. " "Please ensure they have the same size.".format(target.size(), input.size()), stacklevel=2) if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) ret = _smooth_ex_loss(input, target) if reduction != 'none': ret = torch.mean(ret) if reduction == 'mean' else torch.sum(ret) return ret
def forward(self, input: torch.Tensor, target): reduction = self.reduction if self.size_average is not None or self.reduce is not None: reduction = _Reduction.legacy_get_string(self.size_average, self.reduce) pt = F.softmax(input, dim=1) log_pt = torch.log(pt) w = torch.pow((1 - pt), self.gamma) f_l = F.nll_loss(w * log_pt, target, self.weight, None, self.ignore_index, None, reduction) return f_l
def smoothed_nll_loss(inputs, target, weight=None, ignore_index=-100, reduction='mean', smooth_eps=None, smooth_dist=None, size_average=None, reduce=None): """cross entropy loss, with support for target distributions and label smoothing https://arxiv.org/abs/1512.00567""" if size_average is not None or reduce is not None: reduction = _reduction.legacy_get_string(size_average, reduce) smooth_eps = smooth_eps or 0 # ordinary log-liklihood - use cross_entropy from nn if _is_long(target) and smooth_eps == 0: return F.nll_loss(inputs, target, weight, ignore_index=ignore_index, reduction=reduction, size_average=size_average, reduce=reduce) lsm = inputs masked_indices = None num_classes = inputs.size(-1) if _is_long(target) and ignore_index >= 0: masked_indices = target.eq(ignore_index) if smooth_eps > 0 and smooth_dist is not None: if _is_long(target): target = onehot(target, num_classes).type_as(inputs) if smooth_dist.dim() < target.dim(): smooth_dist = smooth_dist.unsqueeze(0) target.lerp_(smooth_dist, smooth_eps) if weight is not None: lsm = lsm * weight.unsqueeze(0) if _is_long(target): eps_sum = smooth_eps / num_classes eps_nll = 1. - eps_sum - smooth_eps likelihood = lsm.gather(dim=-1, index=target.unsqueeze(-1)).squeeze(-1) loss = -(eps_nll * likelihood + eps_sum * lsm.sum(-1)) else: loss = -(target * lsm).sum(-1) if masked_indices is not None: loss.masked_fill_(masked_indices, 0) if reduction == 'sum': loss = loss.sum() elif reduction == 'mean': if masked_indices is None: loss = loss.mean() else: loss = loss.sum() / float(loss.size(0) - masked_indices.sum()) return loss
def __init__(self, weight=None, smooth=True, size_average=None, reduce=None, reduction='mean'): super(DiceLoss, self).__init__() if size_average is not None or reduce is not None: self.reduction = _Reduction.legacy_get_string(size_average, reduce) else: self.reduction = reduction if smooth: self.smooth = 1. else: self.smooth = 0. self.weight = weight
def ssim_loss(input, target, max_val, filter_size=11, k1=0.01, k2=0.03, sigma=1.5, kernel=None, size_average=None, reduce=None, reduction='mean'): r"""ssim_loss(input, target, max_val, filter_size, k1, k2, sigma, kernel=None, size_average=None, reduce=None, reduction='mean') -> Tensor Measures the structural similarity index (SSIM) error. See :class:`~torch.nn.SSIMLoss` for details. """ if input.size() != target.size(): raise ValueError( 'Expected input size ({}) to match target size ({}).'.format( input.size(0), target.size(0))) if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) dim = input.dim() if dim == 2: input = input.expand(1, 1, input.dim(-2), input.dim(-1)) target = target.expand(1, 1, target.dim(-2), target.dim(-1)) elif dim == 3: input = input.expand(1, input.dim(-3), input.dim(-2), input.dim(-1)) target = target.expand(1, target.dim(-3), target.dim(-2), target.dim(-1)) elif dim != 4: raise ValueError('Expected 2, 3, or 4 dimensions (got {})'.format(dim)) _, channel, _, _ = input.size() if kernel is None: kernel = _fspecial_gaussian(filter_size, channel, sigma) kernel = kernel.to(device=input.device) ret, _ = _ssim(input, target, max_val, k1, k2, channel, kernel) if reduction != 'none': ret = torch.mean(ret) if reduction == 'mean' else torch.sum(ret) return ret
def __init__(self, kernel_size: int = 11, kernel_sigma: float = 1.5, k1: float = 0.01, k2: float = 0.03, size_average: Optional[bool] = None, reduce: Optional[bool] = None, reduction: str = 'mean', data_range: Union[int, float] = 1.) -> None: super(SSIMLoss, self).__init__(size_average, reduce, reduction) # Generic loss parameters. self.size_average = size_average self.reduce = reduce if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) self.reduction = reduction # Loss-specific parameters. self.kernel_size = kernel_size self.kernel_sigma = kernel_sigma self.k1 = k1 self.k2 = k2 self.data_range = data_range # Cash kernel between calls. self.kernel = _fspecial_gauss_1d(kernel_size, kernel_sigma)
def ssim_loss(input, target, max_val, filter_size=11, k1=0.01, k2=0.03, sigma=1.5, kernel=None, size_average=None, reduce=None, reduction='mean'): r"""ssim_loss(input, target, max_val, filter_size, k1, k2, sigma, kernel=None, size_average=None, reduce=None, reduction='mean') -> Tensor Measures the structural similarity index (SSIM) error. See :class:`~torch.nn.SSIMLoss` for details. """ if input.size() != target.size(): raise ValueError( 'Expected input size ({}) to match target size ({}).'.format( input.size(0), target.size(0))) if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) dim = input.dim() if dim == 2: input = input.expand(1, 1, input.dim(-2), input.dim(-1)) target = target.expand(1, 1, target.dim(-2), target.dim(-1)) elif dim == 3: input = input.expand(1, input.dim(-3), input.dim(-2), input.dim(-1)) target = target.expand(1, target.dim(-3), target.dim(-2), target.dim(-1)) elif dim != 4: raise ValueError('Expected 2, 3, or 4 dimensions (got {})'.format(dim)) _, channel, _, _ = input.size() if kernel is None: kernel = _fspecial_gaussian(filter_size, channel, sigma) kernel = kernel.to(device=input.device) ret, _ = _ssim(input, target, max_val, k1, k2, channel, kernel) if reduction != 'none': ret = torch.mean(ret) if reduction == 'mean' else torch.sum(ret) return ret # From PyTorch: # Copyright (c) 2016- Facebook, Inc (Adam Paszke) # Copyright (c) 2014- Facebook, Inc (Soumith Chintala) # Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) # Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) # Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) # Copyright (c) 2011-2013 NYU (Clement Farabet) # Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) # Copyright (c) 2006 Idiap Research Institute (Samy Bengio) # Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) # From Caffe2: # Copyright (c) 2016-present, Facebook Inc. All rights reserved. # All contributions by Facebook: # Copyright (c) 2016 Facebook Inc. # All contributions by Google: # Copyright (c) 2015 Google Inc. # All rights reserved. # All contributions by Yangqing Jia: # Copyright (c) 2015 Yangqing Jia # All rights reserved. # All contributions from Caffe: # Copyright(c) 2013, 2014, 2015, the respective contributors # All rights reserved. # All other contributions: # Copyright(c) 2015, 2016 the respective contributors # All rights reserved. # Caffe2 uses a copyright model similar to Caffe: each contributor holds # copyright over their contributions to Caffe2. The project versioning records # all such contribution and copyright details. If a contributor wants to further # mark their specific copyright on a particular contribution, they should # indicate their copyright solely in the commit message of the change when it is # committed. # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America # and IDIAP Research Institute nor the names of its contributors may be # used to endorse or promote products derived from this software without # specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE.
def cross_entropy_without_softmax(input, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean'): if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) return F.nll_loss(torch.log(input), target, weight, None, ignore_index, None, reduction)
def re_bow_loss(input, target, prior, kl_annealing=1., weight=None, size_average=None, ignore_index=0, reduce=None, reduction='mean', _DEBUG=False): # type: (Tensor, Tensor, Distribution, float, Optional[Tensor], Optional[bool], int, Optional[bool], str) -> Tensor r"""This criterion combines `log` and word-wise (at sentence level) likelihood in a single function. See :class:`~torch.nn.CrossEntropyLoss` for details. # TODO: This needs to be re-written. Args: input (Tensor) : :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)` in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)` where :math:`K > 1` in the case of K-dimensional loss. target (Tensor) : :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or :math:`(N, d_1, d_2, ..., d_K)` where :math:`K \geq 1` for K-dimensional loss. weight (Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size `C` size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, the losses are averaged over each loss element in the batch. Note that for some losses, there multiple elements per sample. If the field :attr:`size_average` is set to ``False``, the losses are instead summed for each minibatch. Ignored when reduce is ``False``. Default: ``True`` ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. When :attr:`size_average` is ``True``, the loss is averaged over non-ignored targets. Default: -100 reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the losses are averaged or summed over observations for each minibatch depending on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per batch element instead and ignores :attr:`size_average`. Default: ``True`` reduction (string, optional): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, 'mean': the sum of the output will be divided by the number of elements in the output, 'sum': the output will be summed. Note: :attr:`size_average` and :attr:`reduce` are in the process of being deprecated, and in the meantime, specifying either of those two args will override :attr:`reduction`. Default: 'mean' Examples:: >>> input = torch.randn(3, 5, requires_grad=True) >>> target = torch.randint(5, (3,), dtype=torch.int64) >>> loss = bag_of_words_log_loss(input, target) >>> loss.backward() """ if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) relation_probs = input[0] word_probs = input[1] word_labels = target[1] # Creates a masked target tensor, where it is assumed that the labels indices will start at 1. mask = (word_labels != ignore_index).long() masked_word_labels = word_labels * mask # Here we compute the logarithm of the estimated word probabilities. log_word_probs = torch.log(word_probs) # Adds a column of zeros to the beginning of the log_word_probs. This will allow to batch the computation while # ignoring the 'ignore_idx'. masked_log_word_probs = torch.cat((torch.zeros( word_probs.shape[0], 1, device=mask.device), log_word_probs), dim=1) # Here we compute the expectation, with respect to the predicted relation probabilities, of the logarithm of the # predicted word probabilities. masked_expected_log_word_probs = torch.matmul(relation_probs, masked_log_word_probs) # Gets the probabilities of the words in each sentence (including the ignore_index ones, but those won't contribute # to the result). expected_log_words_in_sentences_probs = torch.gather( masked_expected_log_word_probs, 1, masked_word_labels) # Computes the probability of each sequence. expected_sequence_log_prob = torch.sum( expected_log_words_in_sentences_probs, dim=1) # Here we compute the KL divergence of the predicted relation probabilities against the specified prior. kls = kl_divergence(Categorical(probs=relation_probs), prior) # If we are 'DEBUGGING', i.e. tracking KL and P(X) values, we compute those values here. if (_DEBUG): # We compute P(x) = sum_{r \in \mathcal{R}} P(r)*P(x|r) = \frac{1}{|\mathcal{R}|} sum_{r \in \mathcal{R}} P(x|r) # We actually compute the log of that quantity. # First we expand the word probabilities to allow for each training instance to select from each relation. ex_word_probs = masked_log_word_probs.unsqueeze(0).expand( masked_word_labels.shape[0], -1, -1) ex_word_probs = ex_word_probs.reshape( ex_word_probs.shape[0] * ex_word_probs.shape[1], ex_word_probs.shape[2]) # Then we expand the labels, for the same reason. ex_labels = masked_word_labels.unsqueeze(1).expand( -1, word_probs.shape[0], -1) ex_labels = ex_labels.reshape(ex_labels.shape[0] * ex_labels.shape[1], ex_labels.shape[2]) # We gather the relevant probabilities. log_p_x = torch.gather(ex_word_probs, 1, ex_labels) # We reshape log(P(X)). log_p_x = log_p_x.reshape(int(log_p_x.shape[0] / word_probs.shape[0]), word_probs.shape[0], log_p_x.shape[1]) # We compute the log probability of each sentence, per relation. summed_log_p_x = log_p_x.sum(dim=-1) # We compute the per instance final value of log(P(x)), by employing the log-sum-exp trick. log_p_x = torch.logsumexp(summed_log_p_x, dim=-1) - torch.log( torch.tensor(word_probs.shape[0]).float()) # Compute whatever reduction is necessary. if (reduction == 'mean' or reduction == 'sum'): batch_log_p_x = torch.sum(log_p_x) batch_kls = torch.sum(kls) if (reduction == 'mean'): batch_log_p_x /= log_p_x.shape[0] batch_kls /= kls.shape[0] # Compute the instance-wise loss. instance_wise_loss = expected_sequence_log_prob - kl_annealing * kls # Compute whatever reduction is necessary. if (reduction == 'mean' or reduction == 'sum'): batch_log_prob = torch.sum(instance_wise_loss) if (reduction == 'mean'): batch_log_prob /= instance_wise_loss.shape[0] if (_DEBUG): return (-batch_log_prob, batch_kls, -batch_log_p_x) else: return (-batch_log_prob, ) if (_DEBUG): return (-instance_wise_loss, kls, -log_p_x) else: return (-instance_wise_loss, )
def bag_of_words_log_loss(input, target, weight=None, size_average=None, ignore_index=0, reduce=None, reduction='mean'): # type: (Tensor, Tensor, Optional[Tensor], Optional[bool], int, Optional[bool], str) -> Tensor r"""This criterion combines `log` and word-wise (at sentence level) likelihood in a single function. See :class:`~torch.nn.CrossEntropyLoss` for details. # TODO: This needs to be re-written. Args: input (Tensor) : :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)` in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)` where :math:`K > 1` in the case of K-dimensional loss. target (Tensor) : :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or :math:`(N, d_1, d_2, ..., d_K)` where :math:`K \geq 1` for K-dimensional loss. weight (Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size `C` size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, the losses are averaged over each loss element in the batch. Note that for some losses, there multiple elements per sample. If the field :attr:`size_average` is set to ``False``, the losses are instead summed for each minibatch. Ignored when reduce is ``False``. Default: ``True`` ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. When :attr:`size_average` is ``True``, the loss is averaged over non-ignored targets. Default: -100 reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the losses are averaged or summed over observations for each minibatch depending on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per batch element instead and ignores :attr:`size_average`. Default: ``True`` reduction (string, optional): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, 'mean': the sum of the output will be divided by the number of elements in the output, 'sum': the output will be summed. Note: :attr:`size_average` and :attr:`reduce` are in the process of being deprecated, and in the meantime, specifying either of those two args will override :attr:`reduction`. Default: 'mean' Examples:: >>> input = torch.randn(3, 5, requires_grad=True) >>> target = torch.randint(5, (3,), dtype=torch.int64) >>> loss = bag_of_words_log_loss(input, target) >>> loss.backward() """ if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) word_probs = input[0] word_labels = target[0] # Creates a masked target tensor, where it is assumed that the labels indices will start at 1. mask = (word_labels != ignore_index).long() masked_word_labels = word_labels * mask # Adds a column of ones to the beginning of the input. This will allow to batch the computation ignoring the # 'ignore_idx'. input_accounting_for_masking = torch.cat( (torch.ones(word_probs.shape[0], 1, device=mask.device), word_probs), dim=1) # Gets the probabilities of the words in each sentence (including the ignore_index ones, but those won't contribute # to the result). word_log_probs = torch.log( torch.gather(input_accounting_for_masking, 1, masked_word_labels)) # Computes the probability of each sequence. sequence_log_prob = torch.sum(word_log_probs, dim=1) # Compute whatever reduction is necessary. if (reduction == 'mean' or reduction == 'sum'): batch_log_prob = torch.sum(sequence_log_prob) if (reduction == 'mean'): batch_log_prob /= sequence_log_prob.shape[0] return -batch_log_prob return -sequence_log_prob