def __call__(self, logits, targets, reduce=True): pos_similarity, neg_similarity, batch_size = self.get_similarities(logits) targets_local = FloatTensor(batch_size) targets_local.fill_(1) # 1: pos_similarity should be higher than neg_similarity return F.margin_ranking_loss( pos_similarity, neg_similarity, targets_local, self.config.margin )
def __init__(self, config, weights=None, *args, **kwargs): """Args: config: Config containing `precision_range_lower`, `precision_range_upper`, `num_classes`, `num_anchors` """ nn.Module.__init__(self) Loss.__init__(self, config) self.num_classes = self.config.num_classes self.num_anchors = self.config.num_anchors self.precision_range = ( self.config.precision_range_lower, self.config.precision_range_upper, ) # Create precision anchor values and distance between anchors. # coresponding to [alpha_t] and [delta_t] in the paper. # precision_values: 1D `Tensor` of shape [K], where `K = num_anchors` # delta: Scalar (since we use equal distance between anchors) self.precision_values, self.delta = loss_utils.range_to_anchors_and_delta( self.precision_range, self.num_anchors ) # notation is [b_k] in paper, Parameter of shape [C, K] # where `C = number of classes` `K = num_anchors` self.biases = nn.Parameter( FloatTensor(self.config.num_classes, self.config.num_anchors).zero_() ) self.lambdas = nn.Parameter( FloatTensor(self.config.num_classes, self.config.num_anchors).data.fill_( 1.0 ) )
def __init__(self, lstm: nn.LSTM): """ Shapes: initial_state: (lstm_layers, 1, lstm_hidden_dim) each """ self.lstm = lstm initial_state = ( FloatTensor(lstm.num_layers, 1, lstm.hidden_size).fill_(0), FloatTensor(lstm.num_layers, 1, lstm.hidden_size).fill_(0), ) # Stack of (state, (embedding, element)) self.stack = [(initial_state, (self._lstm_output(initial_state), Element("Root")))]
def __call__(self, logits, targets, reduce=True): """ Computes Kullback-Leibler divergence loss for multiclass classification probability distribution computed by BinaryCrossEntropyLoss loss """ hard_targets, _, soft_targets_logits = targets # we clamp the probability between (1e-20, 1 - 1e-20) to avoid log(0) problem # in the calculation of KLDivergence soft_targets = F.sigmoid(FloatTensor(soft_targets_logits) / self.t).clamp(1e-20, 1 - 1e-20) probs = F.sigmoid(logits / self.t).clamp(1e-20, 1 - 1e-20) probs_neg = probs.neg().add(1).clamp(1e-20, 1 - 1e-20) soft_targets_neg = soft_targets.neg().add(1).clamp(1e-20, 1 - 1e-20) if self.weight is not None: soft_loss = ( F.kl_div(probs.log(), soft_targets, reduction="none") * self.weight + F.kl_div(probs_neg.log(), soft_targets_neg, reduction="none") * self.weight) if reduce: soft_loss = soft_loss.mean() else: soft_loss = F.kl_div( probs.log(), soft_targets, reduction="mean" if reduce else "none") + F.kl_div( probs_neg.log(), soft_targets_neg, reduction="mean" if reduce else "none", ) soft_loss *= self.t**2 # see https://arxiv.org/pdf/1503.02531.pdf hard_loss = 0.0 if self.hard_weight > 0.0: one_hot_targets = (FloatTensor(hard_targets.size(0), logits.size(1)).zero_().scatter_( 1, hard_targets.unsqueeze(1).data, 1)) hard_loss = F.binary_cross_entropy_with_logits( logits, one_hot_targets, reduction="mean" if reduce else "none", weight=self.weight, ) return (1.0 - self.hard_weight) * soft_loss + self.hard_weight * hard_loss
def __call__(self, m_out, targets, reduce=True): """ Computes 1-vs-all binary cross entropy loss for multiclass classification. """ # Converts targets to one-hot representation. Dim: [batch, n_classes] one_hot_targets = ( FloatTensor(targets.size(0), m_out.size(1)) .zero_() .scatter_(1, targets.unsqueeze(1).data, 1) ) """ `F.binary_cross_entropy` or `torch.nn.BCELoss.` requires the output of the previous function be already a FloatTensor. """ # This weighting applies uniform class weights. # examples_per_class = one_hot_target.sum(0).clamp(min=1) # total_positive = examples_per_class.sum() # weights = total_positive.unsqueeze(0) / examples_per_class loss = F.binary_cross_entropy_with_logits( precision.maybe_float(m_out), one_hot_targets, reduction="none" ) if self.config.reweight_negative: # This makes sure we have same weights for all negative classes and # single positive class. Weight is 1 for the correct class and # 1 / (n - 1) for other ones. weights = one_hot_targets + (1.0 - one_hot_targets) / max( 1, one_hot_targets.size(1) - 1.0 ) loss = loss * weights return loss.sum(1).mean() if reduce else loss.sum(1)
def __call__(self, logits, targets, reduce=True): """ Computes soft and hard loss for knowledge distillation """ hard_targets, _, _ = targets # hard targets one_hot_targets = ( FloatTensor(hard_targets.size(0), logits.size(1)) .zero_() .scatter_(1, hard_targets.unsqueeze(1).data, 1) ) prob_loss = KLDivergenceBCELoss(self.config, weight=self.weight) if self.weight is not None: hard_loss = ( F.binary_cross_entropy_with_logits( logits, one_hot_targets, reduction="none" ) * self.weight ) if reduce: hard_loss = hard_loss.mean() else: hard_loss = F.binary_cross_entropy_with_logits( logits, one_hot_targets, reduction="mean" if reduce else "none" ) return self.t * self.t * prob_loss(logits, targets, reduce=reduce) + hard_loss
def __call__(self, logits, targets, reduce=True): """ Computes Kullback-Leibler divergence loss for multiclass classification probability distribution computed by CrossEntropyLoss loss """ hard_targets, _, soft_targets_logits = targets soft_targets = F.softmax(FloatTensor(soft_targets_logits) / self.t, dim=1) soft_targets = soft_targets.clamp(1e-10, 1 - 1e-10) log_probs = F.log_softmax(logits / self.t, 1) if self.weight is not None: soft_loss = ( F.kl_div(log_probs, soft_targets, reduction="none") * self.weight ) if reduce: soft_loss = soft_loss.mean() else: soft_loss = F.kl_div( log_probs, soft_targets, reduction="mean" if reduce else "none" ) soft_loss *= self.t ** 2 # see https://arxiv.org/pdf/1503.02531.pdf hard_loss = 0.0 if self.hard_weight > 0.0: hard_loss = F.cross_entropy( logits, hard_targets, reduction="mean" if reduce else "none", weight=self.weight, ) return (1.0 - self.hard_weight) * soft_loss + self.hard_weight * hard_loss
def __call__(self, logits, targets, reduce=True): """ Computes Kullback-Leibler divergence loss for multiclass classification probability distribution computed by BinaryCrossEntropyLoss loss """ hard_targets, _, soft_targets_logits = targets # we clamp the probability between (1e-20, 1 - 1e-20) to avoid log(0) problem # in the calculation of KLDivergence soft_targets = F.sigmoid(FloatTensor(soft_targets_logits) / self.t).clamp( 1e-20, 1 - 1e-20 ) probs = F.sigmoid(logits / self.t).clamp(1e-20, 1 - 1e-20) probs_neg = probs.neg().add(1).clamp(1e-20, 1 - 1e-20) soft_targets_neg = soft_targets.neg().add(1).clamp(1e-20, 1 - 1e-20) if self.weight is not None: loss = ( F.kl_div(probs.log(), soft_targets, reduction="none") * self.weight + F.kl_div(probs_neg.log(), soft_targets_neg, reduction="none") * self.weight ) if reduce: loss = loss.mean() else: loss = F.kl_div( probs.log(), soft_targets, reduction="mean" if reduce else "none" ) + F.kl_div( probs_neg.log(), soft_targets_neg, reduction="mean" if reduce else "none", ) return loss
def __call__(self, logits, targets, reduce=True): """ Computes Kullback-Leibler divergence loss for multiclass classification probability distribution computed by CrossEntropyLoss loss """ hard_targets, _, soft_targets_logits = targets soft_targets = F.softmax(FloatTensor(soft_targets_logits) / self.t, dim=1) soft_targets = soft_targets.clamp(1e-10, 1 - 1e-10) log_probs = F.log_softmax(logits / self.t, 1) soft_loss = F.kl_div(log_probs, soft_targets, reduction="none") if self.weight is not None: soft_loss = soft_loss * self.weight if reduce: soft_loss = soft_loss.mean() else: # soft_loss dim is batch_size * num_labels, while hard_loss is just # batch size, we have to still reduce soft_loss by the labels # dimension in order to be able to add the two losses. soft_loss = soft_loss.mean(1) soft_loss *= self.t**2 # see https://arxiv.org/pdf/1503.02531.pdf hard_loss = 0.0 if self.hard_weight > 0.0: hard_loss = F.cross_entropy( logits, hard_targets, reduction="mean" if reduce else "none", weight=self.weight, ) return (1.0 - self.hard_weight) * soft_loss + self.hard_weight * hard_loss
def from_config(cls, config: Config, metadata: FieldMeta): label_weights = getattr(metadata, "label_weights", None) if label_weights is not None: label_weights = FloatTensor(label_weights) return cls( metadata.vocab.itos, create_loss(config.loss, weight=label_weights), config )
def from_config(cls, config: Config, metadata: FieldMeta = None, labels=None): label_weights = getattr(metadata, "label_weights", None) if label_weights is not None: label_weights = FloatTensor(label_weights) vocab = metadata.vocab.itos if metadata else labels loss = create_loss(config.loss, weight=label_weights) cls = ( BinaryClassificationOutputLayer if isinstance(loss, BinaryCrossEntropyLoss) else MulticlassOutputLayer ) return cls(vocab, create_loss(config.loss, weight=label_weights), config)
def _prepare_labels_weights(logits, targets, weights=None): """ Args: logits: Variable :math:`(N, C)` where `C = number of classes` targets: Variable :math:`(N)` where each value is `0 <= targets[i] <= C-1` weights: Coefficients for the loss. Must be a `Tensor` of shape [N] or [N, C], where `N = batch_size`, `C = number of classes`. Returns: labels: Tensor of shape [N, C], one-hot representation weights: Tensor of shape broadcastable to labels """ N, C = logits.size() # Converts targets to one-hot representation. Dim: [N, C] labels = FloatTensor(N, C).zero_().scatter(1, targets.unsqueeze(1).data, 1) if weights is None: weights = FloatTensor(N).data.fill_(1.0) if weights.dim() == 1: weights.unsqueeze_(-1) return labels, weights
def __call__(self, m_out, targets, reduce=True): """ Computes multi-label classification loss see details in torch.nn.MultiLabelSoftMarginLoss """ num_classes = m_out.size()[1] target_labels = targets[0] # each label list is padded by -1 to make every # observation example has the same length of list of labels # since -1 is out of the index range # add 1 to target_labels temporarily tmp_target_labels = target_labels + 1 # the idea is similar to one_hot_targets # the following encoding supports multi-label task # need to delete the first-column endoing since # it's for the padded label -1 n_hot_targets = ( FloatTensor(target_labels.size(0), num_classes + 1) .zero_() .scatter_(1, tmp_target_labels, 1) )[:, 1:] """ `F.multilabel_soft_margin_loss` or `torch.nn.MultiLabelSoftMarginLoss.` requires the output of the previous function be already a FloatTensor. """ # default: equal weight for each class # the losses are averaged over observations for each mini-batch loss = F.multilabel_soft_margin_loss( precision.maybe_float(m_out), n_hot_targets, reduction="mean" ) return loss