def lambdaRank_loss_diagonal(batch_preds=None, batch_stds=None, sigma=None): ''' This method will impose explicit bias to highly ranked documents that are essentially ties :param batch_preds: :param batch_stds: :return: ''' batch_preds_sorted, batch_preds_sorted_inds = torch.sort(batch_preds, dim=1, descending=True) # sort documents according to the predicted relevance batch_stds_sorted_via_preds = torch.gather(batch_stds, dim=1, index=batch_preds_sorted_inds) # reorder batch_stds correspondingly so as to make it consistent. BTW, batch_stds[batch_preds_sorted_inds] only works with 1-D tensor # get unique document pairs, which is dynamically different per training iteration pair_row_inds, pair_col_inds = torch_triu_indice(k=1, pair_type='NoTies', batch_labels=batch_stds_sorted_via_preds) batch_std_diffs = torch.unsqueeze(batch_stds_sorted_via_preds, dim=2) - torch.unsqueeze(batch_stds_sorted_via_preds, dim=1) # standard pairwise differences, i.e., S_{ij} batch_std_Sij = torch.clamp(batch_std_diffs, min=-1.0, max=1.0) # ensuring S_{ij} \in {-1, 0, 1} batch_std_Sij = batch_std_Sij[:, pair_row_inds, pair_col_inds] # necessary S_{ij} batch_pred_diffs = torch.unsqueeze(batch_preds_sorted, dim=2) - torch.unsqueeze(batch_preds_sorted, dim=1) # computing pairwise differences, i.e., s_i - s_j batch_pred_s_ij = batch_pred_diffs[:, pair_row_inds, pair_col_inds] # unique pairwise comparisons according to a ltr_adhoc of documents batch_delta_ndcg = get_delta_ndcg(batch_stds, batch_stds_sorted_via_preds) batch_delta_ndcg = batch_delta_ndcg[:, pair_row_inds, pair_col_inds] batch_loss_1st = 0.5 * sigma * batch_pred_s_ij * (1.0 - batch_std_Sij) # cf. the 1st equation in page-3 batch_loss_2nd = torch.log(torch.exp(-sigma * batch_pred_s_ij) + 1.0) # cf. the 1st equation in page-3 batch_loss = torch.sum((batch_loss_1st + batch_loss_2nd) * batch_delta_ndcg) # weighting with delta-nDCG return batch_loss
def lambdaRank_loss_full_soft(batch_preds=None, batch_stds=None, sigma=None): ''' Instead of strictly getting the uppper diagonal entries, here we compute the lambdaloss by fully making use of the properties as follows: (1) using the full pairwise difference matrix is twice the loss of using merely the uppper diagonal entries (2) for ties, the delta nDCG will be zero, thus no need to explicitly remove pairs of ties ''' batch_preds_sorted, batch_preds_sorted_inds = torch.sort( batch_preds, dim=1, descending=True) # sort documents according to the predicted relevance batch_stds_sorted_via_preds = torch.gather( batch_stds, dim=1, index=batch_preds_sorted_inds ) # reorder batch_stds correspondingly so as to make it consistent. BTW, batch_stds[batch_preds_sorted_inds] only works with 1-D tensor batch_std_diffs = torch.unsqueeze( batch_stds_sorted_via_preds, dim=2) - torch.unsqueeze( batch_stds_sorted_via_preds, dim=1) # standard pairwise differences, i.e., S_{ij} batch_std_Sij = torch.clamp(batch_std_diffs, min=-1.0, max=1.0) # ensuring S_{ij} \in {-1, 0, 1} batch_pred_s_ij = torch.unsqueeze( batch_preds_sorted, dim=2) - torch.unsqueeze( batch_preds_sorted, dim=1) # computing pairwise differences, i.e., s_i - s_j batch_delta_ndcg = get_delta_ndcg(batch_stds, batch_stds_sorted_via_preds) batch_loss = torch.sum(sigma * (F.softplus(batch_pred_s_ij, beta=sigma) - batch_std_Sij * batch_pred_s_ij) * batch_delta_ndcg) return batch_loss
def inner_train(self, batch_preds, batch_stds, **kwargs): ''' :param batch_preds: [batch, ranking_size] each row represents the relevance predictions for documents within a ltr_adhoc :param batch_stds: [batch, ranking_size] each row represents the standard relevance grades for documents within a ltr_adhoc ''' label_type = kwargs['label_type'] assert LABEL_TYPE.MultiLabel == label_type assert 'presort' in kwargs and kwargs['presort'] is True # aiming for direct usage of ideal ranking batch_preds_sorted, batch_preds_sorted_inds = torch.sort(batch_preds, dim=1, descending=True) # sort documents according to the predicted relevance batch_stds_sorted_via_preds = torch.gather(batch_stds, dim=1, index=batch_preds_sorted_inds) # reorder batch_stds correspondingly so as to make it consistent. BTW, batch_stds[batch_preds_sorted_inds] only works with 1-D tensor batch_std_diffs = torch.unsqueeze(batch_stds_sorted_via_preds, dim=2) - torch.unsqueeze(batch_stds_sorted_via_preds, dim=1) # standard pairwise differences, i.e., S_{ij} batch_std_Sij = torch.clamp(batch_std_diffs, min=-1.0, max=1.0) # ensuring S_{ij} \in {-1, 0, 1} batch_std_p_ij = 0.5 * (1.0 + batch_std_Sij) batch_s_ij = torch.unsqueeze(batch_preds_sorted, dim=2) - torch.unsqueeze(batch_preds_sorted, dim=1) # computing pairwise differences, i.e., s_i - s_j batch_p_ij = 1.0 / (torch.exp(-self.sigma * batch_s_ij) + 1.0) batch_delta_ndcg = get_delta_ndcg(batch_ideally_sorted_stds=batch_stds, batch_stds_sorted_via_preds=batch_stds_sorted_via_preds, label_type=label_type, gpu=self.gpu) # about reduction, mean leads to poor performance, a probable reason is that the small values due to * lambda_weight * mean batch_loss = F.binary_cross_entropy(input=torch.triu(batch_p_ij, diagonal=1), target=torch.triu(batch_std_p_ij, diagonal=1), weight=torch.triu(batch_delta_ndcg, diagonal=1), reduction='sum') self.optimizer.zero_grad() batch_loss.backward() self.optimizer.step() return batch_loss
def custom_loss_function(self, batch_preds, batch_std_labels, **kwargs): ''' @param batch_preds: [batch, ranking_size] each row represents the relevance predictions for documents associated with the same query @param batch_std_labels: [batch, ranking_size] each row represents the standard relevance grades for documents associated with the same query @param kwargs: @return: ''' assert 'label_type' in kwargs and LABEL_TYPE.MultiLabel == kwargs[ 'label_type'] label_type = kwargs['label_type'] assert 'presort' in kwargs and kwargs[ 'presort'] is True # aiming for direct usage of ideal ranking # sort documents according to the predicted relevance batch_descending_preds, batch_pred_desc_inds = torch.sort( batch_preds, dim=1, descending=True) # reorder batch_stds correspondingly so as to make it consistent. # BTW, batch_stds[batch_preds_sorted_inds] only works with 1-D tensor batch_predict_rankings = torch.gather(batch_std_labels, dim=1, index=batch_pred_desc_inds) batch_p_ij, batch_std_p_ij = get_pairwise_comp_probs( batch_preds=batch_descending_preds, batch_std_labels=batch_predict_rankings, sigma=self.sigma) batch_delta_ndcg = get_delta_ndcg( batch_ideal_rankings=batch_std_labels, batch_predict_rankings=batch_predict_rankings, label_type=label_type, device=self.device) _batch_loss = F.binary_cross_entropy(input=torch.triu(batch_p_ij, diagonal=1), target=torch.triu(batch_std_p_ij, diagonal=1), weight=torch.triu( batch_delta_ndcg, diagonal=1), reduction='none') batch_loss = torch.sum(torch.sum(_batch_loss, dim=(2, 1))) self.optimizer.zero_grad() batch_loss.backward() self.optimizer.step() return batch_loss
def forward(ctx, batch_preds, batch_stds, sigma, gpu): batch_preds_sorted, batch_preds_sorted_inds = torch.sort( batch_preds, dim=1, descending=True ) # sort documents according to the predicted relevance batch_stds_sorted_via_preds = torch.gather( batch_stds, dim=1, index=batch_preds_sorted_inds ) # reorder batch_stds correspondingly so as to make it consistent. BTW, batch_stds[batch_preds_sorted_inds] only works with 1-D tensor batch_std_diffs = torch.unsqueeze( batch_stds_sorted_via_preds, dim=2) - torch.unsqueeze( batch_stds_sorted_via_preds, dim=1) # standard pairwise differences, i.e., S_{ij} batch_std_Sij = torch.clamp(batch_std_diffs, min=-1.0, max=1.0) # ensuring S_{ij} \in {-1, 0, 1} batch_pred_s_ij = torch.unsqueeze( batch_preds_sorted, dim=2) - torch.unsqueeze( batch_preds_sorted, dim=1) # computing pairwise differences, i.e., s_i - s_j batch_delta_ndcg = get_delta_ndcg(batch_stds, batch_stds_sorted_via_preds) batch_loss_1st = 0.5 * sigma * batch_pred_s_ij * ( 1.0 - batch_std_Sij) # cf. the 1st equation in page-3 batch_loss_2nd = log_1_add_exp_minus_sigma( batch_pred_s_ij, sigma=sigma, gpu=gpu) # cf. the 1st equation in page-3 batch_loss = torch.sum( (batch_loss_1st + batch_loss_2nd) * batch_delta_ndcg * 0.5 ) # weighting with delta-nDCG, '0.5' is multiplied due to the symmetric property #- gradient -# batch_grad = sigma * (0.5 * (1 - batch_std_Sij) - reciprocal_1_add_exp_sigma( batch_pred_s_ij, sigma=sigma, gpu=gpu)) batch_grad = batch_grad * batch_delta_ndcg batch_grad = torch.sum( batch_grad, dim=1, keepdim=True ) # relying on the symmetric property, i-th row-sum corresponding to the cumulative gradient w.r.t. i-th document. ctx.save_for_backward(batch_grad) return batch_loss
def lambdaRank_loss_full(batch_preds=None, batch_stds=None, sigma=None, label_type=None, gpu=False): ''' Instead of strictly getting the uppper diagonal entries, here we compute the lambdaloss by fully making use of the properties as follows: (1) using the full pairwise difference matrix is twice the loss of using merely the uppper diagonal entries (2) for ties, the delta nDCG will be zero, thus no need to explicitly remove pairs of ties ''' batch_preds_sorted, batch_preds_sorted_inds = torch.sort( batch_preds, dim=1, descending=True) # sort documents according to the predicted relevance batch_stds_sorted_via_preds = torch.gather( batch_stds, dim=1, index=batch_preds_sorted_inds ) # reorder batch_stds correspondingly so as to make it consistent. BTW, batch_stds[batch_preds_sorted_inds] only works with 1-D tensor batch_std_diffs = torch.unsqueeze( batch_stds_sorted_via_preds, dim=2) - torch.unsqueeze( batch_stds_sorted_via_preds, dim=1) # standard pairwise differences, i.e., S_{ij} batch_std_Sij = torch.clamp(batch_std_diffs, min=-1.0, max=1.0) # ensuring S_{ij} \in {-1, 0, 1} batch_pred_s_ij = torch.unsqueeze( batch_preds_sorted, dim=2) - torch.unsqueeze( batch_preds_sorted, dim=1) # computing pairwise differences, i.e., s_i - s_j batch_delta_ndcg = get_delta_ndcg(batch_stds, batch_stds_sorted_via_preds, label_type=label_type, gpu=gpu) batch_loss_1st = 0.5 * sigma * batch_pred_s_ij * ( 1.0 - batch_std_Sij) # cf. the 1st equation in page-3 batch_loss_2nd = torch.log(torch.exp(-sigma * batch_pred_s_ij) + 1.0) # cf. the 1st equation in page-3 # the coefficient of 0.5 is added due to all pairs are used batch_loss = torch.sum(0.5 * (batch_loss_1st + batch_loss_2nd) * batch_delta_ndcg) # weighting with delta-nDCG return batch_loss