def get_tsa_thresh(schedule, global_step, num_train_steps, start, end): training_progress = torch.tensor(float(global_step) / float(num_train_steps)) if schedule == 'linear_schedule': threshold = training_progress elif schedule == 'exp_schedule': scale = 5 threshold = torch.exp((training_progress - 1) * scale) elif schedule == 'log_schedule': scale = 5 threshold = 1 - torch.exp((-training_progress) * scale) output = threshold * (end - start) + start return output.to(_get_device())
def get_loss(model, sup_batch, unsup_batch, global_step): # logits -> prob(softmax) -> log_prob(log_softmax) # batch input_ids, segment_ids, input_mask, label_ids = sup_batch if unsup_batch: ori_input_ids, ori_segment_ids, ori_input_mask, \ aug_input_ids, aug_segment_ids, aug_input_mask = unsup_batch input_ids = torch.cat((input_ids, aug_input_ids), dim=0) segment_ids = torch.cat((segment_ids, aug_segment_ids), dim=0) input_mask = torch.cat((input_mask, aug_input_mask), dim=0) # logits logits = model(input_ids, segment_ids, input_mask) # sup loss sup_size = label_ids.shape[0] sup_loss = sup_criterion(logits[:sup_size], label_ids) # shape : train_batch_size if cfg.tsa: tsa_thresh = get_tsa_thresh(cfg.tsa, global_step, cfg.total_steps, start=1./logits.shape[-1], end=1) larger_than_threshold = torch.exp(-sup_loss) > tsa_thresh # prob = exp(log_prob), prob > tsa_threshold # larger_than_threshold = torch.sum( F.softmax(pred[:sup_size]) * torch.eye(num_labels)[sup_label_ids] , dim=-1) > tsa_threshold loss_mask = torch.ones_like(label_ids, dtype=torch.float32) * (1 - larger_than_threshold.type(torch.float32)) sup_loss = torch.sum(sup_loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1), torch_device_one()) else: sup_loss = torch.mean(sup_loss) # unsup loss if unsup_batch: # ori with torch.no_grad(): ori_logits = model(ori_input_ids, ori_segment_ids, ori_input_mask) ori_prob = F.softmax(ori_logits, dim=-1) # KLdiv target # ori_log_prob = F.log_softmax(ori_logits, dim=-1) # confidence-based masking if cfg.uda_confidence_thresh != -1: unsup_loss_mask = torch.max(ori_prob, dim=-1)[0] > cfg.uda_confidence_thresh unsup_loss_mask = unsup_loss_mask.type(torch.float32) else: unsup_loss_mask = torch.ones(len(logits) - sup_size, dtype=torch.float32) unsup_loss_mask = unsup_loss_mask.to(_get_device()) # aug # softmax temperature controlling uda_softmax_temp = cfg.uda_softmax_temp if cfg.uda_softmax_temp > 0 else 1. aug_log_prob = F.log_softmax(logits[sup_size:] / uda_softmax_temp, dim=-1) # KLdiv loss """ nn.KLDivLoss (kl_div) input : log_prob (log_softmax) target : prob (softmax) https://pytorch.org/docs/stable/nn.html unsup_loss is divied by number of unsup_loss_mask it is different from the google UDA official The official unsup_loss is divided by total https://github.com/google-research/uda/blob/master/text/uda.py#L175 """ unsup_loss = torch.sum(unsup_criterion(aug_log_prob, ori_prob), dim=-1) unsup_loss = torch.sum(unsup_loss * unsup_loss_mask, dim=-1) / torch.max(torch.sum(unsup_loss_mask, dim=-1), torch_device_one()) final_loss = sup_loss + cfg.uda_coeff*unsup_loss return final_loss, sup_loss, unsup_loss return sup_loss, None, None