def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src tgt = batch.tgt segs = batch.segs clss = batch.clss alignment = batch.alignment mask_src = batch.mask_src mask_tgt = batch.mask_tgt mask_cls = batch.mask_cls mask_alg = batch.mask_alg outputs, model_state, src_mem_bank = self.model(src, tgt,segs, clss, mask_src, mask_tgt, mask_cls) batch_stats = self.loss.sharded_compute_loss(batch, \ outputs, \ self.args.generator_shard_size, \ normalization, \ src_mem_bank=src_mem_bank, \ last_attn=model_state.last_attn, \ alignment=alignment, \ mask_alg=mask_alg, \ mask_tgt=mask_tgt) batch_stats.n_docs = int(src.size(0)) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) for o in self.optims: o.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) for o in self.optims: o.step()
def _gradient_calculation(self, true_batchs, examples, total_stats, report_stats, step): self.model.zero_grad() for batch in true_batchs: loss = self.model(batch) # Topic Model loss topic_stats = Statistics(topic_loss=loss.clone().item() / float(examples)) loss.div(float(examples)).backward(retain_graph=False) total_stats.update(topic_stats) report_stats.update(topic_stats) if step % 1000 == 0: for k in range(self.args.topic_num): logger.info(','.join([ self.model.voc_id_wrapper.i2w(i) for i in self.model.topic_model.tm1.beta.topk(20, dim=-1) [1][k].tolist() ])) # in case of multi step gradient accumulation, # update only after accum batches if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step()
def _gradient_calculation(self, true_batchs, normalization, total_stats, report_stats, step): self.model.zero_grad() for batch in true_batchs: decode_output, _, attn = self.model(batch) tgt_tokens, src_tokens, tgt_labels, sents, examples = normalization # Generation loss abs_stats = self.abs_loss(batch, decode_output, self.args.generator_shard_size, tgt_tokens, attns=attn) abs_stats.n_docs = len(batch) total_stats.update(abs_stats) report_stats.update(abs_stats) # in case of multi step gradient accumulation, # update only after accum batches if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) for o in self.optims: o.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() batch_stats, _, _ = self._main(batch, normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src labels = batch.src_sent_labels segs = batch.segs clss = batch.clss mask = batch.mask_src mask_cls = batch.mask_cls src_txt = batch.src_txt sent_scores, mask = self.model(src, segs, clss, mask, mask_cls, src_txt) #pad = sent_scores.size()[1] - labels.size()[1] #labels_padded = torch.tensor([d + [0] * (pad) for d in labels]) #print("LABELS : ", labels) #print("APRES LE FORWARD DE L'ENCODER : ") #print("size sent_scores :", sent_scores.size()) #print("size mask :", mask.size()) #print("size labels", labels.size()) loss = self.loss(sent_scores, labels.float()) # print("size de LOSS :", loss.size()) loss = (loss * mask_cls.float()).sum() (loss / loss.numel()).backward() # loss.div(float(normalization)).backward() batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src tgt = batch.tgt segs = batch.segs clss = batch.clss mask_src = batch.mask_src mask_tgt = batch.mask_tgt mask_cls = batch.mask_cls z = batch.z mask_z = batch.mask_z z_segs = batch.z_segs outputs, scores, copy_prob = self.model(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls, z, mask_z, z_segs) batch_stats = self.loss.sharded_compute_loss( batch, outputs, self.args.generator_shard_size, normalization) batch_stats.n_docs = int(src.size(0)) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src labels = batch.src_sent_labels.float() segs = batch.segs clss = batch.clss mask = batch.mask_src mask_cls = batch.mask_cls sent_scores, mask = self.model(src, segs, clss, mask, mask_cls) loss = self.loss(sent_scores, labels) loss = (loss * mask.float()).sum() / mask.float().sum() loss.backward() # report accuracy abs_scores, abs_ids = torch.topk(sent_scores, 3, dim=1) abs_mask = (abs_scores > 0).float() n_sents = abs_mask.sum().item() n_correct = torch.sum(torch.gather(labels, 1, abs_ids) * abs_mask).item() batch_stats = Statistics(loss.item() * batch.batch_size, batch.batch_size, n_sents, n_correct) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src labels = batch.src_sent_labels segs = batch.segs clss = batch.clss mask = batch.mask_src mask_cls = batch.mask_cls sent_scores, mask = self.model(src, segs, clss, mask, mask_cls) if self.args.pairwise: loss = self.loss(sent_scores, labels.float(), mask) loss = loss.sum() else: loss = self.loss(sent_scores, labels.float()) loss = (loss * mask.float()).sum() (loss / loss.numel()).backward() batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): # Clear old grads from last step. if self.grad_accum_count > 1: self.model.zero_grad() # Iterate over true batches. for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src labels = batch.labels segs = batch.segs clss = batch.clss mask = batch.mask mask_cls = batch.mask_cls sent_scores, mask = self.model(src, segs, clss, mask, mask_cls) # Calculate loss and propagate backwards loss = self.loss(sent_scores, labels.float()) loss = (loss * mask.float()).sum() (loss / loss.numel()).backward() # Report batch statistics batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) # Update the parameters and statistics if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step() # In case of multi step gradient accumulation, update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step()
def _gradient_calculation(self, true_batchs, normalization, total_stats, report_stats, step): self.model.zero_grad() for batch in true_batchs: outputs, _, topic_loss = self.model(batch) tgt_tokens, src_tokens, sents, examples = normalization if self.args.topic_model: # Topic Model loss topic_stats = Statistics(topic_loss=topic_loss.clone().item() / float(examples)) topic_loss.div(float(examples)).backward(retain_graph=True) total_stats.update(topic_stats) report_stats.update(topic_stats) # Auto-encoder loss abs_stats = self.abs_loss(batch, outputs, self.args.generator_shard_size, tgt_tokens, retain_graph=False) abs_stats.n_docs = len(batch) total_stats.update(abs_stats) report_stats.update(abs_stats) # in case of multi step gradient accumulation, # update only after accum batches if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step()
def _gradient_calculation(self, true_batchs, normalization, total_stats, report_stats, step): self.model.zero_grad() for batch in true_batchs: cup_score, context_outputs, doc_data \ = self.model(batch) tokens, sents, summ_sents = normalization norm_sent_context = tokens*self.args.sample_ratio*(2*self.args.win_size+1) *\ (1+self.args.expand_ratio*(1-self.args.pr)) norm_cup = sents*self.args.win_size*(self.args.negative_sample_num+1)*2. # Auto-encoder loss ae_context_stats = self.abs_loss(batch.context_tgt, context_outputs, self.args.generator_shard_size, norm_sent_context, retain_graph=True) ae_context_stats.n_docs = int(batch.src.size(0)) total_stats.update(ae_context_stats) report_stats.update(ae_context_stats) # CUP loss cup_stats = self.cup_loss(batch.cup_tgt, cup_score, self.args.generator_shard_size, norm_cup, retain_graph=False) total_stats.update(cup_stats) report_stats.update(cup_stats) # in case of multi step gradient accumulation, # update only after accum batches if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) for o in self.optims: o.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats, step): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() # print(self.gpu_rank) src = batch.src tgt = batch.tgt segs = batch.segs clss = batch.clss mask_src = batch.mask_src mask_tgt = batch.mask_tgt mask_cls = batch.mask_cls # TODO modify # outputs, scores = self.model(src, tgt,segs, clss, mask_src, mask_tgt, mask_cls) outputs, scores, src_context, graph_context, top_vec, ent_top_vec, emask = self.model( src, tgt, segs, clss, mask_src, mask_tgt, mask_cls, batch) # # # ent_src batch_stats, copy_v = self.loss.sharded_compute_loss( batch, outputs, self.args.generator_shard_size, normalization, src_context, graph_context, batch.ent_src, ent_top_vec, self.copy) # source_src # batch_stats, copy_v = self.loss.sharded_compute_loss(batch, outputs, self.args.generator_shard_size, normalization, # src_context, graph_context, batch.src, top_vec, self.copy) for name, parms in self.model.generator.named_parameters(): if name == 'copy_ff.weight': out_grad = parms.grad batch_stats.n_docs = int(src.size(0)) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step() # self.report_manager.tensorboard_writer.add_histogram('copy__distribution', copy_v, step) # self.report_manager.tensorboard_writer.add_histogram('score__distribution', out_grad, step) # print('write down') # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src labels = batch.src_sent_labels segs = batch.segs clss = batch.clss mask = batch.mask_src mask_cls = batch.mask_cls if self.args.ext_sum_dec: sent_scores, mask = self.model(src, segs, clss, mask, mask_cls, labels) tgt_len = 3 _, labels_id = torch.topk(labels, k=tgt_len) # B, tgt_len labels_id, _= torch.sort(labels_id) # nsent 100 weight_up 20 weight = torch.linspace(start=1, end=self.args.weight_up, steps=self.args.max_src_nsents).type_as( sent_scores) # global max_class # max_class = max(max_class, torch.max(labels_id + 1).item()) weight = weight[:sent_scores.size(-1)] # weight = torch.ones(self.args.max_src_nsents) loss = F.nll_loss( F.log_softmax( sent_scores.view(-1, sent_scores.size(-1)), dim=-1, dtype=torch.float32, ), labels_id.view(-1), # bsz sent weight=weight, reduction='sum', ignore_index=-1, ) prediction = torch.argmax(sent_scores, dim=-1) if (self.optim._step + 1) % self.args.print_every == 0: logger.info( 'train prediction: %s |label %s ' % (str(prediction), str(labels_id))) # both are numbers accuracy = torch.div(torch.sum(torch.equal(prediction, labels_id).float()), tgt_len) else: sent_scores, mask = self.model(src, segs, clss, mask, mask_cls) loss = self.loss(sent_scores, labels.float()) loss = (loss * mask.float()).sum() tgt_len = 3 _, labels_id = torch.topk(labels, k=tgt_len) # B, tgt_len labels_id, _ = torch.sort(labels_id) _, prediction = torch.topk(sent_scores, k=tgt_len) prediction, _ = torch.sort(labels_id) if (self.optim._step + 1) % self.args.print_every == 0: logger.info( 'train prediction: %s |label %s ' % (str(prediction), str(labels_id))) accuracy = torch.div(torch.sum(torch.equal(prediction, labels_id).float()), tgt_len) (loss / loss.numel()).backward() # with amp.scale_loss((loss / loss.numel()), self.optim.optimizer) as scaled_loss: # scaled_loss.backward() # loss.div(float(normalization)).backward() if self.args.acc_reporter: batch_stats = acc_reporter(float(loss.cpu().data.numpy()),accuracy, normalization) else: batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src tgt = batch.tgt segs = batch.segs clss = batch.clss mask_src = batch.mask_src mask_tgt = batch.mask_tgt mask_cls = batch.mask_cls if self.args.task == 'hybrid': if self.args.oracle or self.args.hybrid_loss: labels = batch.src_sent_labels outputs, scores, copy_params = self.model( src, tgt, segs, clss, mask_src, mask_tgt, mask_cls, labels) else: outputs, scores, copy_params = self.model( src, tgt, segs, clss, mask_src, mask_tgt, mask_cls) batch_stats = self.loss.sharded_compute_loss( batch, outputs, self.args.generator_shard_size, normalization, copy_params) paramss = list(self.model.named_parameters()) for each in paramss: try: if torch.isnan(each[1].grad.sum()): exit() except: continue else: outputs, scores = self.model(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls) batch_stats = self.loss.sharded_compute_loss( batch, outputs, self.args.generator_shard_size, normalization) batch_stats.n_docs = int(src.size(0)) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step()
def _gradient_accumulation(self, true_batchs, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() if self.args.mode == 'train': src = batch.src tgt = batch.tgt pmt_msk = batch.pmt_msk states = batch.states ex_idx = batch.ex_idx tgt_idx = batch.tgt_idx mask_src = batch.mask_src mask_tgt = batch.mask_tgt outputs, _ = self.model(src, tgt, mask_src, pmt_msk, ex_idx) init_logps, trans_logps = self.model.trans_logprobs() ext_logps = self.model.external_logprobs() batch_stats = self.loss.compute_loss(batch, outputs, states, ex_idx, tgt_idx, mask_tgt, init_logps, trans_logps, ext_logps) else: src = batch.src tgt = batch.tgt segs = batch.segs mask_src = batch.mask_src mask_tgt = batch.mask_tgt outputs, scores = self.model(src, tgt, segs, mask_src, mask_tgt) batch_stats = self.loss.sharded_compute_loss(batch, outputs, self.args.generator_shard_size) batch_stats.n_docs = int(src.size(0)) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) for o in self.optims: o.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) for o in self.optims: o.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src labels = batch.src_sent_labels segs = batch.segs clss = batch.clss mask = batch.mask_src mask_cls = batch.mask_cls sent_scores, mask = self.model(src, segs, clss, mask, mask_cls) # print("sent_scores ", sent_scores.size()) # print(sent_scores) # print("labels ", labels.size()) # print(labels) if self.args.pairwise: loss = self.loss(sent_scores, labels.float(), mask) # print("???") # with SummaryWriter(comment='model') as w: # w.add_graph(self.loss, (sent_scores, labels.float(), mask, ) ) # print("1???") # exit() loss = loss.sum() else: loss = self.loss(sent_scores, labels.float()) loss = (loss * mask.float()).sum() # 做了个平均 numel返回number of elements (loss / loss.numel()).backward() # print("parameters: ") # paramss = list(self.model.named_parameters()) # for each in paramss: # try: # if each[1].grad == None: # print("f**k ", each[0]) # except: # continue # exit() # print("出现问题了, each[1] = ", each[1].grad) # for each in self.model.parameters(): # # if each.requires_grad == False: # if each.grad == None: # print(each.grad) # print("loss", loss.size()) # print(loss) # print("mask", mask.size()) # print(mask) # exit() # loss.div(float(normalization)).backward() batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src sent_rg_scores = batch.src_sent_labels sent_sect_labels = batch.sent_sect_labels sent_bin_labels = batch.sent_labels # if self.rg_predictor: segs = batch.segs clss = batch.clss mask = batch.mask_src mask_cls = batch.mask_cls if self.is_joint: if not self.rg_predictor: sent_scores, sent_sect_scores, mask, loss, loss_sent, loss_sect = self.model(src, segs, clss, mask, mask_cls, sent_bin_labels, sent_sect_labels) else: sent_scores, sent_sect_scores, mask, loss, loss_sent, loss_sect = self.model(src, segs, clss, mask, mask_cls, sent_rg_scores, sent_sect_labels) try: acc, pred = self._get_mertrics(sent_sect_scores, sent_sect_labels, mask=mask, task='sent_sect') except: logger.info("Accuracy cannot be computed due to some errors in loading approapriate files...") batch_stats = Statistics(loss=float(loss.cpu().data.numpy().sum()), loss_sect=float(loss_sect.cpu().data.numpy().sum()), loss_sent=float(loss_sent.cpu().data.numpy().sum()), n_docs=normalization, n_acc=batch.batch_size, RMSE=self._get_mertrics(sent_scores, sent_rg_scores, mask=mask, task='sent'), accuracy=acc, a1=self.model.uncertainty_loss._sigmas_sq[0].item(), a2=self.model.uncertainty_loss._sigmas_sq[1].item() ) else: # simple if not self.rg_predictor: sent_scores, mask, loss, _, _ = self.model(src, segs, clss, mask, mask_cls, sent_bin_labels=sent_bin_labels, sent_sect_labels=None) else: sent_scores, mask, loss, _, _ = self.model(src, segs, clss, mask, mask_cls, sent_bin_labels=sent_rg_scores, sent_sect_labels=None) # loss = self.loss(sent_scores, sent_rg_scores.float()) batch_stats = Statistics(loss=float(loss.cpu().data.numpy().sum()), RMSE=self._get_mertrics(sent_scores, sent_rg_scores, mask=mask, task='sent'), n_acc=batch.batch_size, n_docs=normalization, a1=self.model.uncertainty_loss._sigmas_sq[0] if self.is_joint else 0, a2=self.model.uncertainty_loss._sigmas_sq[1] if self.is_joint else 0) loss.backward() total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) # self.optim.step(report_stats=report_stats) # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step(report_stats)
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src labels = batch.labels segs = batch.segs clss = batch.clss mask = batch.mask mask_cls = batch.mask_cls group_idxs = batch.groups #they need to have these two attributes sel_sent_idxs = batch.sel_sent_idxs sel_sent_masks = batch.sel_sent_masks candi_masks = batch.candi_masks #pair_masks = batch.pair_masks src_str, tgt_str = batch.src_str, batch.tgt_str soft_labels = batch.soft_labels if self.args.model_name == 'seq': sent_scores, _ = self.model(src, mask, segs, clss, mask_cls, group_idxs, sel_sent_idxs=sel_sent_idxs, sel_sent_masks=sel_sent_masks, candi_sent_masks=candi_masks) #batch, seq_len, sent_count pred = sent_scores.contiguous().view(-1, sent_scores.size(2)) gold = batch.label_seq.contiguous().view(-1) if self.args.use_rouge_label: soft_labels = soft_labels.contiguous().view(-1, soft_labels.size(2)) #batch*seq_len, sent_count log_prb = F.log_softmax(pred, dim=1) non_pad_mask = gold.ne(-1) # padding value sent_mask = mask_cls.unsqueeze(1).expand(-1,sent_scores.size(1),-1) sent_mask = sent_mask.contiguous().view(-1, sent_scores.size(2)) loss = -((soft_labels * log_prb) * sent_mask.float()).sum(dim=1) loss = loss.masked_select(non_pad_mask).sum() # average later else: loss = F.cross_entropy(pred, gold, ignore_index=-1, reduction='sum') else: sent_scores, _ = self.model(src, mask, segs, clss, mask_cls, group_idxs, sel_sent_idxs=sel_sent_idxs, sel_sent_masks=sel_sent_masks, candi_sent_masks=candi_masks, sel_sent_hit_map=batch.hit_map) if self.args.use_rouge_label: labels = soft_labels if self.args.loss == "bce": loss = self.bce_logits_loss(sent_scores, labels.float()) #pointwise elif self.args.loss == "wsoftmax": loss = -self.logsoftmax(sent_scores) * labels.float() #batch_size, max_sent_count loss = (loss*candi_masks.float()).sum() #print("loss_sum", loss) (loss/loss.numel()).backward() batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) #print([p for p in self.model.parameters() if p.requires_grad]) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step()
def _maml_outter_gradient_accumulation(self, true_batchs, normalization, report_stats, step, inner_step, task_accum): """Outer loop training. NOTE: At the end of function, the adapters will be set to vars mode. Args: true_batchs (list[data.data_loader.Batch]) normalization (int): the number of non-padding tokens in the batch. report_stats (models.reporter.Statistics) step (int): current outer loop step. inner_step (int): current inner loop step. task_accum (int): current task. """ if self.grad_accum_count > 1 and task_accum == 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1 and task_accum == 1: self.model.zero_grad() src = batch.src tgt = batch.tgt segs = batch.segs clss = batch.clss mask_src = batch.mask_src mask_tgt = batch.mask_tgt mask_cls = batch.mask_cls outputs, scores = self.model(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls) batch_stats = self.loss.monolithic_compute_loss_backprop( batch, outputs, normalization) batch_stats.n_docs = int(src.size(0)) report_stats.update(batch_stats) if self.grad_accum_count == 1 and task_accum == self.args.num_task: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step() # Update only after accum batches if self.grad_accum_count > 1 and task_accum == self.args.num_task: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step() # NOTE: Clean fast weight self.model._clean_fast_weights_mode()
def _maml_inner_gradient_accumulation(self, true_batchs, normalization, report_stats, inner_step, task_accum, inference_mode=False): """Inner loop training. NOTE: 1. At the end of function, the adapter will be set to fast weights mode. 2. This function does not require self.model.zero_grad(), since it does not use .backward() Args: true_batchs (list[data.data_loader.Batch]) normalization (int): the number of non-padding tokens in the batch. report_stats (models.reporter.Statistics) inner_step (int): current inner loop step. task_accum (int): current task. """ grad = None for batch in true_batchs: src = batch.src tgt = batch.tgt segs = batch.segs clss = batch.clss mask_src = batch.mask_src mask_tgt = batch.mask_tgt mask_cls = batch.mask_cls outputs, scores = self.model(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls) loss, batch_stats = self.loss.monolithic_compute_loss_return( batch, outputs) # Compute gradient for adapter modules if (grad == None or self.grad_accum_count == 1): if inner_step == 1: grad = torch.autograd.grad(loss.div(normalization), self.model._adapter_vars()) else: grad = torch.autograd.grad( loss.div(normalization), self.model._adapter_fast_weights()) else: if inner_step == 1: next_grad = torch.autograd.grad(loss.div(normalization), self.model._adapter_vars()) else: next_grad = torch.autograd.grad( loss.div(normalization), self.model._adapter_fast_weights()) grad = tuple([sum(x) for x in zip(grad, next_grad)]) batch_stats.n_docs = int(src.size(0)) report_stats.update(batch_stats) if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: distributed.all_reduce_and_rescale_tensors(grad, float(1)) if inner_step == 1: # Compute update values with Adam _, update_values_bert = self.optims_inner[ task_accum - 1][0].step( self.model._adapter_vars_bert(), grad, inner_step=inner_step) _, update_values_dec = self.optims_inner[ task_accum - 1][1].step(self.model._adapter_vars_dec(), grad[len(update_values_bert):], inner_step=inner_step) update_values = update_values_bert + update_values_dec # Compute new weights that maintain a differential path to preivous weights fast_weights = list( map(lambda p: p[1] + p[0], zip(update_values, self.model._adapter_vars()))) else: # Compute update values with Adam _, update_values_bert = self.optims_inner[ task_accum - 1][0].step( self.model._adapter_fast_weights_bert(), grad, inner_step=inner_step) _, update_values_dec = self.optims_inner[ task_accum - 1][1].step( self.model._adapter_fast_weights_dec(), grad[len(update_values_bert):], inner_step=inner_step) update_values = update_values_bert + update_values_dec # Compute new weights that maintain a differential path to preivous weights fast_weights = list( map( lambda p: p[1] + p[0], zip(update_values, self.model._adapter_fast_weights()))) # update only after accum batches if self.grad_accum_count > 1: # Multi GPU gradient gather if self.n_gpu > 1: distributed.all_reduce_and_rescale_tensors(grad, float(1)) if inner_step == 1: # Compute update values with Adam _, update_values_bert = self.optims_inner[ task_accum - 1][0].step(self.model._adapter_vars_bert(), grad, inner_step=inner_step) _, update_values_dec = self.optims_inner[ task_accum - 1][1].step(self.model._adapter_vars_dec(), grad[len(update_values_bert):], inner_step=inner_step) update_values = update_values_bert + update_values_dec # Compute new weights that maintain a differential path to preivous weights fast_weights = list( map(lambda p: p[1] + p[0], zip(update_values, self.model._adapter_vars()))) else: # Compute update values with Adam _, update_values_bert = self.optims_inner[ task_accum - 1][0].step( self.model._adapter_fast_weights_bert(), grad, inner_step=i_nner_step) _, update_values_dec = self.optims_inner[ task_accum - 1][1].step( self.model._adapter_fast_weights_dec(), grad[len(update_values_bert):], inner_step=inner_step) update_values = update_values_bert + update_values_dec # Compute new weights that maintain a differential path to preivous weights fast_weights = list( map(lambda p: p[1] + p[0], zip(update_values, self.model._adapter_fast_weights()))) # Do not accumulate gradient in inference mode if (inference_mode): fast_weights = [w.data for w in fast_weights] for w in fast_weights: w.requires_grad = True # NOTE: Use new weights to perform following computation, the derivative path still maintained self.model._cascade_fast_weights_grad(fast_weights)
def _gradient_calculation(self, true_batchs, normalization, total_stats, report_stats, step): self.model.zero_grad() for batch in true_batchs: if self.args.pretrain: pn_output, decode_output, topic_loss, _ = self.model.pretrain( batch) else: rl_loss, decode_output, topic_loss, _, _ = self.model(batch) tgt_tokens, src_tokens, tgt_labels, sents, examples = normalization if self.args.pretrain: if self.args.topic_model: # Topic Model loss topic_stats = Statistics( topic_loss=topic_loss.clone().item() / float(examples)) topic_loss.div(float(examples)).backward(retain_graph=True) total_stats.update(topic_stats) report_stats.update(topic_stats) # Extractiton Loss pn_stats = self.pn_loss(batch.pn_tgt, pn_output, self.args.generator_shard_size, tgt_labels, retain_graph=True) total_stats.update(pn_stats) report_stats.update(pn_stats) # Generation loss abs_stats = self.abs_loss(batch, decode_output, self.args.generator_shard_size, tgt_tokens, retain_graph=False) abs_stats.n_docs = len(batch) total_stats.update(abs_stats) report_stats.update(abs_stats) else: if self.args.topic_model: # Topic Model loss topic_stats = Statistics( topic_loss=topic_loss.clone().item() / float(examples)) topic_loss.div(float(examples)).backward(retain_graph=True) total_stats.update(topic_stats) report_stats.update(topic_stats) # RL loss rl_stats = Statistics(rl_loss=rl_loss.clone().item() / float(examples)) # critic_stats = Statistics(ct_loss=critic_loss.clone().item() / float(examples)) rl_loss.div(float(examples)).backward(retain_graph=True) total_stats.update(rl_stats) # total_stats.update(critic_stats) report_stats.update(rl_stats) # report_stats.update(critic_stats) # Generation loss abs_stats = self.abs_loss(batch, decode_output, self.args.generator_shard_size, tgt_tokens, retain_graph=False) abs_stats.n_docs = len(batch) total_stats.update(abs_stats) report_stats.update(abs_stats) # in case of multi step gradient accumulation, # update only after accum batches if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() # src = torch.tensor(self._pad(pre_src, 0)) # segs = torch.tensor(self._pad(pre_segs, 0)) # mask_src = torch.logical_not(src == 0) # clss = torch.tensor(self._pad(pre_clss, -1)) # src_sent_labels = torch.tensor(self._pad(pre_src_sent_labels, 0)) # mask_cls = torch.logical_not(clss == -1) # clss[clss == -1] = 0 # setattr(self, 'clss' + postfix, clss.to(device)) # setattr(self, 'mask_cls' + postfix, mask_cls.to(device)) # setattr(self, 'src_sent_labels' + postfix, src_sent_labels.to(device)) # setattr(self, 'src' + postfix, src.to(device)) # setattr(self, 'segs' + postfix, segs.to(device)) # setattr(self, 'mask_src' + postfix, mask_src.to(device)) # # 下面都是要预测的给他pad -1, 意思是看到-1 就停止算loss, 不用计算mask ,mask 是作为输入时才要的 # org_sent_labels = torch.tensor(self._pad(org_sent_labels, -1)) # setattr(self, 'org_sent_labels' + postfix, org_sent_labels.to(device)) # poss = torch.tensor(self._pad(poss, -1)) # setattr(self, 'poss' + postfix, poss.to(device)) if self.args.jigsaw == 'jigsaw_lab': # jigsaw_lab 各自预测的那种,失败的尝试 logits = self.model(batch.src_s, batch.segs_s, batch.clss_s, batch.mask_src_s, batch.mask_cls_s)# bsz tgt_len nsent # bsz, sent, max-sent_num # mask = batch.mask_cls_s[:, :, None].float() # loss = self.loss(sent_scores, batch.poss_s.float()) loss = F.nll_loss( F.log_softmax( logits.view(-1, logits.size(-1)), dim=-1, dtype=torch.float32, ), batch.poss_s.view(-1), # bsz sent reduction='sum', ignore_index=-1, ) prediction = torch.argmax(logits, dim=-1) if (self.optim._step + 1) % self.args.print_every == 0: logger.info( 'train prediction: %s |label %s ' % (str(prediction), str(batch.poss_s))) accuracy = torch.div(torch.sum(torch.equal(prediction, batch.poss_s) * batch.mask_cls_s), torch.sum(batch.mask_cls_s)) * len(logits) # loss = (loss * batch.mask_cls_s.float()).sum() # print('train prediction: %s |label %s ' % (str(torch.argmax(logits, dim=-1)[0]), str(batch.poss_s[0]))) # logger.info('train prediction: %s |label %s ' % (str(torch.argmax(logits, dim=-1)[0]), str(batch.poss_s[0]))) # (loss / loss.numel()).backward() else: #self.args.jigsaw == 'jigsaw_dec': jigsaw decoder poss_s = batch.poss_s mask_poss = torch.eq(poss_s, -1) poss_s.masked_fill_(mask_poss, 1e4) # poss_s[i] [5,1,4,0,2,3,-1,-1]->[5,1,4,0,2,3,1e4,1e4] dec_labels[i] [3,1,xxx,6,7] dec_labels = torch.argsort(poss_s, dim=1) logits,_ = self.model(batch.src_s, batch.segs_s, batch.clss_s, batch.mask_src_s, batch.mask_cls_s, dec_labels) final_dec_labels = dec_labels.masked_fill(mask_poss, -1) loss = F.nll_loss( F.log_softmax( logits.view(-1, logits.size(-1)), dim=-1, dtype=torch.float32, ), final_dec_labels.view(-1), # bsz sent reduction='sum', ignore_index=-1, ) # loss = (loss * batch.mask_cls_s.float()).sum() # (loss / loss.numel()).backward() prediction = torch.argmax(logits, dim=-1) if (self.optim._step + 1) % self.args.print_every == 0: logger.info( 'train prediction: %s |label %s ' % (str(prediction), str(batch.poss_s))) accuracy = torch.div(torch.sum(torch.equal(prediction, batch.poss_s) * batch.mask_cls_s), torch.sum(batch.mask_cls_s)) * len(logits) with amp.scale_loss((loss / loss.numel()), self.optim.optimizer) as scaled_loss: scaled_loss.backward() # loss.div(float(normalization)).backward() if self.args.acc_reporter: batch_stats = acc_reporter.Statistics(float(loss.cpu().data.numpy()), accuracy, normalization) else: batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step()
def _gradient_accumulation(self, true_batchs, normalization, total_stats, report_stats): if self.grad_accum_count > 1: self.model.zero_grad() for batch in true_batchs: if self.grad_accum_count == 1: self.model.zero_grad() src = batch.src tgt = batch.tgt segs = batch.segs clss = batch.clss tgt_eng = batch.tgt_eng # tgt_segs idea has been deprecated if not hasattr(batch, 'tgt_segs'): tgt_segs = torch.ones(tgt.size()).long().cuda() else: tgt_segs = batch.tgt_segs if self.args.batch_verification: self.verification(batch) mask_src = batch.mask_src mask_tgt = batch.mask_tgt mask_cls = batch.mask_cls outputs, scores, mono_outputs = self.model(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls, tgt_eng=tgt_eng, tgt_segs=tgt_segs) # 如果是有两种语言的话,那直接把输出跟目标拼在一块儿得了,非常简单。 # calculate the multi-task loss, concatenate monolingual outputs and cross-lingual outputs if self.args.multi_task: # Here labels are concatenated from the second token (the first cls token is not included). batch.tgt = torch.cat((tgt, tgt_eng[:, 1:]), dim=1) outputs = torch.cat((outputs, mono_outputs), dim=1) batch_stats = self.loss.sharded_compute_loss( batch, outputs, self.args.generator_shard_size, normalization) batch_stats.n_docs = int(src.size(0)) total_stats.update(batch_stats) report_stats.update(batch_stats) # 4. Update the parameters and statistics. if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step() # in case of multi step gradient accumulation, # update only after accum batches if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) for o in self.optims: o.step()