Esempio n. 1
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            tgt = batch.tgt
            segs = batch.segs
            clss = batch.clss
            alignment = batch.alignment

            mask_src = batch.mask_src
            mask_tgt = batch.mask_tgt
            mask_cls = batch.mask_cls
            mask_alg = batch.mask_alg

            outputs, model_state, src_mem_bank = self.model(src, tgt,segs, clss, mask_src, mask_tgt, mask_cls)
            batch_stats = self.loss.sharded_compute_loss(batch, \
                    outputs, \
                    self.args.generator_shard_size, \
                    normalization, \
                    src_mem_bank=src_mem_bank, \
                    last_attn=model_state.last_attn, \
                    alignment=alignment, \
                    mask_alg=mask_alg, \
                    mask_tgt=mask_tgt)

            batch_stats.n_docs = int(src.size(0))

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))

                for o in self.optims:
                    o.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            for o in self.optims:
                o.step()
Esempio n. 2
0
    def _gradient_calculation(self, true_batchs, examples, total_stats,
                              report_stats, step):
        self.model.zero_grad()

        for batch in true_batchs:
            loss = self.model(batch)

            # Topic Model loss
            topic_stats = Statistics(topic_loss=loss.clone().item() /
                                     float(examples))
            loss.div(float(examples)).backward(retain_graph=False)
            total_stats.update(topic_stats)
            report_stats.update(topic_stats)

        if step % 1000 == 0:
            for k in range(self.args.topic_num):
                logger.info(','.join([
                    self.model.voc_id_wrapper.i2w(i)
                    for i in self.model.topic_model.tm1.beta.topk(20, dim=-1)
                    [1][k].tolist()
                ]))
        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.n_gpu > 1:
            grads = [
                p.grad.data for p in self.model.parameters()
                if p.requires_grad and p.grad is not None
            ]
            distributed.all_reduce_and_rescale_tensors(grads, float(1))
        for o in self.optims:
            o.step()
    def _gradient_calculation(self, true_batchs, normalization, total_stats,
                              report_stats, step):
        self.model.zero_grad()

        for batch in true_batchs:
            decode_output, _, attn = self.model(batch)

            tgt_tokens, src_tokens, tgt_labels, sents, examples = normalization

            # Generation loss
            abs_stats = self.abs_loss(batch, decode_output, self.args.generator_shard_size, tgt_tokens, attns=attn)
            abs_stats.n_docs = len(batch)
            total_stats.update(abs_stats)
            report_stats.update(abs_stats)

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.n_gpu > 1:
            grads = [p.grad.data for p in self.model.parameters()
                     if p.requires_grad
                     and p.grad is not None]
            distributed.all_reduce_and_rescale_tensors(
                grads, float(1))
        for o in self.optims:
            o.step()
Esempio n. 4
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            batch_stats, _, _ = self._main(batch, normalization)

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [
                        p.grad.data for p in self.model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    distributed.all_reduce_and_rescale_tensors(grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [
                    p.grad.data for p in self.model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                distributed.all_reduce_and_rescale_tensors(grads, float(1))
            self.optim.step()
Esempio n. 5
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.src_sent_labels

            segs = batch.segs
            clss = batch.clss
            mask = batch.mask_src

            mask_cls = batch.mask_cls
            src_txt = batch.src_txt

            sent_scores, mask = self.model(src, segs, clss, mask, mask_cls, src_txt)
            #pad = sent_scores.size()[1] - labels.size()[1]
            #labels_padded = torch.tensor([d + [0] * (pad) for d in labels])
            #print("LABELS : ", labels)
            #print("APRES LE FORWARD DE L'ENCODER : ")
            #print("size sent_scores :", sent_scores.size())
            #print("size mask :", mask.size())
            #print("size labels", labels.size())
            loss = self.loss(sent_scores, labels.float())
 #           print("size de LOSS :", loss.size())
            loss = (loss * mask_cls.float()).sum()
            (loss / loss.numel()).backward()
            # loss.div(float(normalization)).backward()

            batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization)

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            
            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            self.optim.step()
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            tgt = batch.tgt
            segs = batch.segs
            clss = batch.clss
            mask_src = batch.mask_src
            mask_tgt = batch.mask_tgt
            mask_cls = batch.mask_cls
            z = batch.z
            mask_z = batch.mask_z
            z_segs = batch.z_segs

            outputs, scores, copy_prob = self.model(src, tgt, segs, clss,
                                                    mask_src, mask_tgt,
                                                    mask_cls, z, mask_z,
                                                    z_segs)
            batch_stats = self.loss.sharded_compute_loss(
                batch, outputs, self.args.generator_shard_size, normalization)

            batch_stats.n_docs = int(src.size(0))

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [
                        p.grad.data for p in self.model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    distributed.all_reduce_and_rescale_tensors(grads, float(1))

                for o in self.optims:
                    o.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [
                    p.grad.data for p in self.model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                distributed.all_reduce_and_rescale_tensors(grads, float(1))
            for o in self.optims:
                o.step()
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.src_sent_labels.float()
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask_src
            mask_cls = batch.mask_cls

            sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)
            loss = self.loss(sent_scores, labels)
            loss = (loss * mask.float()).sum() / mask.float().sum()
            loss.backward()

            # report accuracy
            abs_scores, abs_ids = torch.topk(sent_scores, 3, dim=1)
            abs_mask = (abs_scores > 0).float()
            n_sents = abs_mask.sum().item()
            n_correct = torch.sum(torch.gather(labels, 1, abs_ids) *
                                  abs_mask).item()
            batch_stats = Statistics(loss.item() * batch.batch_size,
                                     batch.batch_size, n_sents, n_correct)

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [
                        p.grad.data for p in self.model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    distributed.all_reduce_and_rescale_tensors(grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [
                    p.grad.data for p in self.model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                distributed.all_reduce_and_rescale_tensors(grads, float(1))
            self.optim.step()
Esempio n. 8
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.src_sent_labels
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask_src
            mask_cls = batch.mask_cls

            sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)

            if self.args.pairwise:
                loss = self.loss(sent_scores, labels.float(), mask)
                loss = loss.sum()
            else:
                loss = self.loss(sent_scores, labels.float())
                loss = (loss * mask.float()).sum()
            (loss / loss.numel()).backward()

            batch_stats = Statistics(float(loss.cpu().data.numpy()),
                                     normalization)

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [
                        p.grad.data for p in self.model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    distributed.all_reduce_and_rescale_tensors(grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [
                    p.grad.data for p in self.model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                distributed.all_reduce_and_rescale_tensors(grads, float(1))
            self.optim.step()
Esempio n. 9
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        # Clear old grads from last step.
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        # Iterate over true batches.
        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.labels
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask
            mask_cls = batch.mask_cls

            sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)

            # Calculate loss and propagate backwards
            loss = self.loss(sent_scores, labels.float())
            loss = (loss * mask.float()).sum()
            (loss / loss.numel()).backward()

            # Report batch statistics
            batch_stats = Statistics(float(loss.cpu().data.numpy()),
                                     normalization)
            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # Update the parameters and statistics
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [
                        p.grad.data for p in self.model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    distributed.all_reduce_and_rescale_tensors(grads, float(1))
                self.optim.step()

        # In case of multi step gradient accumulation, update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [
                    p.grad.data for p in self.model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                distributed.all_reduce_and_rescale_tensors(grads, float(1))
            self.optim.step()
Esempio n. 10
0
    def _gradient_calculation(self, true_batchs, normalization, total_stats,
                              report_stats, step):
        self.model.zero_grad()

        for batch in true_batchs:
            outputs, _, topic_loss = self.model(batch)

            tgt_tokens, src_tokens, sents, examples = normalization

            if self.args.topic_model:
                # Topic Model loss
                topic_stats = Statistics(topic_loss=topic_loss.clone().item() /
                                         float(examples))
                topic_loss.div(float(examples)).backward(retain_graph=True)
                total_stats.update(topic_stats)
                report_stats.update(topic_stats)

            # Auto-encoder loss
            abs_stats = self.abs_loss(batch,
                                      outputs,
                                      self.args.generator_shard_size,
                                      tgt_tokens,
                                      retain_graph=False)
            abs_stats.n_docs = len(batch)
            total_stats.update(abs_stats)
            report_stats.update(abs_stats)

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.n_gpu > 1:
            grads = [
                p.grad.data for p in self.model.parameters()
                if p.requires_grad and p.grad is not None
            ]
            distributed.all_reduce_and_rescale_tensors(grads, float(1))
        for o in self.optims:
            o.step()
Esempio n. 11
0
    def _gradient_calculation(self, true_batchs, normalization, total_stats,
                              report_stats, step):
        self.model.zero_grad()

        for batch in true_batchs:
            cup_score, context_outputs, doc_data \
                = self.model(batch)

            tokens, sents, summ_sents = normalization

            norm_sent_context = tokens*self.args.sample_ratio*(2*self.args.win_size+1) *\
                (1+self.args.expand_ratio*(1-self.args.pr))
            norm_cup = sents*self.args.win_size*(self.args.negative_sample_num+1)*2.

            # Auto-encoder loss
            ae_context_stats = self.abs_loss(batch.context_tgt, context_outputs, self.args.generator_shard_size,
                                             norm_sent_context, retain_graph=True)
            ae_context_stats.n_docs = int(batch.src.size(0))
            total_stats.update(ae_context_stats)
            report_stats.update(ae_context_stats)

            # CUP loss
            cup_stats = self.cup_loss(batch.cup_tgt, cup_score, self.args.generator_shard_size,
                                      norm_cup, retain_graph=False)
            total_stats.update(cup_stats)
            report_stats.update(cup_stats)

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.n_gpu > 1:
            grads = [p.grad.data for p in self.model.parameters()
                     if p.requires_grad
                     and p.grad is not None]
            distributed.all_reduce_and_rescale_tensors(
                grads, float(1))
        for o in self.optims:
            o.step()
Esempio n. 12
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats, step):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()
            # print(self.gpu_rank)
            src = batch.src
            tgt = batch.tgt
            segs = batch.segs
            clss = batch.clss
            mask_src = batch.mask_src
            mask_tgt = batch.mask_tgt
            mask_cls = batch.mask_cls
            # TODO modify
            # outputs, scores = self.model(src, tgt,segs, clss, mask_src, mask_tgt, mask_cls)

            outputs, scores, src_context, graph_context, top_vec, ent_top_vec, emask = self.model(
                src, tgt, segs, clss, mask_src, mask_tgt, mask_cls, batch)
            #
            # # ent_src
            batch_stats, copy_v = self.loss.sharded_compute_loss(
                batch, outputs, self.args.generator_shard_size, normalization,
                src_context, graph_context, batch.ent_src, ent_top_vec,
                self.copy)
            # source_src
            # batch_stats, copy_v = self.loss.sharded_compute_loss(batch, outputs, self.args.generator_shard_size, normalization,
            #                                              src_context, graph_context, batch.src, top_vec, self.copy)
            for name, parms in self.model.generator.named_parameters():
                if name == 'copy_ff.weight':
                    out_grad = parms.grad
            batch_stats.n_docs = int(src.size(0))

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [
                        p.grad.data for p in self.model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    distributed.all_reduce_and_rescale_tensors(grads, float(1))

                for o in self.optims:
                    o.step()
        # self.report_manager.tensorboard_writer.add_histogram('copy__distribution', copy_v, step)
        # self.report_manager.tensorboard_writer.add_histogram('score__distribution', out_grad, step)
        # print('write down')
        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [
                    p.grad.data for p in self.model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                distributed.all_reduce_and_rescale_tensors(grads, float(1))
            for o in self.optims:
                o.step()
Esempio n. 13
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.src_sent_labels
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask_src
            mask_cls = batch.mask_cls

            if self.args.ext_sum_dec:
                sent_scores, mask = self.model(src, segs, clss, mask, mask_cls, labels)
                tgt_len = 3
                _, labels_id = torch.topk(labels, k=tgt_len)  # B, tgt_len
                labels_id, _= torch.sort(labels_id)
                # nsent 100 weight_up 20
                weight = torch.linspace(start=1, end=self.args.weight_up, steps=self.args.max_src_nsents).type_as(
                    sent_scores)
                # global max_class
                # max_class = max(max_class, torch.max(labels_id + 1).item())
                weight = weight[:sent_scores.size(-1)]
                # weight = torch.ones(self.args.max_src_nsents)
                loss = F.nll_loss(
                    F.log_softmax(
                        sent_scores.view(-1, sent_scores.size(-1)),
                        dim=-1,
                        dtype=torch.float32,
                    ),
                    labels_id.view(-1),  # bsz sent
                    weight=weight,
                    reduction='sum',
                    ignore_index=-1,
                )
                prediction = torch.argmax(sent_scores, dim=-1)
                if (self.optim._step + 1) % self.args.print_every == 0:
                    logger.info(
                        'train prediction: %s |label %s ' % (str(prediction), str(labels_id)))
                # both are numbers
                accuracy = torch.div(torch.sum(torch.equal(prediction, labels_id).float()), tgt_len)
            else:
                sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)
                loss = self.loss(sent_scores, labels.float())
                loss = (loss * mask.float()).sum()
                tgt_len = 3
                _, labels_id = torch.topk(labels, k=tgt_len)  # B, tgt_len
                labels_id, _ = torch.sort(labels_id)
                _, prediction = torch.topk(sent_scores, k=tgt_len)
                prediction, _ = torch.sort(labels_id)
                if (self.optim._step + 1) % self.args.print_every == 0:
                    logger.info(
                        'train prediction: %s |label %s ' % (str(prediction), str(labels_id)))
                accuracy = torch.div(torch.sum(torch.equal(prediction, labels_id).float()), tgt_len)
            (loss / loss.numel()).backward()
            # with amp.scale_loss((loss / loss.numel()), self.optim.optimizer) as scaled_loss:
            #     scaled_loss.backward()
            # loss.div(float(normalization)).backward()
            if self.args.acc_reporter:
                batch_stats = acc_reporter(float(loss.cpu().data.numpy()),accuracy, normalization)
            else:
                batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization)

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            self.optim.step()
Esempio n. 14
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            tgt = batch.tgt
            segs = batch.segs
            clss = batch.clss
            mask_src = batch.mask_src
            mask_tgt = batch.mask_tgt
            mask_cls = batch.mask_cls

            if self.args.task == 'hybrid':
                if self.args.oracle or self.args.hybrid_loss:
                    labels = batch.src_sent_labels
                    outputs, scores, copy_params = self.model(
                        src, tgt, segs, clss, mask_src, mask_tgt, mask_cls,
                        labels)
                else:
                    outputs, scores, copy_params = self.model(
                        src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)

                batch_stats = self.loss.sharded_compute_loss(
                    batch, outputs, self.args.generator_shard_size,
                    normalization, copy_params)
                paramss = list(self.model.named_parameters())
                for each in paramss:
                    try:
                        if torch.isnan(each[1].grad.sum()):
                            exit()
                    except:
                        continue

            else:
                outputs, scores = self.model(src, tgt, segs, clss, mask_src,
                                             mask_tgt, mask_cls)

                batch_stats = self.loss.sharded_compute_loss(
                    batch, outputs, self.args.generator_shard_size,
                    normalization)

            batch_stats.n_docs = int(src.size(0))

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [
                        p.grad.data for p in self.model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    distributed.all_reduce_and_rescale_tensors(grads, float(1))

                for o in self.optims:
                    o.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [
                    p.grad.data for p in self.model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                distributed.all_reduce_and_rescale_tensors(grads, float(1))
            for o in self.optims:
                o.step()
Esempio n. 15
0
    def _gradient_accumulation(self, true_batchs, total_stats, report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            if self.args.mode == 'train':
                src = batch.src
                tgt = batch.tgt
                pmt_msk = batch.pmt_msk
                states = batch.states
                ex_idx = batch.ex_idx
                tgt_idx = batch.tgt_idx

                mask_src = batch.mask_src
                mask_tgt = batch.mask_tgt

                outputs, _ = self.model(src, tgt, mask_src, pmt_msk, ex_idx)
                init_logps, trans_logps = self.model.trans_logprobs()
                ext_logps = self.model.external_logprobs()
                batch_stats = self.loss.compute_loss(batch, outputs, states,
                                                    ex_idx, tgt_idx, mask_tgt,
                                                    init_logps, trans_logps,
                                                    ext_logps)
            else:
                src = batch.src
                tgt = batch.tgt
                segs = batch.segs
                mask_src = batch.mask_src
                mask_tgt = batch.mask_tgt

                outputs, scores = self.model(src, tgt, segs, mask_src, mask_tgt)
                batch_stats = self.loss.sharded_compute_loss(batch, outputs, self.args.generator_shard_size)

            batch_stats.n_docs = int(src.size(0))
            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))

                for o in self.optims:
                    o.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            for o in self.optims:
                o.step()
Esempio n. 16
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.src_sent_labels
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask_src
            mask_cls = batch.mask_cls

            sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)



            # print("sent_scores ", sent_scores.size())
            # print(sent_scores)
            # print("labels ", labels.size())
            # print(labels)
            if self.args.pairwise:
                loss = self.loss(sent_scores, labels.float(), mask)
                # print("???")
                # with SummaryWriter(comment='model') as w:
                #     w.add_graph(self.loss, (sent_scores, labels.float(), mask, ) )
                #     print("1???")
                #     exit()
                loss = loss.sum()
            else:
                loss = self.loss(sent_scores, labels.float())
                loss = (loss * mask.float()).sum()
            # 做了个平均 numel返回number of elements
            (loss / loss.numel()).backward()


            # print("parameters: ")
            # paramss = list(self.model.named_parameters())
            # for each in paramss:
            #     try:
            #         if each[1].grad == None:
            #             print("f**k ", each[0])
            #     except:
            #         continue
            # exit()
                    # print("出现问题了, each[1] = ", each[1].grad)
            # for each in self.model.parameters():
            #     # if each.requires_grad == False:
            #     if each.grad == None:
            #         print(each.grad)
            # print("loss", loss.size())
            # print(loss)
            # print("mask", mask.size())
            # print(mask)
            # exit()
            # loss.div(float(normalization)).backward()

            batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization)

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            self.optim.step()
Esempio n. 17
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):

        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            sent_rg_scores = batch.src_sent_labels

            sent_sect_labels = batch.sent_sect_labels
            sent_bin_labels = batch.sent_labels
            # if self.rg_predictor:
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask_src
            mask_cls = batch.mask_cls

            if self.is_joint:
                if not self.rg_predictor:
                    sent_scores, sent_sect_scores, mask, loss, loss_sent, loss_sect = self.model(src, segs, clss, mask,
                                                                                                 mask_cls,
                                                                                                 sent_bin_labels,
                                                                                                 sent_sect_labels)
                else:
                    sent_scores, sent_sect_scores, mask, loss, loss_sent, loss_sect = self.model(src, segs, clss, mask,
                                                                                                 mask_cls,
                                                                                                 sent_rg_scores,
                                                                                                 sent_sect_labels)
                try:
                    acc, pred = self._get_mertrics(sent_sect_scores, sent_sect_labels, mask=mask, task='sent_sect')
                except:
                    logger.info("Accuracy cannot be computed due to some errors in loading approapriate files...")

                batch_stats = Statistics(loss=float(loss.cpu().data.numpy().sum()),
                                         loss_sect=float(loss_sect.cpu().data.numpy().sum()),
                                         loss_sent=float(loss_sent.cpu().data.numpy().sum()), n_docs=normalization,
                                         n_acc=batch.batch_size,
                                         RMSE=self._get_mertrics(sent_scores, sent_rg_scores, mask=mask, task='sent'),
                                         accuracy=acc,
                                         a1=self.model.uncertainty_loss._sigmas_sq[0].item(),
                                         a2=self.model.uncertainty_loss._sigmas_sq[1].item()
                                         )


            else:  # simple

                if not self.rg_predictor:
                    sent_scores, mask, loss, _, _ = self.model(src, segs, clss, mask, mask_cls,
                                                               sent_bin_labels=sent_bin_labels, sent_sect_labels=None)
                else:
                    sent_scores, mask, loss, _, _ = self.model(src, segs, clss, mask, mask_cls,
                                                               sent_bin_labels=sent_rg_scores, sent_sect_labels=None)

                # loss = self.loss(sent_scores, sent_rg_scores.float())

                batch_stats = Statistics(loss=float(loss.cpu().data.numpy().sum()),
                                         RMSE=self._get_mertrics(sent_scores, sent_rg_scores, mask=mask,
                                                                 task='sent'),
                                         n_acc=batch.batch_size,
                                         n_docs=normalization,
                                         a1=self.model.uncertainty_loss._sigmas_sq[0] if self.is_joint else 0,
                                         a2=self.model.uncertainty_loss._sigmas_sq[1] if self.is_joint else 0)

            loss.backward()
            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))
                # self.optim.step(report_stats=report_stats)

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            self.optim.step(report_stats)
Esempio n. 18
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.labels
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask
            mask_cls = batch.mask_cls
            group_idxs = batch.groups
            #they need to have these two attributes
            sel_sent_idxs = batch.sel_sent_idxs
            sel_sent_masks = batch.sel_sent_masks
            candi_masks = batch.candi_masks
            #pair_masks = batch.pair_masks
            src_str, tgt_str = batch.src_str, batch.tgt_str
            soft_labels = batch.soft_labels

            if self.args.model_name == 'seq':
                sent_scores, _ = self.model(src, mask, segs, clss, mask_cls, group_idxs,
                        sel_sent_idxs=sel_sent_idxs, sel_sent_masks=sel_sent_masks,
                        candi_sent_masks=candi_masks)
                #batch, seq_len, sent_count
                pred = sent_scores.contiguous().view(-1, sent_scores.size(2))
                gold = batch.label_seq.contiguous().view(-1)
                if self.args.use_rouge_label:
                    soft_labels = soft_labels.contiguous().view(-1, soft_labels.size(2))
                    #batch*seq_len, sent_count
                    log_prb = F.log_softmax(pred, dim=1)
                    non_pad_mask = gold.ne(-1) # padding value
                    sent_mask = mask_cls.unsqueeze(1).expand(-1,sent_scores.size(1),-1)
                    sent_mask = sent_mask.contiguous().view(-1, sent_scores.size(2))
                    loss = -((soft_labels * log_prb) * sent_mask.float()).sum(dim=1)
                    loss = loss.masked_select(non_pad_mask).sum()  # average later
                else:
                    loss = F.cross_entropy(pred, gold, ignore_index=-1, reduction='sum')
            else:
                sent_scores, _ = self.model(src, mask, segs, clss, mask_cls, group_idxs,
                        sel_sent_idxs=sel_sent_idxs, sel_sent_masks=sel_sent_masks,
                        candi_sent_masks=candi_masks,
                        sel_sent_hit_map=batch.hit_map)
                if self.args.use_rouge_label:
                    labels = soft_labels
                if self.args.loss == "bce":
                    loss = self.bce_logits_loss(sent_scores, labels.float()) #pointwise
                elif self.args.loss == "wsoftmax":
                    loss = -self.logsoftmax(sent_scores) * labels.float()

                #batch_size, max_sent_count
                loss = (loss*candi_masks.float()).sum()
                #print("loss_sum", loss)

            (loss/loss.numel()).backward()

            batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization)
            total_stats.update(batch_stats)
            report_stats.update(batch_stats)
            #print([p for p in self.model.parameters() if p.requires_grad])

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            self.optim.step()
Esempio n. 19
0
    def _maml_outter_gradient_accumulation(self, true_batchs, normalization,
                                           report_stats, step, inner_step,
                                           task_accum):
        """Outer loop training.

        NOTE: At the end of function, the adapters will be set to vars mode.

        Args:
            true_batchs (list[data.data_loader.Batch])
            normalization (int):
                the number of non-padding tokens in the batch.
            report_stats (models.reporter.Statistics)
            step (int):
                current outer loop step.
            inner_step (int):
                current inner loop step.
            task_accum (int):
                current task.
        """
        if self.grad_accum_count > 1 and task_accum == 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1 and task_accum == 1:
                self.model.zero_grad()

            src = batch.src
            tgt = batch.tgt
            segs = batch.segs
            clss = batch.clss
            mask_src = batch.mask_src
            mask_tgt = batch.mask_tgt
            mask_cls = batch.mask_cls

            outputs, scores = self.model(src, tgt, segs, clss, mask_src,
                                         mask_tgt, mask_cls)
            batch_stats = self.loss.monolithic_compute_loss_backprop(
                batch, outputs, normalization)

            batch_stats.n_docs = int(src.size(0))
            report_stats.update(batch_stats)

            if self.grad_accum_count == 1 and task_accum == self.args.num_task:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [
                        p.grad.data for p in self.model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    distributed.all_reduce_and_rescale_tensors(grads, float(1))

                for o in self.optims:
                    o.step()

        # Update only after accum batches
        if self.grad_accum_count > 1 and task_accum == self.args.num_task:
            # Multi GPU gradient gather
            if self.n_gpu > 1:
                grads = [
                    p.grad.data for p in self.model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                distributed.all_reduce_and_rescale_tensors(grads, float(1))

            for o in self.optims:
                o.step()

        # NOTE: Clean fast weight
        self.model._clean_fast_weights_mode()
Esempio n. 20
0
    def _maml_inner_gradient_accumulation(self,
                                          true_batchs,
                                          normalization,
                                          report_stats,
                                          inner_step,
                                          task_accum,
                                          inference_mode=False):
        """Inner loop training.

        NOTE: 1. At the end of function, the adapter will be set to fast weights mode.
              2. This function does not require self.model.zero_grad(), since it does not use .backward()

        Args:
            true_batchs (list[data.data_loader.Batch])
            normalization (int):
                the number of non-padding tokens in the batch.
            report_stats (models.reporter.Statistics)
            inner_step (int):
                current inner loop step.
            task_accum (int):
                current task.
        """
        grad = None
        for batch in true_batchs:

            src = batch.src
            tgt = batch.tgt
            segs = batch.segs
            clss = batch.clss
            mask_src = batch.mask_src
            mask_tgt = batch.mask_tgt
            mask_cls = batch.mask_cls

            outputs, scores = self.model(src, tgt, segs, clss, mask_src,
                                         mask_tgt, mask_cls)
            loss, batch_stats = self.loss.monolithic_compute_loss_return(
                batch, outputs)

            # Compute gradient for adapter modules
            if (grad == None or self.grad_accum_count == 1):
                if inner_step == 1:
                    grad = torch.autograd.grad(loss.div(normalization),
                                               self.model._adapter_vars())
                else:
                    grad = torch.autograd.grad(
                        loss.div(normalization),
                        self.model._adapter_fast_weights())
            else:
                if inner_step == 1:
                    next_grad = torch.autograd.grad(loss.div(normalization),
                                                    self.model._adapter_vars())
                else:
                    next_grad = torch.autograd.grad(
                        loss.div(normalization),
                        self.model._adapter_fast_weights())
                grad = tuple([sum(x) for x in zip(grad, next_grad)])

            batch_stats.n_docs = int(src.size(0))
            report_stats.update(batch_stats)

            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    distributed.all_reduce_and_rescale_tensors(grad, float(1))

                if inner_step == 1:
                    # Compute update values with Adam
                    _, update_values_bert = self.optims_inner[
                        task_accum - 1][0].step(
                            self.model._adapter_vars_bert(),
                            grad,
                            inner_step=inner_step)
                    _, update_values_dec = self.optims_inner[
                        task_accum - 1][1].step(self.model._adapter_vars_dec(),
                                                grad[len(update_values_bert):],
                                                inner_step=inner_step)
                    update_values = update_values_bert + update_values_dec
                    # Compute new weights that maintain a differential path to preivous weights
                    fast_weights = list(
                        map(lambda p: p[1] + p[0],
                            zip(update_values, self.model._adapter_vars())))
                else:
                    # Compute update values with Adam
                    _, update_values_bert = self.optims_inner[
                        task_accum - 1][0].step(
                            self.model._adapter_fast_weights_bert(),
                            grad,
                            inner_step=inner_step)
                    _, update_values_dec = self.optims_inner[
                        task_accum - 1][1].step(
                            self.model._adapter_fast_weights_dec(),
                            grad[len(update_values_bert):],
                            inner_step=inner_step)
                    update_values = update_values_bert + update_values_dec
                    # Compute new weights that maintain a differential path to preivous weights
                    fast_weights = list(
                        map(
                            lambda p: p[1] + p[0],
                            zip(update_values,
                                self.model._adapter_fast_weights())))

        # update only after accum batches
        if self.grad_accum_count > 1:
            # Multi GPU gradient gather
            if self.n_gpu > 1:
                distributed.all_reduce_and_rescale_tensors(grad, float(1))

            if inner_step == 1:
                # Compute update values with Adam
                _, update_values_bert = self.optims_inner[
                    task_accum - 1][0].step(self.model._adapter_vars_bert(),
                                            grad,
                                            inner_step=inner_step)
                _, update_values_dec = self.optims_inner[
                    task_accum - 1][1].step(self.model._adapter_vars_dec(),
                                            grad[len(update_values_bert):],
                                            inner_step=inner_step)
                update_values = update_values_bert + update_values_dec
                # Compute new weights that maintain a differential path to preivous weights
                fast_weights = list(
                    map(lambda p: p[1] + p[0],
                        zip(update_values, self.model._adapter_vars())))
            else:
                # Compute update values with Adam
                _, update_values_bert = self.optims_inner[
                    task_accum - 1][0].step(
                        self.model._adapter_fast_weights_bert(),
                        grad,
                        inner_step=i_nner_step)
                _, update_values_dec = self.optims_inner[
                    task_accum - 1][1].step(
                        self.model._adapter_fast_weights_dec(),
                        grad[len(update_values_bert):],
                        inner_step=inner_step)
                update_values = update_values_bert + update_values_dec
                # Compute new weights that maintain a differential path to preivous weights
                fast_weights = list(
                    map(lambda p: p[1] + p[0],
                        zip(update_values,
                            self.model._adapter_fast_weights())))

        # Do not accumulate gradient in inference mode
        if (inference_mode):
            fast_weights = [w.data for w in fast_weights]
            for w in fast_weights:
                w.requires_grad = True

        # NOTE: Use new weights to perform following computation, the derivative path still maintained
        self.model._cascade_fast_weights_grad(fast_weights)
Esempio n. 21
0
    def _gradient_calculation(self, true_batchs, normalization, total_stats,
                              report_stats, step):
        self.model.zero_grad()

        for batch in true_batchs:
            if self.args.pretrain:
                pn_output, decode_output, topic_loss, _ = self.model.pretrain(
                    batch)
            else:
                rl_loss, decode_output, topic_loss, _, _ = self.model(batch)

            tgt_tokens, src_tokens, tgt_labels, sents, examples = normalization

            if self.args.pretrain:
                if self.args.topic_model:
                    # Topic Model loss
                    topic_stats = Statistics(
                        topic_loss=topic_loss.clone().item() / float(examples))
                    topic_loss.div(float(examples)).backward(retain_graph=True)
                    total_stats.update(topic_stats)
                    report_stats.update(topic_stats)

                # Extractiton Loss
                pn_stats = self.pn_loss(batch.pn_tgt,
                                        pn_output,
                                        self.args.generator_shard_size,
                                        tgt_labels,
                                        retain_graph=True)
                total_stats.update(pn_stats)
                report_stats.update(pn_stats)

                # Generation loss
                abs_stats = self.abs_loss(batch,
                                          decode_output,
                                          self.args.generator_shard_size,
                                          tgt_tokens,
                                          retain_graph=False)
                abs_stats.n_docs = len(batch)
                total_stats.update(abs_stats)
                report_stats.update(abs_stats)

            else:
                if self.args.topic_model:
                    # Topic Model loss
                    topic_stats = Statistics(
                        topic_loss=topic_loss.clone().item() / float(examples))
                    topic_loss.div(float(examples)).backward(retain_graph=True)
                    total_stats.update(topic_stats)
                    report_stats.update(topic_stats)

                # RL loss
                rl_stats = Statistics(rl_loss=rl_loss.clone().item() /
                                      float(examples))
                # critic_stats = Statistics(ct_loss=critic_loss.clone().item() / float(examples))
                rl_loss.div(float(examples)).backward(retain_graph=True)
                total_stats.update(rl_stats)
                # total_stats.update(critic_stats)
                report_stats.update(rl_stats)
                # report_stats.update(critic_stats)

                # Generation loss
                abs_stats = self.abs_loss(batch,
                                          decode_output,
                                          self.args.generator_shard_size,
                                          tgt_tokens,
                                          retain_graph=False)
                abs_stats.n_docs = len(batch)
                total_stats.update(abs_stats)
                report_stats.update(abs_stats)

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.n_gpu > 1:
            grads = [
                p.grad.data for p in self.model.parameters()
                if p.requires_grad and p.grad is not None
            ]
            distributed.all_reduce_and_rescale_tensors(grads, float(1))
        for o in self.optims:
            o.step()
Esempio n. 22
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()
            # src = torch.tensor(self._pad(pre_src, 0))
            # segs = torch.tensor(self._pad(pre_segs, 0))
            # mask_src = torch.logical_not(src == 0)
            # clss = torch.tensor(self._pad(pre_clss, -1))
            # src_sent_labels = torch.tensor(self._pad(pre_src_sent_labels, 0))
            # mask_cls = torch.logical_not(clss == -1)
            # clss[clss == -1] = 0
            # setattr(self, 'clss' + postfix, clss.to(device))
            # setattr(self, 'mask_cls' + postfix, mask_cls.to(device))
            # setattr(self, 'src_sent_labels' + postfix, src_sent_labels.to(device))
            # setattr(self, 'src' + postfix, src.to(device))
            # setattr(self, 'segs' + postfix, segs.to(device))
            # setattr(self, 'mask_src' + postfix, mask_src.to(device))
            # # 下面都是要预测的给他pad -1, 意思是看到-1 就停止算loss, 不用计算mask ,mask 是作为输入时才要的
            # org_sent_labels = torch.tensor(self._pad(org_sent_labels, -1))
            # setattr(self, 'org_sent_labels' + postfix, org_sent_labels.to(device))
            # poss = torch.tensor(self._pad(poss, -1))
            # setattr(self, 'poss' + postfix, poss.to(device))

            if self.args.jigsaw == 'jigsaw_lab':  # jigsaw_lab 各自预测的那种,失败的尝试
                logits = self.model(batch.src_s, batch.segs_s, batch.clss_s, batch.mask_src_s, batch.mask_cls_s)# bsz tgt_len nsent
                # bsz, sent, max-sent_num
                # mask = batch.mask_cls_s[:, :, None].float()
                # loss = self.loss(sent_scores, batch.poss_s.float())
                loss = F.nll_loss(
                    F.log_softmax(
                        logits.view(-1, logits.size(-1)),
                        dim=-1,
                        dtype=torch.float32,
                    ),
                    batch.poss_s.view(-1), # bsz sent
                    reduction='sum',
                    ignore_index=-1,
                )
                prediction = torch.argmax(logits, dim=-1)
                if (self.optim._step + 1) % self.args.print_every == 0:
                    logger.info(
                        'train prediction: %s |label %s ' % (str(prediction), str(batch.poss_s)))
                accuracy = torch.div(torch.sum(torch.equal(prediction, batch.poss_s) * batch.mask_cls_s),
                                     torch.sum(batch.mask_cls_s)) * len(logits)

                # loss = (loss * batch.mask_cls_s.float()).sum()
                # print('train prediction: %s |label %s ' % (str(torch.argmax(logits, dim=-1)[0]), str(batch.poss_s[0])))
                # logger.info('train prediction: %s |label %s ' % (str(torch.argmax(logits, dim=-1)[0]), str(batch.poss_s[0])))
                # (loss / loss.numel()).backward()
            else:  #self.args.jigsaw == 'jigsaw_dec':    jigsaw decoder
                poss_s = batch.poss_s
                mask_poss = torch.eq(poss_s, -1)
                poss_s.masked_fill_(mask_poss, 1e4)
                # poss_s[i] [5,1,4,0,2,3,-1,-1]->[5,1,4,0,2,3,1e4,1e4] dec_labels[i] [3,1,xxx,6,7]
                dec_labels = torch.argsort(poss_s, dim=1)
                logits,_ = self.model(batch.src_s, batch.segs_s, batch.clss_s, batch.mask_src_s, batch.mask_cls_s, dec_labels)
                final_dec_labels = dec_labels.masked_fill(mask_poss, -1)
                loss = F.nll_loss(
                    F.log_softmax(
                        logits.view(-1, logits.size(-1)),
                        dim=-1,
                        dtype=torch.float32,
                    ),
                    final_dec_labels.view(-1),  # bsz sent
                    reduction='sum',
                    ignore_index=-1,
                )
                # loss = (loss * batch.mask_cls_s.float()).sum()
                # (loss / loss.numel()).backward()
                prediction = torch.argmax(logits, dim=-1)
                if (self.optim._step + 1) % self.args.print_every == 0:
                    logger.info(
                        'train prediction: %s |label %s ' % (str(prediction), str(batch.poss_s)))
                accuracy = torch.div(torch.sum(torch.equal(prediction, batch.poss_s) * batch.mask_cls_s),
                                     torch.sum(batch.mask_cls_s)) * len(logits)
            with amp.scale_loss((loss / loss.numel()), self.optim.optimizer) as scaled_loss:
                scaled_loss.backward()
            # loss.div(float(normalization)).backward()
            if self.args.acc_reporter:
                batch_stats = acc_reporter.Statistics(float(loss.cpu().data.numpy()), accuracy, normalization)
            else:
                batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization)

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            self.optim.step()
Esempio n. 23
0
    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            tgt = batch.tgt
            segs = batch.segs
            clss = batch.clss

            tgt_eng = batch.tgt_eng
            # tgt_segs idea has been deprecated
            if not hasattr(batch, 'tgt_segs'):
                tgt_segs = torch.ones(tgt.size()).long().cuda()
            else:
                tgt_segs = batch.tgt_segs

            if self.args.batch_verification:
                self.verification(batch)
            mask_src = batch.mask_src
            mask_tgt = batch.mask_tgt
            mask_cls = batch.mask_cls

            outputs, scores, mono_outputs = self.model(src,
                                                       tgt,
                                                       segs,
                                                       clss,
                                                       mask_src,
                                                       mask_tgt,
                                                       mask_cls,
                                                       tgt_eng=tgt_eng,
                                                       tgt_segs=tgt_segs)

            # 如果是有两种语言的话,那直接把输出跟目标拼在一块儿得了,非常简单。
            # calculate the multi-task loss, concatenate monolingual outputs and cross-lingual outputs
            if self.args.multi_task:
                # Here labels are concatenated from the second token (the first cls token is not included).
                batch.tgt = torch.cat((tgt, tgt_eng[:, 1:]), dim=1)
                outputs = torch.cat((outputs, mono_outputs), dim=1)

            batch_stats = self.loss.sharded_compute_loss(
                batch, outputs, self.args.generator_shard_size, normalization)

            batch_stats.n_docs = int(src.size(0))

            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [
                        p.grad.data for p in self.model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    distributed.all_reduce_and_rescale_tensors(grads, float(1))

                for o in self.optims:
                    o.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [
                    p.grad.data for p in self.model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                distributed.all_reduce_and_rescale_tensors(grads, float(1))
            for o in self.optims:
                o.step()