Beispiel #1
0
def permute_params(model, to_filters_last, lazy_mode):
    with torch.no_grad():
        for name, param in model.named_parameters():
            if(param.ndim == 4):
                if to_filters_last:
                    param.data = param.data.permute((2, 3, 1, 0))
                else:
                    param.data = param.data.permute((3, 2, 0, 1))  # permute RSCK to KCRS

    if lazy_mode:
        import habana_frameworks.torch.core as htcore
        htcore.mark_step()
Beispiel #2
0
def validate(val_loader, model, criterion, device, args):
    #Images per second with data loading time
    image_time_DL = AverageMeter('imgs/s(Inc. DL)', ':6.3f')
    batch_time = AverageMeter('BatchTime', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':0.2f')
    top5 = AverageMeter('Acc@5', ':0.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, image_time_DL, losses, top1, top5],
        prefix='Test: ')

    print("MODEL EVAL")
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        data_end = time.time()
        for i, (images, target) in enumerate(val_loader):
            images, target = images.to(device, non_blocking=True), target.to(device, non_blocking=True)
            images = images.contiguous(memory_format=torch.channels_last)
            if args.enable_lazy:
                import habana_frameworks.torch.core as htcore
                htcore.mark_step()

            # compute output
            output = model(images)

            loss = criterion(output, target)

            if i % args.print_interval == 0:
                acc1, acc5 = accuracy_classification(output, target, topk=(1, 5))
                batch_size = images.shape[0]
                losses.update(loss.item())
                top1.update(acc1.to(torch.device('cpu'))[0], n=batch_size*args.print_interval)
                top5.update(acc5.to(torch.device('cpu'))[0], n=batch_size*args.print_interval)

                # measure elapsed time
                image_time_DL.update(batch_size*args.print_interval/(time.time() - data_end), n=args.print_interval)
                batch_time.update(time.time() - data_end, n=1, avoid_warmup=True)
                progress.display(i)
                data_end = time.time()

            if i == args.num_train_steps-1:
                break

    # gather the stats from all processes
    top1.synchronize_between_processes(device)
    top5.synchronize_between_processes(device)

    print(' * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}'.format(top1=top1, top5=top5))

    return top1.global_avg
Beispiel #3
0
def permute_momentum(optimizer, to_filters_last, lazy_mode):
    # Permute the momentum buffer before using for checkpoint
    for group in optimizer.param_groups:
        for p in group['params']:
            param_state = optimizer.state[p]
            if 'momentum_buffer' in param_state:
                buf = param_state['momentum_buffer']
                if(buf.ndim == 4):
                    if to_filters_last:
                        buf = buf.permute((2,3,1,0))
                    else:
                        buf = buf.permute((3,2,0,1))
                    param_state['momentum_buffer'] = buf

    if lazy_mode:
        import habana_frameworks.torch.core as htcore
        htcore.mark_step()
Beispiel #4
0
def evaluate(model, criterion, data_loader, device, print_freq=100):
    model.eval()
    metric_logger = utils.MetricLogger(delimiter="  ", device=device)
    header = 'Test:'
    step_count = 0
    with torch.no_grad():
        for image, target in metric_logger.log_every(data_loader, print_freq,
                                                     header):
            image = image.to(device, non_blocking=True)

            if args.channels_last:
                image = image.contiguous(memory_format=torch.channels_last)
                if args.run_lazy_mode:
                    # This mark_step is added so that the the lazy kernel can
                    # create and evaluate the graph to infer the resulting tensor
                    # as channels_last
                    import habana_frameworks.torch.core as htcore
                    htcore.mark_step()

            target = target.to(device, non_blocking=True)
            output = model(image)
            loss = criterion(output, target)

            acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
            # FIXME need to take into account that the datasets
            # could have been padded in distributed setup
            batch_size = image.shape[0]
            loss_cpu = loss.to('cpu').detach()
            metric_logger.update(loss=loss_cpu.item())
            metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
            metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
            step_count = step_count + 1
            if step_count >= args.num_eval_steps:
                break
    # gather the stats from all processes
    metric_logger.synchronize_between_processes()

    # Return from here if evaluation phase does not go through any iterations.(eg, The data set is so small that
    # there is only one eval batch, but that was skipped in data loader due to drop_last=True)
    if len(metric_logger.meters) == 0:
        return

    print(' * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}'.format(
        top1=metric_logger.acc1, top5=metric_logger.acc5))
    return metric_logger.acc1.global_avg
    def _inference_with_bleu(self, generator, sample, model):
        import sacrebleu

        def decode(toks, escape_unk=False):
            s = self.tgt_dict.string(
                toks.int().cpu(),
                self.cfg.eval_bleu_remove_bpe,
                # The default unknown string in fairseq is `<unk>`, but
                # this is tokenized by sacrebleu as `< unk >`, inflating
                # BLEU scores. Instead, we use a somewhat more verbose
                # alternative that is unlikely to appear in the real
                # reference, but doesn't get split into multiple tokens.
                unk_string=("UNKNOWNTOKENINREF"
                            if escape_unk else "UNKNOWNTOKENINHYP"),
            )
            if self.tokenizer:
                s = self.tokenizer.decode(s)
            return s

        gen_out = self.inference_step(generator, [model],
                                      sample,
                                      prefix_tokens=None)

        if self.cfg.use_habana and self.cfg.use_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        hyps, refs = [], []
        for i in range(len(gen_out)):
            hyps.append(decode(gen_out[i][0]["tokens"]))
            refs.append(
                decode(
                    utils.strip_pad(sample["target"][i], self.tgt_dict.pad()),
                    escape_unk=True,  # don't count <unk> as matches to the hypo
                ))
        if self.cfg.eval_bleu_print_samples:
            logger.info("example hypothesis: " + hyps[0])
            logger.info("example reference: " + refs[0])
        if self.cfg.eval_tokenized_bleu:
            return sacrebleu.corpus_bleu(hyps, [refs], tokenize="none")
        else:
            return sacrebleu.corpus_bleu(hyps, [refs])
Beispiel #6
0
def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        data = data.contiguous(memory_format=torch.channels_last)
        if args.use_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        if args.use_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        optimizer.step()

        if args.use_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data),
                len(train_loader.dataset) / args.world_size,
                100. * batch_idx / len(train_loader), loss.item()))
            if args.dry_run:
                break
Beispiel #7
0
def train(train_loader, model, criterion, optimizer, epoch, device, args):
    batch_time = AverageMeter('BatchTime', ':6.3f')
    #Images per second with data loading time
    image_time_DL = AverageMeter('imgs/s(Inc. DL)', ':6.3f')
    image_time = AverageMeter('imgs/s(Exc. DL)', ':6.3f')
    data_time = AverageMeter('DL Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':0.2f')
    top5 = AverageMeter('Acc@5', ':0.2f')

    if  args.print_interval == 1:
        progress = ProgressMeter(
        len(train_loader),
        [batch_time, image_time, image_time_DL, data_time, losses, top1, top5],
        prefix='Epoch: [{}]'.format(epoch))
    else:
        progress = ProgressMeter(
        len(train_loader),
        [batch_time, image_time_DL, data_time, losses, top1, top5],
        prefix='Epoch: [{}]'.format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        images, target = images.to(device, non_blocking=True), target.to(device, non_blocking=True)
        # measure data loading time
        data_end = time.time()
        data_loading_time = data_end - end

        images = images.contiguous(memory_format=torch.channels_last)
        if args.enable_lazy:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        # compute output
        output = model(images)

        if not args.no_aux_logits:
            aux_logits2 = output.aux_logits2
            aux_logits1 = output.aux_logits1
            output = output.logits
            # "Going Deeper with Convolutions" <http://arxiv.org/abs/1409.4842>, Page 6.
            loss = criterion(output, target) + 0.3*(criterion(aux_logits2, target) + criterion(aux_logits1, target))
        else:
            loss = criterion(output, target)

        optimizer.zero_grad()
        if args.device =='gpu' and args.is_amp:
            from apex import amp
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if args.enable_lazy:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        if args.is_hmp:
            from habana_frameworks.torch.hpex import hmp
            with hmp.disable_casts():
                optimizer.step()
        else:
            optimizer.step()

        if args.enable_lazy:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        if i % args.print_interval == 0:
            # measure accuracy and record loss
            acc1, acc5 = accuracy_classification(output, target, topk=(1, 5))
            batch_size = images.shape[0]
            losses.update(loss.item(), n=batch_size)
            top1.update(acc1.to(torch.device('cpu'))[0], n=batch_size*args.print_interval)
            top5.update(acc5.to(torch.device('cpu'))[0], n=batch_size*args.print_interval)

            batch_elapsed_time = time.time() - data_end
            total_elapsed_time = time.time() - end
            # measure elapsed time
            if epoch == 0:
                batch_time.update(batch_elapsed_time, n=args.print_interval, skip=2*args.print_interval, avoid_warmup=True)
                data_time.update(data_loading_time, n=args.print_interval, avoid_warmup=True)
                image_time_DL.update(batch_size*args.print_interval/total_elapsed_time, n=args.print_interval, skip=2*args.print_interval, avoid_warmup=True)
                if (args.print_interval == 1):
                    image_time.update(batch_size/batch_elapsed_time, n=args.print_interval, skip=2*args.print_interval, avoid_warmup=True)
            else:
                batch_time.update(batch_elapsed_time, n=args.print_interval)
                data_time.update(data_loading_time,n=args.print_interval)
                image_time_DL.update(batch_size*args.print_interval/total_elapsed_time, n=args.print_interval)
                if (args.print_interval == 1):
                    image_time.update(batch_size/batch_elapsed_time, n=args.print_interval, skip=2*args.print_interval)
            progress.display(i)
            end = time.time()

        if i == args.num_train_steps-1:
            break
Beispiel #8
0
def mark_step(is_lazy_mode):
    if is_lazy_mode:
        import habana_frameworks.torch.core as htcore
        htcore.mark_step()
 def test_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT:
     # Break lazy accumulation of graph after every step
     htcore.mark_step()
     return step_output
Beispiel #10
0
    def forward_decoder(
        self,
        tokens,
        encoder_outs: List[Dict[str, List[Tensor]]],
        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
        temperature: float = 1.0,
    ):
        log_probs = []
        avg_attn: Optional[Tensor] = None
        encoder_out: Optional[Dict[str, List[Tensor]]] = None
        for i, model in enumerate(self.models):
            if self.has_encoder():
                encoder_out = encoder_outs[i]
            if use_lazy_mode:
                import habana_frameworks.torch.core as htcore
                htcore.mark_step()

            # decode each model
            if self.has_incremental_states():
                decoder_out = model.decoder.forward(
                    tokens,
                    encoder_out=encoder_out,
                    incremental_state=incremental_states[i],
                )
            else:
                if hasattr(model, "decoder"):
                    decoder_out = model.decoder.forward(
                        tokens, encoder_out=encoder_out)
                else:
                    decoder_out = model.forward(tokens)

            if use_lazy_mode:
                import habana_frameworks.torch.core as htcore
                htcore.mark_step()

            attn: Optional[Tensor] = None
            decoder_len = len(decoder_out)
            if decoder_len > 1 and decoder_out[1] is not None:
                if isinstance(decoder_out[1], Tensor):
                    attn = decoder_out[1]
                else:
                    attn_holder = decoder_out[1]["attn"]
                    if isinstance(attn_holder, Tensor):
                        attn = attn_holder
                    elif attn_holder is not None:
                        attn = attn_holder[0]
                if attn is not None:
                    attn = attn[:, -1, :]

            decoder_out_tuple = (
                decoder_out[0][:, -1:, :].div_(temperature),
                None if decoder_len <= 1 else decoder_out[1],
            )
            probs = model.get_normalized_probs(decoder_out_tuple,
                                               log_probs=True,
                                               sample=None)
            probs = probs[:, -1, :]
            if self.models_size == 1:
                return probs, attn

            log_probs.append(probs)
            if attn is not None:
                if avg_attn is None:
                    avg_attn = attn
                else:
                    avg_attn.add_(attn)

        avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0),
                                    dim=0) - math.log(self.models_size)

        if avg_attn is not None:
            avg_attn.div_(self.models_size)
        return avg_probs, avg_attn
Beispiel #11
0
    def finalize_hypos(
        self,
        step: int,
        bbsz_idx,
        eos_scores,
        tokens,
        scores,
        finalized: List[List[Dict[str, Tensor]]],
        finished: List[bool],
        beam_size: int,
        attn: Optional[Tensor],
        src_lengths,
        max_len: int,
    ):
        """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly.
        A sentence is finalized when {beam_size} finished items have been collected for it.

        Returns number of sentences (not beam items) being finalized.
        These will be removed from the batch and not processed further.
        Args:
            bbsz_idx (Tensor):
        """
        assert bbsz_idx.numel() == eos_scores.numel()

        # clone relevant token and attention tensors.
        # tokens is (batch * beam, max_len). So the index_select
        # gets the newly EOS rows, then selects cols 1..{step + 2}
        tokens = tokens.to('cpu')
        bbsz_idx = bbsz_idx.to('cpu')
        tokens_clone = tokens.index_select(
            0, bbsz_idx)[:, 1:step + 2]  # skip the first index, which is EOS
        tokens_clone[:, step] = self.eos
        tokens_clone = tokens_clone.to('hpu')
        tokens = tokens.to('hpu')
        attn = attn.to('cpu')

        attn_clone = (attn.index_select(0, bbsz_idx)[:, :, 1:step + 2]
                      if attn is not None else None)

        attn = attn.to('hpu')
        # compute scores per token position
        scores = scores.to('cpu')
        pos_scores = scores.index_select(0, bbsz_idx)[:, :step + 1]
        scores = scores.to('hpu')
        bbsz_idx = bbsz_idx.to('hpu')

        pos_scores[:, step] = eos_scores
        # convert from cumulative to per-position scores
        pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]

        # normalize sentence-level scores
        if self.normalize_scores:
            eos_scores /= (step + 1)**self.len_penalty

        # cum_unfin records which sentences in the batch are finished.
        # It helps match indexing between (a) the original sentences
        # in the batch and (b) the current, possibly-reduced set of
        # sentences.
        cum_unfin: List[int] = []
        prev = 0
        for f in finished:
            if f:
                prev += 1
            else:
                cum_unfin.append(prev)

        # The keys here are of the form "{sent}_{unfin_idx}", where
        # "unfin_idx" is the index in the current (possibly reduced)
        # list of sentences, and "sent" is the index in the original,
        # unreduced batch
        # set() is not supported in script export
        sents_seen: Dict[str, Optional[Tensor]] = {}

        # For every finished beam item
        for i in range(bbsz_idx.size()[0]):
            idx = bbsz_idx[i]
            score = eos_scores[i]
            # sentence index in the current (possibly reduced) batch
            unfin_idx = idx // beam_size
            # sentence index in the original (unreduced) batch
            sent = unfin_idx + cum_unfin[unfin_idx]
            # Cannot create dict for key type '(int, int)' in torchscript.
            # The workaround is to cast int to string
            seen = str(sent.item()) + "_" + str(unfin_idx.item())
            if seen not in sents_seen:
                sents_seen[seen] = None

            if self.match_source_len and step > src_lengths[unfin_idx]:
                score = torch.tensor(-math.inf).to(score)

            # An input sentence (among those in a batch) is finished when
            # beam_size hypotheses have been collected for it
            if len(finalized[sent]) < beam_size:
                if attn_clone is not None:
                    # remove padding tokens from attn scores
                    hypo_attn = attn_clone[i]
                else:
                    hypo_attn = torch.empty(0)

                finalized[sent].append({
                    "tokens": tokens_clone[i],
                    "score": score,
                    "attention": hypo_attn,  # src_len x tgt_len
                    "alignment": torch.empty(0),
                    "positional_scores": pos_scores[i],
                })

        newly_finished: List[int] = []

        for seen in sents_seen.keys():
            # check termination conditions for this sentence
            sent: int = int(float(seen.split("_")[0]))
            unfin_idx: int = int(float(seen.split("_")[1]))

            if not finished[sent] and self.is_finished(
                    step, unfin_idx, max_len, len(finalized[sent]), beam_size):
                finished[sent] = True
                newly_finished.append(unfin_idx)

            if use_lazy_mode:
                import habana_frameworks.torch.core as htcore
                htcore.mark_step()

        return newly_finished
Beispiel #12
0
    def _generate(
        self,
        sample: Dict[str, Dict[str, Tensor]],
        prefix_tokens: Optional[Tensor] = None,
        constraints: Optional[Tensor] = None,
        bos_token: Optional[int] = None,
    ):
        incremental_states = torch.jit.annotate(
            List[Dict[str, Dict[str, Optional[Tensor]]]],
            [
                torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
                for i in range(self.model.models_size)
            ],
        )
        net_input = sample["net_input"]

        if "src_tokens" in net_input:
            src_tokens = net_input["src_tokens"]
            # length of the source text being the character length except EndOfSentence and pad
            src_lengths = ((src_tokens.ne(self.eos)
                            & src_tokens.ne(self.pad)).long().sum(dim=1))
        elif "source" in net_input:
            src_tokens = net_input["source"]
            src_lengths = (net_input["padding_mask"].size(-1) -
                           net_input["padding_mask"].sum(-1)
                           if net_input["padding_mask"] is not None else
                           torch.tensor(src_tokens.size(-1)).to(src_tokens))
        elif "features" in net_input:
            src_tokens = net_input["features"]
            src_lengths = (net_input["padding_mask"].size(-1) -
                           net_input["padding_mask"].sum(-1)
                           if net_input["padding_mask"] is not None else
                           torch.tensor(src_tokens.size(-1)).to(src_tokens))
        else:
            raise Exception(
                "expected src_tokens or source in net input. input keys: " +
                str(net_input.keys()))

        # bsz: total number of sentences in beam
        # Note that src_tokens may have more than 2 dimensions (i.e. audio features)
        bsz, src_len = src_tokens.size()[:2]
        beam_size = self.beam_size

        if constraints is not None and not self.search.supports_constraints:
            raise NotImplementedError(
                "Target-side constraints were provided, but search method doesn't support them"
            )

        # Initialize constraints, when active
        self.search.init_constraints(constraints, beam_size)

        max_len: int = -1
        if self.match_source_len:
            max_len = src_lengths.max().item()
        else:
            max_len = min(
                int(self.max_len_a * src_len + self.max_len_b),
                self.max_len - 1,
            )
        assert (
            self.min_len <= max_len
        ), "min_len cannot be larger than max_len, please adjust these!"
        # compute the encoder output for each beam
        with torch.autograd.profiler.record_function(
                "EnsembleModel: forward_encoder"):
            encoder_outs = self.model.forward_encoder(net_input)

        # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores
        new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
        new_order = new_order.to(src_tokens.device).long()
        encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order)
        # ensure encoder_outs is a List.
        assert encoder_outs is not None

        # initialize buffers
        scores = (torch.zeros(bsz * beam_size,
                              max_len + 1).to(src_tokens).float()
                  )  # +1 for eos; pad is never chosen for scoring
        tokens = (torch.zeros(bsz * beam_size,
                              max_len + 2).to(src_tokens).long().fill_(
                                  self.pad))  # +2 for eos and pad

        tokens = tokens.to('cpu')
        tokens[:, 0] = self.eos if bos_token is None else bos_token
        tokens = tokens.to('hpu')
        attn: Optional[Tensor] = None

        # A list that indicates candidates that should be ignored.
        # For example, suppose we're sampling and have already finalized 2/5
        # samples. Then cands_to_ignore would mark 2 positions as being ignored,
        # so that we only finalize the remaining 3 samples.
        cands_to_ignore = (torch.zeros(bsz, beam_size).to(src_tokens).eq(-1)
                           )  # forward and backward-compatible False mask

        # list of completed sentences
        finalized = torch.jit.annotate(
            List[List[Dict[str, Tensor]]],
            [
                torch.jit.annotate(List[Dict[str, Tensor]], [])
                for i in range(bsz)
            ],
        )  # contains lists of dictionaries of infomation about the hypothesis being finalized at each step

        # a boolean array indicating if the sentence at the index is finished or not
        finished = [False for i in range(bsz)]
        num_remaining_sent = bsz  # number of sentences remaining

        # number of candidate hypos per step
        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS

        # offset arrays for converting between different indexing schemes
        bbsz_offsets = ((torch.arange(0, bsz) *
                         beam_size).unsqueeze(1).type_as(tokens).to(
                             src_tokens.device))
        cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(
            src_tokens.device)

        reorder_state: Optional[Tensor] = None
        batch_idxs: Optional[Tensor] = None

        original_batch_idxs: Optional[Tensor] = None
        if "id" in sample and isinstance(sample["id"], Tensor):
            original_batch_idxs = sample["id"]
        else:
            original_batch_idxs = torch.arange(0, bsz).type_as(tokens)

        for step in range(max_len + 1):  # one extra step for EOS marker
            # reorder decoder internal states based on the prev choice of beams
            if reorder_state is not None:
                if batch_idxs is not None:
                    # update beam indices to take into account removed sentences
                    corr = batch_idxs - torch.arange(
                        batch_idxs.numel()).type_as(batch_idxs)
                    corr = corr.to('cpu')
                    reorder_state = reorder_state.to('cpu')
                    reorder_state.view(-1, beam_size).add_(
                        corr.unsqueeze(-1) * beam_size)
                    corr = corr.to('hpu')
                    reorder_state = reorder_state.to('hpu')
                    original_batch_idxs = original_batch_idxs[batch_idxs]
                self.model.reorder_incremental_state(incremental_states,
                                                     reorder_state)
                encoder_outs = self.model.reorder_encoder_out(
                    encoder_outs, reorder_state)
            with torch.autograd.profiler.record_function(
                    "EnsembleModel: forward_decoder"):
                lprobs, avg_attn_scores = self.model.forward_decoder(
                    tokens[:, :step + 1],
                    encoder_outs,
                    incremental_states,
                    self.temperature,
                )

            if self.lm_model is not None:
                lm_out = self.lm_model(tokens[:, :step + 1])
                probs = self.lm_model.get_normalized_probs(lm_out,
                                                           log_probs=True,
                                                           sample=None)
                probs = probs[:, -1, :] * self.lm_weight
                lprobs += probs

            lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs)

            lprobs = lprobs.to('cpu')
            lprobs[:, self.pad] = -math.inf  # never select pad
            lprobs[:, self.unk] -= self.unk_penalty  # apply unk penalty

            # handle max length constraint
            if step >= max_len:
                lprobs[:, :self.eos] = -math.inf
                lprobs[:, self.eos + 1:] = -math.inf
            lprobs = lprobs.to('hpu')

            # handle prefix tokens (possibly with different lengths)
            if (prefix_tokens is not None and step < prefix_tokens.size(1)
                    and step < max_len):
                lprobs, tokens, scores = self._prefix_tokens(
                    step, lprobs, scores, tokens, prefix_tokens, beam_size)
            elif step < self.min_len:
                # minimum length constraint (does not apply if using prefix_tokens)
                lprobs[:, self.eos] = -math.inf

            # Record attention scores, only support avg_attn_scores is a Tensor
            if avg_attn_scores is not None:
                if attn is None:
                    attn = torch.empty(bsz * beam_size,
                                       avg_attn_scores.size(1),
                                       max_len + 2).to(scores)
                    attn = attn.fill_(0.0)
                attn[:, :, step + 1].copy_(avg_attn_scores)

            scores = scores.type_as(lprobs)
            eos_bbsz_idx = torch.empty(0).to(
                tokens
            )  # indices of hypothesis ending with eos (finished sentences)
            eos_scores = torch.empty(0).to(
                scores
            )  # scores of hypothesis ending with eos (finished sentences)

            if self.should_set_src_lengths:
                self.search.set_src_lengths(src_lengths)

            if self.repeat_ngram_blocker is not None:
                lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz,
                                                   beam_size, step)

            # Shape: (batch, cand_size)
            tokens = tokens.to("cpu")
            lprobs = lprobs.to("cpu")
            scores = scores.to("cpu")
            cand_scores, cand_indices, cand_beams = self.search.step(
                step,
                lprobs.view(bsz, -1, self.vocab_size),
                scores.view(bsz, beam_size, -1)[:, :, :step],
                tokens[:, :step + 1],
                original_batch_idxs,
            )

            scores = scores.to("hpu")
            tokens = tokens.to("hpu")
            lprobs = lprobs.to("hpu")
            # cand_bbsz_idx contains beam indices for the top candidate
            # hypotheses, with a range of values: [0, bsz*beam_size),
            # and dimensions: [bsz, cand_size]
            cand_beams = cand_beams.to("hpu")
            cand_bbsz_idx = cand_beams.add(bbsz_offsets)
            # finalize hypotheses that end in eos
            # Shape of eos_mask: (batch size, beam size)
            eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf)
            cands_to_ignore = cands_to_ignore.to('cpu')
            eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(
                eos_mask)

            # only consider eos when it's among the top beam_size indices
            # Now we know what beam item(s) to finish
            # Shape: 1d list of absolute-numbered
            cand_bbsz_idx = cand_bbsz_idx.to("cpu")
            eos_bbsz_idx = torch.masked_select(cand_bbsz_idx[:, :beam_size],
                                               mask=eos_mask[:, :beam_size])
            cand_bbsz_idx = cand_bbsz_idx.to('hpu')
            eos_mask = eos_mask.to('hpu')
            cands_to_ignore = cands_to_ignore.to('hpu')
            eos_bbsz_idx = eos_bbsz_idx.to('hpu')

            finalized_sents: List[int] = []
            if eos_bbsz_idx.numel() > 0:
                eos_scores = torch.masked_select(cand_scores[:, :beam_size],
                                                 mask=eos_mask[:, :beam_size])

                finalized_sents = self.finalize_hypos(
                    step,
                    eos_bbsz_idx,
                    eos_scores,
                    tokens,
                    scores,
                    finalized,
                    finished,
                    beam_size,
                    attn,
                    src_lengths,
                    max_len,
                )
                num_remaining_sent -= len(finalized_sents)

            assert num_remaining_sent >= 0
            if num_remaining_sent == 0:
                break
            if self.search.stop_on_max_len and step >= max_len:
                break
            assert step < max_len, f"{step} < {max_len}"

            # Remove finalized sentences (ones for which {beam_size}
            # finished hypotheses have been generated) from the batch.
            if len(finalized_sents) > 0:
                new_bsz = bsz - len(finalized_sents)

                # construct batch_idxs which holds indices of batches to keep for the next pass
                batch_mask = torch.ones(bsz,
                                        dtype=torch.bool,
                                        device=cand_indices.device)
                batch_mask[finalized_sents] = False
                # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it
                batch_idxs = torch.arange(
                    bsz, device=cand_indices.device).masked_select(batch_mask)

                # Choose the subset of the hypothesized constraints that will continue
                self.search.prune_sentences(batch_idxs)

                eos_mask = eos_mask[batch_idxs]
                cand_beams = cand_beams[batch_idxs]
                bbsz_offsets.resize_(new_bsz, 1)
                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
                cand_scores = cand_scores[batch_idxs]
                cand_indices = cand_indices[batch_idxs]

                if prefix_tokens is not None:
                    prefix_tokens = prefix_tokens[batch_idxs]
                src_lengths = src_lengths[batch_idxs]
                cands_to_ignore = cands_to_ignore[batch_idxs]

                scores = scores.to('cpu')
                scores = scores.view(bsz, -1)[batch_idxs].view(
                    new_bsz * beam_size, -1)
                scores = scores.to('hpu')
                tokens = tokens.to('cpu')
                tokens = tokens.view(bsz, -1)[batch_idxs].view(
                    new_bsz * beam_size, -1)
                tokens = tokens.to('hpu')
                if attn is not None:
                    attn = attn.to('cpu')
                    attn = attn.view(bsz, -1)[batch_idxs].view(
                        new_bsz * beam_size, attn.size(1), -1)
                    attn = attn.to('hpu')

                bsz = new_bsz
            else:
                batch_idxs = None

            # Set active_mask so that values > cand_size indicate eos hypos
            # and values < cand_size indicate candidate active hypos.
            # After, the min values per row are the top candidate active hypos

            # Rewrite the operator since the element wise or is not supported in torchscript.

            eos_mask[:, :beam_size] = ~((~cands_to_ignore) &
                                        (~eos_mask[:, :beam_size]))

            active_mask = torch.add(
                eos_mask.type_as(cand_offsets) * cand_size,
                cand_offsets[:eos_mask.size(1)],
            )

            # get the top beam_size active hypotheses, which are just
            # the hypos with the smallest values in active_mask.
            # {active_hypos} indicates which {beam_size} hypotheses
            # from the list of {2 * beam_size} candidates were
            # selected. Shapes: (batch size, beam size)
            new_cands_to_ignore, active_hypos = torch.topk(active_mask,
                                                           k=beam_size,
                                                           dim=1,
                                                           largest=False)

            # update cands_to_ignore to ignore any finalized hypos.
            cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
            # Make sure there is at least one active item for each sentence in the batch.
            assert (~cands_to_ignore).any(dim=1).all()

            # update cands_to_ignore to ignore any finalized hypos

            # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam
            # can be selected more than once).
            active_bbsz_idx = torch.gather(cand_bbsz_idx,
                                           dim=1,
                                           index=active_hypos)
            active_scores = torch.gather(cand_scores,
                                         dim=1,
                                         index=active_hypos)
            cand_scores = cand_scores.to('cpu')
            active_hypos = active_hypos.to('cpu')
            active_hypos = active_hypos.to(dtype=torch.long)
            active_bbsz_idx = active_bbsz_idx.to('cpu')
            active_bbsz_idx = active_bbsz_idx.view(-1)
            active_scores = active_scores.to('cpu')
            active_scores = active_scores.view(-1)

            # copy tokens and scores for active hypotheses

            # Set the tokens for each beam (can select the same row more than once)
            tokens = tokens.to('cpu')
            tokens[:, :step + 1] = torch.index_select(tokens[:, :step + 1],
                                                      dim=0,
                                                      index=active_bbsz_idx)

            cand_indices = cand_indices.to('cpu')
            # Select the next token for each of them
            tokens.view(bsz, beam_size,
                        -1)[:, :, step + 1] = torch.gather(cand_indices,
                                                           dim=1,
                                                           index=active_hypos)
            tokens = tokens.to('hpu')
            cand_indices = cand_indices.to('hpu')

            scores = scores.to('cpu')
            if step > 0:
                scores[:, :step] = torch.index_select(scores[:, :step],
                                                      dim=0,
                                                      index=active_bbsz_idx)
            scores.view(bsz, beam_size,
                        -1)[:, :, step] = torch.gather(cand_scores,
                                                       dim=1,
                                                       index=active_hypos)

            scores = scores.to('hpu')
            cand_scores = cand_scores.to('hpu')
            active_hypos = active_hypos.to('hpu')
            # Update constraints based on which candidates were selected for the next beam
            self.search.update_constraints(active_hypos)

            # copy attention for active hypotheses
            if attn is not None:
                active_bbsz_idx = active_bbsz_idx.to('hpu')
                attn[:, :, :step + 2] = torch.index_select(
                    attn[:, :, :step + 2], dim=0, index=active_bbsz_idx)

            # reorder incremental state in decoder
            reorder_state = active_bbsz_idx

        # sort by score descending
        for sent in range(len(finalized)):
            scores = torch.tensor(
                [float(elem["score"].item()) for elem in finalized[sent]])
            _, sorted_scores_indices = torch.sort(scores, descending=True)
            finalized[sent] = [
                finalized[sent][ssi] for ssi in sorted_scores_indices
            ]
            finalized[sent] = torch.jit.annotate(List[Dict[str, Tensor]],
                                                 finalized[sent])

            if use_lazy_mode:
                import habana_frameworks.torch.core as htcore
                htcore.mark_step()

        return finalized
Beispiel #13
0
def train_one_epoch(model,
                    criterion,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq,
                    apex=False):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ", device=device)
    metric_logger.add_meter('lr',
                            utils.SmoothedValue(window_size=1, fmt='{value}'))
    metric_logger.add_meter('img/s',
                            utils.SmoothedValue(window_size=10, fmt='{value}'))

    header = 'Epoch: [{}]'.format(epoch)
    step_count = 0
    last_print_time = time.time()

    for image, target in metric_logger.log_every(data_loader, print_freq,
                                                 header):
        image, target = image.to(device, non_blocking=True), target.to(
            device, non_blocking=True)

        dl_ex_start_time = time.time()

        if args.channels_last:
            image = image.contiguous(memory_format=torch.channels_last)

            if args.run_lazy_mode:
                # This mark_step is added so that the the lazy kernel can
                # create and evaluate the graph to infer the resulting tensor
                # as channels_last
                import habana_frameworks.torch.core as htcore
                htcore.mark_step()
        output = model(image)
        loss = criterion(output, target)
        optimizer.zero_grad()

        # We see the performance gain of mobilenet via added this mark_step.
        if (args.run_lazy_mode and 'mobilenet_v2' in args.model):
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        if apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if args.run_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        optimizer.step()

        if args.run_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()
        if step_count % print_freq == 0:
            output_cpu = output.detach().to('cpu')
            acc1, acc5 = utils.accuracy(output_cpu, target, topk=(1, 5))
            batch_size = image.shape[0]
            metric_logger.update(loss=loss.item(),
                                 lr=optimizer.param_groups[0]["lr"])
            metric_logger.meters['acc1'].update(acc1.item(),
                                                n=batch_size * print_freq)
            metric_logger.meters['acc5'].update(acc5.item(),
                                                n=batch_size * print_freq)
            current_time = time.time()
            last_print_time = dl_ex_start_time if args.dl_time_exclude else last_print_time
            metric_logger.meters['img/s'].update(
                batch_size * print_freq / (current_time - last_print_time))
            last_print_time = time.time()

        step_count = step_count + 1
        if step_count >= args.num_train_steps:
            break
def train(args, train_dataset, model, tokenizer, teacher=None):
    """Train the model"""
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    if args.hpu and args.optimizer == "FusedAdamW":
        try:
            from habana_frameworks.torch.hpex.optimizers import FusedAdamW
        except ImportError:
            raise ImportError("Please install habana_torch.")
        optimizer = FusedAdamW(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            eps=args.adam_epsilon,
        )
    elif args.optimizer == "AdamW":
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if args.use_lazy_mode:
        try:
            import habana_frameworks.torch.core as htcore
        except ImportError:
            assert False, "Could Not import habana_frameworks.torch.core"

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        if args.hpu:
            # Distributed DataParallel for HPU
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                bucket_cap_mb=230,
                find_unused_parameters=False,
                gradient_as_bucket_view=True,
                broadcast_buffers=False)
        else:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[args.local_rank],
                output_device=args.local_rank,
                find_unused_parameters=True)

    # Train!
    if args.local_rank in [-1, 0]:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Num Epochs = %d", args.num_train_epochs)
        logger.info("  Instantaneous batch size per GPU = %d",
                    args.per_gpu_train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            args.train_batch_size * args.gradient_accumulation_steps *
            (torch.distributed.get_world_size()
             if args.local_rank != -1 else 1),
        )
        logger.info("  Gradient Accumulation steps = %d",
                    args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    loss_list = []
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            if args.local_rank in [-1, 0]:
                logger.info(
                    "  Continuing training from checkpoint, will skip to saved global_step"
                )
                logger.info("  Continuing training from epoch %d",
                            epochs_trained)
                logger.info("  Continuing training from global step %d",
                            global_step)
                logger.info(
                    "  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)
        except ValueError:
            if args.local_rank in [-1, 0]:
                logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    # Added here for reproductibility
    set_seed(args)

    if args.use_lazy_mode:
        htcore.mark_step()

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0],
                              smoothing=1)
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            if teacher is not None:
                teacher.eval()

            try:
                from habana_frameworks.torch.hpex.normalization import FusedClipNorm
            except ImportError:
                raise ImportError("Please install habana_torch.")
            FusedNorm = FusedClipNorm(model.parameters(), args.max_grad_norm)

            if args.hpu:
                batch = [
                    b.type(torch.IntTensor) if b.dtype == torch.int64 else b
                    for b in batch
                ]

            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }
            if args.model_type != "distilbert":
                inputs[
                    "token_type_ids"] = None if args.model_type == "xlm" else batch[
                        2]
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
            outputs = model(**inputs)
            loss, start_logits_stu, end_logits_stu = outputs

            # Distillation loss
            if teacher is not None:
                if "token_type_ids" not in inputs:
                    inputs[
                        "token_type_ids"] = None if args.teacher_type == "xlm" else batch[
                            2]
                with torch.no_grad():
                    start_logits_tea, end_logits_tea = teacher(
                        input_ids=inputs["input_ids"],
                        token_type_ids=inputs["token_type_ids"],
                        attention_mask=inputs["attention_mask"],
                    )
                assert start_logits_tea.size() == start_logits_stu.size()
                assert end_logits_tea.size() == end_logits_stu.size()

                loss_fct = nn.KLDivLoss(reduction="batchmean")
                loss_start = (loss_fct(
                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
                    F.softmax(start_logits_tea / args.temperature, dim=-1),
                ) * (args.temperature**2))
                loss_end = (loss_fct(
                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
                    F.softmax(end_logits_tea / args.temperature, dim=-1),
                ) * (args.temperature**2))
                loss_ce = (loss_start + loss_end) / 2.0

                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if args.use_lazy_mode:
                htcore.mark_step()

            loss_list.append(loss)
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    if args.hpu:
                        if args.optimizer == "FusedAdamW":
                            FusedNorm.clip_norm(model.parameters())
                        else:
                            if args.hmp:
                                from habana_frameworks.torch.hpex import hmp
                                with hmp.disable_casts():
                                    torch.nn.utils.clip_grad_norm_(
                                        model.parameters(), args.max_grad_norm)
                            else:
                                torch.nn.utils.clip_grad_norm_(
                                    model.parameters(), args.max_grad_norm)

                if args.hpu and args.hmp and not (args.optimizer
                                                  == "FusedAdamW"):
                    from habana_frameworks.torch.hpex import hmp
                    with hmp.disable_casts():
                        optimizer.step()
                else:
                    optimizer.step()

                if args.use_lazy_mode:
                    htcore.mark_step()

                scheduler.step()  # Update learning rate schedule
                model.zero_grad()

                global_step += 1

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    for loss_t in loss_list:
                        tr_loss += loss_t.item()
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss
                    loss_list.clear()

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    if args.hpu:
                        d = next(model_to_save.parameters()).device
                        if (d != torch.device("cpu")):
                            import copy
                            model_to_save_clone = copy.deepcopy(model_to_save)
                            model_to_save_clone.to(torch.device("cpu"))
                            model_to_save_clone.save_pretrained(output_dir)
                            torch.save(model_to_save_clone.state_dict(),
                                       os.path.join(output_dir, "model.pt"))
                        else:
                            model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    if args.hpu:
                        param_groups_copy = optimizer.state_dict(
                        )['param_groups']
                        state_dict_copy = {}
                        for st_key, st_val in optimizer.state_dict(
                        )['state'].items():
                            st_val_copy = {}
                            for k, v in st_val.items():
                                st_val_copy[k] = v.to('cpu') if isinstance(
                                    v, torch.Tensor) else v
                                state_dict_copy[st_key] = st_val_copy
                        optim_dict = {}
                        optim_dict['state'] = state_dict_copy
                        optim_dict['param_groups'] = param_groups_copy
                        torch.save(optim_dict,
                                   os.path.join(output_dir, "optimizer.pt"))
                    else:
                        torch.save(optimizer.state_dict(),
                                   os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    if args.use_lazy_mode:
        try:
            import habana_frameworks.torch.core as htcore
        except ImportError:
            assert False, "Could Not import habana_frameworks.torch.core"

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    if args.local_rank in [-1, 0]:
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
            if args.model_type != "distilbert":
                inputs[
                    "token_type_ids"] = None if args.model_type == "xlm" else batch[
                        2]  # XLM don't use segment_ids
            example_indices = batch[3]
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})

            outputs = model(**inputs)

            if args.use_lazy_mode:
                htcore.mark_step()

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    if args.local_rank in [-1, 0]:
        logger.info("  Evaluation done in total %f secs (%f sec per example)",
                    evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ["xlnet", "xlm"]:
        # XLNet uses a more complex post-processing procedure
        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            model.config.start_n_top,
            model.config.end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Beispiel #16
0
def mark_step(use_hpu=True):
    if not use_hpu:
        return
    import habana_frameworks.torch.core as htcore
    htcore.mark_step()