def permute_params(model, to_filters_last, lazy_mode): with torch.no_grad(): for name, param in model.named_parameters(): if(param.ndim == 4): if to_filters_last: param.data = param.data.permute((2, 3, 1, 0)) else: param.data = param.data.permute((3, 2, 0, 1)) # permute RSCK to KCRS if lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step()
def validate(val_loader, model, criterion, device, args): #Images per second with data loading time image_time_DL = AverageMeter('imgs/s(Inc. DL)', ':6.3f') batch_time = AverageMeter('BatchTime', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':0.2f') top5 = AverageMeter('Acc@5', ':0.2f') progress = ProgressMeter( len(val_loader), [batch_time, image_time_DL, losses, top1, top5], prefix='Test: ') print("MODEL EVAL") # switch to evaluate mode model.eval() with torch.no_grad(): data_end = time.time() for i, (images, target) in enumerate(val_loader): images, target = images.to(device, non_blocking=True), target.to(device, non_blocking=True) images = images.contiguous(memory_format=torch.channels_last) if args.enable_lazy: import habana_frameworks.torch.core as htcore htcore.mark_step() # compute output output = model(images) loss = criterion(output, target) if i % args.print_interval == 0: acc1, acc5 = accuracy_classification(output, target, topk=(1, 5)) batch_size = images.shape[0] losses.update(loss.item()) top1.update(acc1.to(torch.device('cpu'))[0], n=batch_size*args.print_interval) top5.update(acc5.to(torch.device('cpu'))[0], n=batch_size*args.print_interval) # measure elapsed time image_time_DL.update(batch_size*args.print_interval/(time.time() - data_end), n=args.print_interval) batch_time.update(time.time() - data_end, n=1, avoid_warmup=True) progress.display(i) data_end = time.time() if i == args.num_train_steps-1: break # gather the stats from all processes top1.synchronize_between_processes(device) top5.synchronize_between_processes(device) print(' * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}'.format(top1=top1, top5=top5)) return top1.global_avg
def permute_momentum(optimizer, to_filters_last, lazy_mode): # Permute the momentum buffer before using for checkpoint for group in optimizer.param_groups: for p in group['params']: param_state = optimizer.state[p] if 'momentum_buffer' in param_state: buf = param_state['momentum_buffer'] if(buf.ndim == 4): if to_filters_last: buf = buf.permute((2,3,1,0)) else: buf = buf.permute((3,2,0,1)) param_state['momentum_buffer'] = buf if lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step()
def evaluate(model, criterion, data_loader, device, print_freq=100): model.eval() metric_logger = utils.MetricLogger(delimiter=" ", device=device) header = 'Test:' step_count = 0 with torch.no_grad(): for image, target in metric_logger.log_every(data_loader, print_freq, header): image = image.to(device, non_blocking=True) if args.channels_last: image = image.contiguous(memory_format=torch.channels_last) if args.run_lazy_mode: # This mark_step is added so that the the lazy kernel can # create and evaluate the graph to infer the resulting tensor # as channels_last import habana_frameworks.torch.core as htcore htcore.mark_step() target = target.to(device, non_blocking=True) output = model(image) loss = criterion(output, target) acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) # FIXME need to take into account that the datasets # could have been padded in distributed setup batch_size = image.shape[0] loss_cpu = loss.to('cpu').detach() metric_logger.update(loss=loss_cpu.item()) metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) step_count = step_count + 1 if step_count >= args.num_eval_steps: break # gather the stats from all processes metric_logger.synchronize_between_processes() # Return from here if evaluation phase does not go through any iterations.(eg, The data set is so small that # there is only one eval batch, but that was skipped in data loader due to drop_last=True) if len(metric_logger.meters) == 0: return print(' * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}'.format( top1=metric_logger.acc1, top5=metric_logger.acc5)) return metric_logger.acc1.global_avg
def _inference_with_bleu(self, generator, sample, model): import sacrebleu def decode(toks, escape_unk=False): s = self.tgt_dict.string( toks.int().cpu(), self.cfg.eval_bleu_remove_bpe, # The default unknown string in fairseq is `<unk>`, but # this is tokenized by sacrebleu as `< unk >`, inflating # BLEU scores. Instead, we use a somewhat more verbose # alternative that is unlikely to appear in the real # reference, but doesn't get split into multiple tokens. unk_string=("UNKNOWNTOKENINREF" if escape_unk else "UNKNOWNTOKENINHYP"), ) if self.tokenizer: s = self.tokenizer.decode(s) return s gen_out = self.inference_step(generator, [model], sample, prefix_tokens=None) if self.cfg.use_habana and self.cfg.use_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() hyps, refs = [], [] for i in range(len(gen_out)): hyps.append(decode(gen_out[i][0]["tokens"])) refs.append( decode( utils.strip_pad(sample["target"][i], self.tgt_dict.pad()), escape_unk=True, # don't count <unk> as matches to the hypo )) if self.cfg.eval_bleu_print_samples: logger.info("example hypothesis: " + hyps[0]) logger.info("example reference: " + refs[0]) if self.cfg.eval_tokenized_bleu: return sacrebleu.corpus_bleu(hyps, [refs], tokenize="none") else: return sacrebleu.corpus_bleu(hyps, [refs])
def train(args, model, device, train_loader, optimizer, epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) data = data.contiguous(memory_format=torch.channels_last) if args.use_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if args.use_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() optimizer.step() if args.use_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset) / args.world_size, 100. * batch_idx / len(train_loader), loss.item())) if args.dry_run: break
def train(train_loader, model, criterion, optimizer, epoch, device, args): batch_time = AverageMeter('BatchTime', ':6.3f') #Images per second with data loading time image_time_DL = AverageMeter('imgs/s(Inc. DL)', ':6.3f') image_time = AverageMeter('imgs/s(Exc. DL)', ':6.3f') data_time = AverageMeter('DL Time', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':0.2f') top5 = AverageMeter('Acc@5', ':0.2f') if args.print_interval == 1: progress = ProgressMeter( len(train_loader), [batch_time, image_time, image_time_DL, data_time, losses, top1, top5], prefix='Epoch: [{}]'.format(epoch)) else: progress = ProgressMeter( len(train_loader), [batch_time, image_time_DL, data_time, losses, top1, top5], prefix='Epoch: [{}]'.format(epoch)) # switch to train mode model.train() end = time.time() for i, (images, target) in enumerate(train_loader): images, target = images.to(device, non_blocking=True), target.to(device, non_blocking=True) # measure data loading time data_end = time.time() data_loading_time = data_end - end images = images.contiguous(memory_format=torch.channels_last) if args.enable_lazy: import habana_frameworks.torch.core as htcore htcore.mark_step() # compute output output = model(images) if not args.no_aux_logits: aux_logits2 = output.aux_logits2 aux_logits1 = output.aux_logits1 output = output.logits # "Going Deeper with Convolutions" <http://arxiv.org/abs/1409.4842>, Page 6. loss = criterion(output, target) + 0.3*(criterion(aux_logits2, target) + criterion(aux_logits1, target)) else: loss = criterion(output, target) optimizer.zero_grad() if args.device =='gpu' and args.is_amp: from apex import amp with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.enable_lazy: import habana_frameworks.torch.core as htcore htcore.mark_step() if args.is_hmp: from habana_frameworks.torch.hpex import hmp with hmp.disable_casts(): optimizer.step() else: optimizer.step() if args.enable_lazy: import habana_frameworks.torch.core as htcore htcore.mark_step() if i % args.print_interval == 0: # measure accuracy and record loss acc1, acc5 = accuracy_classification(output, target, topk=(1, 5)) batch_size = images.shape[0] losses.update(loss.item(), n=batch_size) top1.update(acc1.to(torch.device('cpu'))[0], n=batch_size*args.print_interval) top5.update(acc5.to(torch.device('cpu'))[0], n=batch_size*args.print_interval) batch_elapsed_time = time.time() - data_end total_elapsed_time = time.time() - end # measure elapsed time if epoch == 0: batch_time.update(batch_elapsed_time, n=args.print_interval, skip=2*args.print_interval, avoid_warmup=True) data_time.update(data_loading_time, n=args.print_interval, avoid_warmup=True) image_time_DL.update(batch_size*args.print_interval/total_elapsed_time, n=args.print_interval, skip=2*args.print_interval, avoid_warmup=True) if (args.print_interval == 1): image_time.update(batch_size/batch_elapsed_time, n=args.print_interval, skip=2*args.print_interval, avoid_warmup=True) else: batch_time.update(batch_elapsed_time, n=args.print_interval) data_time.update(data_loading_time,n=args.print_interval) image_time_DL.update(batch_size*args.print_interval/total_elapsed_time, n=args.print_interval) if (args.print_interval == 1): image_time.update(batch_size/batch_elapsed_time, n=args.print_interval, skip=2*args.print_interval) progress.display(i) end = time.time() if i == args.num_train_steps-1: break
def mark_step(is_lazy_mode): if is_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step()
def test_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: # Break lazy accumulation of graph after every step htcore.mark_step() return step_output
def forward_decoder( self, tokens, encoder_outs: List[Dict[str, List[Tensor]]], incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], temperature: float = 1.0, ): log_probs = [] avg_attn: Optional[Tensor] = None encoder_out: Optional[Dict[str, List[Tensor]]] = None for i, model in enumerate(self.models): if self.has_encoder(): encoder_out = encoder_outs[i] if use_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() # decode each model if self.has_incremental_states(): decoder_out = model.decoder.forward( tokens, encoder_out=encoder_out, incremental_state=incremental_states[i], ) else: if hasattr(model, "decoder"): decoder_out = model.decoder.forward( tokens, encoder_out=encoder_out) else: decoder_out = model.forward(tokens) if use_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() attn: Optional[Tensor] = None decoder_len = len(decoder_out) if decoder_len > 1 and decoder_out[1] is not None: if isinstance(decoder_out[1], Tensor): attn = decoder_out[1] else: attn_holder = decoder_out[1]["attn"] if isinstance(attn_holder, Tensor): attn = attn_holder elif attn_holder is not None: attn = attn_holder[0] if attn is not None: attn = attn[:, -1, :] decoder_out_tuple = ( decoder_out[0][:, -1:, :].div_(temperature), None if decoder_len <= 1 else decoder_out[1], ) probs = model.get_normalized_probs(decoder_out_tuple, log_probs=True, sample=None) probs = probs[:, -1, :] if self.models_size == 1: return probs, attn log_probs.append(probs) if attn is not None: if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log(self.models_size) if avg_attn is not None: avg_attn.div_(self.models_size) return avg_probs, avg_attn
def finalize_hypos( self, step: int, bbsz_idx, eos_scores, tokens, scores, finalized: List[List[Dict[str, Tensor]]], finished: List[bool], beam_size: int, attn: Optional[Tensor], src_lengths, max_len: int, ): """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly. A sentence is finalized when {beam_size} finished items have been collected for it. Returns number of sentences (not beam items) being finalized. These will be removed from the batch and not processed further. Args: bbsz_idx (Tensor): """ assert bbsz_idx.numel() == eos_scores.numel() # clone relevant token and attention tensors. # tokens is (batch * beam, max_len). So the index_select # gets the newly EOS rows, then selects cols 1..{step + 2} tokens = tokens.to('cpu') bbsz_idx = bbsz_idx.to('cpu') tokens_clone = tokens.index_select( 0, bbsz_idx)[:, 1:step + 2] # skip the first index, which is EOS tokens_clone[:, step] = self.eos tokens_clone = tokens_clone.to('hpu') tokens = tokens.to('hpu') attn = attn.to('cpu') attn_clone = (attn.index_select(0, bbsz_idx)[:, :, 1:step + 2] if attn is not None else None) attn = attn.to('hpu') # compute scores per token position scores = scores.to('cpu') pos_scores = scores.index_select(0, bbsz_idx)[:, :step + 1] scores = scores.to('hpu') bbsz_idx = bbsz_idx.to('hpu') pos_scores[:, step] = eos_scores # convert from cumulative to per-position scores pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] # normalize sentence-level scores if self.normalize_scores: eos_scores /= (step + 1)**self.len_penalty # cum_unfin records which sentences in the batch are finished. # It helps match indexing between (a) the original sentences # in the batch and (b) the current, possibly-reduced set of # sentences. cum_unfin: List[int] = [] prev = 0 for f in finished: if f: prev += 1 else: cum_unfin.append(prev) # The keys here are of the form "{sent}_{unfin_idx}", where # "unfin_idx" is the index in the current (possibly reduced) # list of sentences, and "sent" is the index in the original, # unreduced batch # set() is not supported in script export sents_seen: Dict[str, Optional[Tensor]] = {} # For every finished beam item for i in range(bbsz_idx.size()[0]): idx = bbsz_idx[i] score = eos_scores[i] # sentence index in the current (possibly reduced) batch unfin_idx = idx // beam_size # sentence index in the original (unreduced) batch sent = unfin_idx + cum_unfin[unfin_idx] # Cannot create dict for key type '(int, int)' in torchscript. # The workaround is to cast int to string seen = str(sent.item()) + "_" + str(unfin_idx.item()) if seen not in sents_seen: sents_seen[seen] = None if self.match_source_len and step > src_lengths[unfin_idx]: score = torch.tensor(-math.inf).to(score) # An input sentence (among those in a batch) is finished when # beam_size hypotheses have been collected for it if len(finalized[sent]) < beam_size: if attn_clone is not None: # remove padding tokens from attn scores hypo_attn = attn_clone[i] else: hypo_attn = torch.empty(0) finalized[sent].append({ "tokens": tokens_clone[i], "score": score, "attention": hypo_attn, # src_len x tgt_len "alignment": torch.empty(0), "positional_scores": pos_scores[i], }) newly_finished: List[int] = [] for seen in sents_seen.keys(): # check termination conditions for this sentence sent: int = int(float(seen.split("_")[0])) unfin_idx: int = int(float(seen.split("_")[1])) if not finished[sent] and self.is_finished( step, unfin_idx, max_len, len(finalized[sent]), beam_size): finished[sent] = True newly_finished.append(unfin_idx) if use_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() return newly_finished
def _generate( self, sample: Dict[str, Dict[str, Tensor]], prefix_tokens: Optional[Tensor] = None, constraints: Optional[Tensor] = None, bos_token: Optional[int] = None, ): incremental_states = torch.jit.annotate( List[Dict[str, Dict[str, Optional[Tensor]]]], [ torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) for i in range(self.model.models_size) ], ) net_input = sample["net_input"] if "src_tokens" in net_input: src_tokens = net_input["src_tokens"] # length of the source text being the character length except EndOfSentence and pad src_lengths = ((src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1)) elif "source" in net_input: src_tokens = net_input["source"] src_lengths = (net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) if net_input["padding_mask"] is not None else torch.tensor(src_tokens.size(-1)).to(src_tokens)) elif "features" in net_input: src_tokens = net_input["features"] src_lengths = (net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) if net_input["padding_mask"] is not None else torch.tensor(src_tokens.size(-1)).to(src_tokens)) else: raise Exception( "expected src_tokens or source in net input. input keys: " + str(net_input.keys())) # bsz: total number of sentences in beam # Note that src_tokens may have more than 2 dimensions (i.e. audio features) bsz, src_len = src_tokens.size()[:2] beam_size = self.beam_size if constraints is not None and not self.search.supports_constraints: raise NotImplementedError( "Target-side constraints were provided, but search method doesn't support them" ) # Initialize constraints, when active self.search.init_constraints(constraints, beam_size) max_len: int = -1 if self.match_source_len: max_len = src_lengths.max().item() else: max_len = min( int(self.max_len_a * src_len + self.max_len_b), self.max_len - 1, ) assert ( self.min_len <= max_len ), "min_len cannot be larger than max_len, please adjust these!" # compute the encoder output for each beam with torch.autograd.profiler.record_function( "EnsembleModel: forward_encoder"): encoder_outs = self.model.forward_encoder(net_input) # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) new_order = new_order.to(src_tokens.device).long() encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order) # ensure encoder_outs is a List. assert encoder_outs is not None # initialize buffers scores = (torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float() ) # +1 for eos; pad is never chosen for scoring tokens = (torch.zeros(bsz * beam_size, max_len + 2).to(src_tokens).long().fill_( self.pad)) # +2 for eos and pad tokens = tokens.to('cpu') tokens[:, 0] = self.eos if bos_token is None else bos_token tokens = tokens.to('hpu') attn: Optional[Tensor] = None # A list that indicates candidates that should be ignored. # For example, suppose we're sampling and have already finalized 2/5 # samples. Then cands_to_ignore would mark 2 positions as being ignored, # so that we only finalize the remaining 3 samples. cands_to_ignore = (torch.zeros(bsz, beam_size).to(src_tokens).eq(-1) ) # forward and backward-compatible False mask # list of completed sentences finalized = torch.jit.annotate( List[List[Dict[str, Tensor]]], [ torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz) ], ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step # a boolean array indicating if the sentence at the index is finished or not finished = [False for i in range(bsz)] num_remaining_sent = bsz # number of sentences remaining # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = ((torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens).to( src_tokens.device)) cand_offsets = torch.arange(0, cand_size).type_as(tokens).to( src_tokens.device) reorder_state: Optional[Tensor] = None batch_idxs: Optional[Tensor] = None original_batch_idxs: Optional[Tensor] = None if "id" in sample and isinstance(sample["id"], Tensor): original_batch_idxs = sample["id"] else: original_batch_idxs = torch.arange(0, bsz).type_as(tokens) for step in range(max_len + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange( batch_idxs.numel()).type_as(batch_idxs) corr = corr.to('cpu') reorder_state = reorder_state.to('cpu') reorder_state.view(-1, beam_size).add_( corr.unsqueeze(-1) * beam_size) corr = corr.to('hpu') reorder_state = reorder_state.to('hpu') original_batch_idxs = original_batch_idxs[batch_idxs] self.model.reorder_incremental_state(incremental_states, reorder_state) encoder_outs = self.model.reorder_encoder_out( encoder_outs, reorder_state) with torch.autograd.profiler.record_function( "EnsembleModel: forward_decoder"): lprobs, avg_attn_scores = self.model.forward_decoder( tokens[:, :step + 1], encoder_outs, incremental_states, self.temperature, ) if self.lm_model is not None: lm_out = self.lm_model(tokens[:, :step + 1]) probs = self.lm_model.get_normalized_probs(lm_out, log_probs=True, sample=None) probs = probs[:, -1, :] * self.lm_weight lprobs += probs lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs) lprobs = lprobs.to('cpu') lprobs[:, self.pad] = -math.inf # never select pad lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty # handle max length constraint if step >= max_len: lprobs[:, :self.eos] = -math.inf lprobs[:, self.eos + 1:] = -math.inf lprobs = lprobs.to('hpu') # handle prefix tokens (possibly with different lengths) if (prefix_tokens is not None and step < prefix_tokens.size(1) and step < max_len): lprobs, tokens, scores = self._prefix_tokens( step, lprobs, scores, tokens, prefix_tokens, beam_size) elif step < self.min_len: # minimum length constraint (does not apply if using prefix_tokens) lprobs[:, self.eos] = -math.inf # Record attention scores, only support avg_attn_scores is a Tensor if avg_attn_scores is not None: if attn is None: attn = torch.empty(bsz * beam_size, avg_attn_scores.size(1), max_len + 2).to(scores) attn = attn.fill_(0.0) attn[:, :, step + 1].copy_(avg_attn_scores) scores = scores.type_as(lprobs) eos_bbsz_idx = torch.empty(0).to( tokens ) # indices of hypothesis ending with eos (finished sentences) eos_scores = torch.empty(0).to( scores ) # scores of hypothesis ending with eos (finished sentences) if self.should_set_src_lengths: self.search.set_src_lengths(src_lengths) if self.repeat_ngram_blocker is not None: lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step) # Shape: (batch, cand_size) tokens = tokens.to("cpu") lprobs = lprobs.to("cpu") scores = scores.to("cpu") cand_scores, cand_indices, cand_beams = self.search.step( step, lprobs.view(bsz, -1, self.vocab_size), scores.view(bsz, beam_size, -1)[:, :, :step], tokens[:, :step + 1], original_batch_idxs, ) scores = scores.to("hpu") tokens = tokens.to("hpu") lprobs = lprobs.to("hpu") # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_beams = cand_beams.to("hpu") cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos # Shape of eos_mask: (batch size, beam size) eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf) cands_to_ignore = cands_to_ignore.to('cpu') eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to( eos_mask) # only consider eos when it's among the top beam_size indices # Now we know what beam item(s) to finish # Shape: 1d list of absolute-numbered cand_bbsz_idx = cand_bbsz_idx.to("cpu") eos_bbsz_idx = torch.masked_select(cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]) cand_bbsz_idx = cand_bbsz_idx.to('hpu') eos_mask = eos_mask.to('hpu') cands_to_ignore = cands_to_ignore.to('hpu') eos_bbsz_idx = eos_bbsz_idx.to('hpu') finalized_sents: List[int] = [] if eos_bbsz_idx.numel() > 0: eos_scores = torch.masked_select(cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]) finalized_sents = self.finalize_hypos( step, eos_bbsz_idx, eos_scores, tokens, scores, finalized, finished, beam_size, attn, src_lengths, max_len, ) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break if self.search.stop_on_max_len and step >= max_len: break assert step < max_len, f"{step} < {max_len}" # Remove finalized sentences (ones for which {beam_size} # finished hypotheses have been generated) from the batch. if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = torch.ones(bsz, dtype=torch.bool, device=cand_indices.device) batch_mask[finalized_sents] = False # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it batch_idxs = torch.arange( bsz, device=cand_indices.device).masked_select(batch_mask) # Choose the subset of the hypothesized constraints that will continue self.search.prune_sentences(batch_idxs) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] src_lengths = src_lengths[batch_idxs] cands_to_ignore = cands_to_ignore[batch_idxs] scores = scores.to('cpu') scores = scores.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) scores = scores.to('hpu') tokens = tokens.to('cpu') tokens = tokens.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) tokens = tokens.to('hpu') if attn is not None: attn = attn.to('cpu') attn = attn.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, attn.size(1), -1) attn = attn.to('hpu') bsz = new_bsz else: batch_idxs = None # Set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos # Rewrite the operator since the element wise or is not supported in torchscript. eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size])) active_mask = torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[:eos_mask.size(1)], ) # get the top beam_size active hypotheses, which are just # the hypos with the smallest values in active_mask. # {active_hypos} indicates which {beam_size} hypotheses # from the list of {2 * beam_size} candidates were # selected. Shapes: (batch size, beam size) new_cands_to_ignore, active_hypos = torch.topk(active_mask, k=beam_size, dim=1, largest=False) # update cands_to_ignore to ignore any finalized hypos. cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] # Make sure there is at least one active item for each sentence in the batch. assert (~cands_to_ignore).any(dim=1).all() # update cands_to_ignore to ignore any finalized hypos # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam # can be selected more than once). active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos) active_scores = torch.gather(cand_scores, dim=1, index=active_hypos) cand_scores = cand_scores.to('cpu') active_hypos = active_hypos.to('cpu') active_hypos = active_hypos.to(dtype=torch.long) active_bbsz_idx = active_bbsz_idx.to('cpu') active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.to('cpu') active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses # Set the tokens for each beam (can select the same row more than once) tokens = tokens.to('cpu') tokens[:, :step + 1] = torch.index_select(tokens[:, :step + 1], dim=0, index=active_bbsz_idx) cand_indices = cand_indices.to('cpu') # Select the next token for each of them tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather(cand_indices, dim=1, index=active_hypos) tokens = tokens.to('hpu') cand_indices = cand_indices.to('hpu') scores = scores.to('cpu') if step > 0: scores[:, :step] = torch.index_select(scores[:, :step], dim=0, index=active_bbsz_idx) scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather(cand_scores, dim=1, index=active_hypos) scores = scores.to('hpu') cand_scores = cand_scores.to('hpu') active_hypos = active_hypos.to('hpu') # Update constraints based on which candidates were selected for the next beam self.search.update_constraints(active_hypos) # copy attention for active hypotheses if attn is not None: active_bbsz_idx = active_bbsz_idx.to('hpu') attn[:, :, :step + 2] = torch.index_select( attn[:, :, :step + 2], dim=0, index=active_bbsz_idx) # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): scores = torch.tensor( [float(elem["score"].item()) for elem in finalized[sent]]) _, sorted_scores_indices = torch.sort(scores, descending=True) finalized[sent] = [ finalized[sent][ssi] for ssi in sorted_scores_indices ] finalized[sent] = torch.jit.annotate(List[Dict[str, Tensor]], finalized[sent]) if use_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() return finalized
def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, print_freq, apex=False): model.train() metric_logger = utils.MetricLogger(delimiter=" ", device=device) metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) metric_logger.add_meter('img/s', utils.SmoothedValue(window_size=10, fmt='{value}')) header = 'Epoch: [{}]'.format(epoch) step_count = 0 last_print_time = time.time() for image, target in metric_logger.log_every(data_loader, print_freq, header): image, target = image.to(device, non_blocking=True), target.to( device, non_blocking=True) dl_ex_start_time = time.time() if args.channels_last: image = image.contiguous(memory_format=torch.channels_last) if args.run_lazy_mode: # This mark_step is added so that the the lazy kernel can # create and evaluate the graph to infer the resulting tensor # as channels_last import habana_frameworks.torch.core as htcore htcore.mark_step() output = model(image) loss = criterion(output, target) optimizer.zero_grad() # We see the performance gain of mobilenet via added this mark_step. if (args.run_lazy_mode and 'mobilenet_v2' in args.model): import habana_frameworks.torch.core as htcore htcore.mark_step() if apex: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.run_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() optimizer.step() if args.run_lazy_mode: import habana_frameworks.torch.core as htcore htcore.mark_step() if step_count % print_freq == 0: output_cpu = output.detach().to('cpu') acc1, acc5 = utils.accuracy(output_cpu, target, topk=(1, 5)) batch_size = image.shape[0] metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) metric_logger.meters['acc1'].update(acc1.item(), n=batch_size * print_freq) metric_logger.meters['acc5'].update(acc5.item(), n=batch_size * print_freq) current_time = time.time() last_print_time = dl_ex_start_time if args.dl_time_exclude else last_print_time metric_logger.meters['img/s'].update( batch_size * print_freq / (current_time - last_print_time)) last_print_time = time.time() step_count = step_count + 1 if step_count >= args.num_train_steps: break
def train(args, train_dataset, model, tokenizer, teacher=None): """Train the model""" if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] if args.hpu and args.optimizer == "FusedAdamW": try: from habana_frameworks.torch.hpex.optimizers import FusedAdamW except ImportError: raise ImportError("Please install habana_torch.") optimizer = FusedAdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, ) elif args.optimizer == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.use_lazy_mode: try: import habana_frameworks.torch.core as htcore except ImportError: assert False, "Could Not import habana_frameworks.torch.core" # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: if args.hpu: # Distributed DataParallel for HPU model = torch.nn.parallel.DistributedDataParallel( model, bucket_cap_mb=230, find_unused_parameters=False, gradient_as_bucket_view=True, broadcast_buffers=False) else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! if args.local_rank in [-1, 0]: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 loss_list = [] # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) if args.local_rank in [-1, 0]: logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: if args.local_rank in [-1, 0]: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) if args.use_lazy_mode: htcore.mark_step() for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0], smoothing=1) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() if teacher is not None: teacher.eval() try: from habana_frameworks.torch.hpex.normalization import FusedClipNorm except ImportError: raise ImportError("Please install habana_torch.") FusedNorm = FusedClipNorm(model.parameters(), args.max_grad_norm) if args.hpu: batch = [ b.type(torch.IntTensor) if b.dtype == torch.int64 else b for b in batch ] batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type != "distilbert": inputs[ "token_type_ids"] = None if args.model_type == "xlm" else batch[ 2] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) outputs = model(**inputs) loss, start_logits_stu, end_logits_stu = outputs # Distillation loss if teacher is not None: if "token_type_ids" not in inputs: inputs[ "token_type_ids"] = None if args.teacher_type == "xlm" else batch[ 2] with torch.no_grad(): start_logits_tea, end_logits_tea = teacher( input_ids=inputs["input_ids"], token_type_ids=inputs["token_type_ids"], attention_mask=inputs["attention_mask"], ) assert start_logits_tea.size() == start_logits_stu.size() assert end_logits_tea.size() == end_logits_stu.size() loss_fct = nn.KLDivLoss(reduction="batchmean") loss_start = (loss_fct( F.log_softmax(start_logits_stu / args.temperature, dim=-1), F.softmax(start_logits_tea / args.temperature, dim=-1), ) * (args.temperature**2)) loss_end = (loss_fct( F.log_softmax(end_logits_stu / args.temperature, dim=-1), F.softmax(end_logits_tea / args.temperature, dim=-1), ) * (args.temperature**2)) loss_ce = (loss_start + loss_end) / 2.0 loss = args.alpha_ce * loss_ce + args.alpha_squad * loss if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.use_lazy_mode: htcore.mark_step() loss_list.append(loss) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: if args.hpu: if args.optimizer == "FusedAdamW": FusedNorm.clip_norm(model.parameters()) else: if args.hmp: from habana_frameworks.torch.hpex import hmp with hmp.disable_casts(): torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) if args.hpu and args.hmp and not (args.optimizer == "FusedAdamW"): from habana_frameworks.torch.hpex import hmp with hmp.disable_casts(): optimizer.step() else: optimizer.step() if args.use_lazy_mode: htcore.mark_step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: for loss_t in loss_list: tr_loss += loss_t.item() # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss loss_list.clear() if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training if args.hpu: d = next(model_to_save.parameters()).device if (d != torch.device("cpu")): import copy model_to_save_clone = copy.deepcopy(model_to_save) model_to_save_clone.to(torch.device("cpu")) model_to_save_clone.save_pretrained(output_dir) torch.save(model_to_save_clone.state_dict(), os.path.join(output_dir, "model.pt")) else: model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.hpu: param_groups_copy = optimizer.state_dict( )['param_groups'] state_dict_copy = {} for st_key, st_val in optimizer.state_dict( )['state'].items(): st_val_copy = {} for k, v in st_val.items(): st_val_copy[k] = v.to('cpu') if isinstance( v, torch.Tensor) else v state_dict_copy[st_key] = st_val_copy optim_dict = {} optim_dict['state'] = state_dict_copy optim_dict['param_groups'] = param_groups_copy torch.save(optim_dict, os.path.join(output_dir, "optimizer.pt")) else: torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) if args.use_lazy_mode: try: import habana_frameworks.torch.core as htcore except ImportError: assert False, "Could Not import habana_frameworks.torch.core" args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! if args.local_rank in [-1, 0]: logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1]} if args.model_type != "distilbert": inputs[ "token_type_ids"] = None if args.model_type == "xlm" else batch[ 2] # XLM don't use segment_ids example_indices = batch[3] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = model(**inputs) if args.use_lazy_mode: htcore.mark_step() for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time if args.local_rank in [-1, 0]: logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def mark_step(use_hpu=True): if not use_hpu: return import habana_frameworks.torch.core as htcore htcore.mark_step()