def validation(self, val_loader): self.model.eval() summary_loss = AverageMeter() final_scores = RocAucMeter() t = time.time() for step, (images, targets) in enumerate(val_loader): with torch.no_grad(): batch_size = images.shape[0] outputs = self.model(images) loss = self.criterion(outputs, targets) try: final_scores.update(targets, outputs) except: if step % (self.config.verbose_step * 2) == 0: xm.master_print("final_scores update failed...") pass summary_loss.update(loss.detach().item(), batch_size) if self.config.verbose: if step % (self.config.verbose_step * 2) == 0: xm.master_print(f"::: Valid Step({step}/{len(val_loader)}) | Loss: {summary_loss.avg:.4f} | AUC: {final_scores.avg:.4f} | Time: {int((time.time() - t))}s") return summary_loss, final_scores
def validation(self, val_loader): self.model.eval() summary_loss = AverageMeter() final_scores = RocAucMeter() t = time.time() for step, (images, targets) in enumerate(val_loader): if self.config.verbose: if step % self.config.verbose_step == 0: print( f"::: Valid Step({step}/{len(val_loader)}) | Loss: {summary_loss.avg:.4f} | AUC: {final_scores.avg:.4f} | Time: {int((time.time() - t))}s" ) # , end='\r') # print( # f'Val Step {step}/{len(val_loader)}, ' + \ # f'summary_loss: {summary_loss.avg:.5f}, final_score: {final_scores.avg:.5f}, ' + \ # f'time: {(time.time() - t):.5f}') #, end='\r' # ) with torch.no_grad(): targets = targets.to(self.device) batch_size = images.shape[0] images = images.to(self.device).float() outputs = self.model(images) loss = self.criterion(outputs, targets) try: final_scores.update(targets, outputs) except: # print("outputs: ", list(outputs.data.cpu().numpy())[:10]) pass summary_loss.update(loss.detach().item(), batch_size) return summary_loss, final_scores
def train_one_epoch(self, train_loader): self.model.train() summary_loss = AverageMeter() final_scores = RocAucMeter() t = time.time() for step, (images, targets) in enumerate(train_loader): t0 = time.time() batch_size = images.shape[0] outputs = self.model(images) self.optimizer.zero_grad() loss = self.criterion(outputs, targets) loss.backward() # compute and sum gradients on params #torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=global_config.CLIP_GRAD_NORM) xm.optimizer_step(self.optimizer) if self.config.step_scheduler: self.scheduler.step() try: final_scores.update(targets, outputs) except: # xm.master_print("outputs: ", list(outputs.data.cpu().numpy())[:10]) pass summary_loss.update(loss.detach().item(), batch_size) if self.config.verbose: if step % self.config.verbose_step == 0: t1 = time.time() effNet_lr = np.format_float_scientific( self.optimizer.param_groups[0]['lr'], unique=False, precision=1) head_lr = np.format_float_scientific( self.optimizer.param_groups[0]['lr'], unique=False, precision=1) xm.master_print( f":::({str(step).rjust(4, ' ')}/{len(train_loader)}) | Loss: {summary_loss.avg:.4f} | AUC: {final_scores.avg:.5f} | LR: {effNet_lr}/{head_lr} | BTime: {t1-t0 :.2f}s | ETime: {int((t1-t0)*(len(train_loader)-step)//60)}m" ) return summary_loss, final_scores
def train_one_epoch(self, train_loader): self.model.train() summary_loss = AverageMeter() final_scores = RocAucMeter() t = time.time() for step, (images, targets) in enumerate(train_loader): t0 = time.time() targets = targets.to(self.device) images = images.to(self.device).float() batch_size = images.shape[0] outputs = self.model(images) if global_config.ACCUMULATION_STEP > 1: loss = self.criterion(outputs, targets) # loss = loss / global_config.ACCUMULATION_STEP # Normalize loss (if averaged) # APEX clip grad # https://nvidia.github.io/apex/advanced.html#gradient-clipping if global_config.FP16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward( ) # in apex, loss.backward() becomes else: loss.backward() # compute and sum gradients on params if (step + 1) % global_config.ACCUMULATION_STEP == 0: print(f"Step: {step} accum_optimizing") # clip grad btw backward() and step() # https://nvidia.github.io/apex/advanced.html#gradient-clipping # if config.FP16: # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm=config.CLIP_GRAD_NORM) # else: # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.CLIP_GRAD_NORM) self.optimizer.step( ) # backprop according to accumulated losses self.optimizer.zero_grad() # clear gradients if self.config.step_scheduler: self.scheduler.step( ) # scheduler.step() after opt.step() -> update LR else: self.optimizer.zero_grad() loss = self.criterion(outputs, targets) # APEX clip grad # https://nvidia.github.io/apex/advanced.html#gradient-clipping if global_config.FP16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward( ) # in apex, loss.backward() becomes torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), max_norm=global_config.CLIP_GRAD_NORM) else: loss.backward() # compute and sum gradients on params torch.nn.utils.clip_grad_norm_( self.model.parameters(), max_norm=global_config.CLIP_GRAD_NORM) self.optimizer.step() if self.config.step_scheduler: self.scheduler.step() try: final_scores.update(targets, outputs) except: # print("outputs: ", list(outputs.data.cpu().numpy())[:10]) pass summary_loss.update(loss.detach().item(), batch_size) if self.config.verbose: if step % self.config.verbose_step == 0: t1 = time.time() effNet_lr = np.format_float_scientific( self.optimizer.param_groups[0]['lr'], unique=False, precision=1) head_lr = np.format_float_scientific( self.optimizer.param_groups[0]['lr'], unique=False, precision=1) print( f":::({str(step).rjust(4, ' ')}/{len(train_loader)}) | Loss: {summary_loss.avg:.4f} | AUC: {final_scores.avg:.5f} | LR: {effNet_lr}/{head_lr} | BTime: {t1-t0 :.2f}s | ETime: {int((t1-t0)*(len(train_loader)-step)//60)}m" ) #, end='\r') return summary_loss, final_scores