def train_epoch(self, scaler, epoch, model, dataloader, optimizer, prefix="train"): model.train() _timer = Timer() lossLogger = LossLogger() performanceLogger = MetricLogger(self.dictionary, self.cfg) for i, sample in enumerate(dataloader): imgs, targets = sample['image'], sample['target'] _timer.tic() # zero the parameter gradients optimizer.zero_grad() imgs = list( img.cuda() for img in imgs) if isinstance(imgs, list) else imgs.cuda() if isinstance(targets, list): if isinstance(targets[0], torch.Tensor): targets = [t.cuda() for t in targets] else: targets = [{k: v.cuda() for k, v in t.items()} for t in targets] else: targets = targets.cuda() # Autocast with amp.autocast(enabled=True): out = model(imgs, targets, prefix) if not isinstance(out, tuple): losses, predicts = out, None else: losses, predicts = out self.n_iters_elapsed += 1 # Scales loss. Calls backward() on scaled loss to create scaled gradients. # Backward passes under autocast are not recommended. # Backward ops run in the same dtype autocast chose for corresponding forward ops. scaler.scale(losses["loss"]).backward() # scaler.step() first unscales the gradients of the optimizer's assigned params. # If these gradients do not contain infs or NaNs, optimizer.step() is then called, # otherwise, optimizer.step() is skipped. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() # torch.cuda.synchronize() _timer.toc() if (i + 1) % self.cfg.N_ITERS_TO_DISPLAY_STATUS == 0: if self.cfg.distributed: # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_dict(losses) lossLogger.update(**loss_dict_reduced) del loss_dict_reduced else: lossLogger.update(**losses) if predicts is not None: if self.cfg.distributed: # reduce performances over all GPUs for logging purposes predicts_dict_reduced = reduce_dict(predicts) performanceLogger.update(targets, predicts_dict_reduced) del predicts_dict_reduced else: performanceLogger.update(**predicts) del predicts if self.cfg.local_rank == 0: template = "[epoch {}/{}, iter {}, lr {}] Total train loss: {:.4f} " "(ips = {:.2f})\n" "{}" logger.info( template.format( epoch, self.cfg.N_MAX_EPOCHS, i, round(get_current_lr(optimizer), 6), lossLogger.meters["loss"].value, self.batch_size * self.cfg.N_ITERS_TO_DISPLAY_STATUS / _timer.diff, "\n".join([ "{}: {:.4f}".format(n, l.value) for n, l in lossLogger.meters.items() if n != "loss" ]), )) del imgs, targets, losses if self.cfg.TENSORBOARD and self.cfg.local_rank == 0: # Logging train losses [ self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l.global_avg, epoch) for n, l in lossLogger.meters.items() ] performances = performanceLogger.compute() if len(performances): [ self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v, epoch) for k, v in performances.items() ] if self.cfg.TENSORBOARD_WEIGHT and False: for name, param in model.named_parameters(): layer, attr = os.path.splitext(name) attr = attr[1:] self.tb_writer.add_histogram("{}/{}".format(layer, attr), param, epoch)
def val_epoch(self, epoch, model, dataloader, prefix="val"): model.eval() lossLogger = LossLogger() performanceLogger = MetricLogger(self.dictionary, self.cfg) with torch.no_grad(): for sample in dataloader: imgs, targets = sample['image'], sample['target'] imgs = list(img.cuda() for img in imgs) if isinstance( imgs, list) else imgs.cuda() if isinstance(targets, list): if isinstance(targets[0], torch.Tensor): targets = [t.cuda() for t in targets] else: targets = [{k: v.cuda() for k, v in t.items()} for t in targets] else: targets = targets.cuda() losses, predicts = model(imgs, targets, prefix) if self.cfg.distributed: # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_dict(losses) lossLogger.update(**loss_dict_reduced) del loss_dict_reduced else: lossLogger.update(**losses) if predicts is not None: if self.cfg.distributed: # reduce performances over all GPUs for logging purposes predicts_dict_reduced = reduce_dict(predicts) performanceLogger.update(targets, predicts_dict_reduced) del predicts_dict_reduced else: performanceLogger.update(targets, predicts) del predicts del imgs, targets, losses performances = performanceLogger.compute() if self.cfg.TENSORBOARD and self.cfg.local_rank == 0: # Logging val Loss [ self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l.global_avg, epoch) for n, l in lossLogger.meters.items() ] if len(performances): # Logging val performances [ self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v, epoch) for k, v in performances.items() ] if self.cfg.local_rank == 0: template = "[epoch {}] Total {} loss : {:.4f} " "\n" "{}" logger.info( template.format( epoch, prefix, lossLogger.meters["loss"].global_avg, "\n".join([ "{}: {:.4f}".format(n, l.global_avg) for n, l in lossLogger.meters.items() if n != "loss" ]), )) perf_log_str = f"\n------------ Performances ({prefix}) ----------\n" for k, v in performances.items(): perf_log_str += "{:}: {:.4f}\n".format(k, v) perf_log_str += "------------------------------------\n" logger.info(perf_log_str) acc = performances['performance'] return acc
def train_epoch(self, epoch, model, dataloader, optimizer, lr_scheduler, grad_normalizer=None, prefix="train"): model.train() _timer = Timer() lossLogger = MetricLogger(delimiter=" ") performanceLogger = MetricLogger(delimiter=" ") for i, (imgs, targets) in enumerate(dataloader): _timer.tic() # zero the parameter gradients optimizer.zero_grad() # imgs = imgs.cuda() imgs = list( img.cuda() for img in imgs) if isinstance(imgs, list) else imgs.cuda() # labels = [label.cuda() for label in labels] if isinstance(labels,list) else labels.cuda() # labels = [{k: v.cuda() for k, v in t.items()} for t in labels] if isinstance(labels,list) else labels.cuda() if isinstance(targets, list): if isinstance(targets[0], torch.Tensor): targets = [t.cuda() for t in targets] else: targets = [{k: v.cuda() for k, v in t.items()} for t in targets] else: targets = targets.cuda() out = model(imgs, targets, prefix) if not isinstance(out, tuple): losses, performances = out, None else: losses, performances = out self.n_iters_elapsed += 1 with amp.scale_loss(losses["loss"], optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() torch.cuda.synchronize() _timer.toc() if (i + 1) % self.cfg.N_ITERS_TO_DISPLAY_STATUS == 0: if self.cfg.distributed: # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_dict(losses) lossLogger.update(**loss_dict_reduced) else: lossLogger.update(**losses) if performances is not None and all(performances): if self.cfg.distributed: # reduce performances over all GPUs for logging purposes performance_dict_reduced = reduce_dict(performances) performanceLogger.update(**performance_dict_reduced) else: performanceLogger.update(**performances) if self.cfg.local_rank == 0: template = "[epoch {}/{}, iter {}, lr {}] Total train loss: {:.4f} " "(ips = {:.2f})\n" "{}" logger.info( template.format( epoch, self.cfg.N_MAX_EPOCHS, i, round(get_current_lr(optimizer), 6), lossLogger.meters["loss"].value, self.batch_size * self.cfg.N_ITERS_TO_DISPLAY_STATUS / _timer.total_time, "\n".join([ "{}: {:.4f}".format(n, l.value) for n, l in lossLogger.meters.items() if n != "loss" ]), )) del imgs, targets if self.cfg.TENSORBOARD and self.cfg.local_rank == 0: # Logging train losses [ self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l.global_avg, epoch) for n, l in lossLogger.meters.items() ] if len(performanceLogger.meters): [ self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v.global_avg, epoch) for k, v in performanceLogger.meters.items() ] if self.cfg.TENSORBOARD_WEIGHT and False: for name, param in model.named_parameters(): layer, attr = os.path.splitext(name) attr = attr[1:] self.tb_writer.add_histogram("{}/{}".format(layer, attr), param, epoch)