class BaseTrainer: def __init__(self, config, model, criterion, train_loader, weights_init): config['trainer']['output_dir'] = os.path.join( str(pathlib.Path(os.path.abspath(__name__)).parent), config['trainer']['output_dir']) config['name'] = config['name'] + '_' + model.name self.save_dir = os.path.join(config['trainer']['output_dir'], config['name']) self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint') if config['trainer']['resume_checkpoint'] == '' and config['trainer'][ 'finetune_checkpoint'] == '': shutil.rmtree(self.save_dir, ignore_errors=True) if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.global_step = 0 self.start_epoch = 1 self.config = config self.model = model self.criterion = criterion self.train_loader = train_loader # logger and tensorboard self.tensorboard_enable = self.config['trainer']['tensorboard'] self.epochs = self.config['trainer']['epochs'] self.display_interval = self.config['trainer']['display_interval'] if self.tensorboard_enable: from torch.utils.tensorboard import SummaryWriter self.writer = SummaryWriter(self.save_dir) self.logger = setup_logger(os.path.join(self.save_dir, 'train_log')) self.logger.info(pformat(self.config)) # device torch.manual_seed(self.config['trainer']['seed']) # 为CPU设置随机种子 if len(self.config['trainer']['gpus']) > 0 and torch.cuda.is_available( ): self.with_cuda = True torch.backends.cudnn.benchmark = True self.logger.info('train with gpu {} and pytorch {}'.format( self.config['trainer']['gpus'], torch.__version__)) self.gpus = { i: item for i, item in enumerate(self.config['trainer']['gpus']) } self.device = torch.device("cuda:0") torch.cuda.manual_seed( self.config['trainer']['seed']) # 为当前GPU设置随机种子 torch.cuda.manual_seed_all( self.config['trainer']['seed']) # 为所有GPU设置随机种子 else: self.with_cuda = False self.logger.info('train with cpu and pytorch {}'.format( torch.__version__)) self.device = torch.device("cpu") self.logger.info('device {}'.format(self.device)) self.metrics = { 'recall': 0, 'precision': 0, 'hmean': 0, 'train_loss': float('inf'), 'best_model': '' } self.optimizer = self._initialize('optimizer', torch.optim, self.model.parameters()) if self.config['lr_scheduler']['type'] != 'PolynomialLR': self.scheduler = self._initialize('lr_scheduler', torch.optim.lr_scheduler, self.optimizer) else: self.scheduler = PolynomialLR(self.optimizer, self.epochs * len(self.train_loader)) if self.config['trainer']['resume_checkpoint'] != '': self._load_checkpoint(self.config['trainer']['resume_checkpoint'], resume=True) elif self.config['trainer']['finetune_checkpoint'] != '': self._load_checkpoint( self.config['trainer']['finetune_checkpoint'], resume=False) else: if weights_init is not None: self.model.apply(weights_init) # 单机多卡 num_gpus = torch.cuda.device_count() if num_gpus > 1: self.model = nn.DataParallel(self.model) # For sync bn patch_replication_callback(self.model) self.model.to(self.device) if self.tensorboard_enable: try: # add graph dummy_input = torch.zeros( 1, self.config['data_loader']['args']['dataset'] ['img_channel'], self.config['data_loader']['args'] ['dataset']['input_size'], self.config['data_loader'] ['args']['dataset']['input_size']).to(self.device) self.writer.add_graph(self.model, dummy_input) except Exception as e: self.logger.warn( 'add graph to tensorboard failed, error [{}]'.format(e)) def train(self): """ Full training logic """ for epoch in range(self.start_epoch, self.epochs + 1): try: self.epoch_result = self._train_epoch(epoch) if self.config['lr_scheduler']['type'] != 'PolynomialLR': self.scheduler.step() self._on_epoch_finish() except torch.cuda.CudaError: self._log_memory_usage() if self.tensorboard_enable: self.writer.close() self._on_train_finish() def _train_epoch(self, epoch): """ Training logic for an epoch :param epoch: Current epoch number """ raise NotImplementedError def _eval(self): """ eval logic for an epoch :param epoch: Current epoch number """ raise NotImplementedError def _on_epoch_finish(self): raise NotImplementedError def _on_train_finish(self): raise NotImplementedError def _log_memory_usage(self): if not self.with_cuda: return template = """Memory Usage: \n{}""" usage = [] for deviceID, device in self.gpus.items(): deviceID = int(deviceID) allocated = torch.cuda.memory_allocated(deviceID) / (1024 * 1024) cached = torch.cuda.memory_cached(deviceID) / (1024 * 1024) usage.append( ' CUDA: {} Allocated: {} MB Cached: {} MB \n'.format( device, allocated, cached)) content = ''.join(usage) content = template.format(content) self.logger.debug(content) def _save_checkpoint(self, epoch, file_name, save_best=False): """ Saving checkpoints :param epoch: current epoch number :param log: logging information of the epoch :param save_best: if True, rename the saved checkpoint to 'model_best.pth.tar' """ state = { 'epoch': epoch, 'global_step': self.global_step, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict(), 'config': self.config, 'metrics': self.metrics } filename = os.path.join(self.checkpoint_dir, file_name) torch.save(state, filename) if save_best: shutil.copy(filename, os.path.join(self.checkpoint_dir, 'model_best.pth')) self.logger.info("Saving current best: {}".format(file_name)) else: self.logger.info("Saving checkpoint: {}".format(filename)) def _load_checkpoint(self, checkpoint_path, resume): """ Resume from saved checkpoints :param checkpoint_path: Checkpoint path to be resumed """ self.logger.info("Loading checkpoint: {} ...".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) state_dict = {} for (k, v) in checkpoint['state_dict'].items(): state_dict[k.replace('module.', '')] = v #self.model.load_state_dict(checkpoint['state_dict']) self.model.load_state_dict(state_dict) if resume: self.global_step = checkpoint['global_step'] self.start_epoch = checkpoint['epoch'] + 1 self.config['lr_scheduler']['args'][ 'last_epoch'] = self.start_epoch #self.scheduler.load_state_dict(checkpoint['scheduler']) #self.optimizer.load_state_dict(checkpoint['optimizer']) #if 'metrics' in checkpoint: # self.metrics = checkpoint['metrics'] if self.with_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(self.device) self.logger.info("resume from checkpoint {} (epoch {})".format( checkpoint_path, self.start_epoch)) else: self.logger.info( "finetune from checkpoint {}".format(checkpoint_path)) def _initialize(self, name, module, *args, **kwargs): module_name = self.config[name]['type'] module_args = self.config[name]['args'] assert all([ k not in module_args for k in kwargs ]), 'Overwriting kwargs given in config file is not allowed' module_args.update(kwargs) return getattr(module, module_name)(*args, **module_args)
class Trainer(BaseTrainer): def __init__(self, config, model, criterion, train_loader, weights_init=None): super(Trainer, self).__init__(config, model, criterion, weights_init) self.show_images_interval = self.config['trainer'][ 'show_images_interval'] self.test_path = self.config['data_loader']['args']['dataset'][ 'val_data_path'] self.train_loader = train_loader self.train_loader_len = len(train_loader) if self.config['lr_scheduler']['type'] == 'PolynomialLR': self.scheduler = PolynomialLR(self.optimizer, self.epochs * self.train_loader_len) self.logger.info( 'train dataset has {} samples,{} in dataloader'.format( self.train_loader.dataset_len, self.train_loader_len)) def _train_epoch(self, epoch): self.model.train() epoch_start = time.time() batch_start = time.time() train_loss = 0. running_metric_text = runningScore(2) running_metric_kernel = runningScore(2) lr = self.optimizer.param_groups[0]['lr'] for i, (images, labels, training_masks) in enumerate(self.train_loader): if i >= self.train_loader_len: break self.global_step += 1 lr = self.optimizer.param_groups[0]['lr'] # 数据进行转换和丢到gpu cur_batch_size = images.size()[0] images, labels, training_masks = images.to(self.device), labels.to( self.device), training_masks.to(self.device) preds = self.model(images) loss_all, loss_tex, loss_ker, loss_agg, loss_dis = self.criterion( preds, labels, training_masks) # backward self.optimizer.zero_grad() loss_all.backward() self.optimizer.step() if self.config['lr_scheduler']['type'] == 'PolynomialLR': self.scheduler.step() # acc iou score_text = cal_text_score(preds[:, 0, :, :], labels[:, 0, :, :], training_masks, running_metric_text) score_kernel = cal_kernel_score(preds[:, 1, :, :], labels[:, 1, :, :], labels[:, 0, :, :], training_masks, running_metric_kernel) # loss 和 acc 记录到日志 loss_all = loss_all.item() loss_tex = loss_tex.item() loss_ker = loss_ker.item() loss_agg = loss_agg.item() loss_dis = loss_dis.item() train_loss += loss_all acc = score_text['Mean Acc'] iou_text = score_text['Mean IoU'] iou_kernel = score_kernel['Mean IoU'] if (i + 1) % self.display_interval == 0: batch_time = time.time() - batch_start self.logger.info( '[{}/{}], [{}/{}], global_step: {}, Speed: {:.1f} samples/sec, acc: {:.4f}, iou_text: {:.4f}, iou_kernel: {:.4f}, loss_all: {:.4f}, loss_tex: {:.4f}, loss_ker: {:.4f}, loss_agg: {:.4f}, loss_dis: {:.4f}, lr:{:.6}, time:{:.2f}' .format( epoch, self.epochs, i + 1, self.train_loader_len, self.global_step, self.display_interval * cur_batch_size / batch_time, acc, iou_text, iou_kernel, loss_all, loss_tex, loss_ker, loss_agg, loss_dis, lr, batch_time)) batch_start = time.time() if self.tensorboard_enable: # write tensorboard self.writer.add_scalar('TRAIN/LOSS/loss_all', loss_all, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_tex', loss_tex, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_ker', loss_ker, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_agg', loss_agg, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_dis', loss_dis, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/acc', acc, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/iou_text', iou_text, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/iou_kernel', iou_kernel, self.global_step) self.writer.add_scalar('TRAIN/lr', lr, self.global_step) if i % self.show_images_interval == 0: # show images on tensorboard self.writer.add_images('TRAIN/imgs', images, self.global_step) # text kernel and training_masks gt_texts, gt_kernels = labels[:, 0, :, :], labels[:, 1, :, :] gt_texts[gt_texts <= 0.5] = 0 gt_texts[gt_texts > 0.5] = 1 gt_kernels[gt_kernels <= 0.5] = 0 gt_kernels[gt_kernels > 0.5] = 1 show_label = torch.cat( [gt_texts, gt_kernels, training_masks.float()]) show_label = vutils.make_grid(show_label.unsqueeze(1), nrow=cur_batch_size, normalize=False, padding=20, pad_value=1) self.writer.add_image('TRAIN/gt', show_label, self.global_step) # model output preds[:, :2, :, :] = torch.sigmoid(preds[:, :2, :, :]) show_pred = torch.cat( [preds[:, 0, :, :], preds[:, 1, :, :]]) show_pred = vutils.make_grid(show_pred.unsqueeze(1), nrow=cur_batch_size, normalize=False, padding=20, pad_value=1) self.writer.add_image('TRAIN/preds', show_pred, self.global_step) return { 'train_loss': train_loss / self.train_loader_len, 'lr': lr, 'time': time.time() - epoch_start, 'epoch': epoch } def _eval(self): self.model.eval() # torch.cuda.empty_cache() # speed up evaluating after training finished img_path = os.path.join(self.test_path, 'img') gt_path = os.path.join(self.test_path, 'gt') result_save_path = os.path.join(self.save_dir, 'result') if os.path.exists(result_save_path): shutil.rmtree(result_save_path, ignore_errors=True) if not os.path.exists(result_save_path): os.makedirs(result_save_path) short_size = 736 # 预测所有测试图片 img_paths = [os.path.join(img_path, x) for x in os.listdir(img_path)] for img_path in tqdm(img_paths, desc='test models'): img_name = os.path.basename(img_path).split('.')[0] save_name = os.path.join(result_save_path, 'res_' + img_name + '.txt') assert os.path.exists(img_path), 'file is not exists' img = cv2.imread(img_path) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) h, w = img.shape[:2] scale = short_size / min(h, w) img = cv2.resize(img, None, fx=scale, fy=scale) # 将图片由(w,h)变为(1,img_channel,h,w) tensor = transforms.ToTensor()(img) tensor = tensor.unsqueeze_(0) tensor = tensor.to(self.device) with torch.no_grad(): torch.cuda.synchronize(self.device) preds = self.model(tensor)[0] torch.cuda.synchronize(self.device) preds, boxes_list = decode(preds) scale = (preds.shape[1] / w, preds.shape[0] / h) if len(boxes_list): boxes_list = boxes_list / scale np.savetxt(save_name, boxes_list.reshape(-1, 8), delimiter=',', fmt='%d') # 开始计算 recall precision f1 result_dict = cal_recall_precison_f1(gt_path=gt_path, result_path=result_save_path) return result_dict['recall'], result_dict['precision'], result_dict[ 'hmean'] def _on_epoch_finish(self): self.logger.info( '[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format( self.epoch_result['epoch'], self.epochs, self.epoch_result['train_loss'], self.epoch_result['time'], self.epoch_result['lr'])) net_save_path = '{}/PANNet_latest.pth'.format(self.checkpoint_dir) save_best = False if self.config['trainer']['metrics'] == 'hmean': # 使用f1作为最优模型指标 recall, precision, hmean = self._eval() if self.tensorboard_enable: self.writer.add_scalar('EVAL/recall', recall, self.global_step) self.writer.add_scalar('EVAL/precision', precision, self.global_step) self.writer.add_scalar('EVAL/hmean', hmean, self.global_step) self.logger.info( 'test: recall: {:.6f}, precision: {:.6f}, f1: {:.6f}'.format( recall, precision, hmean)) if hmean > self.metrics['hmean']: save_best = True self.metrics['train_loss'] = self.epoch_result['train_loss'] self.metrics['hmean'] = hmean self.metrics['precision'] = precision self.metrics['recall'] = recall self.metrics['best_model'] = net_save_path else: if self.epoch_result['train_loss'] < self.metrics['train_loss']: save_best = True self.metrics['train_loss'] = self.epoch_result['train_loss'] self.metrics['best_model'] = net_save_path self._save_checkpoint(self.epoch_result['epoch'], net_save_path, save_best) def _on_train_finish(self): for k, v in self.metrics.items(): self.logger.info('{}:{}'.format(k, v)) self.logger.info('finish train')
# Loss iter_loss = criterion(logits, labels.to(device)) # Propagate backward (just compute gradients wrt the loss) iter_loss /= CONFIG.SOLVER.ITER_SIZE iter_loss.backward() loss += float(iter_loss) average_loss.add(loss) # Update weights with accumulated gradients optimizer.step() # Update learning rate scheduler.step(epoch=iteration) # TensorBoard if iteration % CONFIG.SOLVER.ITER_TB == 0: writer.add_scalar("loss/train", average_loss.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("lr/group_{}".format(i), o["lr"], iteration) for i in range(torch.cuda.device_count()): writer.add_scalar( "gpu/device_{}/memory_cached".format(i), torch.cuda.memory_cached(i) / 1024**3, iteration, ) # Save a model if iteration % CONFIG.SOLVER.ITER_SAVE == 0: