type=int, help='every iter to save the model.') args = parser.parse_args() return args args = parse_command() print(args) # if setting gpu id, the using single GPU if args.gpu: print('Single GPU Mode.') os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu best_result = Result() best_result.set_to_worst() def create_loader(args): if args.dataset == 'vocaug': composed_transforms_tr = transforms.Compose([ tr.RandomSized(512), tr.RandomRotate(15), tr.RandomHorizontalFlip(), tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), tr.ToTensor() ]) composed_transforms_ts = transforms.Compose([ tr.FixedResize(size=(512, 512)),
class trainer(object): def __init__(self, opt, model, optimizer, start_iter, best_result=None): self.opt = opt self.model = model.cuda() self.optimizer = optimizer self.scheduler = get_schedular(optimizer, self.opt) self.criterion = get_criteria(self.opt) self.criterion = get_criteria(self.opt) self.output_directory = utils.get_save_path(self.opt) self.best_txt = os.path.join(self.output_directory, 'best.txt') self.logger = utils.get_logger(self.output_directory) opt.write_config(self.output_directory) self.st_iter, self.ed_iter = start_iter, self.opt.max_iter # data loader from dataloaders import create_loader self.train_loader = create_loader(self.opt, mode='train') self.eval_loader = create_loader(self.opt, mode='val') if best_result: self.best_result = best_result else: self.best_result = Result() self.best_result.set_to_worst() # train parameters self.iter_save = len(self.train_loader) # self.iter_save = len(self.train_loader) self.train_meter = AverageMeter() self.eval_meter = AverageMeter() self.metric = self.best_result.absrel self.result = Result() def train_iter(self, it): # Clear gradients (ready to accumulate) self.optimizer.zero_grad() end = time.time() try: input, target = next(loader_iter) except: loader_iter = iter(self.train_loader) input, target = next(loader_iter) input, target = input.cuda(), target.cuda() data_time = time.time() - end # compute pred end = time.time() pred = self.model(input) # @wx 注意输出 loss = self.criterion(pred, target) loss.backward() # compute gradient and do SGD step self.optimizer.step() gpu_time = time.time() - end # measure accuracy and record loss in each GPU self.result.set_to_worst() self.result.evaluate(pred[0], target, loss.item()) self.train_meter.update(self.result, gpu_time, data_time, input.size(0)) avg = self.train_meter.average() if it % self.opt.print_freq == 0: print('=> output: {}'.format(self.output_directory)) print('Train Iter: [{0}/{1}]\t' 't_Data={data_time:.3f}({average.data_time:.3f}) ' 't_GPU={gpu_time:.3f}({average.gpu_time:.3f})\n\t' 'Loss={Loss:.5f}({average.loss:.5f}) ' 'RMSE={result.rmse:.2f}({average.rmse:.2f}) ' 'REL={result.absrel:.2f}({average.absrel:.2f}) ' 'Log10={result.lg10:.3f}({average.lg10:.3f}) ' 'Delta1={result.delta1:.3f}({average.delta1:.3f}) ' 'Delta2={result.delta2:.3f}({average.delta2:.3f}) ' 'Delta3={result.delta3:.3f}({average.delta3:.3f})'.format( it, self.opt.max_iter, data_time=data_time, gpu_time=gpu_time, Loss=loss.item(), result=self.result, average=avg)) self.logger.add_scalar('Train/Loss', avg.loss, it) self.logger.add_scalar('Train/RMSE', avg.rmse, it) self.logger.add_scalar('Train/rel', avg.absrel, it) self.logger.add_scalar('Train/Log10', avg.lg10, it) self.logger.add_scalar('Train/Delta1', avg.delta1, it) self.logger.add_scalar('Train/Delta2', avg.delta2, it) self.logger.add_scalar('Train/Delta3', avg.delta3, it) def eval(self, it): skip = len(self.eval_loader) // 9 # save images every skip iters self.eval_meter.reset() for i, (input, target) in enumerate(self.eval_loader): end = time.time() input, target = input.cuda(), target.cuda() data_time = time.time() - end # compute output end = time.time() with torch.no_grad(): pred = self.model(input) gpu_time = time.time() - end # measure accuracy and record loss # print(input.size(0)) self.result.set_to_worst() self.result.evaluate(pred[0], target) self.eval_meter.update(self.result, gpu_time, data_time, input.size(0)) if i % skip == 0: pred = pred[0] # save 8 images for visualization h, w = target.size(2), target.size(3) if h != pred.size(2) or w != pred.size(3): pred = F.interpolate(input=pred, size=(h, w), mode='bilinear', align_corners=True) data = input[0] target = target[0] pred = pred[0] if self.opt.modality == 'd': img_merge = None else: if self.opt.modality == 'rgb': rgb = data elif self.opt.modality == 'rgbd': rgb = data[:3, :, :] depth = data[3:, :, :] if i == 0: if self.opt.modality == 'rgbd': img_merge = utils.merge_into_row_with_gt( rgb, depth, target, pred) else: img_merge = utils.merge_into_row(rgb, target, pred) elif (i < 8 * skip) and (i % skip == 0): if self.opt.modality == 'rgbd': row = utils.merge_into_row_with_gt( rgb, depth, target, pred) else: row = utils.merge_into_row(rgb, target, pred) img_merge = utils.add_row(img_merge, row) elif i == 8 * skip: filename = self.output_directory + '/comparison_' + str( it) + '.png' utils.save_image(img_merge, filename) if (i + 1) % self.opt.print_freq == 0: print( 'Test: [{0}/{1}]\t' 't_GPU={gpu_time:.3f}({average.gpu_time:.3f})\n\t' 'RMSE={result.rmse:.2f}({average.rmse:.2f}) ' 'REL={result.absrel:.2f}({average.absrel:.2f}) ' 'Log10={result.lg10:.3f}({average.lg10:.3f}) ' 'Delta1={result.delta1:.3f}({average.delta1:.3f}) ' 'Delta2={result.delta2:.3f}({average.delta2:.3f}) ' 'Delta3={result.delta3:.3f}({average.delta3:.3f}) '.format( i + 1, len(self.eval_loader), gpu_time=gpu_time, result=self.result, average=self.eval_meter.average())) avg = self.eval_meter.average() self.logger.add_scalar('Test/RMSE', avg.rmse, it) self.logger.add_scalar('Test/rel', avg.absrel, it) self.logger.add_scalar('Test/Log10', avg.lg10, it) self.logger.add_scalar('Test/Delta1', avg.delta1, it) self.logger.add_scalar('Test/Delta2', avg.delta2, it) self.logger.add_scalar('Test/Delta3', avg.delta3, it) print('\n*\n' 'RMSE={average.rmse:.3f}\n' 'Rel={average.absrel:.3f}\n' 'Log10={average.lg10:.3f}\n' 'Delta1={average.delta1:.3f}\n' 'Delta2={average.delta2:.3f}\n' 'Delta3={average.delta3:.3f}\n' 't_GPU={time:.3f}\n'.format(average=avg, time=avg.gpu_time)) def train_eval(self): for it in tqdm(range(self.st_iter, self.ed_iter + 1), total=self.ed_iter - self.st_iter + 1, leave=False, dynamic_ncols=True): self.model.train() self.train_iter(it) # save the change of learning_rate for i, param_group in enumerate(self.optimizer.param_groups): old_lr = float(param_group['lr']) self.logger.add_scalar('Lr/lr_' + str(i), old_lr, it) if it % self.iter_save == 0: self.model.eval() self.eval(it) self.metric = self.eval_meter.average().absrel train_avg = self.train_meter.average() eval_avg = self.eval_meter.average() self.logger.add_scalars('TrainVal/rmse', { 'train_rmse': train_avg.rmse, 'test_rmse': eval_avg.rmse }, it) self.logger.add_scalars('TrainVal/rel', { 'train_rel': train_avg.absrel, 'test_rmse': eval_avg.absrel }, it) self.logger.add_scalars('TrainVal/lg10', { 'train_lg10': train_avg.lg10, 'test_rmse': eval_avg.lg10 }, it) self.logger.add_scalars('TrainVal/Delta1', { 'train_d1': train_avg.delta1, 'test_d1': eval_avg.delta1 }, it) self.logger.add_scalars('TrainVal/Delta2', { 'train_d2': train_avg.delta2, 'test_d2': eval_avg.delta2 }, it) self.logger.add_scalars('TrainVal/Delta3', { 'train_d3': train_avg.delta3, 'test_d3': eval_avg.delta3 }, it) self.train_meter.reset() # remember best rmse and save checkpoint is_best = eval_avg.absrel < self.best_result.absrel if is_best: self.best_result = eval_avg with open(self.best_txt, 'w') as txtfile: txtfile.write( "Iter={}, rmse={:.3f}, rel={:.3f}, log10={:.3f}, d1={:.3f}, d2={:.3f}, dd31={:.3f}, " "t_gpu={:.4f}".format( it, eval_avg.rmse, eval_avg.absrel, eval_avg.lg10, eval_avg.delta1, eval_avg.delta2, eval_avg.delta3, eval_avg.gpu_time)) # save checkpoint for each epoch utils.save_checkpoint( { 'args': self.opt, 'epoch': it, 'state_dict': self.model.state_dict(), 'best_result': self.best_result, 'optimizer': self.optimizer, }, is_best, it, self.output_directory) # Update learning rate do_schedule(self.opt, self.scheduler, it=it, len=self.iter_save, metrics=self.metric) self.logger.close()
def main(): args = parse_command() print(args) # if setting gpu id, the using single GPU if args.gpu: print('Single GPU Mode.') os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu best_result = Result() best_result.set_to_worst() # set random seed torch.manual_seed(args.manual_seed) torch.cuda.manual_seed(args.manual_seed) np.random.seed(args.manual_seed) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") args.batch_size = args.batch_size * torch.cuda.device_count() else: print("Let's use GPU ", torch.cuda.current_device()) train_loader, val_loader = create_loader(args) if args.resume: assert os.path.isfile(args.resume), \ "=> no checkpoint found at '{}'".format(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] optimizer = checkpoint['optimizer'] # solve 'out of memory' model = checkpoint['model'] print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch'])) # clear memory del checkpoint # del model_dict torch.cuda.empty_cache() else: print("=> creating Model") model = get_models(args) print("=> model created.") start_epoch = 0 # different modules have different learning rate train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] optimizer = torch.optim.SGD(train_params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # You can use DataParallel() whether you use Multi-GPUs or not model = nn.DataParallel(model).cuda() # when training, use reduceLROnPlateau to reduce learning rate scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=args.lr_patience) # loss function criterion = criteria._CrossEntropyLoss2d(size_average=True, batch_average=True) # create directory path output_directory = utils.get_output_directory(args) if not os.path.exists(output_directory): os.makedirs(output_directory) best_txt = os.path.join(output_directory, 'best.txt') config_txt = os.path.join(output_directory, 'config.txt') # write training parameters to config file if not os.path.exists(config_txt): with open(config_txt, 'w') as txtfile: args_ = vars(args) args_str = '' for k, v in args_.items(): args_str = args_str + str(k) + ':' + str(v) + ',\t\n' txtfile.write(args_str) # create log log_path = os.path.join( output_directory, 'logs', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) if os.path.isdir(log_path): shutil.rmtree(log_path) os.makedirs(log_path) logger = SummaryWriter(log_path) start_iter = len(train_loader) * start_epoch + 1 max_iter = len(train_loader) * (args.epochs - start_epoch + 1) + 1 iter_save = len(train_loader) # iter_save = 1 # train model.train() if args.freeze: model.module.freeze_backbone_bn() output_directory = utils.get_output_directory(args, check=True) average_meter = AverageMeter() train_meter = AverageMeter() for it in tqdm(range(start_iter, max_iter + 1), total=max_iter, leave=False, dynamic_ncols=True): optimizer.zero_grad() loss = 0 data_time = 0 gpu_time = 0 for _ in range(args.iter_size): end = time.time() try: samples = next(loader_iter) except: loader_iter = iter(train_loader) samples = next(loader_iter) input = samples['image'].cuda() target = samples['label'].cuda() torch.cuda.synchronize() data_time_ = time.time() data_time += data_time_ - end with torch.autograd.detect_anomaly(): preds = model(input) # @wx 注意输出 # print('#train preds size:', len(preds)) # print('#train preds[0] size:', preds[0].size()) iter_loss = 0 if args.msc: for pred in preds: # Resize labels for {100%, 75%, 50%, Max} logits target_ = utils.resize_labels(target, shape=(pred.size()[-2], pred.size()[-1])) # print('#train pred size:', pred.size()) iter_loss += criterion(pred, target_) else: pred = preds target_ = utils.resize_labels(target, shape=(pred.size()[-2], pred.size()[-1])) # print('#train pred size:', pred.size()) # print('#train target size:', target.size()) iter_loss += criterion(pred, target_) # Backpropagate (just compute gradients wrt the loss) iter_loss /= args.iter_size iter_loss.backward() loss += float(iter_loss) gpu_time += time.time() - data_time_ torch.cuda.synchronize() # Update weights with accumulated gradients optimizer.step() # measure accuracy and record loss result = Result() pred = F.softmax(pred, dim=1) result.evaluate(pred.data.cpu().numpy(), target.data.cpu().numpy(), n_class=21) average_meter.update(result, gpu_time, data_time, input.size(0)) train_meter.update(result, gpu_time, data_time, input.size(0)) if it % args.print_freq == 0: print('=> output: {}'.format(output_directory)) print('Train Iter: [{0}/{1}]\t' 't_Data={data_time:.3f}({average.data_time:.3f}) ' 't_GPU={gpu_time:.3f}({average.gpu_time:.3f})\n\t' 'Loss={Loss:.5f} ' 'MeanAcc={result.mean_acc:.3f}({average.mean_acc:.3f}) ' 'MIOU={result.mean_iou:.3f}({average.mean_iou:.3f}) '.format( it, max_iter, data_time=data_time, gpu_time=gpu_time, Loss=loss, result=result, average=average_meter.average())) logger.add_scalar('Train/Loss', loss, it) logger.add_scalar('Train/mean_acc', result.mean_iou, it) logger.add_scalar('Train/mean_iou', result.mean_acc, it) if it % iter_save == 0: epoch = it // iter_save resu1t, img_merge = validate(args, val_loader, model, epoch=epoch, logger=logger) # when rml doesn't fall, reduce learning rate scheduler.step(result.mean_iou) # save the change of learning_rate for i, param_group in enumerate(optimizer.param_groups): old_lr = float(param_group['lr']) logger.add_scalar('Lr/lr_' + str(i), old_lr, it) # vis the change between train and test train_avg = train_meter.average() logger.add_scalars( 'TrainVal/mean_acc', { 'train_mean_acc': train_avg.mean_acc, 'test_mean_acc': result.mean_acc }, epoch) logger.add_scalars( 'TrainVal/mean_iou', { 'train_mean_iou': train_avg.mean_iou, 'test_mean_iou': result.mean_iou }, epoch) train_meter.reset() # remember best rmse and save checkpoint is_best = result.mean_iou < best_result.mean_iou if is_best: best_result = result with open(best_txt, 'w') as txtfile: txtfile.write("epoch={}, mean_iou={:.3f}, mean_acc={:.3f}" "t_gpu={:.4f}".format( epoch, result.mean_iou, result.mean_acc, result.gpu_time)) if img_merge is not None: img_filename = output_directory + '/comparison_best.png' utils.save_image(img_merge, img_filename) # save checkpoint for each epoch utils.save_checkpoint( { 'args': args, 'epoch': epoch, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, is_best, it, output_directory) # change to train mode model.train() if args.freeze: model.module.freeze_backbone_bn() logger.close()