def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, default='options/test/test_KPSAGAN.yml', help='Path to option YMAL file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=False) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: # util.mkdir_and_rename( #opt['path']['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) #### create model model = create_model(opt) avg_psnr = 0.0 idx = 0 dataset_dir = '/srv/wuyichao/Super-Resolution/KPSAGAN/BasicSR-master/BasicSR-master-c/result_600000/' util.mkdir(dataset_dir) for val_data in val_loader: idx += 1 img_name = os.path.splitext(os.path.basename( val_data['LQ_path'][0]))[0] logger.info(img_name) #img_dir = os.path.join(opt['path']['val_images'], img_name) #util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # save images suffix = 'cut' #opt['suffix'] if suffix: save_img_path = osp.join(dataset_dir, img_name + suffix + '.png') else: save_img_path = osp.join(dataset_dir, img_name + '.png') util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info('psnr: {:.4e}'.format(avg_psnr))
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YMAL file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings opt['dist'] = False rank = -1 print('Disabled distributed training.') #### loading resume state if exists if opt['path'].get('resume_state', None): resume_state_path, _ = get_resume_paths(opt) print('\n\t************************') print('resume_state_path: ', resume_state_path) print('\n\t************************') # distributed resuming: all load into default GPU if resume_state_path is None: resume_state = None else: device_id = torch.cuda.current_device() resume_state = torch.load( resume_state_path, map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt.get('use_tb_logger', False) and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter conf_name = basename(args.opt).replace(".yml", "") exp_dir = opt['path']['experiments_root'] log_dir_train = os.path.join(exp_dir, 'tb', conf_name, 'train') log_dir_valid = os.path.join(exp_dir, 'tb', conf_name, 'valid') tb_logger_train = SummaryWriter(log_dir=log_dir_train) tb_logger_valid = SummaryWriter(log_dir=log_dir_valid) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) print('Dataset created') train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model current_step = 0 if resume_state is None else resume_state['iter'] model = create_model(opt, current_step) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training timer = Timer() logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) timerData = TickTock() for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) timerData.tick() for _, train_data in enumerate(train_loader): timerData.tock() current_step += 1 if current_step > total_iters: break #### training model.feed_data(train_data) #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) nll = model.optimize_parameters(current_step) """ try: nll = model.optimize_parameters(current_step) except RuntimeError as e: nll = None print("Skipping ERROR caught in nll = model.optimize_parameters(current_step): ") print(e) """ if nll is None: nll = 0 #### log def eta(t_iter): return (t_iter * (opt['train']['niter'] - current_step)) / 3600 if current_step % opt['logger']['print_freq'] == 0 \ or current_step - (resume_state['iter'] if resume_state else 0) < 25: avg_time = timer.get_average_and_reset() avg_data_time = timerData.get_average_and_reset() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}, t:{:.2e}, td:{:.2e}, eta:{:.2e}, nll:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate(), avg_time, avg_data_time, eta(avg_time), nll) print(message) timer.tick() # Reduce number of logs if current_step % 5 == 0: tb_logger_train.add_scalar('loss/nll', nll, current_step) tb_logger_train.add_scalar('lr/base', model.get_current_learning_rate(), current_step) tb_logger_train.add_scalar('time/iteration', timer.get_last_iteration(), current_step) tb_logger_train.add_scalar('time/data', timerData.get_last_iteration(), current_step) tb_logger_train.add_scalar('time/eta', eta(timer.get_last_iteration()), current_step) for k, v in model.get_current_log().items(): tb_logger_train.add_scalar(k, v, current_step) # validation if current_step % opt['train']['val_freq'] == 0 and rank <= 0: avg_psnr = 0.0 idx = 0 nlls = [] for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) nll = model.test() if nll is None: nll = 0 nlls.append(nll) visuals = model.get_current_visuals() sr_img = None # Save SR images for reference if hasattr(model, 'heats'): for heat in model.heats: for i in range(model.n_sample): sr_img = util.tensor2img( visuals['SR', heat, i]) # uint8 save_img_path = os.path.join( img_dir, '{:s}_{:09d}_h{:03d}_s{:d}.png'.format( img_name, current_step, int(heat * 100), i)) util.save_img(sr_img, save_img_path) else: sr_img = util.tensor2img(visuals['SR']) # uint8 save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) assert sr_img is not None # Save LQ images for reference save_img_path_lq = os.path.join( img_dir, '{:s}_LQ.png'.format(img_name)) if not os.path.isfile(save_img_path_lq): lq_img = util.tensor2img(visuals['LQ']) # uint8 util.save_img( cv2.resize(lq_img, dsize=None, fx=opt['scale'], fy=opt['scale'], interpolation=cv2.INTER_NEAREST), save_img_path_lq) # Save GT images for reference gt_img = util.tensor2img(visuals['GT']) # uint8 save_img_path_gt = os.path.join( img_dir, '{:s}_GT.png'.format(img_name)) if not os.path.isfile(save_img_path_gt): util.save_img(gt_img, save_img_path_gt) # calculate PSNR crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx avg_nll = sum(nlls) / len(nlls) # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}'.format( epoch, current_step, avg_psnr)) # tensorboard logger tb_logger_valid.add_scalar('loss/psnr', avg_psnr, current_step) tb_logger_valid.add_scalar('loss/nll', avg_nll, current_step) tb_logger_train.flush() tb_logger_valid.flush() #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) timerData.tick() with open(os.path.join(opt['path']['root'], "TRAIN_DONE"), 'w') as f: f.write("TRAIN_DONE") if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('--opt', type=str, help='Path to option YAML file.') args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### loading resume state if exists if 'resume_latest' in opt and opt['resume_latest'] == True: if os.path.isdir(opt['path']['training_state']): name_state_files = os.listdir(opt['path']['training_state']) if len(name_state_files) > 0: latest_state_num = 0 for name_state_file in name_state_files: state_num = int(name_state_file.split('.')[0]) if state_num > latest_state_num: latest_state_num = state_num opt['path']['resume_state'] = os.path.join( opt['path']['training_state'], str(latest_state_num)+'.state') else: raise ValueError if opt['path'].get('resume_state', None): device_id = torch.cuda.current_device() resume_state = torch.load(opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if resume_state is None: util.mkdir_and_rename( opt['path']['experiments_root']) # rename experiment folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]'.format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name'] + '_{}'.format(util.get_timestamp())) # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) train_sampler = None train_loader = create_dataloader( train_set, dataset_opt, opt, train_sampler) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training is_time = False logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) if is_time: batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') for epoch in range(start_epoch, total_epochs + 1): if current_step > total_iters: break if is_time: torch.cuda.synchronize() end = time.time() for _, train_data in enumerate(train_loader): if 'adv_train' in opt: current_step += opt['adv_train']['m'] else: current_step += 1 if current_step > total_iters: break #### training model.feed_data(train_data) if is_time: torch.cuda.synchronize() data_time.update(time.time() - end) model.optimize_parameters(current_step) #### update learning rate model.update_learning_rate( current_step, warmup_iter=opt['train']['warmup_iter']) if is_time: torch.cuda.synchronize() batch_time.update(time.time() - end) #### log if current_step % opt['logger']['print_freq'] == 0: # FIXME remove debug debug = True if debug: torch.cuda.empty_cache() logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar(k, v, current_step) logger.info(message) if is_time: logger.info(str(data_time)) logger.info(str(batch_time)) #### validation if opt['datasets'].get('val', None) and current_step % opt['train']['val_freq'] == 0: if opt['model'] in ['sr', 'srgan']: # image restoration validation pbar = util.ProgressBar(len(val_loader)) avg_psnr = 0. idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join( opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join(img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR sr_img, gt_img = util.crop_border( [sr_img, gt_img], opt['scale']) avg_psnr += util.calculate_psnr(sr_img, gt_img) pbar.update('Test {}'.format(img_name)) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) tb_logger.flush() if is_time: torch.cuda.synchronize() end = time.time() logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') tb_logger.close()
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, default='options/train/train_EDVR_woTSA_M.yml', help='Path to option YAML file.') parser.add_argument('--set', dest='set_opt', default=None, nargs=argparse.REMAINDER, help='set options') args = parser.parse_args() opt = option.parse(args.opt, args.set_opt, is_train=True) #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU print('Training from state: {}'.format(opt['path']['resume_state'])) device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options elif opt['auto_resume']: exp_dir = opt['path']['experiments_root'] # first time run: create dirs if not os.path.exists(exp_dir): os.makedirs(exp_dir) os.makedirs(opt['path']['models']) os.makedirs(opt['path']['training_state']) os.makedirs(opt['path']['val_images']) os.makedirs(opt['path']['tb_logger']) resume_state = None else: # detect experiment directory and get the latest state state_dir = opt['path']['training_state'] state_files = [ x for x in os.listdir(state_dir) if x.endswith('state') ] # no valid state detected if len(state_files) < 1: print( 'No previous training state found, train from start state') resume_state = None else: state_files = sorted(state_files, key=lambda x: int(x.split('.')[0])) latest_state = state_files[-1] print('Training from lastest state: {}'.format(latest_state)) latest_state_file = os.path.join(state_dir, latest_state) opt['path']['resume_state'] = latest_state_file device_id = torch.cuda.current_device() resume_state = torch.load( latest_state_file, map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) else: resume_state = None if resume_state is None and not opt['auto_resume'] and not opt['no_log']: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.2: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir=opt['path']['tb_logger']) # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader if opt['datasets']['train']['ratio']: dataset_ratio = opt['datasets']['train']['ratio'] else: dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) if dataset_opt['mode'] in ['MetaREDS', 'MetaREDSOnline']: train_sampler = MetaIterSampler(train_set, dataset_opt['batch_size'], len(opt['scale']), dataset_ratio) elif dataset_opt['mode'] in ['REDS', 'MultiREDS']: train_sampler = IterSampler(train_set, dataset_opt['batch_size'], dataset_ratio) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar(k, v, current_step) logger.info(message) print("PROGRESS: {:02d}%".format( int(current_step / total_iters * 100))) #### validation if opt['datasets'].get( 'val', None) and current_step % opt['train']['val_freq'] == 0: pbar = util.ProgressBar(len(val_loader)) psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. for val_data in val_loader: folder = val_data['folder'][0] idx_d = val_data['idx'].item() # border = val_data['border'].item() if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = [] model.feed_data(val_data) model.test() visuals = model.get_current_visuals() rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder].append(psnr) pbar.update('Test {} - {}'.format(folder, idx_d)) for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format(psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') tb_logger.close()
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) opt_net = opt['network_G'] which_model = opt_net['which_model_G'] #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training #print(train_data) model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) ### validation if opt['datasets'].get( 'val', None) and current_step % opt['train']['val_freq'] == 0: # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) avg_psnr = 0. idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() if which_model == "RCAN": sr_img = util.tensor2img(visuals['rlt'], np.uint8, min_max=(0, 255)) # uint8 gt_img = util.tensor2img(visuals['GT'], np.uint8, min_max=(0, 255)) # uint8 else: sr_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale']) avg_psnr += util.calculate_psnr(sr_img, gt_img) pbar.update('Test {}'.format(img_name)) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') tb_logger.close()
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) parser.add_argument('--exp_name', type=str, default='temp') parser.add_argument('--degradation_type', type=str, default=None) parser.add_argument('--sigma_x', type=float, default=None) parser.add_argument('--sigma_y', type=float, default=None) parser.add_argument('--theta', type=float, default=None) args = parser.parse_args() if args.exp_name == 'temp': opt = option.parse(args.opt, is_train=True) else: opt = option.parse(args.opt, is_train=True, exp_name=args.exp_name) # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) inner_loop_name = opt['train']['maml']['optimizer'][0] + str( opt['train']['maml']['adapt_iter']) + str( math.floor(math.log10(opt['train']['maml']['lr_alpha']))) meta_loop_name = opt['train']['optim'][0] + str( math.floor(math.log10(opt['train']['lr_G']))) if args.degradation_type is not None: if args.degradation_type == 'preset': opt['datasets']['val']['degradation_mode'] = args.degradation_type else: opt['datasets']['val']['degradation_type'] = args.degradation_type if args.sigma_x is not None: opt['datasets']['val']['sigma_x'] = args.sigma_x if args.sigma_y is not None: opt['datasets']['val']['sigma_y'] = args.sigma_y if args.theta is not None: opt['datasets']['val']['theta'] = args.theta if opt['datasets']['val']['degradation_mode'] == 'set': degradation_name = str(opt['datasets']['val']['degradation_type'])\ + '_' + str(opt['datasets']['val']['sigma_x']) \ + '_' + str(opt['datasets']['val']['sigma_y'])\ + '_' + str(opt['datasets']['val']['theta']) else: degradation_name = opt['datasets']['val']['degradation_mode'] patch_name = 'p{}x{}'.format( opt['train']['maml']['patch_size'], opt['train']['maml'] ['num_patch']) if opt['train']['maml']['use_patch'] else 'full' use_real_flag = '_ideal' if opt['train']['use_real'] else '' folder_name = opt[ 'name'] + '_' + degradation_name # + '_' + inner_loop_name + meta_loop_name + '_' + degradation_name + '_' + patch_name + use_real_flag if args.exp_name != 'temp': folder_name = args.exp_name #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: #util.mkdir_and_rename( # opt['path']['experiments_root']) # rename experiment folder if exists #util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' # and 'pretrain_model' not in key and 'resume' not in key)) if not os.path.exists(opt['path']['experiments_root']): os.mkdir(opt['path']['experiments_root']) # raise ValueError('Path does not exists - check path') # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') #logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + folder_name) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': pass elif phase == 'val': if '+' in opt['datasets']['val']['name']: val_set, val_loader = [], [] valname_list = opt['datasets']['val']['name'].split('+') for i in range(len(valname_list)): val_set.append( create_dataset( dataset_opt, scale=opt['scale'], kernel_size=opt['datasets']['train'] ['kernel_size'], model_name=opt['network_E']['which_model_E'], idx=i)) val_loader.append( create_dataloader(val_set[-1], dataset_opt, opt, None)) else: val_set = create_dataset( dataset_opt, scale=opt['scale'], kernel_size=opt['datasets']['train']['kernel_size'], model_name=opt['network_E']['which_model_E']) # val_set = loader.get_dataset(opt, train=False) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) #### create model models = create_model(opt) assert len(models) == 2 model, est_model = models[0], models[1] modelcp, est_modelcp = create_model(opt) _, est_model_fixed = create_model(opt) center_idx = (opt['datasets']['val']['N_frames']) // 2 lr_alpha = opt['train']['maml']['lr_alpha'] update_step = opt['train']['maml']['adapt_iter'] pd_log = pd.DataFrame( columns=['PSNR_Bicubic', 'PSNR_Ours', 'SSIM_Bicubic', 'SSIM_Ours']) def crop(LR_seq, HR, num_patches_for_batch=4, patch_size=44): """ Crop given patches. Args: LR_seq: (B=1) x T x C x H x W HR: (B=1) x C x H x W patch_size (int, optional): Return: B(=batch_size) x T x C x H x W """ # Find the lowest resolution cropped_lr = [] cropped_hr = [] assert HR.size(0) == 1 LR_seq_ = LR_seq[0] HR_ = HR[0] for _ in range(num_patches_for_batch): patch_lr, patch_hr = preprocessing.common_crop( LR_seq_, HR_, patch_size=patch_size // 2) cropped_lr.append(patch_lr) cropped_hr.append(patch_hr) cropped_lr = torch.stack(cropped_lr, dim=0) cropped_hr = torch.stack(cropped_hr, dim=0) return cropped_lr, cropped_hr # Single GPU # PSNR_rlt: psnr_init, psnr_before, psnr_after psnr_rlt = [{}, {}] # SSIM_rlt: ssim_init, ssim_after ssim_rlt = [{}, {}] pbar = util.ProgressBar(len(val_set)) for val_data in val_loader: folder = val_data['folder'][0] idx_d = int(val_data['idx'][0].split('/')[0]) if 'name' in val_data.keys(): name = val_data['name'][0][center_idx][0] else: #name = '{}/{:08d}'.format(folder, idx_d) name = folder train_folder = os.path.join('../results_for_paper', folder_name, name) hr_train_folder = os.path.join(train_folder, 'hr') bic_train_folder = os.path.join(train_folder, 'bic') maml_train_folder = os.path.join(train_folder, 'maml') #slr_train_folder = os.path.join(train_folder, 'slr') # print(train_folder) if not os.path.exists(train_folder): os.makedirs(train_folder, exist_ok=False) if not os.path.exists(hr_train_folder): os.mkdir(hr_train_folder) if not os.path.exists(bic_train_folder): os.mkdir(bic_train_folder) if not os.path.exists(maml_train_folder): os.mkdir(maml_train_folder) #if not os.path.exists(slr_train_folder): # os.mkdir(slr_train_folder) for i in range(len(psnr_rlt)): if psnr_rlt[i].get(folder, None) is None: psnr_rlt[i][folder] = [] for i in range(len(ssim_rlt)): if ssim_rlt[i].get(folder, None) is None: ssim_rlt[i][folder] = [] if idx_d % 10 != 5: #continue pass cropped_meta_train_data = {} meta_train_data = {} meta_test_data = {} # Make SuperLR seq using estimation model meta_train_data['GT'] = val_data['LQs'][:, center_idx] meta_test_data['LQs'] = val_data['LQs'][0:1] meta_test_data['GT'] = val_data['GT'][0:1, center_idx] # Check whether the batch size of each validation data is 1 assert val_data['SuperLQs'].size(0) == 1 if opt['network_G']['which_model_G'] == 'TOF': LQs = meta_test_data['LQs'] B, T, C, H, W = LQs.shape LQs = LQs.reshape(B * T, C, H, W) Bic_LQs = F.interpolate(LQs, scale_factor=opt['scale'], mode='bicubic', align_corners=True) meta_test_data['LQs'] = Bic_LQs.reshape(B, T, C, H * opt['scale'], W * opt['scale']) ## Before start training, first save the bicubic, real outputs # Bicubic modelcp.load_network(opt['path']['bicubic_G'], modelcp.netG) modelcp.feed_data(meta_test_data) modelcp.test() model_start_visuals = modelcp.get_current_visuals(need_GT=True) hr_image = util.tensor2img(model_start_visuals['GT'], mode='rgb') start_image = util.tensor2img(model_start_visuals['rlt'], mode='rgb') #####imageio.imwrite(os.path.join(hr_train_folder, '{:08d}.png'.format(idx_d)), hr_image) #####imageio.imwrite(os.path.join(bic_train_folder, '{:08d}.png'.format(idx_d)), start_image) psnr_rlt[0][folder].append(util.calculate_psnr(start_image, hr_image)) ssim_rlt[0][folder].append(util.calculate_ssim(start_image, hr_image)) modelcp.netG, est_modelcp.netE = deepcopy(model.netG), deepcopy( est_model.netE) ########## SLR LOSS Preparation ############ est_model_fixed.load_network(opt['path']['fixed_E'], est_model_fixed.netE) optim_params = [] for k, v in modelcp.netG.named_parameters(): if v.requires_grad: optim_params.append(v) if not opt['train']['use_real']: for k, v in est_modelcp.netE.named_parameters(): if v.requires_grad: optim_params.append(v) if opt['train']['maml']['optimizer'] == 'Adam': inner_optimizer = torch.optim.Adam( optim_params, lr=lr_alpha, betas=(opt['train']['maml']['beta1'], opt['train']['maml']['beta2'])) elif opt['train']['maml']['optimizer'] == 'SGD': inner_optimizer = torch.optim.SGD(optim_params, lr=lr_alpha) else: raise NotImplementedError() # Inner Loop Update st = time.time() for i in range(update_step): # Make SuperLR seq using UPDATED estimation model if not opt['train']['use_real']: est_modelcp.feed_data(val_data) # est_model.test() est_modelcp.forward_without_optim() superlr_seq = est_modelcp.fake_L meta_train_data['LQs'] = superlr_seq else: meta_train_data['LQs'] = val_data['SuperLQs'] if opt['network_G']['which_model_G'] == 'TOF': # Bicubic upsample to match the size LQs = meta_train_data['LQs'] B, T, C, H, W = LQs.shape LQs = LQs.reshape(B * T, C, H, W) Bic_LQs = F.interpolate(LQs, scale_factor=opt['scale'], mode='bicubic', align_corners=True) meta_train_data['LQs'] = Bic_LQs.reshape( B, T, C, H * opt['scale'], W * opt['scale']) # Update both modelcp + estmodelcp jointly inner_optimizer.zero_grad() if opt['train']['maml']['use_patch']: cropped_meta_train_data['LQs'], cropped_meta_train_data['GT'] = \ crop(meta_train_data['LQs'], meta_train_data['GT'], opt['train']['maml']['num_patch'], opt['train']['maml']['patch_size']) modelcp.feed_data(cropped_meta_train_data) else: modelcp.feed_data(meta_train_data) loss_train = modelcp.calculate_loss() ##################### SLR LOSS ################### est_model_fixed.feed_data(val_data) est_model_fixed.test() slr_initialized = est_model_fixed.fake_L slr_initialized = slr_initialized.to('cuda') if opt['network_G']['which_model_G'] == 'TOF': loss_train += 10 * F.l1_loss( LQs.to('cuda').squeeze(0), slr_initialized) else: loss_train += 10 * F.l1_loss(meta_train_data['LQs'].to('cuda'), slr_initialized) loss_train.backward() inner_optimizer.step() et = time.time() update_time = et - st modelcp.feed_data(meta_test_data) modelcp.test() model_update_visuals = modelcp.get_current_visuals(need_GT=False) update_image = util.tensor2img(model_update_visuals['rlt'], mode='rgb') # Save and calculate final image imageio.imwrite( os.path.join(maml_train_folder, '{:08d}.png'.format(idx_d)), update_image) psnr_rlt[1][folder].append(util.calculate_psnr(update_image, hr_image)) ssim_rlt[1][folder].append(util.calculate_ssim(update_image, hr_image)) name_df = '{}/{:08d}'.format(folder, idx_d) if name_df in pd_log.index: pd_log.at[name_df, 'PSNR_Bicubic'] = psnr_rlt[0][folder][-1] pd_log.at[name_df, 'PSNR_Ours'] = psnr_rlt[1][folder][-1] pd_log.at[name_df, 'SSIM_Bicubic'] = ssim_rlt[0][folder][-1] pd_log.at[name_df, 'SSIM_Ours'] = ssim_rlt[1][folder][-1] else: pd_log.loc[name_df] = [ psnr_rlt[0][folder][-1], psnr_rlt[1][folder][-1], ssim_rlt[0][folder][-1], ssim_rlt[1][folder][-1] ] pd_log.to_csv( os.path.join('../results_for_paper', folder_name, 'psnr_update.csv')) pbar.update( 'Test {} - {}: I: {:.3f}/{:.4f} \tF+: {:.3f}/{:.4f} \tTime: {:.3f}s' .format(folder, idx_d, psnr_rlt[0][folder][-1], ssim_rlt[0][folder][-1], psnr_rlt[1][folder][-1], ssim_rlt[1][folder][-1], update_time)) psnr_rlt_avg = {} psnr_total_avg = 0. # Just calculate the final value of psnr_rlt(i.e. psnr_rlt[2]) for k, v in psnr_rlt[0].items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt[0]) log_s = '# Validation # Bic PSNR: {:.4e}:'.format(psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) psnr_rlt_avg = {} psnr_total_avg = 0. # Just calculate the final value of psnr_rlt(i.e. psnr_rlt[2]) for k, v in psnr_rlt[1].items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt[1]) log_s = '# Validation # PSNR: {:.4e}:'.format(psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) ssim_rlt_avg = {} ssim_total_avg = 0. # Just calculate the final value of ssim_rlt(i.e. ssim_rlt[1]) for k, v in ssim_rlt[0].items(): ssim_rlt_avg[k] = sum(v) / len(v) ssim_total_avg += ssim_rlt_avg[k] ssim_total_avg /= len(ssim_rlt[0]) log_s = '# Validation # Bicubic SSIM: {:.4e}:'.format(ssim_total_avg) for k, v in ssim_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) ssim_rlt_avg = {} ssim_total_avg = 0. # Just calculate the final value of ssim_rlt(i.e. ssim_rlt[1]) for k, v in ssim_rlt[1].items(): ssim_rlt_avg[k] = sum(v) / len(v) ssim_total_avg += ssim_rlt_avg[k] ssim_total_avg /= len(ssim_rlt[1]) log_s = '# Validation # SSIM: {:.4e}:'.format(ssim_total_avg) for k, v in ssim_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) logger.info('End of evaluation.')
def main(): #### setup options of three networks parser = argparse.ArgumentParser() parser.add_argument('-opt_P', type=str, help='Path to option YMAL file of Predictor.') parser.add_argument('-opt_C', type=str, help='Path to option YMAL file of Corrector.') parser.add_argument('-opt_F', type=str, help='Path to option YMAL file of SFTMD_Net.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) parser.add_argument('--SFM', type=int, default=0, help='0: no SFM, 1: SFM') args = parser.parse_args() opt_P = option.parse(args.opt_P, is_train=True) opt_C = option.parse(args.opt_C, is_train=True) opt_F = option.parse(args.opt_F, is_train=True) # convert to NoneDict, which returns None for missing keys opt_P = option.dict_to_nonedict(opt_P) opt_C = option.dict_to_nonedict(opt_C) opt_F = option.dict_to_nonedict(opt_F) # choose small opt for SFTMD test, fill path of pre-trained model_F opt_F = opt_F['sftmd'] #### set random seed seed = opt_P['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) util.set_random_seed(seed) # load PCA matrix of enough kernel print('load PCA matrix') pca_matrix = torch.load('./pca_matrix.pth', map_location=lambda storage, loc: storage) print('PCA matrix shape: {}'.format(pca_matrix.shape)) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt_P['dist'] = False opt_F['dist'] = False opt_C['dist'] = False rank = -1 print('Disabled distributed training.') else: opt_P['dist'] = True opt_F['dist'] = True opt_C['dist'] = True init_dist() world_size = torch.distributed.get_world_size( ) #Returns the number of processes in the current process group rank = torch.distributed.get_rank( ) #Returns the rank of current process group torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True ###### Predictor&Corrector train ###### #### loading resume state if exists if opt_P['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt_P['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt_P, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0-7) if resume_state is None: # Predictor path util.mkdir_and_rename( opt_P['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt_P['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # Corrector path util.mkdir_and_rename( opt_C['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt_C['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt_P['path']['log'], 'train_' + opt_P['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt_P['path']['log'], 'val_' + opt_P['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt_P)) logger.info(option.dict2str(opt_C)) # tensorboard logger if opt_P['use_tb_logger'] and 'debug' not in opt_P['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt_P['name']) else: util.setup_logger('base', opt_P['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt_P['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt_P['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt_P['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt_P, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt_P, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None assert val_loader is not None #### create model model_F = create_model(opt_F) #load pretrained model of SFTMD model_P = create_model(opt_P) model_C = create_model(opt_C) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model_P.resume_training( resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt_P['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate, schedulers # model.update_learning_rate(current_step, warmup_iter=opt_P['train']['warmup_iter']) #### preprocessing for LR_img and kernel map prepro = util.SRMDPreprocessing(opt_P['scale'], pca_matrix, random=True, para_input=opt_P['code_length'], kernel=opt_P['kernel_size'], noise=False, cuda=True, sig=opt_P['sig'], sig_min=opt_P['sig_min'], sig_max=opt_P['sig_max'], rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0) if (opt.SFM == 0): LR_img, ker_map = prepro(train_data['GT']) else: img_train_SFM = train_data['GT'].copy() for img_idx in range(train_data['GT'].size()[0]): img_numpy, mask = random_drop( train_data['GT'][img_idx, :, :, :].data.numpy(), mode=0) img_train_SFM[img_idx, :, :, :] = img_numpy LR_img, ker_map = prepro(img_train_SFM) #### training Predictor model_P.feed_data(LR_img, ker_map) model_P.optimize_parameters(current_step) P_visuals = model_P.get_current_visuals() est_ker_map = P_visuals['Batch_est_ker_map'] #### log of model_P if current_step % opt_P['logger']['print_freq'] == 0: logs = model_P.get_current_log() message = 'Predictor <epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model_P.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt_P['use_tb_logger'] and 'debug' not in opt_P['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### training Corrector for step in range(opt_C['step']): # test SFTMD for corresponding SR image model_F.feed_data(train_data, LR_img, est_ker_map) model_F.test() F_visuals = model_F.get_current_visuals() SR_img = F_visuals['Batch_SR'] # Test SFTMD to produce SR images # train corrector given SR image and estimated kernel map model_C.feed_data(SR_img, est_ker_map, ker_map) model_C.optimize_parameters(current_step) C_visuals = model_C.get_current_visuals() est_ker_map = C_visuals['Batch_est_ker_map'] #### log of model_C if current_step % opt_C['logger']['print_freq'] == 0: logs = model_C.get_current_log() message = 'Corrector <epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model_C.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt_C['use_tb_logger'] and 'debug' not in opt_C[ 'name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### save models and training states if current_step % opt_P['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model_P.save(current_step) model_P.save_training_state(epoch, current_step) model_C.save(current_step) model_C.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model_P.save('latest') model_C.save('latest') logger.info('End of Predictor and Corrector training.') tb_logger.close()
def main(): ymlPath = './options/df2k/train_bicubic_noise.yml' parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, default=ymlPath, help='Path to option YMAL file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() if opt['path'].get('resume_state', None): device_id = torch.cuda.current_device() resume_state = torch.load(opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options print('resume_state=', opt['path']['resume_state']) else: resume_state = None if rank <= 0: if resume_state is None: util.mkdir_and_rename( opt['path']['experiments_root']) # rename experiment folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]'.format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') opt = option.dict_to_nonedict(opt) seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True dataset_ratio = 1 # enlarge the size of each epoch train_loader = None val_loader = None for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None model = create_model(opt) if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break model.feed_data(train_data) model.optimize_parameters(current_step) if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) loss_and_index = epoch-start_epoch if current_step % opt['train']['val_freq'] == 0 and rank <= 0 and val_loader is not None: avg_psnr = val_pix_err_f = val_pix_err_nf = val_mean_color_err = avg_ssim = 0.0 idx = 0 for val_data in val_loader: if idx > 2: break idx += 1 img_name = os.path.splitext(os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 print("!!!Now start to save Image!!!!") save_img_path = os.path.join(img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_ssim += util.calculate_ssim(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx avg_ssim = avg_ssim / idx val_pix_err_f /= idx val_pix_err_nf /= idx val_mean_color_err /= idx logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger.info('# Validation # SSIM: {:.4e}'.format(avg_ssim)) logger_val = logging.getLogger('val') # validation logger logger_val.info('<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}'.format( epoch, current_step, avg_psnr)) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) tb_logger.add_scalar('val_pix_err_f', val_pix_err_f, current_step) tb_logger.add_scalar('val_pix_err_nf', val_pix_err_nf, current_step) tb_logger.add_scalar('val_mean_color_err', val_mean_color_err, current_step) if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')
def main(): # options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option JSON file.') opt = option.parse(parser.parse_args().opt, is_train=True) opt = option.dict_to_nonedict(opt) # Convert to NoneDict, which return None for missing key. # train from scratch OR resume training if opt['path']['resume_state']: if os.path.isdir(opt['path']['resume_state']): import glob resume_state_path = util.sorted_nicely(glob.glob(os.path.normpath(opt['path']['resume_state']) + '/*.state'))[-1] else: resume_state_path = opt['path']['resume_state'] resume_state = torch.load(resume_state_path) else: # training from scratch resume_state = None util.mkdir_and_rename(opt['path']['experiments_root']) # rename old folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger(None, opt['path']['log'], 'train', level=logging.INFO, screen=True) util.setup_logger('val', opt['path']['log'], 'val', level=logging.INFO) logger = logging.getLogger('base') if resume_state: logger.info('Set [resume_state] to ' + resume_state_path) logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) option.check_resume(opt) # check resume options logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: from tensorboardX import SummaryWriter try: tb_logger = SummaryWriter(logdir='../tb_logger/' + opt['name']) #for version tensorboardX >= 1.7 except: tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) #for version tensorboardX < 1.6 # random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) # if the model does not change and input sizes remain the same during training then there may be benefit # from setting torch.backends.cudnn.benchmark = True, otherwise it may stall training torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True # create train and val dataloader val_loader = False for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) batch_size = dataset_opt.get('batch_size', 4) virtual_batch_size = dataset_opt.get('virtual_batch_size', batch_size) virtual_batch_size = virtual_batch_size if virtual_batch_size > batch_size else batch_size train_size = int(math.ceil(len(train_set) / batch_size)) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) train_loader = create_dataloader(train_set, dataset_opt) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt) logger.info('Number of val images in [{:s}]: {:d}'.format(dataset_opt['name'], len(val_set))) else: raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None # create model model = create_model(opt) # resume training if resume_state: start_epoch = resume_state['epoch'] current_step = resume_state['iter'] virtual_step = current_step * virtual_batch_size / batch_size \ if virtual_batch_size and virtual_batch_size > batch_size else current_step model.resume_training(resume_state) # handle optimizers and schedulers model.update_schedulers(opt['train']) # updated schedulers in case JSON configuration has changed del resume_state # start the iteration time when resuming t0 = time.time() else: current_step = 0 virtual_step = 0 start_epoch = 0 # training logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) try: for epoch in range(start_epoch, total_epochs*(virtual_batch_size//batch_size)): for n, train_data in enumerate(train_loader,start=1): if virtual_step == 0: # first iteration start time t0 = time.time() virtual_step += 1 take_step = False if virtual_step > 0 and virtual_step * batch_size % virtual_batch_size == 0: current_step += 1 take_step = True if current_step > total_iters: break # training model.feed_data(train_data) model.optimize_parameters(virtual_step) # log if current_step % opt['logger']['print_freq'] == 0 and take_step: # iteration end time t1 = time.time() logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}, i_time: {:.4f} sec.> '.format( epoch, current_step, model.get_current_learning_rate(current_step), (t1 - t0)) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar(k, v, current_step) logger.info(message) # # start time for next iteration # t0 = time.time() # update learning rate if model.optGstep and model.optDstep and take_step: model.update_learning_rate(current_step, warmup_iter=opt['train'].get('warmup_iter', -1)) # save models and training states (changed to save models before validation) if current_step % opt['logger']['save_checkpoint_freq'] == 0 and take_step: if model.swa: model.save(current_step, opt['logger']['overwrite_chkp'], loader=train_loader) else: model.save(current_step, opt['logger']['overwrite_chkp']) model.save_training_state(epoch + (n >= len(train_loader)), current_step, opt['logger']['overwrite_chkp']) logger.info('Models and training states saved.') # validation if val_loader and current_step % opt['train']['val_freq'] == 0 and take_step: val_sr_imgs_list = [] val_gt_imgs_list = [] val_metrics = metrics.MetricsDict(metrics=opt['train'].get('metrics', None)) for val_data in val_loader: img_name = os.path.splitext(os.path.basename(val_data['LR_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test(val_data) """ Get Visuals """ visuals = model.get_current_visuals() sr_img = tensor2np(visuals['SR'], denormalize=opt['datasets']['train']['znorm']) gt_img = tensor2np(visuals['HR'], denormalize=opt['datasets']['train']['znorm']) # Save SR images for reference if opt['train']['overwrite_val_imgs']: save_img_path = os.path.join(img_dir, '{:s}.png'.format(\ img_name)) else: save_img_path = os.path.join(img_dir, '{:s}_{:d}.png'.format(\ img_name, current_step)) # save single images or lr / sr comparison if opt['train']['val_comparison']: lr_img = tensor2np(visuals['LR'], denormalize=opt['datasets']['train']['znorm']) util.save_img_comp(lr_img, sr_img, save_img_path) else: util.save_img(sr_img, save_img_path) """ Get Metrics # TODO: test using tensor based metrics (batch) instead of numpy. """ crop_size = opt['scale'] val_metrics.calculate_metrics(sr_img, gt_img, crop_size = crop_size) #, only_y=True) avg_metrics = val_metrics.get_averages() del val_metrics # log logger_m = '' for r in avg_metrics: #print(r) formatted_res = r['name'].upper()+': {:.5g}, '.format(r['average']) logger_m += formatted_res logger.info('# Validation # '+logger_m[:-2]) logger_val = logging.getLogger('val') # validation logger logger_val.info('<epoch:{:3d}, iter:{:8,d}> '.format(epoch, current_step)+logger_m[:-2]) # memory_usage = torch.cuda.memory_allocated()/(1024.0 ** 3) # in GB # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: for r in avg_metrics: tb_logger.add_scalar(r['name'], r['average'], current_step) # # reset time for next iteration to skip the validation time from calculation # t0 = time.time() if current_step % opt['logger']['print_freq'] == 0 and take_step or \ (val_loader and current_step % opt['train']['val_freq'] == 0 and take_step): # reset time for next iteration to skip the validation time from calculation t0 = time.time() logger.info('Saving the final model.') if model.swa: model.save('latest', loader=train_loader) else: model.save('latest') logger.info('End of training.') except KeyboardInterrupt: # catch a KeyboardInterrupt and save the model and state to resume later if model.swa: model.save(current_step, True, loader=train_loader) else: model.save(current_step, True) model.save_training_state(epoch + (n >= len(train_loader)), current_step, True) logger.info('Training interrupted. Latest models and training states saved.')
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YMAL file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) flops, params = get_model_complexity_info(model.netG, (3, 480, 480), as_strings=True, print_per_layer_stat=True, verbose=True) print('{:<30} {:<8}'.format('Computational complexity: ', flops)) print('{:<30} {:<8}'.format('Number of parameters: ', params))
def main(opt, resume_state, logger): # resume if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) option.check_resume(opt) # check resume options logger.info(option.dict2str(opt)) # Initialize visdom if opt['use_vis_logger'] and 'debug' not in opt['name']: vis, vis_plots = init_vis(opt) else: vis, vis_plots = None, None # random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) # create train and val dataloader train_loader, val_loader, total_iters, total_epochs = get_dataloader( opt, logger) # cudnn torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True # create model model = create_model(opt) # resume training if resume_state: start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 # training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) # highest_psnr, highest_ssim for model saving if opt['dataset_type'] == 'reduced': highest_psnr = highest_ssim = 0 elif opt['dataset_type'] == 'full': lowest_D_labmda = lowest_D_s = 1 highest_qnr = 0 for epoch in range(start_epoch, total_epochs): for _, train_data in enumerate(train_loader): # current step current_step += 1 if current_step > total_iters: break # training train(opt, train_data, current_step, epoch, model, logger, vis, vis_plots) # validation if current_step % opt['train']['val_freq'] == 0: # it will write generated images to disk if opt['dataset_type'] == 'reduced': this_psnr, this_ssim = valid(opt, val_loader, current_step, epoch, model, logger, vis, vis_plots) # storage images take lots of time # storage(opt, # val_loader, # current_step, # model, # store=(this_psnr, highest_psnr, this_ssim, # highest_ssim)) elif opt['dataset_type'] == 'full': this_D_lambda, this_D_s, this_qnr = valid( opt, val_loader, current_step, epoch, model, logger, vis, vis_plots) # storage images take lots of time # storage(opt, # val_loader, # current_step, # model, # store=(this_D_lambda, lowest_D_labmda, this_D_s, # lowest_D_s, this_qnr, highest_qnr)) # save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if opt['dataset_type'] == 'reduced': if this_psnr > highest_psnr: logger.info( 'Saving models and training states with highest psnr.' ) # remove the old old_model = glob.glob(opt['path']['models'] + '/*_psnr.pth') old_state = glob.glob(opt['path']['training_state'] + '/*_psnr.state') for ele in old_model: os.remove(ele) for ele in old_state: os.remove(ele) # save the new model.save(current_step, this_psnr, 'psnr') model.save_training_state(epoch, current_step, 'psnr') highest_psnr = this_psnr if this_ssim > highest_ssim: logger.info( 'Saving models and training states with highest ssim.' ) # remove the old old_model = glob.glob(opt['path']['models'] + '/*_ssim.pth') old_state = glob.glob(opt['path']['training_state'] + '/*_ssim.state') for ele in old_model: os.remove(ele) for ele in old_state: os.remove(ele) model.save(current_step, this_ssim, 'ssim') model.save_training_state(epoch, current_step, 'ssim') highest_ssim = this_ssim elif opt['dataset_type'] == 'full': if this_D_lambda < lowest_D_labmda: logger.info( 'Saving models and training states with lowest D_lambda.' ) # remove the old old_model = glob.glob(opt['path']['models'] + '/*_D_lambda.pth') old_state = glob.glob(opt['path']['training_state'] + '/*_D_lambda.state') for ele in old_model: os.remove(ele) for ele in old_state: os.remove(ele) # save the new model.save(current_step, this_D_lambda, 'D_lambda') model.save_training_state(epoch, current_step, 'D_lambda') lowest_D_labmda = this_D_lambda if this_D_s < lowest_D_s: logger.info( 'Saving models and training states with lowest D_s.' ) # remove the old old_model = glob.glob(opt['path']['models'] + '/*_D_s.pth') old_state = glob.glob(opt['path']['training_state'] + '/*_D_s.state') for ele in old_model: os.remove(ele) for ele in old_state: os.remove(ele) model.save(current_step, this_D_s, 'D_s') model.save_training_state(epoch, current_step, 'D_s') lowest_D_s = this_D_s if this_qnr > highest_qnr: logger.info( 'Saving models and training states with higest QNR' ) # remove the old old_model = glob.glob(opt['path']['models'] + '/*_qnr.pth') old_state = glob.glob(opt['path']['training_state'] + '/*_qnr.state') for ele in old_model: os.remove(ele) for ele in old_state: os.remove(ele) model.save(current_step, this_qnr, 'qnr') model.save_training_state(epoch, current_step, 'qnr') highest_qnr = this_qnr # save the last state # logger.info('Saving the final model.') # model.save('latest') logger.info('End of training.')
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YMAL file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) util.mkdirs('../experiments/pretrained_models/') # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch total_epochs = 100000 total_iters = int(opt['train']['niter']) # total_epochs = total_iters for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): # if epoch % opt['train']['redata_freq'] == 0: # filename = random.choice(trainlist) # try: # [train_loader, _] = switch_data(opt,filename,rank,logger,total_epochs,total_iters,True) # except: # print('Data error') # filename = random.choice(trainlist) # continue if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) # #### test input data # print(train_data['GT'].shape) # print(train_data['LQ'].shape) # LQ_data = train_data['LQ'].numpy() # GT_data = train_data['GT'].numpy() # LQ_data = LQ_data.transpose((0,1,3,4,2)) # GT_data = GT_data.transpose((0,1,3,4,2)) # cv2.imwrite('../01.jpg',LQ_data[0][0]*255) # cv2.imwrite('../02.jpg',LQ_data[0][1]*255) # cv2.imwrite('../03.jpg',LQ_data[0][2]*255) # cv2.imwrite('../04.jpg',LQ_data[0][3]*255) # cv2.imwrite('../05.jpg',LQ_data[0][4]*255) # cv2.imwrite('../GT.jpg',GT_data[0][0]*255) # print(s) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # validation if current_step % opt['train']['val_freq'] == 0 and rank <= 0: avg_psnr = 0.0 # lq_psnr = 0.0 dpsnr = 0.0 idx = 0 # filename = random.choice(vallist) # filename = vallist[0] # [_, val_loader] = switch_data(opt,filename,rank,logger,total_epochs,total_iters, False) for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images']) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 lq_img = util.tensor2img(visuals['LQ']) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) if opt['train']['save_test_img']: if opt['train']['save_test_img_num'] > 0: if idx < opt['train']['save_test_img_num']: util.save_img(sr_img, save_img_path) else: util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. lq_img = lq_img / 255. lq_psnr = util.calculate_psnr(lq_img * 255., gt_img * 255.) cur_psnr = util.calculate_psnr(sr_img * 255., gt_img * 255.) avg_psnr += cur_psnr dpsnr += cur_psnr - lq_psnr avg_psnr = avg_psnr / idx lq_psnr = lq_psnr / idx dpsnr = dpsnr / idx # print(avg_psnr) # print(lq_psnr) # print(s) # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger.info('# Validation # DPSNR: {:.4e}'.format(dpsnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}, dpsnr: {:.4e}'. format(epoch, current_step, avg_psnr, dpsnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) tb_logger.add_scalar('dpsnr', dpsnr, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers #if resume_state is None: # util.mkdir_and_rename(opt['path']['experiments_root']) # rename experiment folder if exists # util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' # and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 1 #200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = TrainDataset() train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = ValidDataset() train_size = int( math.ceil(len(val_set) / dataset_opt['batch_size'])) val_sampler = DistIterSampler(val_set, world_size, rank, dataset_ratio) val_loader = create_dataloader(val_set, dataset_opt, opt, val_sampler) logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.3f} '.format(k, v) logger.info(message) print(message) #### save models and training states if current_step % 1000 == 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) #### validation # multi-GPU testing psnr_rlt = [] # with border and center frames for _, val_data in enumerate(val_loader): model.feed_data(val_data) model.test() # calculate PSNR psnr_rlt.append(torch.mean(model.get_psnr()).item()) psnr_total_avg = np.array(psnr_rlt).mean() log_s = '# Validation # PSNR: {:.4f}:'.format(psnr_total_avg) logger.info(log_s) print(log_s) logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') tb_logger.close()
def IKC_train(opt_P, opt_C, opt_F, rank, world_size, pca_matrix): ###### Predictor&Corrector train ###### #### loading resume state if exists if opt_P['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load(opt_P['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt_P, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0-7) if resume_state is None: # Predictor path util.mkdir_and_rename( opt_P['path']['experiments_root']) # rename experiment folder if exists util.mkdirs((path for key, path in opt_P['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # Corrector path util.mkdir_and_rename( opt_C['path']['experiments_root']) # rename experiment folder if exists util.mkdirs((path for key, path in opt_C['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt_P['path']['log'], 'train_' + opt_P['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt_P['path']['log'], 'val_' + opt_P['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt_P)) # tensorboard logger if opt_P['use_tb_logger'] and 'debug' not in opt_P['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]'.format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt_P['name']) else: util.setup_logger('base', opt_P['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') #### random seed seed = opt_P['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt_P['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt_P['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt_P['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt_P, train_sampler) if rank <= 0: logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt_P, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None assert val_loader is not None #### create model model_F = create_model(opt_F) #load pretrained model of SFTMD model_P = create_model(opt_P) model_C = create_model(opt_C) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model_P.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt_P['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate, schedulers # model.update_learning_rate(current_step, warmup_iter=opt_P['train']['warmup_iter']) #### preprocessing for LR_img and kernel map prepro = util.SRMDPreprocessing(opt_P['scale'], pca_matrix, para_input=opt_P['code_length'], kernel=opt_P['kernel_size'], noise=False, cuda=True, sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0) LR_img, ker_map = prepro(train_data['GT']) #### training Predictor model_P.feed_data(LR_img, ker_map) model_P.optimize_parameters(current_step) P_visuals = model_P.get_current_visuals() est_ker_map = P_visuals['Batch_est_ker_map'] #### log of model_P if current_step % opt_P['logger']['print_freq'] == 0: logs = model_P.get_current_log() message = 'Predictor <epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model_P.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt_P['use_tb_logger'] and 'debug' not in opt_P['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### training Corrector for step in range(opt_C['step']): # test SFTMD for corresponding SR image model_F.feed_data(train_data, LR_img, est_ker_map) model_F.test() F_visuals = model_F.get_current_visuals() SR_img = F_visuals['Batch_SR'] # Test SFTMD to produce SR images # train corrector given SR image and estimated kernel map model_C.feed_data(SR_img, est_ker_map, ker_map) model_C.optimize_parameters(current_step) C_visuals = model_C.get_current_visuals() est_ker_map = C_visuals['Batch_est_ker_map'] #### log of model_C if current_step % opt_C['logger']['print_freq'] == 0: logs = model_C.get_current_log() message = 'Corrector <epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model_C.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt_C['use_tb_logger'] and 'debug' not in opt_C['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # validation, to produce ker_map_list(fake) if current_step % opt_P['train']['val_freq'] == 0 and rank <= 0: avg_psnr = 0.0 idx = 0 for _, val_data in enumerate(val_loader): prepro = util.SRMDPreprocessing(opt_P['scale'], pca_matrix, para_input=opt_P['code_length'], kernel=opt_P['kernel_size'], noise=False, cuda=True, sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0) LR_img, ker_map = prepro(val_data['GT']) single_img_psnr = 0.0 # valid Predictor model_P.feed_data(LR_img, ker_map) model_P.test() P_visuals = model_P.get_current_visuals() est_ker_map = P_visuals['Batch_est_ker_map'] for step in range(opt_C['step']): step += 1 idx += 1 model_F.feed_data(val_data, LR_img, est_ker_map) model_F.test() F_visuals = model_F.get_current_visuals() SR_img = F_visuals['Batch_SR'] # Test SFTMD to produce SR images model_C.feed_data(SR_img, est_ker_map, ker_map) model_C.test() C_visuals = model_C.get_current_visuals() est_ker_map = C_visuals['Batch_est_ker_map'] sr_img = util.tensor2img(F_visuals['SR']) # uint8 gt_img = util.tensor2img(F_visuals['GT']) # uint8 # Save SR images for reference img_name = os.path.splitext(os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt_P['path']['val_images'], img_name) # img_dir = os.path.join(opt_F['path']['val_images'], str(current_step), '_', str(step)) util.mkdir(img_dir) save_img_path = os.path.join(img_dir, '{:s}_{:d}_{:d}.png'.format(img_name, current_step, step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt_P['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] step_psnr = util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) logger.info( '<epoch:{:3d}, iter:{:8,d}, step:{:3d}> img:{:s}, psnr: {:.4f}'.format(epoch, current_step, step, img_name, step_psnr)) single_img_psnr += step_psnr avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_signle_img_psnr = single_img_psnr / step logger.info( '<epoch:{:3d}, iter:{:8,d}, step:{:3d}> img:{:s}, average psnr: {:.4f}'.format(epoch, current_step, step, img_name, avg_signle_img_psnr)) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4f}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info('<epoch:{:3d}, iter:{:8,d}, step:{:3d}> psnr: {:.4f}'.format(epoch, current_step, step, avg_psnr)) # tensorboard logger if opt_P['use_tb_logger'] and 'debug' not in opt_P['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) #### save models and training states if current_step % opt_P['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model_P.save(current_step) model_P.save_training_state(epoch, current_step) model_C.save(current_step) model_C.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model_P.save('latest') model_C.save('latest') logger.info('End of Predictor training.') tb_logger.close()
def main(): ############################################ # # set options # ############################################ parser = argparse.ArgumentParser() parser.add_argument('--opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) ############################################ # # distributed training settings # ############################################ if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() print("Rank:", rank) print("------------------DIST-------------------------") ############################################ # # loading resume state if exists # ############################################ if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None ############################################ # # mkdir and loggers # ############################################ if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('base_val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_', level=logging.INFO, screen=True) print("set train log") util.setup_logger('base_val', opt['path']['log'], 'val_', level=logging.INFO, screen=True) print("set val log") logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True ############################################ # # create train and val dataloader # ############################################ #### # dataset_ratio = 200 # enlarge the size of each epoch, todo: what it is dataset_ratio = 1 # enlarge the size of each epoch, todo: what it is for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) # total_iters = int(opt['train']['niter']) # total_epochs = int(math.ceil(total_iters / train_size)) total_iters = train_size total_epochs = int(opt['train']['epoch']) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) # total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) total_epochs = int(opt['train']['epoch']) if opt['train']['enable'] == False: total_epochs = 1 else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None ############################################ # # create model # ############################################ #### model = create_model(opt) print("Model Created! ") #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 print("Not Resume Training") ############################################ # # training # ############################################ #### #### logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) Avg_train_loss = AverageMeter() # total if (opt['train']['pixel_criterion'] == 'cb+ssim'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_ssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_ssim = AverageMeter() Avg_train_loss_vmaf = AverageMeter() elif (opt['train']['pixel_criterion'] == 'ssim'): Avg_train_loss_ssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'msssim'): Avg_train_loss_msssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'cb+msssim'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_msssim = AverageMeter() saved_total_loss = 10e10 saved_total_PSNR = -1 for epoch in range(start_epoch, total_epochs): ############################################ # # Start a new epoch # ############################################ # Turn into training mode #model = model.train() # reset total loss Avg_train_loss.reset() current_step = 0 if (opt['train']['pixel_criterion'] == 'cb+ssim'): Avg_train_loss_pix.reset() Avg_train_loss_ssim.reset() elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): Avg_train_loss_pix.reset() Avg_train_loss_ssim.reset() Avg_train_loss_vmaf.reset() elif (opt['train']['pixel_criterion'] == 'ssim'): Avg_train_loss_ssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'msssim'): Avg_train_loss_msssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'cb+msssim'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_msssim = AverageMeter() if opt['dist']: train_sampler.set_epoch(epoch) for train_idx, train_data in enumerate(train_loader): if 'debug' in opt['name']: img_dir = os.path.join(opt['path']['train_images']) util.mkdir(img_dir) LQ = train_data['LQs'] GT = train_data['GT'] GT_img = util.tensor2img(GT) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}.png'.format(train_idx, 'debug_GT')) util.save_img(GT_img, save_img_path) for i in range(5): LQ_img = util.tensor2img(LQ[0, i, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format(train_idx, 'debug_LQ', i)) util.save_img(LQ_img, save_img_path) if (train_idx >= 3): break if opt['train']['enable'] == False: message_train_loss = 'None' break current_step += 1 if current_step > total_iters: print("Total Iteration Reached !") break #### update learning rate if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': pass else: model.update_learning_rate( current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) # if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': # model.optimize_parameters_without_schudlue(current_step) # else: model.optimize_parameters(current_step) if (opt['train']['pixel_criterion'] == 'cb+ssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_pix.update(model.log_dict['l_pix'], 1) Avg_train_loss_ssim.update(model.log_dict['ssim_loss'], 1) elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_pix.update(model.log_dict['l_pix'], 1) Avg_train_loss_ssim.update(model.log_dict['ssim_loss'], 1) Avg_train_loss_vmaf.update(model.log_dict['vmaf_loss'], 1) elif (opt['train']['pixel_criterion'] == 'ssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_ssim.update(model.log_dict['ssim_loss'], 1) elif (opt['train']['pixel_criterion'] == 'msssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_msssim.update(model.log_dict['msssim_loss'], 1) elif (opt['train']['pixel_criterion'] == 'cb+msssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_pix.update(model.log_dict['l_pix'], 1) Avg_train_loss_msssim.update(model.log_dict['msssim_loss'], 1) else: Avg_train_loss.update(model.log_dict['l_pix'], 1) # add total train loss if (opt['train']['pixel_criterion'] == 'cb+ssim'): message_train_loss = ' pix_avg_loss: {:.4e}'.format( Avg_train_loss_pix.avg) message_train_loss += ' ssim_avg_loss: {:.4e}'.format( Avg_train_loss_ssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): message_train_loss = ' pix_avg_loss: {:.4e}'.format( Avg_train_loss_pix.avg) message_train_loss += ' ssim_avg_loss: {:.4e}'.format( Avg_train_loss_ssim.avg) message_train_loss += ' vmaf_avg_loss: {:.4e}'.format( Avg_train_loss_vmaf.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'ssim'): message_train_loss = ' ssim_avg_loss: {:.4e}'.format( Avg_train_loss_ssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'msssim'): message_train_loss = ' msssim_avg_loss: {:.4e}'.format( Avg_train_loss_msssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'cb+msssim'): message_train_loss = ' pix_avg_loss: {:.4e}'.format( Avg_train_loss_pix.avg) message_train_loss += ' msssim_avg_loss: {:.4e}'.format( Avg_train_loss_msssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) else: message_train_loss = ' train_avg_loss: {:.4e}'.format( Avg_train_loss.avg) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) message += message_train_loss if rank <= 0: logger.info(message) ############################################ # # end of one epoch, save epoch model # ############################################ #### save models and training states # if current_step % opt['logger']['save_checkpoint_freq'] == 0: # if rank <= 0: # logger.info('Saving models and training states.') # model.save(current_step) # model.save('latest') # # model.save_training_state(epoch, current_step) # # todo delete previous weights # previous_step = current_step - opt['logger']['save_checkpoint_freq'] # save_filename = '{}_{}.pth'.format(previous_step, 'G') # save_path = os.path.join(opt['path']['models'], save_filename) # if os.path.exists(save_path): # os.remove(save_path) if epoch == 1: save_filename = '{:04d}_{}.pth'.format(0, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) save_filename = '{:04d}_{}.pth'.format(epoch - 1, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) if rank <= 0: logger.info('Saving models and training states.') save_filename = '{:04d}'.format(epoch) model.save(save_filename) # model.save('latest') # model.save_training_state(epoch, current_step) ############################################ # # end of one epoch, do validation # ############################################ #### validation #if opt['datasets'].get('val', None) and current_step % opt['train']['val_freq'] == 0: if opt['datasets'].get('val', None): if opt['model'] in [ 'sr', 'srgan' ] and rank <= 0: # image restoration validation # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) avg_psnr = 0. idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) #util.save_img(sr_img, save_img_path) # calculate PSNR sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale']) avg_psnr += util.calculate_psnr(sr_img, gt_img) pbar.update('Test {}'.format(img_name)) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) else: # video restoration validation if opt['dist']: # todo : multi-GPU testing psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. ssim_rlt = {} # with border and center frames ssim_rlt_avg = {} ssim_total_avg = 0. val_loss_rlt = {} val_loss_rlt_avg = {} val_loss_total_avg = 0. if rank == 0: pbar = util.ProgressBar(len(val_set)) for idx in range(rank, len(val_set), world_size): print('idx', idx) if 'debug' in opt['name']: if (idx >= 3): break val_data = val_set[idx] val_data['LQs'].unsqueeze_(0) val_data['GT'].unsqueeze_(0) folder = val_data['folder'] idx_d, max_idx = val_data['idx'].split('/') idx_d, max_idx = int(idx_d), int(max_idx) if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = torch.zeros(max_idx, dtype=torch.float32, device='cuda') if ssim_rlt.get(folder, None) is None: ssim_rlt[folder] = torch.zeros(max_idx, dtype=torch.float32, device='cuda') if val_loss_rlt.get(folder, None) is None: val_loss_rlt[folder] = torch.zeros( max_idx, dtype=torch.float32, device='cuda') # tmp = torch.zeros(max_idx, dtype=torch.float32, device='cuda') model.feed_data(val_data) # model.test() # model.test_stitch() if opt['stitch'] == True: model.test_stitch() else: model.test() # large GPU memory # visuals = model.get_current_visuals() visuals = model.get_current_visuals( save=True, name='{}_{}'.format(folder, idx), save_path=opt['path']['val_images']) rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder][idx_d] = psnr # calculate SSIM ssim = util.calculate_ssim(rlt_img, gt_img) ssim_rlt[folder][idx_d] = ssim # calculate Val loss val_loss = model.get_loss() val_loss_rlt[folder][idx_d] = val_loss logger.info( '{}_{:02d} PSNR: {:.4f}, SSIM: {:.4f}'.format( folder, idx, psnr, ssim)) if rank == 0: for _ in range(world_size): pbar.update('Test {} - {}/{}'.format( folder, idx_d, max_idx)) # # collect data for _, v in psnr_rlt.items(): dist.reduce(v, 0) for _, v in ssim_rlt.items(): dist.reduce(v, 0) for _, v in val_loss_rlt.items(): dist.reduce(v, 0) dist.barrier() if rank == 0: psnr_rlt_avg = {} psnr_total_avg = 0. for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = torch.mean(v).cpu().item() psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # ssim ssim_rlt_avg = {} ssim_total_avg = 0. for k, v in ssim_rlt.items(): ssim_rlt_avg[k] = torch.mean(v).cpu().item() ssim_total_avg += ssim_rlt_avg[k] ssim_total_avg /= len(ssim_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( ssim_total_avg) for k, v in ssim_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # added val_loss_rlt_avg = {} val_loss_total_avg = 0. for k, v in val_loss_rlt.items(): val_loss_rlt_avg[k] = torch.mean(v).cpu().item() val_loss_total_avg += val_loss_rlt_avg[k] val_loss_total_avg /= len(val_loss_rlt) log_l = '# Validation # Loss: {:.4e}:'.format( val_loss_total_avg) for k, v in val_loss_rlt_avg.items(): log_l += ' {}: {:.4e}'.format(k, v) logger.info(log_l) message = '' for v in model.get_current_learning_rate(): message += '{:.5e}'.format(v) logger_val.info( 'Epoch {:02d}, LR {:s}, PSNR {:.4f}, SSIM {:.4f} Train {:s}, Val Total Loss {:.4e}' .format(epoch, message, psnr_total_avg, ssim_total_avg, message_train_loss, val_loss_total_avg)) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) # add val loss tb_logger.add_scalar('val_loss_avg', val_loss_total_avg, current_step) for k, v in val_loss_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) else: # Todo: our function One GPU pbar = util.ProgressBar(len(val_loader)) psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. ssim_rlt = {} # with border and center frames ssim_rlt_avg = {} ssim_total_avg = 0. val_loss_rlt = {} val_loss_rlt_avg = {} val_loss_total_avg = 0. for val_inx, val_data in enumerate(val_loader): if 'debug' in opt['name']: if (val_inx >= 5): break folder = val_data['folder'][0] # idx_d = val_data['idx'].item() idx_d = val_data['idx'] # border = val_data['border'].item() if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = [] if ssim_rlt.get(folder, None) is None: ssim_rlt[folder] = [] if val_loss_rlt.get(folder, None) is None: val_loss_rlt[folder] = [] # process the black blank [B N C H W] print(val_data['LQs'].size()) H_S = val_data['LQs'].size(3) # 540 W_S = val_data['LQs'].size(4) # 960 print(H_S) print(W_S) blank_1_S = 0 blank_2_S = 0 print(val_data['LQs'][0, 2, 0, :, :].size()) for i in range(H_S): if not sum(val_data['LQs'][0, 2, 0, i, :]) == 0: blank_1_S = i - 1 # assert not sum(data_S[:, :, 0][i+1]) == 0 break for i in range(H_S): if not sum(val_data['LQs'][0, 2, 0, :, H_S - i - 1]) == 0: blank_2_S = (H_S - 1) - i - 1 # assert not sum(data_S[:, :, 0][blank_2_S-1]) == 0 break print('LQ :', blank_1_S, blank_2_S) if blank_1_S == -1: print('LQ has no blank') blank_1_S = 0 blank_2_S = H_S # val_data['LQs'] = val_data['LQs'][:,:,:,blank_1_S:blank_2_S,:] print("LQ", val_data['LQs'].size()) # end of process the black blank model.feed_data(val_data) if opt['stitch'] == True: model.test_stitch() else: model.test() # large GPU memory # process blank blank_1_L = blank_1_S << 2 blank_2_L = blank_2_S << 2 print(blank_1_L, blank_2_L) print(model.fake_H.size()) if not blank_1_S == 0: # model.fake_H = model.fake_H[:,:,blank_1_L:blank_2_L,:] model.fake_H[:, :, 0:blank_1_L, :] = 0 model.fake_H[:, :, blank_2_L:H_S, :] = 0 # end of # process blank visuals = model.get_current_visuals( save=True, name='{}_{:02d}'.format(folder, val_inx), save_path=opt['path']['val_images']) rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder].append(psnr) # calculate SSIM ssim = util.calculate_ssim(rlt_img, gt_img) ssim_rlt[folder].append(ssim) # val loss val_loss = model.get_loss() val_loss_rlt[folder].append(val_loss.item()) logger.info( '{}_{:02d} PSNR: {:.4f}, SSIM: {:.4f}'.format( folder, val_inx, psnr, ssim)) pbar.update('Test {} - {}'.format(folder, idx_d)) # average PSNR for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # average SSIM for k, v in ssim_rlt.items(): ssim_rlt_avg[k] = sum(v) / len(v) ssim_total_avg += ssim_rlt_avg[k] ssim_total_avg /= len(ssim_rlt) log_s = '# Validation # SSIM: {:.4e}:'.format( ssim_total_avg) for k, v in ssim_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # average VMAF # average Val LOSS for k, v in val_loss_rlt.items(): val_loss_rlt_avg[k] = sum(v) / len(v) val_loss_total_avg += val_loss_rlt_avg[k] val_loss_total_avg /= len(val_loss_rlt) log_l = '# Validation # Loss: {:.4e}:'.format( val_loss_total_avg) for k, v in val_loss_rlt_avg.items(): log_l += ' {}: {:.4e}'.format(k, v) logger.info(log_l) # toal validation log message = '' for v in model.get_current_learning_rate(): message += '{:.5e}'.format(v) logger_val.info( 'Epoch {:02d}, LR {:s}, PSNR {:.4f}, SSIM {:.4f} Train {:s}, Val Total Loss {:.4e}' .format(epoch, message, psnr_total_avg, ssim_total_avg, message_train_loss, val_loss_total_avg)) # end add if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) # tb_logger.add_scalar('ssim_avg', ssim_total_avg, current_step) # for k, v in ssim_rlt_avg.items(): # tb_logger.add_scalar(k, v, current_step) # add val loss tb_logger.add_scalar('val_loss_avg', val_loss_total_avg, current_step) for k, v in val_loss_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) ############################################ # # end of validation, save model # ############################################ # logger.info("Finished an epoch, Check and Save the model weights") # we check the validation loss instead of training loss. OK~ if saved_total_loss >= val_loss_total_avg: saved_total_loss = val_loss_total_avg #torch.save(model.state_dict(), args.save_path + "/best" + ".pth") model.save('best') logger.info( "Best Weights updated for decreased validation loss") else: logger.info( "Weights Not updated for undecreased validation loss") if saved_total_PSNR <= psnr_total_avg: saved_total_PSNR = psnr_total_avg model.save('bestPSNR') logger.info( "Best Weights updated for increased validation PSNR") else: logger.info( "Weights Not updated for unincreased validation PSNR") ############################################ # # end of one epoch, schedule LR # ############################################ # add scheduler todo if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': for scheduler in model.schedulers: # scheduler.step(val_loss_total_avg) scheduler.step(val_loss_total_avg) if rank <= 0: logger.info('Saving the final model.') model.save('last') logger.info('End of training.') tb_logger.close()
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load(opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path']['experiments_root']) # rename experiment folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( f'You are using PyTorch {version}. Tensorboard will use [tensorboardX]') from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info(f'Random seed: {seed}') util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info(f'Number of train images: {len(train_set):,}, iters: {train_size:,}') logger.info(f'Total epochs needed: {total_epochs} for iters {total_iters:,}') elif phase == 'val': pass '''val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info(f'Number of val images in [{dataset_opt['name']}]: {len(val_set)}')''' else: raise NotImplementedError(f'Phase [{phase}] is not recognized.') assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info(f"Resuming training from epoch: {resume_state['epoch']}, iter: {resume_state['iter']}.") start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info(f'Start training from epoch: {start_epoch}, iter: {current_step}') for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = f'<epoch:{epoch:3d}, iter:{current_step:8d}, lr:(' for i, v in enumerate(model.get_current_learning_rate()): if rank <= 0 : tb_logger.add_scalar(f'lr_{i}', v, current_step) message += f'{v:.3e},' message += ')>' loss=float(0) num=float(0) for k, v in logs.items(): if 'l_pix_' in k: num += 1 loss += v if num > 0: loss /= num message += f'loss: {loss:.4e} ' # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar('loss', loss, current_step) if rank <= 0: logger.info(message) #### validation if current_step % opt['train']['val_freq'] == 0 and rank <= 0: pass '''avg_psnr = 0.0 idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext(os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join(img_dir, f'{img_name}_{current_step}.png') util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx # log logger.info(f'# Validation # PSNR: {avg_psnr:.4e}') logger_val = logging.getLogger('val') # validation logger logger_val.info(f'<epoch:{epoch:3d}, iter:{current_step:8d}> psnr: {avg_psnr:.4e}') # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step)''' #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.close()
def main(): ###### SFTMD train ###### #### setup options parser = argparse.ArgumentParser() parser.add_argument('-opt_F', type=str, help='Path to option YMAL file of SFTMD_Net.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt_F = option.parse(args.opt_F, is_train=True) # convert to NoneDict, which returns None for missing keys opt_F = option.dict_to_nonedict(opt_F) #### random seed seed = opt_F['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) util.set_random_seed(seed) # create PCA matrix of enough kernel batch_ker = util.random_batch_kernel(batch=30000, l=21, sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, tensor=False) print('batch kernel shape: {}'.format(batch_ker.shape)) b = np.size(batch_ker, 0) batch_ker = batch_ker.reshape((b, -1)) pca_matrix = util.PCA(batch_ker, k=10).float() print('PCA matrix shape: {}'.format(pca_matrix.shape)) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt_F['dist'] = False rank = -1 print('Disabled distributed training.') else: opt_F['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### loading resume state if exists if opt_F['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load(opt_F['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt_F, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: if resume_state is None: util.mkdir_and_rename( opt_F['path']['experiments_root']) # rename experiment folder if exists util.mkdirs((path for key, path in opt_F['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt_F['path']['log'], 'train_' + opt_F['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt_F['path']['log'], 'val_' + opt_F['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt_F)) # tensorboard logger if opt_F['use_tb_logger'] and 'debug' not in opt_F['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]'.format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt_F['name']) else: util.setup_logger('base', opt_F['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt_F['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt_F['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt_F['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt_F, train_sampler) if rank <= 0: logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt_F, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None assert val_loader is not None #### create model model_F = create_model(opt_F) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model_F.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt_F['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### preprocessing for LR_img and kernel map prepro = util.SRMDPreprocessing(opt_F['scale'], pca_matrix, para_input=10, kernel=21, noise=False, cuda=True, sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0) LR_img, ker_map = prepro(train_data['GT']) #### update learning rate, schedulers model_F.update_learning_rate(current_step, warmup_iter=opt_F['train']['warmup_iter']) #### training model_F.feed_data(train_data, LR_img, ker_map) model_F.optimize_parameters(current_step) #### log if current_step % opt_F['logger']['print_freq'] == 0: logs = model_F.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model_F.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt_F['use_tb_logger'] and 'debug' not in opt_F['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # validation if current_step % opt_F['train']['val_freq'] == 0 and rank <= 0: avg_psnr = 0.0 idx = 0 for _, val_data in enumerate(val_loader): idx += 1 #### preprocessing for LR_img and kernel map prepro = util.SRMDPreprocessing(opt_F['scale'], pca_matrix, para_input=15, noise=False, cuda=True, sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0) LR_img, ker_map = prepro(val_data['GT']) model_F.feed_data(val_data, LR_img, ker_map) model_F.test() visuals = model_F.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference img_name = os.path.splitext(os.path.basename(val_data['LQ_path'][0]))[0] #img_dir = os.path.join(opt_F['path']['val_images'], img_name) img_dir = os.path.join(opt_F['path']['val_images'], str(current_step)) util.mkdir(img_dir) save_img_path = os.path.join(img_dir,'{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt_F['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info('<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}'.format(epoch, current_step, avg_psnr)) # tensorboard logger if opt_F['use_tb_logger'] and 'debug' not in opt_F['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) #### save models and training states if current_step % opt_F['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model_F.save(current_step) model_F.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model_F.save('latest') logger.info('End of SFTMD training.')
def main(): # options parser = argparse.ArgumentParser() parser.add_argument("-opt", type=str, required=True, help="Path to option JSON file.") opt = option.parse(parser.parse_args().opt, is_train=True) opt = option.dict_to_nonedict( opt) # Convert to NoneDict, which return None for missing key. # train from scratch OR resume training if opt["path"]["resume_state"]: # resuming training resume_state = torch.load(opt["path"]["resume_state"]) else: # training from scratch resume_state = None util.mkdir_and_rename( opt["path"]["experiments_root"]) # rename old folder if exists util.mkdirs((path for key, path in opt["path"].items() if not key == "experiments_root" and "pretrain_model" not in key and "resume" not in key)) # config loggers. Before it, the log will not work util.setup_logger(None, opt["path"]["log"], "train", level=logging.INFO, screen=True) util.setup_logger("val", opt["path"]["log"], "val", level=logging.INFO) logger = logging.getLogger("base") if resume_state: logger.info("Resuming training from epoch: {}, iter: {}.".format( resume_state["epoch"], resume_state["iter"])) option.check_resume(opt) # check resume options logger.info(option.dict2str(opt)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir="../tb_logger/" + opt["name"]) # random seed seed = opt["train"]["manual_seed"] if seed is None: seed = random.randint(1, 10000) logger.info("Random seed: {}".format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True # create train and val dataloader for phase, dataset_opt in opt["datasets"].items(): if phase == "train": train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt["batch_size"])) logger.info("Number of train images: {:,d}, iters: {:,d}".format( len(train_set), train_size)) total_iters = int(opt["train"]["niter"]) total_epochs = int(math.ceil(total_iters / train_size)) logger.info("Total epochs needed: {:d} for iters {:,d}".format( total_epochs, total_iters)) train_loader = create_dataloader(train_set, dataset_opt) elif phase == "val": val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt) logger.info("Number of val images in [{:s}]: {:d}".format( dataset_opt["name"], len(val_set))) else: raise NotImplementedError( "Phase [{:s}] is not recognized.".format(phase)) assert train_loader is not None # create model model = create_model(opt) # resume training if resume_state: start_epoch = resume_state["epoch"] current_step = resume_state["iter"] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 # training logger.info("Start training from epoch: {:d}, iter: {:d}".format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs): for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break # update learning rate model.update_learning_rate() # training model.feed_data(train_data) model.optimize_parameters(current_step) # log if current_step % opt["logger"]["print_freq"] == 0: logs = model.get_current_log() message = "<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> ".format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += "{:s}: {:.4e} ".format(k, v) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: tb_logger.add_scalar(k, v, current_step) logger.info(message) # validation if current_step % opt["train"]["val_freq"] == 0: avg_psnr = 0.0 idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data["LR_path"][0]))[0] img_dir = os.path.join(opt["path"]["val_images"], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals["SR"]) # uint8 gt_img = util.tensor2img(visuals["HR"]) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, "{:s}_{:d}.png".format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt["scale"] gt_img = gt_img / 255.0 sr_img = sr_img / 255.0 cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx # log logger.info("# Validation # PSNR: {:.4e}".format(avg_psnr)) logger_val = logging.getLogger("val") # validation logger logger_val.info( "<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}".format( epoch, current_step, avg_psnr)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: tb_logger.add_scalar("psnr", avg_psnr, current_step) # save models and training states if current_step % opt["logger"]["save_checkpoint_freq"] == 0: logger.info("Saving models and training states.") model.save(current_step) model.save_training_state(epoch, current_step) copy_tree( opt["path"]["experiments_root"], "/content/gdrive/My Drive/LVTN/SuperResolution/SR_models/" + "-ESRGAN/experiments/" + opt["name"], ) logger.info("Saving the final model.") model.save("latest") logger.info("End of training.")
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### validation if opt['datasets'].get( 'val', None) and current_step % opt['train']['val_freq'] == 0: if opt['model'] in [ 'sr', 'srgan' ] and rank <= 0: # image restoration validation # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) avg_psnr = 0. avg_ssim = 0. idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) # we don't need GT data to be run with the model gt_img = util.tensor2img(val_data['GT']) # uint8 val_data['GT'] = torch.tensor(0) import sys #print(val_data['LQ'].shape) model.feed_data(val_data, need_GT=False) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['rlt']) # uint8 if "mask" in visuals: mask = util.tensor2img(visuals['mask']) # print(mask) # Save SR mask for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}_mask.png'.format( img_name, current_step)) util.save_img(mask, save_img_path) #gt_mask_dir = os.path.join(opt['path']['gt_mask'], img_name) #gt_mask = util.tensor2img(val_data['GT_mask']) #print(gt_mask) #import cv2 #cv2.imshow("window", gt_mask) #cv2.waitKey() #util.mkdir(gt_mask_dir) #save_img_path = os.path.join(gt_mask_dir, # '{:s}_{:d}_GT_mask.png'.format(img_name, current_step)) #util.save_img(gt_mask, save_img_path) # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale']) avg_psnr += util.calculate_psnr(sr_img, gt_img) avg_ssim += util.calculate_ssim(sr_img, gt_img) pbar.update('Test {}'.format(img_name)) avg_psnr = avg_psnr / idx avg_ssim = avg_ssim / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger.info('# Validation # SSIM: {:.4e}'.format(avg_ssim)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) tb_logger.add_scalar('ssim', avg_ssim, current_step) else: # video restoration validation if opt['dist']: # multi-GPU testing psnr_rlt = {} # with border and center frames if rank == 0: pbar = util.ProgressBar(len(val_set)) for idx in range(rank, len(val_set), world_size): val_data = val_set[idx] val_data['LQs'].unsqueeze_(0) val_data['GT'].unsqueeze_(0) folder = val_data['folder'] idx_d, max_idx = val_data['idx'].split('/') idx_d, max_idx = int(idx_d), int(max_idx) if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = torch.zeros( max_idx, dtype=torch.float32, device='cuda') # tmp = torch.zeros(max_idx, dtype=torch.float32, device='cuda') model.feed_data(val_data) model.test() visuals = model.get_current_visuals() rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr_rlt[folder][idx_d] = util.calculate_psnr( rlt_img, gt_img) if rank == 0: for _ in range(world_size): pbar.update('Test {} - {}/{}'.format( folder, idx_d, max_idx)) # # collect data for _, v in psnr_rlt.items(): dist.reduce(v, 0) dist.barrier() if rank == 0: psnr_rlt_avg = {} psnr_total_avg = 0. for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = torch.mean(v).cpu().item() psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) if opt['use_tb_logger'] and 'debug' not in opt[ 'name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) else: pbar = util.ProgressBar(len(val_loader)) psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. for val_data in val_loader: folder = val_data['folder'][0] idx_d = val_data['idx'].item() # border = val_data['border'].item() if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = [] model.feed_data(val_data) model.test() visuals = model.get_current_visuals() rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder].append(psnr) pbar.update('Test {} - {}'.format(folder, idx_d)) for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') tb_logger.close()
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument("-opt", type=str, help="Path to option YAML file.") parser.add_argument( "--launcher", choices=["none", "pytorch"], default="none", help="job launcher" ) parser.add_argument("--local_rank", type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings if args.launcher == "none": # disabled distributed training opt["dist"] = False rank = -1 print("Disabled distributed training.") else: opt["dist"] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt["path"].get("resume_state", None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt["path"]["resume_state"], map_location=lambda storage, loc: storage.cuda(device_id), ) option.check_resume(opt, resume_state["iter"]) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt["path"]["experiments_root"] ) # rename experiment folder if exists util.mkdirs( ( path for key, path in opt["path"].items() if not key == "experiments_root" and "pretrain_model" not in key and "resume" not in key ) ) # config loggers. Before it, the log will not work util.setup_logger( "base", opt["path"]["log"], "train_" + opt["name"], level=logging.INFO, screen=True, tofile=True, ) logger = logging.getLogger("base") logger.info(option.dict2str(opt)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( "You are using PyTorch {}. Tensorboard will use [tensorboardX]".format( version ) ) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir="../tb_logger/" + opt["name"]) else: util.setup_logger( "base", opt["path"]["log"], "train", level=logging.INFO, screen=True ) logger = logging.getLogger("base") # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt["train"]["manual_seed"] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info("Random seed: {}".format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt["datasets"].items(): if phase == "train": train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt["batch_size"])) total_iters = int(opt["train"]["niter"]) total_epochs = int(math.ceil(total_iters / train_size)) if opt["dist"]: train_sampler = DistIterSampler( train_set, world_size, rank, dataset_ratio ) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio)) ) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( "Number of train images: {:,d}, iters: {:,d}".format( len(train_set), train_size ) ) logger.info( "Total epochs needed: {:d} for iters {:,d}".format( total_epochs, total_iters ) ) elif phase == "val": pass # val_set = create_dataset(dataset_opt, isVal=True) # val_loader = create_dataloader(val_set, dataset_opt, opt, None) # if rank <= 0: # logger.info( # "Number of val images in [{:s}]: {:d}".format( # dataset_opt["name"], len(val_set) # ) # ) else: raise NotImplementedError("Phase [{:s}] is not recognized.".format(phase)) assert train_loader is not None #### create model # model_path = opt["path"]["pretrain_model_G"] model = create_model(opt) #### resume training if resume_state: logger.info( "Resuming training from epoch: {}, iter: {}.".format( resume_state["epoch"], resume_state["iter"] ) ) start_epoch = resume_state["epoch"] current_step = resume_state["iter"] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info( "Start training from epoch: {:d}, iter: {:d}".format(start_epoch, current_step) ) for epoch in range(start_epoch, total_epochs + 1): if opt["dist"]: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate( current_step, warmup_iter=opt["train"]["warmup_iter"] ) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt["logger"]["print_freq"] == 0: logs = model.get_current_log() message = "<epoch:{:3d}, iter:{:8,d}, lr:(".format(epoch, current_step) for v in model.get_current_learning_rate(): message += "{:.3e},".format(v) message += ")>" for k, v in logs.items(): message += "{:s}: {:.4e} ".format(k, v) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### validation # currently, it does not support validation during training # if current_step % opt["train"]["val_freq"] == 0: # avg_psnr = 0 # idx = 0 # for val_data in val_loader: # idx += 1 # key = ( # val_data["key"][0] # if type(val_data["key"]) is list # else val_data["key"] # ) # imgName = key + ".png" # savePath = os.path.join( # opt["path"]["val_images"], str(current_step), imgName # ) # model.feed_data(val_data) # model.test() # output = model.get_current_visuals() # hr = util.tensor2img(output["GT"]) # sr = util.tensor2img(output["restore"]) # # Cropping to calculate PSNR # hr /= 255.0 # sr /= 255.0 # scale = 4 # H, W, C = hr.shape # H_r, W_r = H % scale, W % scale # cropped_hr = hr[: H - H_r, : W - W_r, :] # cropped_sr = sr[: H - H_r, : W - W_r, :] # avg_psnr += util.calculate_psnr(cropped_sr * 255, cropped_hr * 255) # logger.info("Saving output in {}".format(savePath)) # util.mkdir(savePath) # util.save_img( # output, joinPath(savePath, str(current_step) + ".png") # ) # avg_psnr /= idx # # log # logger.info("# Validation # PSNR: {:.4e}".format(avg_psnr)) # logger_val = logging.getLogger("val") # validation logger # logger_val.info( # "<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}".format( # epoch, current_step, avg_psnr # ) # ) # # tensorboard logger # if opt["use_tb_logger"] and "debug" not in opt["name"]: # tb_logger.add_scalar("psnr", avg_psnr, current_step) #### save models and training states if current_step % opt["logger"]["save_checkpoint_freq"] == 0: if rank <= 0: # Save the experiments in case of Colab is timeout logger.info("Saving models and training states.") model.save(current_step) model.save_training_state(epoch, current_step) copy_tree( "/content/EDVR/experiments", "/content/drive/My Drive/LVTN/SuperResolution/EDVR/experiments", ) copy_tree( "/content/EDVR/tb_logger", "/content/drive/My Drive/LVTN/SuperResolution/EDVR/tb_logger", ) if rank <= 0: logger.info("Saving the final model.") model.save("latest") logger.info("End of training.") tb_logger.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, default='options/train/train_ESRCNN_S2self.json', help='Path to option JSON file.') opt = option.parse(parser.parse_args().opt, is_train=True) opt = option.dict_to_nonedict(opt) if opt['path']['resume_state']: resume_state = torch.load(opt['path']['resume_state']) else: resume_state = None util.mkdir_and_rename(opt['path']['experiments_root']) util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) util.setup_logger(None, opt['path']['log'], 'train', level=logging.INFO, screen=True) util.setup_logger('val', opt['path']['log'], 'val', level=logging.INFO) logger = logging.getLogger('base') if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) option.check_resume(opt) logger.info(option.dict2str(opt)) if opt['use_tb_logger'] and 'debug' not in opt['name']: from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='./tb_logger/' + opt['name']) seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # Setup TrainDataLoader trainloader = DataLoader(opt['datasets']['train']['dataroot'], split='train') train_size = int( math.ceil(len(trainloader) / opt['datasets']['train']['batch_size'])) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(trainloader), train_size)) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) TrainDataLoader = data.DataLoader( trainloader, batch_size=opt['datasets']['train']['batch_size'], num_workers=12, shuffle=True) #Setup for validate valloader = DataLoader(opt['datasets']['train']['dataroot'], split='val') VALDataLoader = data.DataLoader( valloader, batch_size=opt['datasets']['train']['batch_size'] // 5, num_workers=1, shuffle=True) logger.info('Number of val images:{:d}'.format(len(valloader))) # Setup Model model = get_model('esrcnn_s2self', opt) if resume_state: start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) else: current_step = 0 start_epoch = 0 logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs): for i, train_data in enumerate(TrainDataLoader): current_step += 1 if current_step > total_iters: break model.update_learning_rate() model.feed_data(train_data) model.optimize_parameters(current_step) if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}>'.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v[0]) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar(k, v[0], current_step) logger.info(message) if current_step % opt['train']['val_freq'] == 0: avg_psnr = 0.0 idx = 0 for i_val, val_data in enumerate(VALDataLoader): idx += 1 img_name = val_data[3][0].split('.')[0] model.feed_data(val_data) model.val() visuals = model.get_current_visuals() pred_img = util.tensor2img(visuals['Pred']) gt_img = util.tensor2img(visuals['label']) avg_psnr += util.calculate_psnr(pred_img, gt_img) avg_psnr = avg_psnr / idx logger.info('# Validation #PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr:{:.4e}'.format( epoch, current_step, avg_psnr)) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) if current_step % opt['logger']['save_checkpoint_freq'] == 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) logger.info('Saving the final model.') model.save('latest') logger.info('End of training')
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YMAL file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) jsondir = parser.parse_args().opt files = set(glob.glob(os.path.join(experiments, "**/*.*"), recursive=True)) os.makedirs(drivebackup, exist_ok=True) os.makedirs(experiments, exist_ok=True) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) # -------------------------------------------- ADDED -------------------------------------------- filter_low = filters.FilterLow(gaussian=False) l1_loss = torch.nn.L1Loss() mse_loss = torch.nn.MSELoss() if torch.cuda.is_available(): filter_low = filter_low.cuda() l1_loss = l1_loss.cuda() mse_loss = mse_loss.cuda() # ----------------------------------------------------------------------------------------------- #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # validation if current_step % opt['train']['val_freq'] == 0 and rank <= 0: avg_psnr = val_pix_err_f = val_pix_err_nf = val_mean_color_err = 0.0 idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) # ----------------------------------------- ADDED ----------------------------------------- val_pix_err_f += l1_loss(filter_low(visuals['SR']), filter_low(visuals['GT'])) val_pix_err_nf += l1_loss(visuals['SR'], visuals['GT']) val_mean_color_err += mse_loss( visuals['SR'].mean(2).mean(1), visuals['GT'].mean(2).mean(1)) # ----------------------------------------------------------------------------------------- avg_psnr = avg_psnr / idx val_pix_err_f /= idx val_pix_err_nf /= idx val_mean_color_err /= idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}'.format( epoch, current_step, avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) tb_logger.add_scalar('val_pix_err_f', val_pix_err_f, current_step) tb_logger.add_scalar('val_pix_err_nf', val_pix_err_nf, current_step) tb_logger.add_scalar('val_mean_color_err', val_mean_color_err, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) files = movebackups(files, jsondir) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')
def main(): PreUp = False # options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option JSON file.') opt = option.parse(parser.parse_args().opt, is_train=True) opt = option.dict_to_nonedict(opt) # Convert to NoneDict, which return None for missing key. ratio = opt["scale"] if PreUp == True: ratio=5 # train from scratch OR resume training if opt['path']['resume_state']: # resuming training resume_state = torch.load(opt['path']['resume_state']) else: # training from scratch resume_state = None util.mkdir_and_rename(opt['path']['experiments_root']) # rename old folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger(None, opt['path']['log'], 'train', level=logging.INFO, screen=True) util.setup_logger('val', opt['path']['log'], 'val', level=logging.INFO) logger = logging.getLogger('base') if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) option.check_resume(opt) # check resume options logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: from tensorboardX import SummaryWriter tb_logger_train = SummaryWriter(log_dir='/mnt/gpid07/users/luis.salgueiro/git/mnt/BasicSR/tb_logger/' + opt['name'] + "/train") tb_logger_val = SummaryWriter(log_dir='//mnt/gpid07/users/luis.salgueiro/git/mnt/BasicSR/tb_logger/' + opt['name'] + "/val" ) # random seed seed = opt['train']['manual_seed'] if seed is None: seed = 100 #random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True # print("OLAAAA_-...", os.environ['CUDA_VISIBLE_DEVICES']) # ######################################### # ######## DATA LOADER #################### # ######################################### # create train and val dataloader for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': print("Entro DATASET train......") train_set = create_dataset(dataset_opt) print("CREO DATASET train_set ", train_set) train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) train_loader = create_dataloader(train_set, dataset_opt) print("CREO train loader: ", train_loader) elif phase == 'val': print("Entro en phase VAL....") val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt) logger.info('Number of val images in [{:s}]: {:d}'.format(dataset_opt['name'], len(val_set))) # for _,ii in enumerate(val_loader): # print("VAL LOADER:........", ii) # print(val_loader[0]) else: raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None assert val_loader is not None # create model model = create_model(opt) #print("PASO..... MODEL ") # resume training if resume_state: print("RESUMING state") start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 print("PASO..... INIT ") # ######################################### # ######### training ################ # ######################################### # ii=0 logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) for epoch in range(start_epoch, total_epochs): # print("Entro EPOCH...", ii) for _, train_data in enumerate(train_loader): # print("Entro TRAIN_LOADER...") current_step += 1 if current_step > total_iters: break # update learning rate model.update_learning_rate() # training #print("....... TRAIN DATA..........", train_data) model.feed_data(train_data) model.optimize_parameters(current_step) # log train if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) # print(".............MESSAGE: ", message) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) #print("MSG: ", message) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: # print("K: ", k) # print("V: ", v) if "test" in k: tb_logger_val.add_scalar(k, v, current_step) else: tb_logger_train.add_scalar(k, v, current_step) logger.info(message) if current_step % opt['train']['val_freq'] == 0: avg_psnr_sr = 0.0 avg_psnr_lr = 0.0 avg_psnr_dif = 0.0 avg_ssim_lr, avg_ssim_sr, avg_ssim_dif = 0.0, 0.0, 0.0 avg_ergas_lr, avg_ergas_sr, avg_ergas_dif = 0.0, 0.0, 0.0 idx = 0 # for val_data in val_loader: for _, val_data in enumerate(val_loader): idx += 1 img_name = os.path.splitext(os.path.basename(val_data['LR_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) # print("Img nameVaL: ", img_name) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2imgNorm(visuals['SR'],out_type=np.uint8, min_max=(0, 1), MinVal=val_data["LR_min"], MaxVal=val_data["LR_max"]) # uint16 gt_img = util.tensor2imgNorm(visuals['HR'],out_type=np.uint8, min_max=(0, 1), MinVal=val_data["HR_min"], MaxVal=val_data["HR_max"]) # uint16 lr_img = util.tensor2imgNorm(visuals['LR'], out_type=np.uint8, min_max=(0, 1), MinVal=val_data["LR_min"], MaxVal=val_data["LR_max"]) # uint16 # Save SR images for reference if idx < 10: # print(idx) util.mkdir(img_dir) save_img_path = os.path.join(img_dir, '{:s}_{:d}'.format(img_name, current_step)) util.save_imgSR(sr_img, save_img_path) util.save_imgHR(gt_img, save_img_path) util.save_imgLR(lr_img, save_img_path) print("SAVING CROPS") util.save_imgCROP(lr_img,gt_img,sr_img , save_img_path, ratio, PreUp=PreUp) if PreUp==False: dim2 = (gt_img.shape[1], gt_img.shape[1]) print("DIM:", dim2) print("LR image shape ", lr_img.shape) print("HR image shape ", gt_img.shape) lr_img = cv2.resize(np.transpose(lr_img,(1,2,0)), dim2, interpolation=cv2.INTER_NEAREST) lr_img = np.transpose(lr_img,(2,0,1)) print("LR image 2 shape ", lr_img.shape) print("LR image 2 shape ", lr_img.shape) avg_psnr_sr += util.calculate_psnr2(sr_img, gt_img) avg_psnr_lr += util.calculate_psnr2(lr_img, gt_img) avg_ssim_lr += util.calculate_ssim2(lr_img, gt_img) avg_ssim_sr += util.calculate_ssim2(sr_img, gt_img) avg_ergas_lr += util.calculate_ergas(lr_img, gt_img, pixratio=ratio) avg_ergas_sr += util.calculate_ergas(sr_img, gt_img, pixratio=ratio) #avg_psnr += util.calculate_psnr2(cropped_sr_img, cropped_gt_img) avg_psnr_sr = avg_psnr_sr / idx avg_psnr_lr = avg_psnr_lr / idx avg_psnr_dif = avg_psnr_lr - avg_psnr_sr avg_ssim_lr = avg_ssim_lr / idx avg_ssim_sr = avg_ssim_sr / idx avg_ssim_dif = avg_ssim_lr - avg_ssim_sr avg_ergas_lr = avg_ergas_lr / idx avg_ergas_sr = avg_ergas_sr / idx avg_ergas_dif = avg_ergas_lr - avg_ergas_sr # print("IDX: ", idx) # log VALIDATION logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr_sr)) logger.info('# Validation # SSIM: {:.4e}'.format(avg_ssim_sr)) logger.info('# Validation # ERGAS: {:.4e}'.format(avg_ergas_sr)) logger_val = logging.getLogger('val') # validation logger logger_val.info('<epoch:{:3d}, iter:{:8,d}> psnr_SR: {:.4e}'.format( epoch, current_step, avg_psnr_sr)) logger_val.info('<epoch:{:3d}, iter:{:8,d}> psnr_LR: {:.4e}'.format( epoch, current_step, avg_psnr_lr)) logger_val.info('<epoch:{:3d}, iter:{:8,d}> psnr_DIF: {:.4e}'.format( epoch, current_step, avg_psnr_dif)) logger_val.info('<epoch:{:3d}, iter:{:8,d}> ssim_LR: {:.4e}'.format( epoch, current_step, avg_ssim_lr)) logger_val.info('<epoch:{:3d}, iter:{:8,d}> ssim_SR: {:.4e}'.format( epoch, current_step, avg_ssim_sr)) logger_val.info('<epoch:{:3d}, iter:{:8,d}> ssim_DIF: {:.4e}'.format( epoch, current_step, avg_ssim_dif)) logger_val.info('<epoch:{:3d}, iter:{:8,d}> ergas_LR: {:.4e}'.format( epoch, current_step, avg_ergas_lr)) logger_val.info('<epoch:{:3d}, iter:{:8,d}> ergas_SR: {:.4e}'.format( epoch, current_step, avg_ergas_sr)) logger_val.info('<epoch:{:3d}, iter:{:8,d}> ergas_DIF: {:.4e}'.format( epoch, current_step, avg_ergas_dif)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger_val.add_scalar('dif_PSNR', avg_psnr_dif, current_step) # tb_logger.add_scalar('psnr', avg_psnr, current_step) tb_logger_val.add_scalar('dif_SSIM', avg_ssim_dif, current_step) tb_logger_val.add_scalar('dif_ERGAS', avg_ergas_dif, current_step) tb_logger_val.add_scalar('psnr_LR', avg_psnr_lr, current_step) # tb_logger.add_scalar('psnr', avg_psnr, current_step) tb_logger_val.add_scalar('ssim_LR', avg_ssim_lr, current_step) tb_logger_val.add_scalar('ERGAS_LR', avg_ergas_lr, current_step) tb_logger_val.add_scalar('psnr_SR', avg_psnr_sr, current_step) # tb_logger.add_scalar('psnr', avg_psnr, current_step) tb_logger_val.add_scalar('ssim_SR', avg_ssim_sr, current_step) tb_logger_val.add_scalar('ERGAS_SR', avg_ergas_sr, current_step) print("****** SR_IMG: ", sr_img.shape) print("****** LR_IMG: ", lr_img.shape) print("****** GT_IMG: ", gt_img.shape) fig1,ax1 = ep.plot_rgb(sr_img, rgb=[2, 1, 0], stretch=True) tb_logger_val.add_figure("SR_plt", fig1, current_step,close=True) fig2, ax2 = ep.plot_rgb(gt_img, rgb=[2, 1, 0], stretch=True) tb_logger_val.add_figure("GT_plt", fig2, current_step, close=True) fig3, ax3 = ep.plot_rgb(lr_img, rgb=[2, 1, 0], stretch=True) tb_logger_val.add_figure("LR_plt", fig3, current_step, close=True) # print("TERMINO GUARDAR IMG TB") # save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) # ii=ii+1 logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')
def main(): ############################################ # # set options # ############################################ parser = argparse.ArgumentParser() parser.add_argument('--opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) ############################################ # # distributed training settings # ############################################ if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() print("Rank:", rank) print("World Size", world_size) print("------------------DIST-------------------------") ############################################ # # loading resume state if exists # ############################################ if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None ############################################ # # mkdir and loggers # ############################################ if 'debug' in opt['name']: debug_mode = True else: debug_mode = False if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('base_val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_', level=logging.INFO, screen=True) print("set train log") util.setup_logger('base_val', opt['path']['log'], 'val_', level=logging.INFO, screen=True) print("set val log") logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True ############################################ # # create train and val dataloader # ############################################ #### # dataset_ratio = 200 # enlarge the size of each epoch dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': if opt['datasets']['train'].get('split', None): train_set, val_set = create_dataset(dataset_opt) else: train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) # total_iters = int(opt['train']['niter']) # total_epochs = int(math.ceil(total_iters / train_size)) total_iters = train_size total_epochs = int(opt['train']['epoch']) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) # total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) total_epochs = int(opt['train']['epoch']) if opt['train']['enable'] == False: total_epochs = 1 else: # train_sampler = None train_sampler = RandomBalancedSampler(train_set, train_size) train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler, vscode_debug=debug_mode) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': if not opt['datasets']['train'].get('split', None): val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None, vscode_debug=debug_mode) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None ############################################ # # create model # ############################################ #### model = create_model(opt) print("Model Created! ") #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 print("Not Resume Training") ############################################ # # training # ############################################ logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) model.train_AverageMeter() saved_total_loss = 10e10 saved_total_PSNR = -1 saved_total_SSIM = -1 for epoch in range(start_epoch, total_epochs): ############################################ # # Start a new epoch # ############################################ current_step = 0 if opt['dist']: train_sampler.set_epoch(epoch) for train_idx, train_data in enumerate(train_loader): # print('current_step', current_step) if 'debug' in opt['name']: img_dir = os.path.join(opt['path']['train_images']) util.mkdir(img_dir) LQs = train_data['LQs'] # B N C H W if not 'sr' in opt['name']: GTenh = train_data['GTenh'] GTinp = train_data['GTinp'] for imgs, name in zip([LQs, GTenh, GTinp], ['LQs', 'GTenh', 'GTinp']): num = imgs.size(1) for i in range(num): img = util.tensor2img(imgs[0, i, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format( train_idx, str(name), i)) util.save_img(img, save_img_path) else: if 'GT' in train_data: GT_name = 'GT' elif 'GTs' in train_data: GT_name = 'GTs' GT = train_data[GT_name] for imgs, name in zip([LQs, GT], ['LQs', GT_name]): if name == 'GT': num = imgs.size(0) img = util.tensor2img(imgs[0, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format( train_idx, str(name), 0)) util.save_img(img, save_img_path) elif name == 'GTs': num = imgs.size(1) for i in range(num): img = util.tensor2img(imgs[:, i, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format( train_idx, str(name), i)) util.save_img(img, save_img_path) else: num = imgs.size(1) for i in range(num): img = util.tensor2img(imgs[:, i, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format( train_idx, str(name), i)) util.save_img(img, save_img_path) if (train_idx >= 3): # set to 0, just do validation break # if pre-load weight first do validation and skip the first epoch # if opt['path'].get('pretrain_model_G', None) and epoch == 0: # epoch += 1 # break if opt['train']['enable'] == False: message_train_loss = 'None' break current_step += 1 if current_step > total_iters: print("Total Iteration Reached !") break #### update learning rate if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': pass else: model.update_learning_rate( current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) model.train_AverageMeter_update() #### log if current_step % opt['logger']['print_freq'] == 0: logs_inst, logs_avg = model.get_current_log( ) # training loss mode='train' message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' # if 'debug' in opt['name']: # debug model print the instant loss # for k, v in logs_inst.items(): # message += '{:s}: {:.4e} '.format(k, v) # # tensorboard logger # if opt['use_tb_logger'] and 'debug' not in opt['name']: # if rank <= 0: # tb_logger.add_scalar(k, v, current_step) # for avg loss current_iters_epoch = epoch * total_iters + current_step for k, v in logs_avg.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_iters_epoch) if rank <= 0: logger.info(message) # saving models if epoch == 1: save_filename = '{:04d}_{}.pth'.format(0, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) save_filename = '{:04d}_{}.pth'.format(epoch - 1, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) if rank <= 0: logger.info('Saving models and training states.') save_filename = '{:04d}'.format(epoch) model.save(save_filename) # ======================================================================= # # Main validation loop # # ======================================================================= # if opt['datasets'].get('val', None): if opt['dist']: # multi-GPU testing psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. ssim_rlt = {} # with border and center frames ssim_rlt_avg = {} ssim_total_avg = 0. val_loss_rlt = {} # the averaged loss val_loss_rlt_avg = {} val_loss_total_avg = 0. if rank == 0: pbar = util.ProgressBar(len(val_set)) for idx in range( rank, len(val_set), world_size): # distributed parallel validation # print('idx', idx) if 'debug' in opt['name']: if (idx >= 3): break if (idx >= 1000): break val_data = val_set[idx] # use idx method to fetch must extend batch dimension val_data['LQs'].unsqueeze_(0) val_data['GTenh'].unsqueeze_(0) val_data['GTinp'].unsqueeze_(0) key = val_data['key'][0] # IMG_0034_00809 max_idx = len(val_set) val_name = 'val_set' num = model.get_info( ) # each model has different number of loss if psnr_rlt.get(val_name, None) is None: psnr_rlt[val_name] = torch.zeros([num, max_idx], dtype=torch.float32, device='cuda') if ssim_rlt.get(val_name, None) is None: ssim_rlt[val_name] = torch.zeros([num, max_idx], dtype=torch.float32, device='cuda') if val_loss_rlt.get(val_name, None) is None: val_loss_rlt[val_name] = torch.zeros( [num, max_idx], dtype=torch.float32, device='cuda') model.feed_data(val_data) model.test() avg_loss, loss_list = model.get_loss(ret=1) save_enable = True if idx >= 100: save_enable = False psnr_list, ssim_list = model.compute_current_psnr_ssim( save=save_enable, name=key, save_path=opt['path']['val_images']) # print('psnr_list',psnr_list) assert len(loss_list) == num assert len(psnr_list) == num for i in range(num): psnr_rlt[val_name][i, idx] = psnr_list[i] ssim_rlt[val_name][i, idx] = ssim_list[i] val_loss_rlt[val_name][i, idx] = loss_list[i] # print('psnr_rlt[val_name][i, idx]',psnr_rlt[val_name][i, idx]) # print('ssim_rlt[val_name][i, idx]',ssim_rlt[val_name][i, idx]) # print('val_loss_rlt[val_name][i, idx] ',val_loss_rlt[val_name][i, idx] ) if rank == 0: for _ in range(world_size): pbar.update('Test {} - {}/{}'.format( key, idx, max_idx)) # # collect data for _, v in psnr_rlt.items(): for i in v: dist.reduce(i, 0) for _, v in ssim_rlt.items(): for i in v: dist.reduce(i, 0) for _, v in val_loss_rlt.items(): for i in v: dist.reduce(i, 0) dist.barrier() if rank == 0: psnr_rlt_avg = {} psnr_total_avg = 0. for k, v in psnr_rlt.items(): # key, value # print('k', k, 'v', v, 'v.shape', v.shape) psnr_rlt_avg[k] = [] for i in range(num): non_zero_idx = v[i, :].nonzero() # logger.info('non_zero_idx {}'.format(non_zero_idx.shape)) # check matrix = v[i, :][non_zero_idx] # print('matrix', matrix) value = torch.mean(matrix).cpu().item() # print('value', value) psnr_rlt_avg[k].append(value) psnr_total_avg += psnr_rlt_avg[k][i] psnr_total_avg = psnr_total_avg / (len(psnr_rlt) * num) log_p = '# Validation # Avg. PSNR: {:.2f},'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): for i, it in enumerate(v): log_p += ' {}: {:.2f}'.format(i, it) logger.info(log_p) logger_val.info(log_p) # ssim ssim_rlt_avg = {} ssim_total_avg = 0. for k, v in ssim_rlt.items(): ssim_rlt_avg[k] = [] for i in range(num): non_zero_idx = v[i, :].nonzero() # print('non_zero_idx', non_zero_idx) matrix = v[i, :][non_zero_idx] # print('matrix', matrix) value = torch.mean(matrix).cpu().item() # print('value', value) ssim_rlt_avg[k].append( torch.mean(matrix).cpu().item()) ssim_total_avg += ssim_rlt_avg[k][i] ssim_total_avg /= (len(ssim_rlt) * num) log_s = '# Validation # Avg. SSIM: {:.2f},'.format( ssim_total_avg) for k, v in ssim_rlt_avg.items(): for i, it in enumerate(v): log_s += ' {}: {:.2f}'.format(i, it) logger.info(log_s) logger_val.info(log_s) # added val_loss_rlt_avg = {} val_loss_total_avg = 0. for k, v in val_loss_rlt.items(): # k, key, the folder name # v, value, the torch matrix val_loss_rlt_avg[k] = [] # loss0 - loss_N for i in range(num): non_zero_idx = v[i, :].nonzero() # print('non_zero_idx', non_zero_idx) matrix = v[i, :][non_zero_idx] # print('matrix', matrix) value = torch.mean(matrix).cpu().item() # print('value', value) val_loss_rlt_avg[k].append( torch.mean(matrix).cpu().item()) val_loss_total_avg += val_loss_rlt_avg[k][i] val_loss_total_avg /= (len(val_loss_rlt) * num) log_l = '# Validation # Avg. Loss: {:.4e},'.format( val_loss_total_avg) for k, v in val_loss_rlt_avg.items(): for i, it in enumerate(v): log_l += ' {}: {:.4e}'.format(i, it) logger.info(log_l) logger_val.info(log_l) message = '' for v in model.get_current_learning_rate(): message += '{:.5e}'.format(v) logger_val.info( 'Epoch {:02d}, LR {:s}, PSNR {:.4f}, SSIM {:.4f}, Val Loss {:.4e}' .format(epoch, message, psnr_total_avg, ssim_total_avg, val_loss_total_avg)) else: pbar = util.ProgressBar(len(val_loader)) model.val_loss_AverageMeter() model.val_AverageMeter_para() for val_inx, val_data in enumerate(val_loader): # if 'debug' in opt['name']: # if (val_inx >= 10): # break save_enable = True if val_inx >= 100: save_enable = False if val_inx >= 100: break key = val_data['key'][0] folder = key[:-6] model.feed_data(val_data) model.test() avg_loss, loss_list = model.get_loss(ret=1) model.val_loss_AverageMeter_update(loss_list, avg_loss) psnr_list, ssim_list = model.compute_current_psnr_ssim( save=save_enable, name=key, save_path=opt['path']['val_images']) model.val_AverageMeter_para_update(psnr_list, ssim_list) if 'debug' in opt['name']: msg_psnr = '' msg_ssim = '' for i, psnr in enumerate(psnr_list): msg_psnr += '{} :{:.02f} '.format(i, psnr) for i, ssim in enumerate(ssim_list): msg_ssim += '{} :{:.02f} '.format(i, ssim) logger.info('{}_{:02d} {}'.format( key, val_inx, msg_psnr)) logger.info('{}_{:02d} {}'.format( key, val_inx, msg_ssim)) pbar.update('Test {} - {}'.format(key, val_inx)) # toal validation log lr = '' for v in model.get_current_learning_rate(): lr += '{:.5e}'.format(v) logs_avg, logs_psnr_avg, psnr_total_avg, ssim_total_avg, val_loss_total_avg = model.get_current_log( mode='val') msg_logs_avg = '' for k, v in logs_avg.items(): msg_logs_avg += '{:s}: {:.4e} '.format(k, v) logger_val.info('Val-Epoch {:02d}, LR {:s}, {:s}'.format( epoch, lr, msg_logs_avg)) logger.info('Val-Epoch {:02d}, LR {:s}, {:s}'.format( epoch, lr, msg_logs_avg)) msg_logs_psnr_avg = '' for k, v in logs_psnr_avg.items(): msg_logs_psnr_avg += '{:s}: {:.4e} '.format(k, v) logger_val.info('Val-Epoch {:02d}, LR {:s}, {:s}'.format( epoch, lr, msg_logs_psnr_avg)) logger.info('Val-Epoch {:02d}, LR {:s}, {:s}'.format( epoch, lr, msg_logs_psnr_avg)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('val_psnr', psnr_total_avg, epoch) tb_logger.add_scalar('val_loss', val_loss_total_avg, epoch) ############################################ # # end of validation, save model # ############################################ # if rank <= 0: logger.info("Finished an epoch, Check and Save the model weights") # we check the validation loss instead of training loss. OK~ if saved_total_loss >= val_loss_total_avg: saved_total_loss = val_loss_total_avg #torch.save(model.state_dict(), args.save_path + "/best" + ".pth") model.save('best') logger.info( "Best Weights updated for decreased validation loss") else: logger.info( "Weights Not updated for undecreased validation loss") if saved_total_PSNR <= psnr_total_avg: saved_total_PSNR = psnr_total_avg model.save('bestPSNR') logger.info( "Best Weights updated for increased validation PSNR") else: logger.info( "Weights Not updated for unincreased validation PSNR") ############################################ # # end of one epoch, schedule LR # ############################################ model.train_AverageMeter_reset() # add scheduler todo if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': for scheduler in model.schedulers: # scheduler.step(val_loss_total_avg) scheduler.step(val_loss_total_avg) if rank <= 0: logger.info('Saving the final model.') model.save('last') logger.info('End of training.') tb_logger.close()
def main(): # options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option JSON file.') opt = option.parse(parser.parse_args().opt, is_train=True) # Convert to NoneDict, which return None for missing key. opt = option.dict_to_nonedict(opt) pytorch_ver = get_pytorch_ver() # train from scratch OR resume training if opt['path']['resume_state']: if os.path.isdir(opt['path']['resume_state']): import glob resume_state_path = util.sorted_nicely( glob.glob( os.path.normpath(opt['path']['resume_state']) + '/*.state'))[-1] else: resume_state_path = opt['path']['resume_state'] resume_state = torch.load(resume_state_path) else: # training from scratch resume_state = None # rename old folder if exists util.mkdir_and_rename(opt['path']['experiments_root']) util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger(None, opt['path']['log'], 'train', level=logging.INFO, screen=True) util.setup_logger('val', opt['path']['log'], 'val', level=logging.INFO) logger = logging.getLogger('base') if resume_state: logger.info('Set [resume_state] to ' + resume_state_path) logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) option.check_resume(opt) # check resume options logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: from tensorboardX import SummaryWriter try: # for version tensorboardX >= 1.7 tb_logger = SummaryWriter(logdir='../tb_logger/' + opt['name']) except: # for version tensorboardX < 1.6 tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) # random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True # create train and val dataloader for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) train_loader = create_dataloader(train_set, dataset_opt) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt) logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None # create model model = create_model(opt) # resume training if resume_state: start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers # updated schedulers in case JSON configuration has changed model.update_schedulers(opt['train']) else: current_step = 0 start_epoch = 0 # training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs): for n, train_data in enumerate(train_loader, start=1): current_step += 1 if current_step > total_iters: break if pytorch_ver == "pre": # Order for PyTorch ver < 1.1.0 # update learning rate model.update_learning_rate(current_step - 1) # training model.feed_data(train_data) model.optimize_parameters(current_step) elif pytorch_ver == "post": # Order for PyTorch ver > 1.1.0 # training model.feed_data(train_data) model.optimize_parameters(current_step) # update learning rate model.update_learning_rate(current_step - 1) else: print('Error identifying PyTorch version. ', torch.__version__) break # log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar(k, v, current_step) logger.info(message) # save models and training states (changed to save models before validation) if current_step % opt['logger']['save_checkpoint_freq'] == 0: model.save(current_step) model.save_training_state(epoch + (n >= len(train_loader)), current_step) logger.info('Models and training states saved.') # validation if current_step % opt['train']['val_freq'] == 0: avg_psnr = 0.0 avg_ssim = 0.0 avg_lpips = 0.0 idx = 0 val_sr_imgs_list = [] val_gt_imgs_list = [] for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LR_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() if opt['datasets']['train'][ 'znorm']: # If the image range is [-1,1] sr_img = util.tensor2img(visuals['SR'], min_max=(-1, 1)) # uint8 gt_img = util.tensor2img(visuals['HR'], min_max=(-1, 1)) # uint8 else: # Default: Image range is [0,1] sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['HR']) # uint8 # sr_img = util.tensor2img(visuals['SR']) # uint8 # gt_img = util.tensor2img(visuals['HR']) # uint8 # print("Min. SR value:",sr_img.min()) # Debug # print("Max. SR value:",sr_img.max()) # Debug # print("Min. GT value:",gt_img.min()) # Debug # print("Max. GT value:",gt_img.max()) # Debug # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR, SSIM and LPIPS distance crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. # For training models with only one channel ndim==2, if RGB ndim==3, etc. if gt_img.ndim == 2: cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size] else: cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] if sr_img.ndim == 2: cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size] else: # Default: RGB images cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] # If calculating only once for all images val_gt_imgs_list.append(cropped_gt_img) # If calculating only once for all images val_sr_imgs_list.append(cropped_sr_img) # LPIPS only works for RGB images avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_ssim += util.calculate_ssim(cropped_sr_img * 255, cropped_gt_img * 255) # If calculating for each image # avg_lpips += lpips.calculate_lpips([cropped_sr_img], [cropped_gt_img]) avg_psnr = avg_psnr / idx avg_ssim = avg_ssim / idx # avg_lpips=avg_lpips / idx # If calculating for each image # If calculating only once for all images avg_lpips = lpips.calculate_lpips(val_sr_imgs_list, val_gt_imgs_list) # log # logger.info('# Validation # PSNR: {:.5g}, SSIM: {:.5g}'.format(avg_psnr, avg_ssim)) logger.info( '# Validation # PSNR: {:.5g}, SSIM: {:.5g}, LPIPS: {:.5g}'. format(avg_psnr, avg_ssim, avg_lpips)) logger_val = logging.getLogger('val') # validation logger # logger_val.info('<epoch:{:3d}, iter:{:8,d}> psnr: {:.5g}, ssim: {:.5g}'.format( # epoch, current_step, avg_psnr, avg_ssim)) logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr: {:.5g}, ssim: {:.5g}, lpips: {:.5g}' .format(epoch, current_step, avg_psnr, avg_ssim, avg_lpips)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) tb_logger.add_scalar('ssim', avg_ssim, current_step) tb_logger.add_scalar('lpips', avg_lpips, current_step) logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')
def main(): # options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option JSON file.') opt = option.parse(parser.parse_args().opt, is_train=True) opt = option.dict_to_nonedict( opt) # Convert to NoneDict, which return None for missing key. # train from scratch OR resume training if opt['path']['resume_state']: # resuming training resume_state = torch.load(opt['path']['resume_state']) else: # training from scratch resume_state = None util.mkdir_and_rename( opt['path']['experiments_root']) # rename old folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger(None, opt['path']['log'], 'train', level=logging.INFO, screen=True) util.setup_logger('val', opt['path']['log'], 'val', level=logging.INFO) logger = logging.getLogger('base') if resume_state: # resume_state[] logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) option.check_resume(opt) # check resume options logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: from tensorboardX import SummaryWriter tb_logger = SummaryWriter(logdir='../../SRN_tb_logger/' + opt['name']) # random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True # create train and val dataloader for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) train_loader = create_dataloader(train_set, dataset_opt) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt) logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None # create model model = create_model(opt) # resume training if resume_state: start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state, opt['train']) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 # training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs): for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break # update learning rate model.update_learning_rate() # training model.feed_data(train_data, True) model.optimize_parameters(current_step) # log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar(k, v, current_step) logger.info(message) # training samples if opt['train']['save_tsamples'] and current_step % opt['train'][ 'save_tsamples'] == 0: fake_LRs = os.listdir( opt['datasets']['train']['dataroot_fake_LR']) real_LRs = os.listdir( opt['datasets']['train']['dataroot_real_LR']) HRs = os.listdir(opt['datasets']['train']['dataroot_HR']) for i in range(5): random_index = np.random.choice(range(len(fake_LRs))) fake_LR_path = os.path.join( opt['datasets']['train']['dataroot_fake_LR'], fake_LRs[random_index]) real_LR_path = os.path.join( opt['datasets']['train']['dataroot_real_LR'], real_LRs[random_index]) HR_path = os.path.join( opt['datasets']['train']['dataroot_HR'], HRs[random_index]) fake_LR = np.array(Image.open(fake_LR_path)) real_LR = np.array(Image.open(real_LR_path)) HR = np.array(Image.open(HR_path)) h, w, _ = fake_LR.shape fake_LR = fake_LR[h // 2 - 64:h // 2 + 64, w // 2 - 64:w // 2 + 64, :] h, w, _ = HR.shape HR = HR[h // 2 - 64 * 4:h // 2 + 64 * 4, w // 2 - 64 * 4:w // 2 + 64 * 4, :] h, w, _ = real_LR.shape real_LR = real_LR[h // 2 - 64:h // 2 + 64, w // 2 - 64:w // 2 + 64, :] fake_LR = torch.from_numpy( np.ascontiguousarray(np.transpose( fake_LR, (2, 0, 1)))).float().unsqueeze(0) / 255 real_LR = torch.from_numpy( np.ascontiguousarray(np.transpose( real_LR, (2, 0, 1)))).float().unsqueeze(0) / 255 HR = torch.from_numpy( np.ascontiguousarray(np.transpose( HR, (2, 0, 1)))).float().unsqueeze(0) / 255 LR = torch.cat([fake_LR, real_LR], dim=0) data = {'LR': LR, 'HR': HR} model.feed_data(data, False) model.test(tsamples=True) visuals = model.get_current_visuals(tsamples=True) fake_SR = visuals['SR'][0] real_SR = visuals['SR'][1] fake_hf = visuals['hf'][0] real_hf = visuals['hf'][1] HR = visuals['HR'] HR_hf = visuals['HR_hf'][0] # image_1 = torch.cat([fake_LR[0], fake_SR[0]], dim=2) # image_2 = torch.cat([real_LR[0], real_SR[0]], dim=2) image_1 = np.clip(torch.cat([fake_SR, HR, real_SR], dim=2), 0, 1) image_2 = np.clip( torch.cat([fake_hf, HR_hf, real_hf], dim=2), 0, 1) image = torch.cat([image_1, image_2], dim=1) tb_logger.add_image( 'train/train_samples_{}'.format(str(i)), image, current_step) logger.info('Saved training Samples') # validation if current_step % opt['train']['val_freq'] == 0: avg_psnr = 0.0 idx = 0 avg_lpips = 0.0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LR_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data, False) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 if 'HR' in opt['datasets']['val']['mode']: gt_img = util.tensor2img(visuals['HR']) # uint8 log_info = '{}'.format( val_data['HR_path'][0].split('/')[-1]) if opt['val_lpips']: lpips = visuals['LPIPS'] avg_lpips += lpips log_info += ' LPIPS:{:.3f}'.format( lpips.numpy()) if opt['use_domain_distance_map']: ada_w = visuals['adaptive_weights'] log_info += ' Adaptive weights:{:.2f}'.format( ada_w.numpy()) # logger.info('{} LPIPS: {:.3f}'.format(val_data['HR_path'][0].split('/')[-1], lpips.numpy())) # print('img:', val_data['HR_path'][0].split('/')[-1], 'LPIPS: %.3f' % lpips.numpy()) # else: # print('img:', val_data['LR_path'][0].split('/')[-1]) logger.info(log_info) # Save SR images for reference save_img_path = os.path.join(img_dir, '{:s}_{:d}.png'.format(\ img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR if 'HR' in opt['datasets']['val']['mode']: crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr( cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx if opt['val_lpips']: avg_lpips = avg_lpips / idx print('Mean LPIPS:', avg_lpips.numpy()) # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger if opt['val_lpips']: logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}, LPIPS: {:.4f}' .format(epoch, current_step, avg_psnr, avg_lpips)) else: logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}'.format( epoch, current_step, avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) tb_logger.add_scalar('LPIPS', avg_lpips, current_step) # save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')
def main(): # options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option JSON file.') opt = option.parse(parser.parse_args().opt, is_train=True) opt = option.dict_to_nonedict(opt) # Convert to NoneDict, which return None for missing key. # train from scratch OR resume training if opt['path']['resume_state']: # resuming training resume_state = torch.load(opt['path']['resume_state']) else: # training from scratch resume_state = None util.mkdir_and_rename(opt['path']['experiments_root']) # rename old folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger(None, opt['path']['log'], 'train', level=logging.INFO, screen=True) util.setup_logger('val', opt['path']['log'], 'val', level=logging.INFO) logger = logging.getLogger('base') if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) option.check_resume(opt) # check resume options logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) # random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True # create train and val dataloader for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) train_loader = create_dataloader(train_set, dataset_opt) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt) logger.info('Number of val images in [{:s}]: {:d}'.format(dataset_opt['name'], len(val_set))) else: raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None # create model model = create_model(opt) # resume training if resume_state: start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 # training logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) for epoch in range(start_epoch, total_epochs): for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break # update learning rate model.update_learning_rate() # training model.feed_data(train_data) model.optimize_parameters(current_step) # log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar(k, v, current_step) logger.info(message) # validation if current_step % opt['train']['val_freq'] == 0: avg_psnr = 0.0 idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext(os.path.basename(val_data['LR_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['HR']) # uint8 # Save SR images for reference save_img_path = os.path.join(img_dir, '{:s}_{:d}.png'.format(\ img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info('<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}'.format( epoch, current_step, avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) # save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')
def main(): # options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option JSON file.') opt = option.parse(parser.parse_args().opt, is_train=True) opt = option.dict_to_nonedict(opt) # Convert to NoneDict, which return None for missing key. pytorch_ver = get_pytorch_ver() # train from scratch OR resume training if opt['path']['resume_state']: if os.path.isdir(opt['path']['resume_state']): import glob resume_state_path = util.sorted_nicely(glob.glob(os.path.normpath(opt['path']['resume_state']) + '/*.state'))[-1] else: resume_state_path = opt['path']['resume_state'] resume_state = torch.load(resume_state_path) else: # training from scratch resume_state = None util.mkdir_and_rename(opt['path']['experiments_root']) # rename old folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger(None, opt['path']['log'], 'train', level=logging.INFO, screen=True) util.setup_logger('val', opt['path']['log'], 'val', level=logging.INFO) logger = logging.getLogger('base') if resume_state: logger.info('Set [resume_state] to ' + resume_state_path) logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) option.check_resume(opt) # check resume options logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: from tensorboardX import SummaryWriter try: tb_logger = SummaryWriter(logdir='../tb_logger/' + opt['name']) #for version tensorboardX >= 1.7 except: tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) #for version tensorboardX < 1.6 # random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True # create train and val dataloader for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) train_loader = create_dataloader(train_set, dataset_opt) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt) logger.info('Number of val images in [{:s}]: {:d}'.format(dataset_opt['name'], len(val_set))) else: raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None # create model model = create_model(opt) # resume training if resume_state: start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers model.update_schedulers(opt['train']) # updated schedulers in case JSON configuration has changed else: current_step = 0 start_epoch = 0 # training logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) for epoch in range(start_epoch, total_epochs): for n, train_data in enumerate(train_loader,start=1): current_step += 1 if current_step > total_iters: break if pytorch_ver=="pre": #Order for PyTorch ver < 1.1.0 # update learning rate model.update_learning_rate(current_step-1) # training model.feed_data(train_data) model.optimize_parameters(current_step) elif pytorch_ver=="post": #Order for PyTorch ver > 1.1.0 # training model.feed_data(train_data) model.optimize_parameters(current_step) # update learning rate model.update_learning_rate(current_step-1) else: print('Error identifying PyTorch version. ', torch.__version__) break # log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar(k, v, current_step) logger.info(message) # save models and training states (changed to save models before validation) if current_step % opt['logger']['save_checkpoint_freq'] == 0: model.save(current_step) model.save_training_state(epoch + (n >= len(train_loader)), current_step) logger.info('Models and training states saved.') # validation if current_step % opt['train']['val_freq'] == 0: avg_psnr_c = 0.0 avg_psnr_s = 0.0 avg_psnr_p = 0.0 idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext(os.path.basename(val_data['LR_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() img_c = util.tensor2img(visuals['img_c']) # uint8 img_s = util.tensor2img(visuals['img_s']) # uint8 img_p = util.tensor2img(visuals['img_p']) # uint8 gt_img = util.tensor2img(visuals['HR']) # uint8 # Save SR images for reference save_c_img_path = os.path.join(img_dir, '{:s}_{:d}_c.png'.format(img_name, current_step)) save_s_img_path = os.path.join(img_dir, '{:s}_{:d}_s.png'.format(img_name, current_step)) save_p_img_path = os.path.join(img_dir, '{:s}_{:d}_d.png'.format(img_name, current_step)) util.save_img(img_c, save_c_img_path) util.save_img(img_s, save_s_img_path) util.save_img(img_p, save_p_img_path) # calculate PSNR crop_size = opt['scale'] gt_img = gt_img / 255. #sr_img = sr_img / 255. #ESRGAN #PPON #C sr_img_c = img_c / 255. cropped_sr_img_c = sr_img_c[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr_c += util.calculate_psnr(cropped_sr_img_c * 255, cropped_gt_img * 255) #S sr_img_s = img_s / 255. cropped_sr_img_s = sr_img_s[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr_s += util.calculate_psnr(cropped_sr_img_s * 255, cropped_gt_img * 255) #D sr_img_p = img_p / 255. cropped_sr_img_p = sr_img_p[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr_p += util.calculate_psnr(cropped_sr_img_p * 255, cropped_gt_img * 255) avg_psnr_c = avg_psnr_c / idx avg_psnr_s = avg_psnr_s / idx avg_psnr_p = avg_psnr_p / idx # log logger.info('# Validation # PSNR_c: {:.5g}'.format(avg_psnr_c)) logger.info('# Validation # PSNR_s: {:.5g}'.format(avg_psnr_s)) logger.info('# Validation # PSNR_p: {:.5g}'.format(avg_psnr_p)) logger_val = logging.getLogger('val') # validation logger logger_val.info('<epoch:{:3d}, iter:{:8,d}> psnr_c: {:.5g}, psnr_s: {:.5g}, psnr_p: {:.5g}'.format( epoch, current_step, avg_psnr_c, avg_psnr_s, avg_psnr_p)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_c', avg_psnr_c, current_step) tb_logger.add_scalar('psnr_s', avg_psnr_s, current_step) tb_logger.add_scalar('psnr_p', avg_psnr_p, current_step) logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')