def main(): # options parser = argparse.ArgumentParser() parser.add_argument("-opt", type=str, help="Path to option YAML file.") parser.add_argument("--launcher", choices=["none", "pytorch"], default="none", help="job launcher") parser.add_argument("--local_rank", type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) # distributed training settings if args.launcher == "none": # disabled distributed training opt["dist"] = False rank = -1 print("Disabled distributed training.") else: opt["dist"] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() # loading resume state if exists if opt["path"].get("resume_state", None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt["path"]["resume_state"], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state["iter"]) # check resume options else: resume_state = None # mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt["path"] ["experiments_root"]) # rename experiment folder if exists util.mkdirs( (path for key, path in opt["path"].items() if not key == "experiments_root" and "pretrain_model" not in key and "resume" not in key)) # config loggers. Before it, the log will not work util.setup_logger("base", opt["path"]["log"], "train_" + opt["name"], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger("base") logger.info(option.dict2str(opt)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info("You are using PyTorch {}. \ Tensorboard will use [tensorboardX]".format( version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir="../tb_logger/" + opt["name"]) else: util.setup_logger("base", opt["path"]["log"], "train", level=logging.INFO, screen=True) logger = logging.getLogger("base") # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) # random seed seed = opt["train"]["manual_seed"] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info("Random seed: {}".format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True # create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt["datasets"].items(): if phase == "train": train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt["batch_size"])) total_iters = int(opt["train"]["niter"]) total_epochs = int(math.ceil(total_iters / train_size)) if opt["dist"]: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( "Number of train images: {:,d}, iters: {:,d}".format( len(train_set), train_size)) logger.info("Total epochs needed: {:d} for iters {:,d}".format( total_epochs, total_iters)) elif phase == "val": val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info("Number of val images in [{:s}]: {:d}".format( dataset_opt["name"], len(val_set))) else: raise NotImplementedError( "Phase [{:s}] is not recognized.".format(phase)) assert train_loader is not None # create model model = create_model(opt) print("Model created!") # resume training if resume_state: logger.info("Resuming training from epoch: {}, iter: {}.".format( resume_state["epoch"], resume_state["iter"])) start_epoch = resume_state["epoch"] current_step = resume_state["iter"] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 # training logger.info("Start training from epoch: {:d}, iter: {:d}".format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt["dist"]: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break # update learning rate model.update_learning_rate(current_step, warmup_iter=opt["train"]["warmup_iter"]) # training model.feed_data(train_data) model.optimize_parameters(current_step) # log if current_step % opt["logger"]["print_freq"] == 0: logs = model.get_current_log() message = "[epoch:{:3d}, iter:{:8,d}, lr:(".format( epoch, current_step) for v in model.get_current_learning_rate(): message += "{:.3e},".format(v) message += ")] " for k, v in logs.items(): message += "{:s}: {:.4e} ".format(k, v) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # validation if opt["datasets"].get( "val", None) and current_step % opt["train"]["val_freq"] == 0: # image restoration validation if opt["model"] in ["sr", "srgan"] and rank <= 0: # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) avg_psnr = 0.0 idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data["LQ_path"][0]))[0] img_dir = os.path.join(opt["path"]["val_images"], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals["rlt"]) # uint8 gt_img = util.tensor2img(visuals["GT"]) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, "{:s}_{:d}.png".format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR sr_img, gt_img = util.crop_border([sr_img, gt_img], opt["scale"]) avg_psnr += util.calculate_psnr(sr_img, gt_img) pbar.update("Test {}".format(img_name)) avg_psnr = avg_psnr / idx # log logger.info("# Validation # PSNR: {:.4e}".format(avg_psnr)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: tb_logger.add_scalar("psnr", avg_psnr, current_step) else: # video restoration validation if opt["dist"]: # multi-GPU testing psnr_rlt = {} # with border and center frames if rank == 0: pbar = util.ProgressBar(len(val_set)) for idx in range(rank, len(val_set), world_size): val_data = val_set[idx] val_data["LQs"].unsqueeze_(0) val_data["GT"].unsqueeze_(0) folder = val_data["folder"] idx_d, max_idx = val_data["idx"].split("/") idx_d, max_idx = int(idx_d), int(max_idx) if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = torch.zeros( max_idx, dtype=torch.float32, device="cuda") model.feed_data(val_data) model.test() visuals = model.get_current_visuals() rlt_img = util.tensor2img(visuals["rlt"]) # uint8 gt_img = util.tensor2img(visuals["GT"]) # uint8 # calculate PSNR psnr_rlt[folder][idx_d] = util.calculate_psnr( rlt_img, gt_img) if rank == 0: for _ in range(world_size): pbar.update("Test {} - {}/{}".format( folder, idx_d, max_idx)) # collect data for _, v in psnr_rlt.items(): dist.reduce(v, 0) dist.barrier() if rank == 0: psnr_rlt_avg = {} psnr_total_avg = 0.0 for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = torch.mean(v).cpu().item() psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = "# Validation # PSNR: {:.4e}:".format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += " {}: {:.4e}".format(k, v) logger.info(log_s) if opt["use_tb_logger"] and "debug" not in opt[ "name"]: tb_logger.add_scalar("psnr_avg", psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) else: pbar = util.ProgressBar(len(val_loader)) psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0.0 for val_data in val_loader: folder = val_data["folder"][0] idx_d, max_id = val_data["idx"][0].split("/") # border = val_data['border'].item() if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = [] model.feed_data(val_data) model.test() visuals = model.get_current_visuals() rlt_img = util.tensor2img(visuals["rlt"]) # uint8 gt_img = util.tensor2img(visuals["GT"]) # uint8 lq_img = util.tensor2img(visuals["LQ"][2]) # uint8 img_dir = opt["path"]["val_images"] util.mkdir(img_dir) save_img_path = os.path.join( img_dir, "{}.png".format(idx_d)) util.save_img(np.hstack((lq_img, rlt_img, gt_img)), save_img_path) # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder].append(psnr) pbar.update("Test {} - {}".format(folder, idx_d)) for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = "# Validation # PSNR: {:.4e}:".format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += " {}: {:.4e}".format(k, v) logger.info(log_s) if opt["use_tb_logger"] and "debug" not in opt["name"]: tb_logger.add_scalar("psnr_avg", psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) # save models and training states if current_step % opt["logger"]["save_checkpoint_freq"] == 0: if rank <= 0: logger.info("Saving models and training states.") model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info("Saving the final model.") model.save("latest") logger.info("End of training.") tb_logger.close()
def main(): #### setup options of three networks parser = argparse.ArgumentParser() parser.add_argument('-opt_P', type=str, help='Path to option YMAL file of Predictor.') parser.add_argument('-opt_C', type=str, help='Path to option YMAL file of Corrector.') parser.add_argument('-opt_F', type=str, help='Path to option YMAL file of SFTMD_Net.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt_P = option.parse(args.opt_P, is_train=True) opt_C = option.parse(args.opt_C, is_train=True) opt_F = option.parse(args.opt_F, is_train=True) # convert to NoneDict, which returns None for missing keys opt_P = option.dict_to_nonedict(opt_P) opt_C = option.dict_to_nonedict(opt_C) opt_F = option.dict_to_nonedict(opt_F) # choose small opt for SFTMD test, fill path of pre-trained model_F opt_F = opt_F['sftmd'] # create PCA matrix of enough kernel batch_ker = util.random_batch_kernel(batch=30000, l=opt_P['kernel_size'], sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, tensor=False) print('batch kernel shape: {}'.format(batch_ker.shape)) b = np.size(batch_ker, 0) batch_ker = batch_ker.reshape((b, -1)) pca_matrix = util.PCA(batch_ker, k=opt_P['code_length']).float() print('PCA matrix shape: {}'.format(pca_matrix.shape)) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt_P['dist'] = False opt_F['dist'] = False opt_C['dist'] = False rank = -1 print('Disabled distributed training.') else: opt_P['dist'] = True opt_F['dist'] = True opt_C['dist'] = True init_dist() world_size = torch.distributed.get_world_size( ) #Returns the number of processes in the current process group rank = torch.distributed.get_rank( ) #Returns the rank of current process group torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True ###### Predictor&Corrector train ###### #### loading resume state if exists if opt_P['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt_P['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt_P, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0-7) if resume_state is None: # Predictor path util.mkdir_and_rename( opt_P['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt_P['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # Corrector path util.mkdir_and_rename( opt_C['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt_C['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt_P['path']['log'], 'train_' + opt_P['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt_P['path']['log'], 'val_' + opt_P['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt_P)) logger.info(option.dict2str(opt_C)) # tensorboard logger if opt_P['use_tb_logger'] and 'debug' not in opt_P['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt_P['name']) else: util.setup_logger('base', opt_P['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') #### random seed seed = opt_P['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt_P['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt_P['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt_P['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt_P, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt_P, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None assert val_loader is not None #### create model model_F = create_model(opt_F) #load pretrained model of SFTMD model_P = create_model(opt_P) model_C = create_model(opt_C) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model_P.resume_training( resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt_P['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate, schedulers # model.update_learning_rate(current_step, warmup_iter=opt_P['train']['warmup_iter']) #### preprocessing for LR_img and kernel map prepro = util.SRMDPreprocessing(opt_P['scale'], pca_matrix, para_input=opt_P['code_length'], kernel=opt_P['kernel_size'], noise=False, cuda=True, sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0) LR_img, ker_map = prepro(train_data['GT']) #### training Predictor model_P.feed_data(LR_img, ker_map) model_P.optimize_parameters(current_step) P_visuals = model_P.get_current_visuals() est_ker_map = P_visuals['Batch_est_ker_map'] #### log of model_P if current_step % opt_P['logger']['print_freq'] == 0: logs = model_P.get_current_log() message = 'Predictor <epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model_P.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt_P['use_tb_logger'] and 'debug' not in opt_P['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### training Corrector for step in range(opt_C['step']): # test SFTMD for corresponding SR image model_F.feed_data(train_data, LR_img, est_ker_map) model_F.test() F_visuals = model_F.get_current_visuals() SR_img = F_visuals['Batch_SR'] # Test SFTMD to produce SR images # train corrector given SR image and estimated kernel map model_C.feed_data(SR_img, est_ker_map, ker_map) model_C.optimize_parameters(current_step) C_visuals = model_C.get_current_visuals() est_ker_map = C_visuals['Batch_est_ker_map'] #### log of model_C if current_step % opt_C['logger']['print_freq'] == 0: logs = model_C.get_current_log() message = 'Corrector <epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model_C.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt_C['use_tb_logger'] and 'debug' not in opt_C[ 'name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # validation, to produce ker_map_list(fake) if current_step % opt_P['train']['val_freq'] == 0 and rank <= 0: avg_psnr = 0.0 idx = 0 for _, val_data in enumerate(val_loader): prepro = util.SRMDPreprocessing( opt_P['scale'], pca_matrix, para_input=opt_P['code_length'], kernel=opt_P['kernel_size'], noise=False, cuda=True, sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0) LR_img, ker_map = prepro(val_data['GT']) single_img_psnr = 0.0 # valid Predictor model_P.feed_data(LR_img, ker_map) model_P.test() P_visuals = model_P.get_current_visuals() est_ker_map = P_visuals['Batch_est_ker_map'] for step in range(opt_C['step']): step += 1 idx += 1 model_F.feed_data(val_data, LR_img, est_ker_map) model_F.test() F_visuals = model_F.get_current_visuals() SR_img = F_visuals['Batch_SR'] # Test SFTMD to produce SR images model_C.feed_data(SR_img, est_ker_map, ker_map) model_C.test() C_visuals = model_C.get_current_visuals() est_ker_map = C_visuals['Batch_est_ker_map'] sr_img = util.tensor2img(F_visuals['SR']) # uint8 gt_img = util.tensor2img(F_visuals['GT']) # uint8 # Save SR images for reference img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt_P['path']['val_images'], img_name) # img_dir = os.path.join(opt_F['path']['val_images'], str(current_step), '_', str(step)) util.mkdir(img_dir) save_img_path = os.path.join( img_dir, '{:s}_{:d}_{:d}.png'.format( img_name, current_step, step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt_P['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] step_psnr = util.calculate_psnr( cropped_sr_img * 255, cropped_gt_img * 255) logger.info( '<epoch:{:3d}, iter:{:8,d}, step:{:3d}> img:{:s}, psnr: {:.4f}' .format(epoch, current_step, step, img_name, step_psnr)) single_img_psnr += step_psnr avg_psnr += util.calculate_psnr( cropped_sr_img * 255, cropped_gt_img * 255) avg_signle_img_psnr = single_img_psnr / step logger.info( '<epoch:{:3d}, iter:{:8,d}, step:{:3d}> img:{:s}, average psnr: {:.4f}' .format(epoch, current_step, step, img_name, avg_signle_img_psnr)) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4f}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info( '<epoch:{:3d}, iter:{:8,d}, step:{:3d}> psnr: {:.4f}'. format(epoch, current_step, step, avg_psnr)) # tensorboard logger if opt_P['use_tb_logger'] and 'debug' not in opt_P['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) #### save models and training states if current_step % opt_P['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model_P.save(current_step) model_P.save_training_state(epoch, current_step) model_C.save(current_step) model_C.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model_P.save('latest') model_C.save('latest') logger.info('End of Predictor and Corrector training.') tb_logger.close()
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YMAL file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # validation if current_step % opt['train']['val_freq'] == 0 and rank <= 0: avg_psnr = 0.0 idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}'.format( epoch, current_step, avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.')
def SFTMD_train(opt_F, rank, world_size, pca_matrix): #### loading resume state if exists if opt_F['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load(opt_F['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt_F, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: if resume_state is None: util.mkdir_and_rename( opt_F['path']['experiments_root']) # rename experiment folder if exists util.mkdirs((path for key, path in opt_F['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt_F['path']['log'], 'train_' + opt_F['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt_F['path']['log'], 'val_' + opt_F['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt_F)) # tensorboard logger if opt_F['use_tb_logger'] and 'debug' not in opt_F['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]'.format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt_F['name']) else: util.setup_logger('base', opt_F['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt_F['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt_F['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt_F['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt_F, train_sampler) if rank <= 0: logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt_F, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None assert val_loader is not None #### create model model_F = create_model(opt_F) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model_F.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt_F['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### preprocessing for LR_img and kernel map prepro = util.SRMDPreprocessing(opt_F['scale'], pca_matrix, para_input=10, kernel=21, noise=False, cuda=True, sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0) LR_img, ker_map = prepro(train_data['GT']) #### update learning rate, schedulers model_F.update_learning_rate(current_step, warmup_iter=opt_F['train']['warmup_iter']) #### training model_F.feed_data(train_data, LR_img, ker_map) model_F.optimize_parameters(current_step) #### log if current_step % opt_F['logger']['print_freq'] == 0: logs = model_F.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model_F.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt_F['use_tb_logger'] and 'debug' not in opt_F['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # validation if current_step % opt_F['train']['val_freq'] == 0 and rank <= 0: avg_psnr = 0.0 idx = 0 for _, val_data in enumerate(val_loader): idx += 1 #### preprocessing for LR_img and kernel map prepro = util.SRMDPreprocessing(opt_F['scale'], pca_matrix, para_input=15, noise=False, cuda=True, sig_min=0.2, sig_max=4.0, rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0) LR_img, ker_map = prepro(val_data['GT']) model_F.feed_data(val_data, LR_img, ker_map) model_F.test() visuals = model_F.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference img_name = os.path.splitext(os.path.basename(val_data['LQ_path'][0]))[0] #img_dir = os.path.join(opt_F['path']['val_images'], img_name) img_dir = os.path.join(opt_F['path']['val_images'], str(current_step)) util.mkdir(img_dir) save_img_path = os.path.join(img_dir,'{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt_F['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info('<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}'.format(epoch, current_step, avg_psnr)) # tensorboard logger if opt_F['use_tb_logger'] and 'debug' not in opt_F['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) #### save models and training states if current_step % opt_F['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model_F.save(current_step) model_F.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model_F.save('latest') logger.info('End of SFTMD training.')
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key and 'wandb_load_run_path' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) if opt['use_wandb_logger'] and 'debug' not in opt['name']: json_path = os.path.join(os.path.expanduser('~'), '.wandb_api_keys.json') if os.path.exists(json_path): with open(json_path, 'r') as j: json_file = json.loads(j.read()) os.environ['WANDB_API_KEY'] = json_file['ryul99'] wandb.init(project="mmsr", config=opt, sync_tensorboard=True) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) if opt['use_wandb_logger'] and 'debug' not in opt['name']: wandb.config.update({'random_seed': seed}) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data, noise_mode=opt['datasets']['train']['noise_mode'], noise_rate=opt['datasets']['train']['noise_rate']) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if opt['use_wandb_logger'] and 'debug' not in opt['name']: if rank <= 0: wandb.log({k: v}, step=current_step) if rank <= 0: logger.info(message) #### validation if opt['datasets'].get( 'val', None) and current_step % opt['train']['val_freq'] == 0: if opt['model'] in [ 'sr', 'srgan' ] and rank <= 0: # image restoration validation # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) avg_psnr = 0. idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data( val_data, noise_mode=opt['datasets']['val']['noise_mode'], noise_rate=opt['datasets']['val']['noise_rate']) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale']) avg_psnr += util.calculate_psnr(sr_img, gt_img) pbar.update('Test {}'.format(img_name)) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) if opt['use_wandb_logger'] and 'debug' not in opt['name']: wandb.log({'psnr': avg_psnr}, step=current_step) else: # video restoration validation if opt['dist']: # multi-GPU testing psnr_rlt = {} # with border and center frames if rank == 0: pbar = util.ProgressBar(len(val_set)) for idx in range(rank, len(val_set), world_size): val_data = val_set[idx] val_data['LQs'].unsqueeze_(0) val_data['GT'].unsqueeze_(0) folder = val_data['folder'] idx_d, max_idx = val_data['idx'].split('/') idx_d, max_idx = int(idx_d), int(max_idx) if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = torch.zeros( max_idx, dtype=torch.float32, device='cuda') # tmp = torch.zeros(max_idx, dtype=torch.float32, device='cuda') model.feed_data(val_data, noise_mode=opt['datasets']['val'] ['noise_mode'], noise_rate=opt['datasets']['val'] ['noise_rate']) model.test() visuals = model.get_current_visuals() rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr_rlt[folder][idx_d] = util.calculate_psnr( rlt_img, gt_img) if rank == 0: for _ in range(world_size): pbar.update('Test {} - {}/{}'.format( folder, idx_d, max_idx)) # # collect data for _, v in psnr_rlt.items(): dist.reduce(v, 0) dist.barrier() if rank == 0: psnr_rlt_avg = {} psnr_total_avg = 0. for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = torch.mean(v).cpu().item() psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) if opt['use_tb_logger'] and 'debug' not in opt[ 'name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) if opt['use_wandb_logger'] and 'debug' not in opt[ 'name']: lq_img, rlt_img, gt_img = map( util.tensor2img, [ visuals['LQ'], visuals['rlt'], visuals['GT'] ]) wandb.log({'psnr_avg': psnr_total_avg}, step=current_step) wandb.log(psnr_rlt_avg, step=current_step) wandb.log( { 'Validation Image': [ wandb.Image(lq_img[:, :, [2, 1, 0]], caption='LQ'), wandb.Image(rlt_img[:, :, [2, 1, 0]], caption='output'), wandb.Image(gt_img[:, :, [2, 1, 0]], caption='GT'), ] }, step=current_step) else: pbar = util.ProgressBar(len(val_loader)) psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. for val_data in val_loader: folder = val_data['folder'][0] idx_d = val_data['idx'].item() # border = val_data['border'].item() if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = [] model.feed_data(val_data, noise_mode=opt['datasets']['val'] ['noise_mode'], noise_rate=opt['datasets']['val'] ['noise_rate']) model.test() visuals = model.get_current_visuals() rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder].append(psnr) pbar.update('Test {} - {}'.format(folder, idx_d)) for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) if opt['use_wandb_logger'] and 'debug' not in opt[ 'name']: lq_img, rlt_img, gt_img = map( util.tensor2img, [visuals['LQ'], visuals['rlt'], visuals['GT']]) wandb.log({'psnr_avg': psnr_total_avg}, step=current_step) wandb.log(psnr_rlt_avg, step=current_step) wandb.log( { 'Validation Image': [ wandb.Image(lq_img[:, :, [2, 1, 0]], caption='LQ'), wandb.Image(rlt_img[:, :, [2, 1, 0]], caption='output'), wandb.Image(gt_img[:, :, [2, 1, 0]], caption='GT'), ] }, step=current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.close()
def main(): ############################################ # # set options # ############################################ parser = argparse.ArgumentParser() parser.add_argument('--opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) ############################################ # # distributed training settings # ############################################ if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() print("Rank:", rank) print("------------------DIST-------------------------") ############################################ # # loading resume state if exists # ############################################ if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None ############################################ # # mkdir and loggers # ############################################ if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('base_val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_', level=logging.INFO, screen=True) print("set train log") util.setup_logger('base_val', opt['path']['log'], 'val_', level=logging.INFO, screen=True) print("set val log") logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True ############################################ # # create train and val dataloader # ############################################ #### # dataset_ratio = 200 # enlarge the size of each epoch, todo: what it is dataset_ratio = 1 # enlarge the size of each epoch, todo: what it is for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) # total_iters = int(opt['train']['niter']) # total_epochs = int(math.ceil(total_iters / train_size)) total_iters = train_size total_epochs = int(opt['train']['epoch']) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) # total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) total_epochs = int(opt['train']['epoch']) if opt['train']['enable'] == False: total_epochs = 1 else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None ############################################ # # create model # ############################################ #### model = create_model(opt) print("Model Created! ") #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 print("Not Resume Training") ############################################ # # training # ############################################ #### #### logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) Avg_train_loss = AverageMeter() # total if (opt['train']['pixel_criterion'] == 'cb+ssim'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_ssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_ssim = AverageMeter() Avg_train_loss_vmaf = AverageMeter() elif (opt['train']['pixel_criterion'] == 'ssim'): Avg_train_loss_ssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'msssim'): Avg_train_loss_msssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'cb+msssim'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_msssim = AverageMeter() saved_total_loss = 10e10 saved_total_PSNR = -1 for epoch in range(start_epoch, total_epochs): ############################################ # # Start a new epoch # ############################################ # Turn into training mode #model = model.train() # reset total loss Avg_train_loss.reset() current_step = 0 if (opt['train']['pixel_criterion'] == 'cb+ssim'): Avg_train_loss_pix.reset() Avg_train_loss_ssim.reset() elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): Avg_train_loss_pix.reset() Avg_train_loss_ssim.reset() Avg_train_loss_vmaf.reset() elif (opt['train']['pixel_criterion'] == 'ssim'): Avg_train_loss_ssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'msssim'): Avg_train_loss_msssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'cb+msssim'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_msssim = AverageMeter() if opt['dist']: train_sampler.set_epoch(epoch) for train_idx, train_data in enumerate(train_loader): if 'debug' in opt['name']: img_dir = os.path.join(opt['path']['train_images']) util.mkdir(img_dir) LQ = train_data['LQs'] GT = train_data['GT'] GT_img = util.tensor2img(GT) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}.png'.format(train_idx, 'debug_GT')) util.save_img(GT_img, save_img_path) for i in range(5): LQ_img = util.tensor2img(LQ[0, i, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format(train_idx, 'debug_LQ', i)) util.save_img(LQ_img, save_img_path) if (train_idx >= 3): break if opt['train']['enable'] == False: message_train_loss = 'None' break current_step += 1 if current_step > total_iters: print("Total Iteration Reached !") break #### update learning rate if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': pass else: model.update_learning_rate( current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) # if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': # model.optimize_parameters_without_schudlue(current_step) # else: model.optimize_parameters(current_step) if (opt['train']['pixel_criterion'] == 'cb+ssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_pix.update(model.log_dict['l_pix'], 1) Avg_train_loss_ssim.update(model.log_dict['ssim_loss'], 1) elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_pix.update(model.log_dict['l_pix'], 1) Avg_train_loss_ssim.update(model.log_dict['ssim_loss'], 1) Avg_train_loss_vmaf.update(model.log_dict['vmaf_loss'], 1) elif (opt['train']['pixel_criterion'] == 'ssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_ssim.update(model.log_dict['ssim_loss'], 1) elif (opt['train']['pixel_criterion'] == 'msssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_msssim.update(model.log_dict['msssim_loss'], 1) elif (opt['train']['pixel_criterion'] == 'cb+msssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_pix.update(model.log_dict['l_pix'], 1) Avg_train_loss_msssim.update(model.log_dict['msssim_loss'], 1) else: Avg_train_loss.update(model.log_dict['l_pix'], 1) # add total train loss if (opt['train']['pixel_criterion'] == 'cb+ssim'): message_train_loss = ' pix_avg_loss: {:.4e}'.format( Avg_train_loss_pix.avg) message_train_loss += ' ssim_avg_loss: {:.4e}'.format( Avg_train_loss_ssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): message_train_loss = ' pix_avg_loss: {:.4e}'.format( Avg_train_loss_pix.avg) message_train_loss += ' ssim_avg_loss: {:.4e}'.format( Avg_train_loss_ssim.avg) message_train_loss += ' vmaf_avg_loss: {:.4e}'.format( Avg_train_loss_vmaf.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'ssim'): message_train_loss = ' ssim_avg_loss: {:.4e}'.format( Avg_train_loss_ssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'msssim'): message_train_loss = ' msssim_avg_loss: {:.4e}'.format( Avg_train_loss_msssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'cb+msssim'): message_train_loss = ' pix_avg_loss: {:.4e}'.format( Avg_train_loss_pix.avg) message_train_loss += ' msssim_avg_loss: {:.4e}'.format( Avg_train_loss_msssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) else: message_train_loss = ' train_avg_loss: {:.4e}'.format( Avg_train_loss.avg) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) message += message_train_loss if rank <= 0: logger.info(message) ############################################ # # end of one epoch, save epoch model # ############################################ #### save models and training states # if current_step % opt['logger']['save_checkpoint_freq'] == 0: # if rank <= 0: # logger.info('Saving models and training states.') # model.save(current_step) # model.save('latest') # # model.save_training_state(epoch, current_step) # # todo delete previous weights # previous_step = current_step - opt['logger']['save_checkpoint_freq'] # save_filename = '{}_{}.pth'.format(previous_step, 'G') # save_path = os.path.join(opt['path']['models'], save_filename) # if os.path.exists(save_path): # os.remove(save_path) if epoch == 1: save_filename = '{:04d}_{}.pth'.format(0, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) save_filename = '{:04d}_{}.pth'.format(epoch - 1, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) if rank <= 0: logger.info('Saving models and training states.') save_filename = '{:04d}'.format(epoch) model.save(save_filename) # model.save('latest') # model.save_training_state(epoch, current_step) ############################################ # # end of one epoch, do validation # ############################################ #### validation #if opt['datasets'].get('val', None) and current_step % opt['train']['val_freq'] == 0: if opt['datasets'].get('val', None): if opt['model'] in [ 'sr', 'srgan' ] and rank <= 0: # image restoration validation # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) avg_psnr = 0. idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) #util.save_img(sr_img, save_img_path) # calculate PSNR sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale']) avg_psnr += util.calculate_psnr(sr_img, gt_img) pbar.update('Test {}'.format(img_name)) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) else: # video restoration validation if opt['dist']: # todo : multi-GPU testing psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. ssim_rlt = {} # with border and center frames ssim_rlt_avg = {} ssim_total_avg = 0. val_loss_rlt = {} val_loss_rlt_avg = {} val_loss_total_avg = 0. if rank == 0: pbar = util.ProgressBar(len(val_set)) for idx in range(rank, len(val_set), world_size): print('idx', idx) if 'debug' in opt['name']: if (idx >= 3): break val_data = val_set[idx] val_data['LQs'].unsqueeze_(0) val_data['GT'].unsqueeze_(0) folder = val_data['folder'] idx_d, max_idx = val_data['idx'].split('/') idx_d, max_idx = int(idx_d), int(max_idx) if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = torch.zeros(max_idx, dtype=torch.float32, device='cuda') if ssim_rlt.get(folder, None) is None: ssim_rlt[folder] = torch.zeros(max_idx, dtype=torch.float32, device='cuda') if val_loss_rlt.get(folder, None) is None: val_loss_rlt[folder] = torch.zeros( max_idx, dtype=torch.float32, device='cuda') # tmp = torch.zeros(max_idx, dtype=torch.float32, device='cuda') model.feed_data(val_data) # model.test() # model.test_stitch() if opt['stitch'] == True: model.test_stitch() else: model.test() # large GPU memory # visuals = model.get_current_visuals() visuals = model.get_current_visuals( save=True, name='{}_{}'.format(folder, idx), save_path=opt['path']['val_images']) rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder][idx_d] = psnr # calculate SSIM ssim = util.calculate_ssim(rlt_img, gt_img) ssim_rlt[folder][idx_d] = ssim # calculate Val loss val_loss = model.get_loss() val_loss_rlt[folder][idx_d] = val_loss logger.info( '{}_{:02d} PSNR: {:.4f}, SSIM: {:.4f}'.format( folder, idx, psnr, ssim)) if rank == 0: for _ in range(world_size): pbar.update('Test {} - {}/{}'.format( folder, idx_d, max_idx)) # # collect data for _, v in psnr_rlt.items(): dist.reduce(v, 0) for _, v in ssim_rlt.items(): dist.reduce(v, 0) for _, v in val_loss_rlt.items(): dist.reduce(v, 0) dist.barrier() if rank == 0: psnr_rlt_avg = {} psnr_total_avg = 0. for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = torch.mean(v).cpu().item() psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # ssim ssim_rlt_avg = {} ssim_total_avg = 0. for k, v in ssim_rlt.items(): ssim_rlt_avg[k] = torch.mean(v).cpu().item() ssim_total_avg += ssim_rlt_avg[k] ssim_total_avg /= len(ssim_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( ssim_total_avg) for k, v in ssim_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # added val_loss_rlt_avg = {} val_loss_total_avg = 0. for k, v in val_loss_rlt.items(): val_loss_rlt_avg[k] = torch.mean(v).cpu().item() val_loss_total_avg += val_loss_rlt_avg[k] val_loss_total_avg /= len(val_loss_rlt) log_l = '# Validation # Loss: {:.4e}:'.format( val_loss_total_avg) for k, v in val_loss_rlt_avg.items(): log_l += ' {}: {:.4e}'.format(k, v) logger.info(log_l) message = '' for v in model.get_current_learning_rate(): message += '{:.5e}'.format(v) logger_val.info( 'Epoch {:02d}, LR {:s}, PSNR {:.4f}, SSIM {:.4f} Train {:s}, Val Total Loss {:.4e}' .format(epoch, message, psnr_total_avg, ssim_total_avg, message_train_loss, val_loss_total_avg)) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) # add val loss tb_logger.add_scalar('val_loss_avg', val_loss_total_avg, current_step) for k, v in val_loss_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) else: # Todo: our function One GPU pbar = util.ProgressBar(len(val_loader)) psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. ssim_rlt = {} # with border and center frames ssim_rlt_avg = {} ssim_total_avg = 0. val_loss_rlt = {} val_loss_rlt_avg = {} val_loss_total_avg = 0. for val_inx, val_data in enumerate(val_loader): if 'debug' in opt['name']: if (val_inx >= 5): break folder = val_data['folder'][0] # idx_d = val_data['idx'].item() idx_d = val_data['idx'] # border = val_data['border'].item() if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = [] if ssim_rlt.get(folder, None) is None: ssim_rlt[folder] = [] if val_loss_rlt.get(folder, None) is None: val_loss_rlt[folder] = [] # process the black blank [B N C H W] print(val_data['LQs'].size()) H_S = val_data['LQs'].size(3) # 540 W_S = val_data['LQs'].size(4) # 960 print(H_S) print(W_S) blank_1_S = 0 blank_2_S = 0 print(val_data['LQs'][0, 2, 0, :, :].size()) for i in range(H_S): if not sum(val_data['LQs'][0, 2, 0, i, :]) == 0: blank_1_S = i - 1 # assert not sum(data_S[:, :, 0][i+1]) == 0 break for i in range(H_S): if not sum(val_data['LQs'][0, 2, 0, :, H_S - i - 1]) == 0: blank_2_S = (H_S - 1) - i - 1 # assert not sum(data_S[:, :, 0][blank_2_S-1]) == 0 break print('LQ :', blank_1_S, blank_2_S) if blank_1_S == -1: print('LQ has no blank') blank_1_S = 0 blank_2_S = H_S # val_data['LQs'] = val_data['LQs'][:,:,:,blank_1_S:blank_2_S,:] print("LQ", val_data['LQs'].size()) # end of process the black blank model.feed_data(val_data) if opt['stitch'] == True: model.test_stitch() else: model.test() # large GPU memory # process blank blank_1_L = blank_1_S << 2 blank_2_L = blank_2_S << 2 print(blank_1_L, blank_2_L) print(model.fake_H.size()) if not blank_1_S == 0: # model.fake_H = model.fake_H[:,:,blank_1_L:blank_2_L,:] model.fake_H[:, :, 0:blank_1_L, :] = 0 model.fake_H[:, :, blank_2_L:H_S, :] = 0 # end of # process blank visuals = model.get_current_visuals( save=True, name='{}_{:02d}'.format(folder, val_inx), save_path=opt['path']['val_images']) rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder].append(psnr) # calculate SSIM ssim = util.calculate_ssim(rlt_img, gt_img) ssim_rlt[folder].append(ssim) # val loss val_loss = model.get_loss() val_loss_rlt[folder].append(val_loss.item()) logger.info( '{}_{:02d} PSNR: {:.4f}, SSIM: {:.4f}'.format( folder, val_inx, psnr, ssim)) pbar.update('Test {} - {}'.format(folder, idx_d)) # average PSNR for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # average SSIM for k, v in ssim_rlt.items(): ssim_rlt_avg[k] = sum(v) / len(v) ssim_total_avg += ssim_rlt_avg[k] ssim_total_avg /= len(ssim_rlt) log_s = '# Validation # SSIM: {:.4e}:'.format( ssim_total_avg) for k, v in ssim_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # average VMAF # average Val LOSS for k, v in val_loss_rlt.items(): val_loss_rlt_avg[k] = sum(v) / len(v) val_loss_total_avg += val_loss_rlt_avg[k] val_loss_total_avg /= len(val_loss_rlt) log_l = '# Validation # Loss: {:.4e}:'.format( val_loss_total_avg) for k, v in val_loss_rlt_avg.items(): log_l += ' {}: {:.4e}'.format(k, v) logger.info(log_l) # toal validation log message = '' for v in model.get_current_learning_rate(): message += '{:.5e}'.format(v) logger_val.info( 'Epoch {:02d}, LR {:s}, PSNR {:.4f}, SSIM {:.4f} Train {:s}, Val Total Loss {:.4e}' .format(epoch, message, psnr_total_avg, ssim_total_avg, message_train_loss, val_loss_total_avg)) # end add if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) # tb_logger.add_scalar('ssim_avg', ssim_total_avg, current_step) # for k, v in ssim_rlt_avg.items(): # tb_logger.add_scalar(k, v, current_step) # add val loss tb_logger.add_scalar('val_loss_avg', val_loss_total_avg, current_step) for k, v in val_loss_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) ############################################ # # end of validation, save model # ############################################ # logger.info("Finished an epoch, Check and Save the model weights") # we check the validation loss instead of training loss. OK~ if saved_total_loss >= val_loss_total_avg: saved_total_loss = val_loss_total_avg #torch.save(model.state_dict(), args.save_path + "/best" + ".pth") model.save('best') logger.info( "Best Weights updated for decreased validation loss") else: logger.info( "Weights Not updated for undecreased validation loss") if saved_total_PSNR <= psnr_total_avg: saved_total_PSNR = psnr_total_avg model.save('bestPSNR') logger.info( "Best Weights updated for increased validation PSNR") else: logger.info( "Weights Not updated for unincreased validation PSNR") ############################################ # # end of one epoch, schedule LR # ############################################ # add scheduler todo if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': for scheduler in model.schedulers: # scheduler.step(val_loss_total_avg) scheduler.step(val_loss_total_avg) if rank <= 0: logger.info('Saving the final model.') model.save('last') logger.info('End of training.') tb_logger.close()
def main(): #### setup options of three networks parser = argparse.ArgumentParser() parser.add_argument("-opt", type=str, help="Path to option YMAL file of Predictor.") parser.add_argument("--launcher", choices=["none", "pytorch"], default="none", help="job launcher") parser.add_argument("--local_rank", type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) # choose small opt for SFTMD test, fill path of pre-trained model_F #### set random seed seed = opt["train"]["manual_seed"] if seed is None: seed = random.randint(1, 10000) util.set_random_seed(seed) # load PCA matrix of enough kernel print("load PCA matrix") pca_matrix = torch.load(opt["pca_matrix_path"], map_location=lambda storage, loc: storage) print("PCA matrix shape: {}".format(pca_matrix.shape)) #### distributed training settings if args.launcher == "none": # disabled distributed training opt["dist"] = False opt["dist"] = False rank = -1 print("Disabled distributed training.") else: opt["dist"] = True opt["dist"] = True init_dist() world_size = ( torch.distributed.get_world_size() ) # Returns the number of processes in the current process group rank = torch.distributed.get_rank( ) # Returns the rank of current process group torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True ###### Predictor&Corrector train ###### #### loading resume state if exists if opt["path"].get("resume_state", None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt["path"]["resume_state"], map_location=lambda storage, loc: storage.cuda(device_id), ) option.check_resume(opt, resume_state["iter"]) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0-7) if resume_state is None: # Predictor path util.mkdir_and_rename( opt["path"] ["experiments_root"]) # rename experiment folder if exists util.mkdirs( (path for key, path in opt["path"].items() if not key == "experiments_root" and "pretrain_model" not in key and "resume" not in key)) os.system("rm ./log") os.symlink(os.path.join(opt["path"]["experiments_root"], ".."), "./log") # config loggers. Before it, the log will not work util.setup_logger( "base", opt["path"]["log"], "train_" + opt["name"], level=logging.INFO, screen=True, tofile=True, ) util.setup_logger( "val", opt["path"]["log"], "val_" + opt["name"], level=logging.INFO, screen=True, tofile=True, ) logger = logging.getLogger("base") logger.info(option.dict2str(opt)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( "You are using PyTorch {}. Tensorboard will use [tensorboardX]" .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir="log/tb_logger/" + opt["name"]) else: util.setup_logger("base", opt["path"]["log"], "train", level=logging.INFO, screen=True) logger = logging.getLogger("base") torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt["datasets"].items(): if phase == "train": train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt["batch_size"])) total_iters = int(opt["train"]["niter"]) total_epochs = int(math.ceil(total_iters / train_size)) if opt["dist"]: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( "Number of train images: {:,d}, iters: {:,d}".format( len(train_set), train_size)) logger.info("Total epochs needed: {:d} for iters {:,d}".format( total_epochs, total_iters)) elif phase == "val": val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info("Number of val images in [{:s}]: {:d}".format( dataset_opt["name"], len(val_set))) else: raise NotImplementedError( "Phase [{:s}] is not recognized.".format(phase)) assert train_loader is not None assert val_loader is not None #### create model model = create_model(opt) # load pretrained model of SFTMD #### resume training if resume_state: logger.info("Resuming training from epoch: {}, iter: {}.".format( resume_state["epoch"], resume_state["iter"])) start_epoch = resume_state["epoch"] current_step = resume_state["iter"] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 prepro = util.SRMDPreprocessing( opt["scale"], pca_matrix, random=True, para_input=opt["code_length"], kernel=opt["kernel_size"], noise=False, cuda=True, sig=None, sig_min=opt["sig_min"], sig_max=opt["sig_max"], rate_iso=1.0, scaling=3, rate_cln=0.2, noise_high=0.0, ) #### training logger.info("Start training from epoch: {:d}, iter: {:d}".format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt["dist"]: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### preprocessing for LR_img and kernel map LR_img, ker_map = prepro(train_data["GT"]) LR_img = (LR_img * 255).round() / 255 #### training Predictor model.feed_data(LR_img, train_data["GT"], ker_map) model.optimize_parameters(current_step) model.update_learning_rate(current_step, warmup_iter=opt["train"]["warmup_iter"]) visuals = model.get_current_visuals() #### log of model_P if current_step % opt["logger"]["print_freq"] == 0: logs = model.get_current_log() message = "Predictor <epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> ".format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += "{:s}: {:.4e} ".format(k, v) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # validation, to produce ker_map_list(fake) if current_step % opt["train"]["val_freq"] == 0 and rank <= 0: avg_psnr = 0.0 idx = 0 for _, val_data in enumerate(val_loader): # LR_img, ker_map = prepro(val_data['GT']) LR_img = val_data["LQ"] lr_img = util.tensor2img( LR_img) # save LR image for reference # valid Predictor model.feed_data(LR_img, val_data["GT"]) model.test() visuals = model.get_current_visuals() # Save images for reference img_name = os.path.splitext( os.path.basename(val_data["LQ_path"][0]))[0] img_dir = os.path.join(opt["path"]["val_images"], img_name) # img_dir = os.path.join(opt['path']['val_images'], str(current_step), '_', str(step)) util.mkdir(img_dir) save_lr_path = os.path.join(img_dir, "{:s}_LR.png".format(img_name)) util.save_img(lr_img, save_lr_path) sr_img = util.tensor2img(visuals["SR"]) # uint8 gt_img = util.tensor2img(visuals["GT"]) # uint8 save_img_path = os.path.join( img_dir, "{:s}_{:d}.png".format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt["scale"] gt_img = gt_img / 255.0 sr_img = sr_img / 255.0 cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) idx += 1 avg_psnr = avg_psnr / idx # log logger.info("# Validation # PSNR: {:.6f}".format(avg_psnr)) logger_val = logging.getLogger("val") # validation logger logger_val.info( "<epoch:{:3d}, iter:{:8,d}, psnr: {:.6f}".format( epoch, current_step, avg_psnr)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: tb_logger.add_scalar("psnr", avg_psnr, current_step) #### save models and training states if current_step % opt["logger"]["save_checkpoint_freq"] == 0: if rank <= 0: logger.info("Saving models and training states.") model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info("Saving the final model.") model.save("latest") logger.info("End of Predictor and Corrector training.") tb_logger.close()
def main(): ############################################ # # set options # ############################################ parser = argparse.ArgumentParser() parser.add_argument('--opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) ############################################ # # distributed training settings # ############################################ if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() print("Rank:", rank) print("World Size", world_size) print("------------------DIST-------------------------") ############################################ # # loading resume state if exists # ############################################ if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None ############################################ # # mkdir and loggers # ############################################ if 'debug' in opt['name']: debug_mode = True else: debug_mode = False if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('base_val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_', level=logging.INFO, screen=True) print("set train log") util.setup_logger('base_val', opt['path']['log'], 'val_', level=logging.INFO, screen=True) print("set val log") logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True ############################################ # # create train and val dataloader # ############################################ #### # dataset_ratio = 200 # enlarge the size of each epoch dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': if opt['datasets']['train'].get('split', None): train_set, val_set = create_dataset(dataset_opt) else: train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) # total_iters = int(opt['train']['niter']) # total_epochs = int(math.ceil(total_iters / train_size)) total_iters = train_size total_epochs = int(opt['train']['epoch']) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) # total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) total_epochs = int(opt['train']['epoch']) if opt['train']['enable'] == False: total_epochs = 1 else: # train_sampler = None train_sampler = RandomBalancedSampler(train_set, train_size) train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler, vscode_debug=debug_mode) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': if not opt['datasets']['train'].get('split', None): val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None, vscode_debug=debug_mode) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None ############################################ # # create model # ############################################ #### model = create_model(opt) print("Model Created! ") #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 print("Not Resume Training") ############################################ # # training # ############################################ logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) model.train_AverageMeter() saved_total_loss = 10e10 saved_total_PSNR = -1 saved_total_SSIM = -1 for epoch in range(start_epoch, total_epochs): ############################################ # # Start a new epoch # ############################################ current_step = 0 if opt['dist']: train_sampler.set_epoch(epoch) for train_idx, train_data in enumerate(train_loader): # print('current_step', current_step) if 'debug' in opt['name']: img_dir = os.path.join(opt['path']['train_images']) util.mkdir(img_dir) LQs = train_data['LQs'] # B N C H W if not 'sr' in opt['name']: GTenh = train_data['GTenh'] GTinp = train_data['GTinp'] for imgs, name in zip([LQs, GTenh, GTinp], ['LQs', 'GTenh', 'GTinp']): num = imgs.size(1) for i in range(num): img = util.tensor2img(imgs[0, i, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format( train_idx, str(name), i)) util.save_img(img, save_img_path) else: if 'GT' in train_data: GT_name = 'GT' elif 'GTs' in train_data: GT_name = 'GTs' GT = train_data[GT_name] for imgs, name in zip([LQs, GT], ['LQs', GT_name]): if name == 'GT': num = imgs.size(0) img = util.tensor2img(imgs[0, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format( train_idx, str(name), 0)) util.save_img(img, save_img_path) elif name == 'GTs': num = imgs.size(1) for i in range(num): img = util.tensor2img(imgs[:, i, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format( train_idx, str(name), i)) util.save_img(img, save_img_path) else: num = imgs.size(1) for i in range(num): img = util.tensor2img(imgs[:, i, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format( train_idx, str(name), i)) util.save_img(img, save_img_path) if (train_idx >= 3): # set to 0, just do validation break # if pre-load weight first do validation and skip the first epoch # if opt['path'].get('pretrain_model_G', None) and epoch == 0: # epoch += 1 # break if opt['train']['enable'] == False: message_train_loss = 'None' break current_step += 1 if current_step > total_iters: print("Total Iteration Reached !") break #### update learning rate if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': pass else: model.update_learning_rate( current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) model.train_AverageMeter_update() #### log if current_step % opt['logger']['print_freq'] == 0: logs_inst, logs_avg = model.get_current_log( ) # training loss mode='train' message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' # if 'debug' in opt['name']: # debug model print the instant loss # for k, v in logs_inst.items(): # message += '{:s}: {:.4e} '.format(k, v) # # tensorboard logger # if opt['use_tb_logger'] and 'debug' not in opt['name']: # if rank <= 0: # tb_logger.add_scalar(k, v, current_step) # for avg loss current_iters_epoch = epoch * total_iters + current_step for k, v in logs_avg.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_iters_epoch) if rank <= 0: logger.info(message) # saving models if epoch == 1: save_filename = '{:04d}_{}.pth'.format(0, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) save_filename = '{:04d}_{}.pth'.format(epoch - 1, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) if rank <= 0: logger.info('Saving models and training states.') save_filename = '{:04d}'.format(epoch) model.save(save_filename) # ======================================================================= # # Main validation loop # # ======================================================================= # if opt['datasets'].get('val', None): if opt['dist']: # multi-GPU testing psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. ssim_rlt = {} # with border and center frames ssim_rlt_avg = {} ssim_total_avg = 0. val_loss_rlt = {} # the averaged loss val_loss_rlt_avg = {} val_loss_total_avg = 0. if rank == 0: pbar = util.ProgressBar(len(val_set)) for idx in range( rank, len(val_set), world_size): # distributed parallel validation # print('idx', idx) if 'debug' in opt['name']: if (idx >= 3): break if (idx >= 1000): break val_data = val_set[idx] # use idx method to fetch must extend batch dimension val_data['LQs'].unsqueeze_(0) val_data['GTenh'].unsqueeze_(0) val_data['GTinp'].unsqueeze_(0) key = val_data['key'][0] # IMG_0034_00809 max_idx = len(val_set) val_name = 'val_set' num = model.get_info( ) # each model has different number of loss if psnr_rlt.get(val_name, None) is None: psnr_rlt[val_name] = torch.zeros([num, max_idx], dtype=torch.float32, device='cuda') if ssim_rlt.get(val_name, None) is None: ssim_rlt[val_name] = torch.zeros([num, max_idx], dtype=torch.float32, device='cuda') if val_loss_rlt.get(val_name, None) is None: val_loss_rlt[val_name] = torch.zeros( [num, max_idx], dtype=torch.float32, device='cuda') model.feed_data(val_data) model.test() avg_loss, loss_list = model.get_loss(ret=1) save_enable = True if idx >= 100: save_enable = False psnr_list, ssim_list = model.compute_current_psnr_ssim( save=save_enable, name=key, save_path=opt['path']['val_images']) # print('psnr_list',psnr_list) assert len(loss_list) == num assert len(psnr_list) == num for i in range(num): psnr_rlt[val_name][i, idx] = psnr_list[i] ssim_rlt[val_name][i, idx] = ssim_list[i] val_loss_rlt[val_name][i, idx] = loss_list[i] # print('psnr_rlt[val_name][i, idx]',psnr_rlt[val_name][i, idx]) # print('ssim_rlt[val_name][i, idx]',ssim_rlt[val_name][i, idx]) # print('val_loss_rlt[val_name][i, idx] ',val_loss_rlt[val_name][i, idx] ) if rank == 0: for _ in range(world_size): pbar.update('Test {} - {}/{}'.format( key, idx, max_idx)) # # collect data for _, v in psnr_rlt.items(): for i in v: dist.reduce(i, 0) for _, v in ssim_rlt.items(): for i in v: dist.reduce(i, 0) for _, v in val_loss_rlt.items(): for i in v: dist.reduce(i, 0) dist.barrier() if rank == 0: psnr_rlt_avg = {} psnr_total_avg = 0. for k, v in psnr_rlt.items(): # key, value # print('k', k, 'v', v, 'v.shape', v.shape) psnr_rlt_avg[k] = [] for i in range(num): non_zero_idx = v[i, :].nonzero() # logger.info('non_zero_idx {}'.format(non_zero_idx.shape)) # check matrix = v[i, :][non_zero_idx] # print('matrix', matrix) value = torch.mean(matrix).cpu().item() # print('value', value) psnr_rlt_avg[k].append(value) psnr_total_avg += psnr_rlt_avg[k][i] psnr_total_avg = psnr_total_avg / (len(psnr_rlt) * num) log_p = '# Validation # Avg. PSNR: {:.2f},'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): for i, it in enumerate(v): log_p += ' {}: {:.2f}'.format(i, it) logger.info(log_p) logger_val.info(log_p) # ssim ssim_rlt_avg = {} ssim_total_avg = 0. for k, v in ssim_rlt.items(): ssim_rlt_avg[k] = [] for i in range(num): non_zero_idx = v[i, :].nonzero() # print('non_zero_idx', non_zero_idx) matrix = v[i, :][non_zero_idx] # print('matrix', matrix) value = torch.mean(matrix).cpu().item() # print('value', value) ssim_rlt_avg[k].append( torch.mean(matrix).cpu().item()) ssim_total_avg += ssim_rlt_avg[k][i] ssim_total_avg /= (len(ssim_rlt) * num) log_s = '# Validation # Avg. SSIM: {:.2f},'.format( ssim_total_avg) for k, v in ssim_rlt_avg.items(): for i, it in enumerate(v): log_s += ' {}: {:.2f}'.format(i, it) logger.info(log_s) logger_val.info(log_s) # added val_loss_rlt_avg = {} val_loss_total_avg = 0. for k, v in val_loss_rlt.items(): # k, key, the folder name # v, value, the torch matrix val_loss_rlt_avg[k] = [] # loss0 - loss_N for i in range(num): non_zero_idx = v[i, :].nonzero() # print('non_zero_idx', non_zero_idx) matrix = v[i, :][non_zero_idx] # print('matrix', matrix) value = torch.mean(matrix).cpu().item() # print('value', value) val_loss_rlt_avg[k].append( torch.mean(matrix).cpu().item()) val_loss_total_avg += val_loss_rlt_avg[k][i] val_loss_total_avg /= (len(val_loss_rlt) * num) log_l = '# Validation # Avg. Loss: {:.4e},'.format( val_loss_total_avg) for k, v in val_loss_rlt_avg.items(): for i, it in enumerate(v): log_l += ' {}: {:.4e}'.format(i, it) logger.info(log_l) logger_val.info(log_l) message = '' for v in model.get_current_learning_rate(): message += '{:.5e}'.format(v) logger_val.info( 'Epoch {:02d}, LR {:s}, PSNR {:.4f}, SSIM {:.4f}, Val Loss {:.4e}' .format(epoch, message, psnr_total_avg, ssim_total_avg, val_loss_total_avg)) else: pbar = util.ProgressBar(len(val_loader)) model.val_loss_AverageMeter() model.val_AverageMeter_para() for val_inx, val_data in enumerate(val_loader): # if 'debug' in opt['name']: # if (val_inx >= 10): # break save_enable = True if val_inx >= 100: save_enable = False if val_inx >= 100: break key = val_data['key'][0] folder = key[:-6] model.feed_data(val_data) model.test() avg_loss, loss_list = model.get_loss(ret=1) model.val_loss_AverageMeter_update(loss_list, avg_loss) psnr_list, ssim_list = model.compute_current_psnr_ssim( save=save_enable, name=key, save_path=opt['path']['val_images']) model.val_AverageMeter_para_update(psnr_list, ssim_list) if 'debug' in opt['name']: msg_psnr = '' msg_ssim = '' for i, psnr in enumerate(psnr_list): msg_psnr += '{} :{:.02f} '.format(i, psnr) for i, ssim in enumerate(ssim_list): msg_ssim += '{} :{:.02f} '.format(i, ssim) logger.info('{}_{:02d} {}'.format( key, val_inx, msg_psnr)) logger.info('{}_{:02d} {}'.format( key, val_inx, msg_ssim)) pbar.update('Test {} - {}'.format(key, val_inx)) # toal validation log lr = '' for v in model.get_current_learning_rate(): lr += '{:.5e}'.format(v) logs_avg, logs_psnr_avg, psnr_total_avg, ssim_total_avg, val_loss_total_avg = model.get_current_log( mode='val') msg_logs_avg = '' for k, v in logs_avg.items(): msg_logs_avg += '{:s}: {:.4e} '.format(k, v) logger_val.info('Val-Epoch {:02d}, LR {:s}, {:s}'.format( epoch, lr, msg_logs_avg)) logger.info('Val-Epoch {:02d}, LR {:s}, {:s}'.format( epoch, lr, msg_logs_avg)) msg_logs_psnr_avg = '' for k, v in logs_psnr_avg.items(): msg_logs_psnr_avg += '{:s}: {:.4e} '.format(k, v) logger_val.info('Val-Epoch {:02d}, LR {:s}, {:s}'.format( epoch, lr, msg_logs_psnr_avg)) logger.info('Val-Epoch {:02d}, LR {:s}, {:s}'.format( epoch, lr, msg_logs_psnr_avg)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('val_psnr', psnr_total_avg, epoch) tb_logger.add_scalar('val_loss', val_loss_total_avg, epoch) ############################################ # # end of validation, save model # ############################################ # if rank <= 0: logger.info("Finished an epoch, Check and Save the model weights") # we check the validation loss instead of training loss. OK~ if saved_total_loss >= val_loss_total_avg: saved_total_loss = val_loss_total_avg #torch.save(model.state_dict(), args.save_path + "/best" + ".pth") model.save('best') logger.info( "Best Weights updated for decreased validation loss") else: logger.info( "Weights Not updated for undecreased validation loss") if saved_total_PSNR <= psnr_total_avg: saved_total_PSNR = psnr_total_avg model.save('bestPSNR') logger.info( "Best Weights updated for increased validation PSNR") else: logger.info( "Weights Not updated for unincreased validation PSNR") ############################################ # # end of one epoch, schedule LR # ############################################ model.train_AverageMeter_reset() # add scheduler todo if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': for scheduler in model.schedulers: # scheduler.step(val_loss_total_avg) scheduler.step(val_loss_total_avg) if rank <= 0: logger.info('Saving the final model.') model.save('last') logger.info('End of training.') tb_logger.close()
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument("-opt", type=str, help="Path to option YAML file.") parser.add_argument( "--launcher", choices=["none", "pytorch"], default="none", help="job launcher" ) parser.add_argument("--local_rank", type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings if args.launcher == "none": # disabled distributed training opt["dist"] = False rank = -1 print("Disabled distributed training.") else: opt["dist"] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt["path"].get("resume_state", None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt["path"]["resume_state"], map_location=lambda storage, loc: storage.cuda(device_id), ) option.check_resume(opt, resume_state["iter"]) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt["path"]["experiments_root"] ) # rename experiment folder if exists util.mkdirs( ( path for key, path in opt["path"].items() if not key == "experiments_root" and "pretrain_model" not in key and "resume" not in key ) ) # config loggers. Before it, the log will not work util.setup_logger( "base", opt["path"]["log"], "train_" + opt["name"], level=logging.INFO, screen=True, tofile=True, ) logger = logging.getLogger("base") logger.info(option.dict2str(opt)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( "You are using PyTorch {}. Tensorboard will use [tensorboardX]".format( version ) ) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir="../tb_logger/" + opt["name"]) else: util.setup_logger( "base", opt["path"]["log"], "train", level=logging.INFO, screen=True ) logger = logging.getLogger("base") # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt["train"]["manual_seed"] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info("Random seed: {}".format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt["datasets"].items(): if phase == "train": train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt["batch_size"])) total_iters = int(opt["train"]["niter"]) total_epochs = int(math.ceil(total_iters / train_size)) if opt["dist"]: train_sampler = DistIterSampler( train_set, world_size, rank, dataset_ratio ) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio)) ) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( "Number of train images: {:,d}, iters: {:,d}".format( len(train_set), train_size ) ) logger.info( "Total epochs needed: {:d} for iters {:,d}".format( total_epochs, total_iters ) ) elif phase == "val": pass # val_set = create_dataset(dataset_opt, isVal=True) # val_loader = create_dataloader(val_set, dataset_opt, opt, None) # if rank <= 0: # logger.info( # "Number of val images in [{:s}]: {:d}".format( # dataset_opt["name"], len(val_set) # ) # ) else: raise NotImplementedError("Phase [{:s}] is not recognized.".format(phase)) assert train_loader is not None #### create model # model_path = opt["path"]["pretrain_model_G"] model = create_model(opt) #### resume training if resume_state: logger.info( "Resuming training from epoch: {}, iter: {}.".format( resume_state["epoch"], resume_state["iter"] ) ) start_epoch = resume_state["epoch"] current_step = resume_state["iter"] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info( "Start training from epoch: {:d}, iter: {:d}".format(start_epoch, current_step) ) for epoch in range(start_epoch, total_epochs + 1): if opt["dist"]: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate( current_step, warmup_iter=opt["train"]["warmup_iter"] ) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt["logger"]["print_freq"] == 0: logs = model.get_current_log() message = "<epoch:{:3d}, iter:{:8,d}, lr:(".format(epoch, current_step) for v in model.get_current_learning_rate(): message += "{:.3e},".format(v) message += ")>" for k, v in logs.items(): message += "{:s}: {:.4e} ".format(k, v) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### validation # currently, it does not support validation during training # if current_step % opt["train"]["val_freq"] == 0: # avg_psnr = 0 # idx = 0 # for val_data in val_loader: # idx += 1 # key = ( # val_data["key"][0] # if type(val_data["key"]) is list # else val_data["key"] # ) # imgName = key + ".png" # savePath = os.path.join( # opt["path"]["val_images"], str(current_step), imgName # ) # model.feed_data(val_data) # model.test() # output = model.get_current_visuals() # hr = util.tensor2img(output["GT"]) # sr = util.tensor2img(output["restore"]) # # Cropping to calculate PSNR # hr /= 255.0 # sr /= 255.0 # scale = 4 # H, W, C = hr.shape # H_r, W_r = H % scale, W % scale # cropped_hr = hr[: H - H_r, : W - W_r, :] # cropped_sr = sr[: H - H_r, : W - W_r, :] # avg_psnr += util.calculate_psnr(cropped_sr * 255, cropped_hr * 255) # logger.info("Saving output in {}".format(savePath)) # util.mkdir(savePath) # util.save_img( # output, joinPath(savePath, str(current_step) + ".png") # ) # avg_psnr /= idx # # log # logger.info("# Validation # PSNR: {:.4e}".format(avg_psnr)) # logger_val = logging.getLogger("val") # validation logger # logger_val.info( # "<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e}".format( # epoch, current_step, avg_psnr # ) # ) # # tensorboard logger # if opt["use_tb_logger"] and "debug" not in opt["name"]: # tb_logger.add_scalar("psnr", avg_psnr, current_step) #### save models and training states if current_step % opt["logger"]["save_checkpoint_freq"] == 0: if rank <= 0: # Save the experiments in case of Colab is timeout logger.info("Saving models and training states.") model.save(current_step) model.save_training_state(epoch, current_step) copy_tree( "/content/EDVR/experiments", "/content/drive/My Drive/LVTN/SuperResolution/EDVR/experiments", ) copy_tree( "/content/EDVR/tb_logger", "/content/drive/My Drive/LVTN/SuperResolution/EDVR/tb_logger", ) if rank <= 0: logger.info("Saving the final model.") model.save("latest") logger.info("End of training.") tb_logger.close()
def main(): #### options #### 参数设置 parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) label_path = opt['datasets']['val']['dataroot_label_file'] #### distributed training settings #### 分布训练 if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists #### 载入checkpoint if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers #### 创建一系列目录 if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed #### 随机种子 seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader #### 创建数据集 dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt, is_train=False) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model #### 模型创建 model = create_model(opt) #### resume training #### 载入checkpoint if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### --------------------------训练开始 #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### --------------------------训练结束 #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### validation if opt['datasets'].get( 'val', None) and current_step % opt['train']['val_freq'] == 0: if rank <= 0: # # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['img1_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], str(current_step)) util.mkdir(img_dir) f = open(os.path.join(img_dir, 'predict_score.txt'), 'a') model.feed_data(val_data) model.test() visuals = model.get_current_visuals() predict_score1 = visuals['predict_score1'].numpy() # Save predict scores f.write('%s %f\n' % (img_name + '.png', predict_score1)) f.close() pbar.update('Test {}'.format(img_name)) # calculate accuracy aligned_pair_accuracy, accuracy_esrganbig, accuracy_srganbig = rank_pair_test(\ os.path.join(img_dir, 'predict_score.txt'), label_path) # log logger.info( '# Validation # Accuracy: {:.4e}, Accuracy_pair1_class1: {:.4e}, Accuracy_pair1_class2: {:.4e} ' .format(aligned_pair_accuracy, accuracy_esrganbig, accuracy_srganbig)) logger_val = logging.getLogger('val') # validation logger logger_val.info( '<epoch:{:3d}, iter:{:8,d}> Accuracy: {:.4e}, Accuracy_pair1_class1: {:.4e}, Accuracy_pair1_class2: {:.4e} ' .format(epoch, current_step, aligned_pair_accuracy, accuracy_esrganbig, accuracy_srganbig)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('Accuracy', aligned_pair_accuracy, current_step) tb_logger.add_scalar('Accuracy_pair1_class1', accuracy_esrganbig, current_step) tb_logger.add_scalar('Accuracy_pair1_class2', accuracy_srganbig, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') tb_logger.close()
class Trainer: def init(self, opt, launcher, all_networks={}): self._profile = False self.val_compute_psnr = opt_get(opt, ['eval', 'compute_psnr'], False) self.val_compute_fea = opt_get(opt, ['eval', 'compute_fea'], False) #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if self.rank <= 0: # normal training (self.rank -1) OR distributed training (self.rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and path is not None and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) self.logger = logging.getLogger('base') self.logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: self.tb_logger_path = os.path.join( opt['path']['experiments_root'], 'tb_logger') version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: self.self.logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter self.tb_logger = SummaryWriter(log_dir=self.tb_logger_path) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) self.logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) self.opt = opt #### wandb init if opt['wandb'] and self.rank <= 0: import wandb os.makedirs(os.path.join(opt['path']['log'], 'wandb'), exist_ok=True) wandb.init(project=opt['name'], dir=opt['path']['log']) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if self.rank <= 0: self.logger.info('Random seed: {}'.format(seed)) seed += self.rank # Different multiprocessing instances should behave differently. util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True if opt_get(opt, ['anomaly_detection'], False): torch.autograd.set_detect_anomaly(True) # Save the compiled opt dict to the global loaded_options variable. util.loaded_options = opt #### create train and val dataloader dataset_ratio = 1 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': self.train_set, collate_fn = create_dataset( dataset_opt, return_collate=True) train_size = int( math.ceil(len(self.train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) self.total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: self.train_sampler = DistIterSampler( self.train_set, self.world_size, self.rank, dataset_ratio) self.total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: self.train_sampler = None self.train_loader = create_dataloader(self.train_set, dataset_opt, opt, self.train_sampler, collate_fn=collate_fn) if self.rank <= 0: self.logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(self.train_set), train_size)) self.logger.info( 'Total epochs needed: {:d} for iters {:,d}'.format( self.total_epochs, total_iters)) elif phase == 'val': self.val_set, collate_fn = create_dataset(dataset_opt, return_collate=True) self.val_loader = create_dataloader(self.val_set, dataset_opt, opt, None, collate_fn=collate_fn) if self.rank <= 0: self.logger.info( 'Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(self.val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert self.train_loader is not None #### create model self.model = ExtensibleTrainer(opt, cached_networks=all_networks) ### Evaluators self.evaluators = [] if 'eval' in opt.keys() and 'evaluators' in opt['eval'].keys(): for ev_key, ev_opt in opt['eval']['evaluators'].items(): self.evaluators.append( create_evaluator(self.model.networks[ev_opt['for']], ev_opt, self.model.env)) #### resume training if resume_state: self.logger.info( 'Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) self.start_epoch = resume_state['epoch'] self.current_step = resume_state['iter'] self.model.resume_training( resume_state, 'amp_opt_level' in opt.keys()) # handle optimizers and schedulers else: self.current_step = -1 if 'start_step' not in opt.keys( ) else opt['start_step'] self.start_epoch = 0 if 'force_start_step' in opt.keys(): self.current_step = opt['force_start_step'] opt['current_step'] = self.current_step def do_step(self, train_data): if self._profile: print("Data fetch: %f" % (time() - _t)) _t = time() opt = self.opt self.current_step += 1 #### update learning rate self.model.update_learning_rate( self.current_step, warmup_iter=opt['train']['warmup_iter']) #### training if self._profile: print("Update LR: %f" % (time() - _t)) _t = time() self.model.feed_data(train_data, self.current_step) self.model.optimize_parameters(self.current_step) if self._profile: print("Model feed + step: %f" % (time() - _t)) _t = time() #### log if self.current_step % opt['logger'][ 'print_freq'] == 0 and self.rank <= 0: logs = self.model.get_current_log(self.current_step) message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( self.epoch, self.current_step) for v in self.model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): if 'histogram' in k: self.tb_logger.add_histogram(k, v, self.current_step) elif isinstance(v, dict): self.tb_logger.add_scalars(k, v, self.current_step) else: message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: self.tb_logger.add_scalar(k, v, self.current_step) if opt['wandb'] and self.rank <= 0: import wandb wandb.log(logs) self.logger.info(message) #### save models and training states if self.current_step % opt['logger']['save_checkpoint_freq'] == 0: if self.rank <= 0: self.logger.info('Saving models and training states.') self.model.save(self.current_step) self.model.save_training_state(self.epoch, self.current_step) if 'alt_path' in opt['path'].keys(): import shutil print("Synchronizing tb_logger to alt_path..") alt_tblogger = os.path.join(opt['path']['alt_path'], "tb_logger") shutil.rmtree(alt_tblogger, ignore_errors=True) shutil.copytree(self.tb_logger_path, alt_tblogger) #### validation if opt['datasets'].get( 'val', None) and self.current_step % opt['train']['val_freq'] == 0: if opt['model'] in [ 'sr', 'srgan', 'corruptgan', 'spsrgan', 'extensibletrainer' ] and self.rank <= 0: # image restoration validation avg_psnr = 0. avg_fea_loss = 0. idx = 0 val_tqdm = tqdm(self.val_loader) for val_data in val_tqdm: idx += 1 for b in range(len(val_data['HQ_path'])): img_name = os.path.splitext( os.path.basename(val_data['HQ_path'][b]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) self.model.feed_data(val_data, self.current_step) self.model.test() visuals = self.model.get_current_visuals() if visuals is None: continue sr_img = util.tensor2img(visuals['rlt'][b]) # uint8 # calculate PSNR if self.val_compute_psnr: gt_img = util.tensor2img(visuals['hq'][b]) # uint8 sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale']) avg_psnr += util.calculate_psnr(sr_img, gt_img) # Save SR images for reference img_base_name = '{:s}_{:d}.png'.format( img_name, self.current_step) save_img_path = os.path.join(img_dir, img_base_name) util.save_img(sr_img, save_img_path) avg_psnr = avg_psnr / idx avg_fea_loss = avg_fea_loss / idx # log self.logger.info( '# Validation # PSNR: {:.4e} Fea: {:.4e}'.format( avg_psnr, avg_fea_loss)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt[ 'name'] and self.rank <= 0: self.tb_logger.add_scalar('val_psnr', avg_psnr, self.current_step) self.tb_logger.add_scalar('val_fea', avg_fea_loss, self.current_step) if len(self.evaluators ) != 0 and self.current_step % opt['train']['val_freq'] == 0: eval_dict = {} for eval in self.evaluators: if eval.uses_all_ddp or self.rank <= 0: eval_dict.update(eval.perform_eval()) if self.rank <= 0: print("Evaluator results: ", eval_dict) for ek, ev in eval_dict.items(): self.tb_logger.add_scalar(ek, ev, self.current_step) if opt['wandb']: import wandb wandb.log(eval_dict) def do_training(self): self.logger.info('Start training from epoch: {:d}, iter: {:d}'.format( self.start_epoch, self.current_step)) for epoch in range(self.start_epoch, self.total_epochs + 1): self.epoch = epoch if opt['dist']: self.train_sampler.set_epoch(epoch) tq_ldr = tqdm(self.train_loader) _t = time() for train_data in tq_ldr: self.do_step(train_data) def create_training_generator(self, index): self.logger.info('Start training from epoch: {:d}, iter: {:d}'.format( self.start_epoch, self.current_step)) for epoch in range(self.start_epoch, self.total_epochs + 1): self.epoch = epoch if self.opt['dist']: self.train_sampler.set_epoch(epoch) tq_ldr = tqdm(self.train_loader, position=index) _t = time() for train_data in tq_ldr: yield self.model self.do_step(train_data)
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers #if resume_state is None: # util.mkdir_and_rename(opt['path']['experiments_root']) # rename experiment folder if exists # util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' # and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 1 #200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = TrainDataset() train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = ValidDataset() train_size = int( math.ceil(len(val_set) / dataset_opt['batch_size'])) val_sampler = DistIterSampler(val_set, world_size, rank, dataset_ratio) val_loader = create_dataloader(val_set, dataset_opt, opt, val_sampler) logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.3f} '.format(k, v) logger.info(message) print(message) #### save models and training states if current_step % 1000 == 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) #### validation # multi-GPU testing psnr_rlt = [] # with border and center frames for _, val_data in enumerate(val_loader): model.feed_data(val_data) model.test() # calculate PSNR psnr_rlt.append(torch.mean(model.get_psnr()).item()) psnr_total_avg = np.array(psnr_rlt).mean() log_s = '# Validation # PSNR: {:.4f}:'.format(psnr_total_avg) logger.info(log_s) print(log_s) logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') tb_logger.close()