src_path = opt.src_path tgt_path = opt.tgt_path swapper.swap_setup(src_path, tgt_path) if opt.post_tune: print('\n\t\t\tPersonalization: meta cycle finetune...') swapper.post_personalize(opt.output_dir, visualizer=None, verbose=False) # swapper.post_personalize(opt.output_dir, visualizer=visualizer, verbose=True, bidirection=opt.bidirection) print('\n\t\t\tPersonalization: completed...') # if a->b print('\n\t\t\tSwapping: {} wear the clothe of {}...'.format( src_path, tgt_path)) preds = swapper.swap(src_info=swapper.src_info, tgt_info=swapper.tsf_info, target_part=opt.swap_part, visualizer=visualizer) if opt.save_res: pred_output_dir = mkdir(os.path.join(opt.output_dir, 'swappers')) save_results(src_path, tgt_path, pred_output_dir, preds) # # else b->a # preds = swapper.swap(src_info=swapper.tgt_info, tgt_info=swapper.src_info, # target_part=opt.swap_part, visualizer=visualizer)
def main(jsonPath): # options opt = option.parse(jsonPath, is_train=False) util.mkdirs((path for key, path in opt["path"].items() if not key == "pretrain_model_G")) opt = option.dict_to_nonedict(opt) util.setup_logger(None, opt["path"]["log"], "test.log", level=logging.INFO, screen=True) logger = logging.getLogger("base") logger.info(option.dict2str(opt)) # Create test dataset and dataloader test_loaders = [] for phase, dataset_opt in sorted(opt["datasets"].items()): test_set = create_dataset(dataset_opt) test_loader = create_dataloader(test_set, dataset_opt) logger.info("Number of test images in [{:s}]: {:d}".format( dataset_opt["name"], len(test_set))) test_loaders.append(test_loader) # Create model model = create_model(opt) for test_loader in test_loaders: test_set_name = test_loader.dataset.opt["name"] logger.info("\nTesting [{:s}]...".format(test_set_name)) # test_start_time = time.time() dataset_dir = os.path.join(opt["path"]["results_root"], test_set_name) util.mkdir(dataset_dir) test_results = OrderedDict() test_results["psnr"] = [] test_results["ssim"] = [] test_results["psnr_y"] = [] test_results["ssim_y"] = [] for data in test_loader: need_HR = False if test_loader.dataset.opt[ "dataroot_HR"] is None else True model.feed_data(data, need_HR=need_HR) img_path = data["LR_path"][0] img_name = os.path.splitext(os.path.basename(img_path))[0] model.test() # test visuals = model.get_current_visuals(need_HR=need_HR) sr_img = util.tensor2img(visuals["SR"]) # uint8 # save images suffix = opt["suffix"] if suffix: save_img_path = os.path.join(dataset_dir, img_name + suffix + ".png") else: save_img_path = os.path.join(dataset_dir, img_name + ".png") util.save_img(sr_img, save_img_path) # calculate PSNR and SSIM if need_HR: gt_img = util.tensor2img(visuals["HR"]) gt_img = gt_img / 255.0 sr_img = sr_img / 255.0 crop_border = test_loader.dataset.opt["scale"] cropped_sr_img = sr_img[crop_border:-crop_border, crop_border:-crop_border, :] cropped_gt_img = gt_img[crop_border:-crop_border, crop_border:-crop_border, :] psnr = util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) ssim = util.calculate_ssim(cropped_sr_img * 255, cropped_gt_img * 255) test_results["psnr"].append(psnr) test_results["ssim"].append(ssim) if gt_img.shape[2] == 3: # RGB image sr_img_y = bgr2ycbcr(sr_img, only_y=True) gt_img_y = bgr2ycbcr(gt_img, only_y=True) cropped_sr_img_y = sr_img_y[crop_border:-crop_border, crop_border:-crop_border] cropped_gt_img_y = gt_img_y[crop_border:-crop_border, crop_border:-crop_border] psnr_y = util.calculate_psnr(cropped_sr_img_y * 255, cropped_gt_img_y * 255) ssim_y = util.calculate_ssim(cropped_sr_img_y * 255, cropped_gt_img_y * 255) test_results["psnr_y"].append(psnr_y) test_results["ssim_y"].append(ssim_y) logger.info( "{:20s} - PSNR: {:.6f} dB; SSIM: {:.6f}; PSNR_Y: {:.6f} dB; SSIM_Y: {:.6f}." .format(img_name, psnr, ssim, psnr_y, ssim_y)) else: logger.info( "{:20s} - PSNR: {:.6f} dB; SSIM: {:.6f}.".format( img_name, psnr, ssim)) else: logger.info(img_name) if need_HR: # metrics # Average PSNR/SSIM results ave_psnr = sum(test_results["psnr"]) / len(test_results["psnr"]) ave_ssim = sum(test_results["ssim"]) / len(test_results["ssim"]) logger.info( "----Average PSNR/SSIM results for {}----\n\tPSNR: {:.6f} dB; SSIM: {:.6f}\n" .format(test_set_name, ave_psnr, ave_ssim)) if test_results["psnr_y"] and test_results["ssim_y"]: ave_psnr_y = sum(test_results["psnr_y"]) / len( test_results["psnr_y"]) ave_ssim_y = sum(test_results["ssim_y"]) / len( test_results["ssim_y"]) logger.info( "----Y channel, average PSNR/SSIM----\n\tPSNR_Y: {:.6f} dB; SSIM_Y: {:.6f}\n" .format(ave_psnr_y, ave_ssim_y))
def main(config_path): # Read a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) corpus = config['corpus'] feature = config['feature'] param = config['param'] # TODO: Solve conflict (batch_norm & layer norm) if corpus['label_type'] == 'phone61': output_size = 61 elif corpus['label_type'] == 'phone48': output_size = 48 elif corpus['label_type'] == 'phone39': output_size = 39 elif corpus['label_type'] == 'character': output_size = 30 # Model setting CTCModel = load(model_type=config['model_name']) network = CTCModel(batch_size=param['batch_size'], input_size=feature['input_size'] * feature['num_stack'], num_cell=param['num_cell'], num_layer=param['num_layer'], output_size=output_size, clip_gradients=param['clip_grad'], clip_activation=param['clip_activation'], dropout_ratio_input=param['dropout_input'], dropout_ratio_hidden=param['dropout_hidden'], num_proj=param['num_proj'], weight_decay=param['weight_decay']) network.model_name = config['model_name'].upper() network.model_name += '_' + str(param['num_cell']) network.model_name += '_' + str(param['num_layer']) network.model_name += '_' + param['optimizer'] network.model_name += '_lr' + str(param['learning_rate']) if param['num_proj'] != 0: network.model_name += '_proj' + str(param['num_proj']) if feature['num_stack'] != 1: network.model_name += '_stack' + str(feature['num_stack']) if param['weight_decay'] != 0: network.model_name += '_weightdecay' + str(param['weight_decay']) # Set save path network.model_dir = mkdir('/n/sd8/inaguma/result/timit/ctc/') network.model_dir = mkdir_join(network.model_dir, corpus['label_type']) network.model_dir = mkdir_join(network.model_dir, network.model_name) # Reset model directory if not isfile(join(network.model_dir, 'complete.txt')): tf.gfile.DeleteRecursively(network.model_dir) tf.gfile.MakeDirs(network.model_dir) else: raise ValueError('File exists.') # Set process name setproctitle('ctc_timit_' + corpus['label_type'] + '_' + param['optimizer']) # Save config file shutil.copyfile(config_path, join(network.model_dir, 'config.yml')) sys.stdout = open(join(network.model_dir, 'train.log'), 'w') print(network.model_name) do_train(network=network, optimizer=param['optimizer'], learning_rate=param['learning_rate'], batch_size=param['batch_size'], epoch_num=param['num_epoch'], label_type=corpus['label_type'], num_stack=feature['num_stack'], num_skip=feature['num_skip']) sys.stdout = sys.__stdout__
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) label_path = opt['datasets']['val']['dataroot_label_file'] #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load(opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path']['experiments_root']) # rename experiment folder if exists util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]'.format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt,is_train = False) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None #### create model model = create_model(opt) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format(epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### validation if opt['datasets'].get('val', None) and current_step % opt['train']['val_freq'] == 0: if rank <= 0: # # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext(os.path.basename(val_data['img1_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], str(current_step)) util.mkdir(img_dir) f = open(os.path.join(img_dir, 'predict_score.txt'), 'a') model.feed_data(val_data) model.test() visuals = model.get_current_visuals() predict_score1 = visuals['predict_score1'].numpy() # Save predict scores f.write('%s %f\n' % (img_name + '.png', predict_score1)) f.close() pbar.update('Test {}'.format(img_name)) # calculate accuracy aligned_pair_accuracy, accuracy_esrganbig, accuracy_srganbig = rank_pair_test(\ os.path.join(img_dir, 'predict_score.txt'), label_path) # log logger.info( '# Validation # Accuracy: {:.4e}, Accuracy_pair1_class1: {:.4e}, Accuracy_pair1_class2: {:.4e} '.format( aligned_pair_accuracy, accuracy_esrganbig, accuracy_srganbig)) logger_val = logging.getLogger('val') # validation logger logger_val.info( '<epoch:{:3d}, iter:{:8,d}> Accuracy: {:.4e}, Accuracy_pair1_class1: {:.4e}, Accuracy_pair1_class2: {:.4e} '.format( epoch, current_step, aligned_pair_accuracy, accuracy_esrganbig, accuracy_srganbig)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('Accuracy', aligned_pair_accuracy, current_step) tb_logger.add_scalar('Accuracy_pair1_class1', accuracy_esrganbig, current_step) tb_logger.add_scalar('Accuracy_pair1_class2', accuracy_srganbig, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') tb_logger.close()
def summary_dir(self): return ut.mkdir(pj(self.resdir, 'summary'))
if test_opt.ip: visualizer = VisdomVisualizer(env=test_opt.name, ip=test_opt.ip, port=test_opt.port) else: visualizer = None # set imitator imitator = Imitator(test_opt) if test_opt.post_tune: adaptive_personalize(test_opt, imitator, visualizer) imitator.personalize(test_opt.src_path, visualizer=visualizer) print('\n\t\t\tPersonalization: completed...') if test_opt.save_res: pred_output_dir = mkdir(os.path.join(test_opt.output_dir, 'imitators')) pred_output_dir = clear_dir(pred_output_dir) else: pred_output_dir = None print('\n\t\t\tImitating `{}`'.format(test_opt.tgt_path)) tgt_paths = scan_tgt_paths(test_opt.tgt_path, itv=1) imitator.inference(tgt_paths, tgt_smpls=None, cam_strategy='smooth', output_dir=pred_output_dir, visualizer=visualizer, verbose=True)
def fit(model, opt, dataloaders, steps_states, data_params, loggers): # read data_params batch_size = data_params['batch_size'] virtual_batch_size = data_params['virtual_batch_size'] total_iters = data_params['total_iters'] total_epochs = data_params['total_epochs'] # read steps_states start_epoch = steps_states["start_epoch"] current_step = steps_states["current_step"] virtual_step = steps_states["virtual_step"] # read loggers logger = util.get_root_logger() tb_logger = loggers["tb_logger"] # training logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step)) try: timer = metrics.Timer() # iteration timer timerData = metrics.TickTock() # data timer timerEpoch = metrics.TickTock() # epoch timer # outer loop for different epochs for epoch in range(start_epoch, (total_epochs * (virtual_batch_size // batch_size))+1): timerData.tick() timerEpoch.tick() # inner iteration loop within one epoch for n, train_data in enumerate(dataloaders['train'], start=1): timerData.tock() virtual_step += 1 take_step = False if virtual_step > 0 and virtual_step * batch_size % virtual_batch_size == 0: current_step += 1 take_step = True if current_step > total_iters: break # training model.feed_data(train_data) # unpack data from dataset and apply preprocessing model.optimize_parameters(virtual_step) # calculate loss functions, get gradients, update network weights # log def eta(t_iter): # calculate training ETA in hours return (t_iter * (opt['train']['niter'] - current_step)) / 3600 if t_iter > 0 else 0 if current_step % opt['logger']['print_freq'] == 0 and take_step: # iteration end time avg_time = timer.get_average_and_reset() avg_data_time = timerData.get_average_and_reset() # print training losses and save logging information to disk message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}, t:{:.4f}s, td:{:.4f}s, eta:{:.4f}h> '.format( epoch, current_step, model.get_current_learning_rate(current_step), avg_time, avg_data_time, eta(avg_time)) # tensorboard training logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if current_step % opt['logger'].get('tb_sample_rate', 1) == 0: # Reduce rate of tb logs # tb_logger.add_scalar('loss/nll', nll, current_step) tb_logger.add_scalar('lr/base', model.get_current_learning_rate(), current_step) tb_logger.add_scalar('time/iteration', timer.get_last_iteration(), current_step) tb_logger.add_scalar('time/data', timerData.get_last_iteration(), current_step) logs = model.get_current_log() for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard loss logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if current_step % opt['logger'].get('tb_sample_rate', 1) == 0: # Reduce rate of tb logs tb_logger.add_scalar(k, v, current_step) # tb_logger.flush() logger.info(message) # start time for next iteration #TODO:skip the validation time from calculation timer.tick() # update learning rate if model.optGstep and model.optDstep and take_step: model.update_learning_rate(current_step, warmup_iter=opt['train'].get('warmup_iter', -1)) # save latest models and training states every <save_checkpoint_freq> iterations if current_step % opt['logger']['save_checkpoint_freq'] == 0 and take_step: if model.swa: model.save(current_step, opt['logger']['overwrite_chkp'], loader=dataloaders['train']) else: model.save(current_step, opt['logger']['overwrite_chkp']) model.save_training_state( epoch=epoch + (n >= len(dataloaders['train'])), iter_step=current_step, latest=opt['logger']['overwrite_chkp'] ) logger.info('Models and training states saved.') # validation if dataloaders.get('val', None) and current_step % opt['train']['val_freq'] == 0 and take_step: val_metrics = metrics.MetricsDict(metrics=opt['train'].get('metrics', None)) nlls = [] for val_data in dataloaders['val']: model.feed_data(val_data) # unpack data from data loader model.test() # run inference if hasattr(model, 'nll'): nll = model.nll if model.nll else 0 nlls.append(nll) """ Get Visuals """ visuals = model.get_current_visuals() # get image results img_name = os.path.splitext(os.path.basename(val_data['LR_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) # Save SR images for reference sr_img = None if hasattr(model, 'heats'): # SRFlow opt['train']['val_comparison'] = False for heat in model.heats: for i in range(model.n_sample): sr_img = tensor2np(visuals['SR', heat, i], denormalize=opt['datasets']['train']['znorm']) if opt['train']['overwrite_val_imgs']: save_img_path = os.path.join(img_dir, '{:s}_h{:03d}_s{:d}.png'.format(img_name, int(heat * 100), i)) else: save_img_path = os.path.join(img_dir, '{:s}_{:09d}_h{:03d}_s{:d}.png'.format(img_name, current_step, int(heat * 100), i)) util.save_img(sr_img, save_img_path) else: # regular SR sr_img = tensor2np(visuals['SR'], denormalize=opt['datasets']['train']['znorm']) if opt['train']['overwrite_val_imgs']: save_img_path = os.path.join(img_dir, '{:s}.png'.format(img_name)) else: save_img_path = os.path.join(img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) if not opt['train']['val_comparison']: util.save_img(sr_img, save_img_path) assert sr_img is not None # Save GT images for reference gt_img = tensor2np(visuals['HR'], denormalize=opt['datasets']['train']['znorm']) if opt['train']['save_gt']: save_img_path_gt = os.path.join(img_dir, '{:s}_GT.png'.format(img_name)) if not os.path.isfile(save_img_path_gt): util.save_img(gt_img, save_img_path_gt) # Save LQ images for reference if opt['train']['save_lr']: save_img_path_lq = os.path.join(img_dir, '{:s}_LQ.png'.format(img_name)) if not os.path.isfile(save_img_path_lq): lq_img = tensor2np(visuals['LR'], denormalize=opt['datasets']['train']['znorm']) util.save_img(lq_img, save_img_path_lq, scale=opt['scale']) # save single images or LQ / SR comparison if opt['train']['val_comparison']: lr_img = tensor2np(visuals['LR'], denormalize=opt['datasets']['train']['znorm']) util.save_img_comp([lr_img, sr_img], save_img_path) # else: # util.save_img(sr_img, save_img_path) """ Get Metrics # TODO: test using tensor based metrics (batch) instead of numpy. """ val_metrics.calculate_metrics(sr_img, gt_img, crop_size=opt['scale']) # , only_y=True) avg_metrics = val_metrics.get_averages() if nlls: avg_nll = sum(nlls) / len(nlls) del val_metrics # log logger_m = '' for r in avg_metrics: formatted_res = r['name'].upper() + ': {:.5g}, '.format(r['average']) logger_m += formatted_res if nlls: logger_m += 'avg_nll: {:.4e} '.format(avg_nll) logger.info('# Validation # ' + logger_m[:-2]) logger_val = logging.getLogger('val') # validation logger logger_val.info('<epoch:{:3d}, iter:{:8,d}> '.format(epoch, current_step) + logger_m[:-2]) # memory_usage = torch.cuda.memory_allocated()/(1024.0 ** 3) # in GB # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: for r in avg_metrics: tb_logger.add_scalar(r['name'], r['average'], current_step) if nlls: tb_logger.add_scalar('average nll', avg_nll, current_step) # tb_logger.flush() # tb_logger_valid.add_scalar(r['name'], r['average'], current_step) # tb_logger_valid.flush() timerData.tick() timerEpoch.tock() logger.info('End of epoch {} / {} \t Time Taken: {:.4f} sec'.format( epoch, total_epochs, timerEpoch.get_last_iteration())) logger.info('Saving the final model.') if model.swa: model.save('latest', loader=dataloaders['train']) else: model.save('latest') logger.info('End of training.') except KeyboardInterrupt: # catch a KeyboardInterrupt and save the model and state to resume later if model.swa: model.save(current_step, True, loader=dataloaders['train']) else: model.save(current_step, True) model.save_training_state(epoch + (n >= len(dataloaders['train'])), current_step, True) logger.info('Training interrupted. Latest models and training states saved.')
def weights_best(self): path = "weights/{}/best".format(self.cnf['name']) mkdir(path) return os.path.join(path, '{epoch}_{timestamp}_{loss}.pkl')
def main(): ############################################ # # set options # ############################################ parser = argparse.ArgumentParser() parser.add_argument('--opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) ############################################ # # distributed training settings # ############################################ if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() print("Rank:", rank) print("------------------DIST-------------------------") ############################################ # # loading resume state if exists # ############################################ if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None ############################################ # # mkdir and loggers # ############################################ if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('base_val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_', level=logging.INFO, screen=True) print("set train log") util.setup_logger('base_val', opt['path']['log'], 'val_', level=logging.INFO, screen=True) print("set val log") logger = logging.getLogger('base') logger_val = logging.getLogger('base_val') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True ############################################ # # create train and val dataloader # ############################################ #### # dataset_ratio = 200 # enlarge the size of each epoch, todo: what it is dataset_ratio = 1 # enlarge the size of each epoch, todo: what it is for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) # total_iters = int(opt['train']['niter']) # total_epochs = int(math.ceil(total_iters / train_size)) total_iters = train_size total_epochs = int(opt['train']['epoch']) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) # total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) total_epochs = int(opt['train']['epoch']) if opt['train']['enable'] == False: total_epochs = 1 else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None ############################################ # # create model # ############################################ #### model = create_model(opt) print("Model Created! ") #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 print("Not Resume Training") ############################################ # # training # ############################################ #### #### logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) Avg_train_loss = AverageMeter() # total if (opt['train']['pixel_criterion'] == 'cb+ssim'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_ssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_ssim = AverageMeter() Avg_train_loss_vmaf = AverageMeter() elif (opt['train']['pixel_criterion'] == 'ssim'): Avg_train_loss_ssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'msssim'): Avg_train_loss_msssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'cb+msssim'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_msssim = AverageMeter() saved_total_loss = 10e10 saved_total_PSNR = -1 for epoch in range(start_epoch, total_epochs): ############################################ # # Start a new epoch # ############################################ # Turn into training mode #model = model.train() # reset total loss Avg_train_loss.reset() current_step = 0 if (opt['train']['pixel_criterion'] == 'cb+ssim'): Avg_train_loss_pix.reset() Avg_train_loss_ssim.reset() elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): Avg_train_loss_pix.reset() Avg_train_loss_ssim.reset() Avg_train_loss_vmaf.reset() elif (opt['train']['pixel_criterion'] == 'ssim'): Avg_train_loss_ssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'msssim'): Avg_train_loss_msssim = AverageMeter() elif (opt['train']['pixel_criterion'] == 'cb+msssim'): Avg_train_loss_pix = AverageMeter() Avg_train_loss_msssim = AverageMeter() if opt['dist']: train_sampler.set_epoch(epoch) for train_idx, train_data in enumerate(train_loader): if 'debug' in opt['name']: img_dir = os.path.join(opt['path']['train_images']) util.mkdir(img_dir) LQ = train_data['LQs'] GT = train_data['GT'] GT_img = util.tensor2img(GT) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}.png'.format(train_idx, 'debug_GT')) util.save_img(GT_img, save_img_path) for i in range(5): LQ_img = util.tensor2img(LQ[0, i, ...]) # uint8 save_img_path = os.path.join( img_dir, '{:4d}_{:s}_{:1d}.png'.format(train_idx, 'debug_LQ', i)) util.save_img(LQ_img, save_img_path) if (train_idx >= 3): break if opt['train']['enable'] == False: message_train_loss = 'None' break current_step += 1 if current_step > total_iters: print("Total Iteration Reached !") break #### update learning rate if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': pass else: model.update_learning_rate( current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) # if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': # model.optimize_parameters_without_schudlue(current_step) # else: model.optimize_parameters(current_step) if (opt['train']['pixel_criterion'] == 'cb+ssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_pix.update(model.log_dict['l_pix'], 1) Avg_train_loss_ssim.update(model.log_dict['ssim_loss'], 1) elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_pix.update(model.log_dict['l_pix'], 1) Avg_train_loss_ssim.update(model.log_dict['ssim_loss'], 1) Avg_train_loss_vmaf.update(model.log_dict['vmaf_loss'], 1) elif (opt['train']['pixel_criterion'] == 'ssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_ssim.update(model.log_dict['ssim_loss'], 1) elif (opt['train']['pixel_criterion'] == 'msssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_msssim.update(model.log_dict['msssim_loss'], 1) elif (opt['train']['pixel_criterion'] == 'cb+msssim'): Avg_train_loss.update(model.log_dict['total_loss'], 1) Avg_train_loss_pix.update(model.log_dict['l_pix'], 1) Avg_train_loss_msssim.update(model.log_dict['msssim_loss'], 1) else: Avg_train_loss.update(model.log_dict['l_pix'], 1) # add total train loss if (opt['train']['pixel_criterion'] == 'cb+ssim'): message_train_loss = ' pix_avg_loss: {:.4e}'.format( Avg_train_loss_pix.avg) message_train_loss += ' ssim_avg_loss: {:.4e}'.format( Avg_train_loss_ssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'cb+ssim+vmaf'): message_train_loss = ' pix_avg_loss: {:.4e}'.format( Avg_train_loss_pix.avg) message_train_loss += ' ssim_avg_loss: {:.4e}'.format( Avg_train_loss_ssim.avg) message_train_loss += ' vmaf_avg_loss: {:.4e}'.format( Avg_train_loss_vmaf.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'ssim'): message_train_loss = ' ssim_avg_loss: {:.4e}'.format( Avg_train_loss_ssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'msssim'): message_train_loss = ' msssim_avg_loss: {:.4e}'.format( Avg_train_loss_msssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) elif (opt['train']['pixel_criterion'] == 'cb+msssim'): message_train_loss = ' pix_avg_loss: {:.4e}'.format( Avg_train_loss_pix.avg) message_train_loss += ' msssim_avg_loss: {:.4e}'.format( Avg_train_loss_msssim.avg) message_train_loss += ' total_avg_loss: {:.4e}'.format( Avg_train_loss.avg) else: message_train_loss = ' train_avg_loss: {:.4e}'.format( Avg_train_loss.avg) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) message += message_train_loss if rank <= 0: logger.info(message) ############################################ # # end of one epoch, save epoch model # ############################################ #### save models and training states # if current_step % opt['logger']['save_checkpoint_freq'] == 0: # if rank <= 0: # logger.info('Saving models and training states.') # model.save(current_step) # model.save('latest') # # model.save_training_state(epoch, current_step) # # todo delete previous weights # previous_step = current_step - opt['logger']['save_checkpoint_freq'] # save_filename = '{}_{}.pth'.format(previous_step, 'G') # save_path = os.path.join(opt['path']['models'], save_filename) # if os.path.exists(save_path): # os.remove(save_path) if epoch == 1: save_filename = '{:04d}_{}.pth'.format(0, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) save_filename = '{:04d}_{}.pth'.format(epoch - 1, 'G') save_path = os.path.join(opt['path']['models'], save_filename) if os.path.exists(save_path): os.remove(save_path) if rank <= 0: logger.info('Saving models and training states.') save_filename = '{:04d}'.format(epoch) model.save(save_filename) # model.save('latest') # model.save_training_state(epoch, current_step) ############################################ # # end of one epoch, do validation # ############################################ #### validation #if opt['datasets'].get('val', None) and current_step % opt['train']['val_freq'] == 0: if opt['datasets'].get('val', None): if opt['model'] in [ 'sr', 'srgan' ] and rank <= 0: # image restoration validation # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) avg_psnr = 0. idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) #util.save_img(sr_img, save_img_path) # calculate PSNR sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale']) avg_psnr += util.calculate_psnr(sr_img, gt_img) pbar.update('Test {}'.format(img_name)) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) else: # video restoration validation if opt['dist']: # todo : multi-GPU testing psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. ssim_rlt = {} # with border and center frames ssim_rlt_avg = {} ssim_total_avg = 0. val_loss_rlt = {} val_loss_rlt_avg = {} val_loss_total_avg = 0. if rank == 0: pbar = util.ProgressBar(len(val_set)) for idx in range(rank, len(val_set), world_size): print('idx', idx) if 'debug' in opt['name']: if (idx >= 3): break val_data = val_set[idx] val_data['LQs'].unsqueeze_(0) val_data['GT'].unsqueeze_(0) folder = val_data['folder'] idx_d, max_idx = val_data['idx'].split('/') idx_d, max_idx = int(idx_d), int(max_idx) if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = torch.zeros(max_idx, dtype=torch.float32, device='cuda') if ssim_rlt.get(folder, None) is None: ssim_rlt[folder] = torch.zeros(max_idx, dtype=torch.float32, device='cuda') if val_loss_rlt.get(folder, None) is None: val_loss_rlt[folder] = torch.zeros( max_idx, dtype=torch.float32, device='cuda') # tmp = torch.zeros(max_idx, dtype=torch.float32, device='cuda') model.feed_data(val_data) # model.test() # model.test_stitch() if opt['stitch'] == True: model.test_stitch() else: model.test() # large GPU memory # visuals = model.get_current_visuals() visuals = model.get_current_visuals( save=True, name='{}_{}'.format(folder, idx), save_path=opt['path']['val_images']) rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder][idx_d] = psnr # calculate SSIM ssim = util.calculate_ssim(rlt_img, gt_img) ssim_rlt[folder][idx_d] = ssim # calculate Val loss val_loss = model.get_loss() val_loss_rlt[folder][idx_d] = val_loss logger.info( '{}_{:02d} PSNR: {:.4f}, SSIM: {:.4f}'.format( folder, idx, psnr, ssim)) if rank == 0: for _ in range(world_size): pbar.update('Test {} - {}/{}'.format( folder, idx_d, max_idx)) # # collect data for _, v in psnr_rlt.items(): dist.reduce(v, 0) for _, v in ssim_rlt.items(): dist.reduce(v, 0) for _, v in val_loss_rlt.items(): dist.reduce(v, 0) dist.barrier() if rank == 0: psnr_rlt_avg = {} psnr_total_avg = 0. for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = torch.mean(v).cpu().item() psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # ssim ssim_rlt_avg = {} ssim_total_avg = 0. for k, v in ssim_rlt.items(): ssim_rlt_avg[k] = torch.mean(v).cpu().item() ssim_total_avg += ssim_rlt_avg[k] ssim_total_avg /= len(ssim_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( ssim_total_avg) for k, v in ssim_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # added val_loss_rlt_avg = {} val_loss_total_avg = 0. for k, v in val_loss_rlt.items(): val_loss_rlt_avg[k] = torch.mean(v).cpu().item() val_loss_total_avg += val_loss_rlt_avg[k] val_loss_total_avg /= len(val_loss_rlt) log_l = '# Validation # Loss: {:.4e}:'.format( val_loss_total_avg) for k, v in val_loss_rlt_avg.items(): log_l += ' {}: {:.4e}'.format(k, v) logger.info(log_l) message = '' for v in model.get_current_learning_rate(): message += '{:.5e}'.format(v) logger_val.info( 'Epoch {:02d}, LR {:s}, PSNR {:.4f}, SSIM {:.4f} Train {:s}, Val Total Loss {:.4e}' .format(epoch, message, psnr_total_avg, ssim_total_avg, message_train_loss, val_loss_total_avg)) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) # add val loss tb_logger.add_scalar('val_loss_avg', val_loss_total_avg, current_step) for k, v in val_loss_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) else: # Todo: our function One GPU pbar = util.ProgressBar(len(val_loader)) psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. ssim_rlt = {} # with border and center frames ssim_rlt_avg = {} ssim_total_avg = 0. val_loss_rlt = {} val_loss_rlt_avg = {} val_loss_total_avg = 0. for val_inx, val_data in enumerate(val_loader): if 'debug' in opt['name']: if (val_inx >= 5): break folder = val_data['folder'][0] # idx_d = val_data['idx'].item() idx_d = val_data['idx'] # border = val_data['border'].item() if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = [] if ssim_rlt.get(folder, None) is None: ssim_rlt[folder] = [] if val_loss_rlt.get(folder, None) is None: val_loss_rlt[folder] = [] # process the black blank [B N C H W] print(val_data['LQs'].size()) H_S = val_data['LQs'].size(3) # 540 W_S = val_data['LQs'].size(4) # 960 print(H_S) print(W_S) blank_1_S = 0 blank_2_S = 0 print(val_data['LQs'][0, 2, 0, :, :].size()) for i in range(H_S): if not sum(val_data['LQs'][0, 2, 0, i, :]) == 0: blank_1_S = i - 1 # assert not sum(data_S[:, :, 0][i+1]) == 0 break for i in range(H_S): if not sum(val_data['LQs'][0, 2, 0, :, H_S - i - 1]) == 0: blank_2_S = (H_S - 1) - i - 1 # assert not sum(data_S[:, :, 0][blank_2_S-1]) == 0 break print('LQ :', blank_1_S, blank_2_S) if blank_1_S == -1: print('LQ has no blank') blank_1_S = 0 blank_2_S = H_S # val_data['LQs'] = val_data['LQs'][:,:,:,blank_1_S:blank_2_S,:] print("LQ", val_data['LQs'].size()) # end of process the black blank model.feed_data(val_data) if opt['stitch'] == True: model.test_stitch() else: model.test() # large GPU memory # process blank blank_1_L = blank_1_S << 2 blank_2_L = blank_2_S << 2 print(blank_1_L, blank_2_L) print(model.fake_H.size()) if not blank_1_S == 0: # model.fake_H = model.fake_H[:,:,blank_1_L:blank_2_L,:] model.fake_H[:, :, 0:blank_1_L, :] = 0 model.fake_H[:, :, blank_2_L:H_S, :] = 0 # end of # process blank visuals = model.get_current_visuals( save=True, name='{}_{:02d}'.format(folder, val_inx), save_path=opt['path']['val_images']) rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder].append(psnr) # calculate SSIM ssim = util.calculate_ssim(rlt_img, gt_img) ssim_rlt[folder].append(ssim) # val loss val_loss = model.get_loss() val_loss_rlt[folder].append(val_loss.item()) logger.info( '{}_{:02d} PSNR: {:.4f}, SSIM: {:.4f}'.format( folder, val_inx, psnr, ssim)) pbar.update('Test {} - {}'.format(folder, idx_d)) # average PSNR for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # average SSIM for k, v in ssim_rlt.items(): ssim_rlt_avg[k] = sum(v) / len(v) ssim_total_avg += ssim_rlt_avg[k] ssim_total_avg /= len(ssim_rlt) log_s = '# Validation # SSIM: {:.4e}:'.format( ssim_total_avg) for k, v in ssim_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) # average VMAF # average Val LOSS for k, v in val_loss_rlt.items(): val_loss_rlt_avg[k] = sum(v) / len(v) val_loss_total_avg += val_loss_rlt_avg[k] val_loss_total_avg /= len(val_loss_rlt) log_l = '# Validation # Loss: {:.4e}:'.format( val_loss_total_avg) for k, v in val_loss_rlt_avg.items(): log_l += ' {}: {:.4e}'.format(k, v) logger.info(log_l) # toal validation log message = '' for v in model.get_current_learning_rate(): message += '{:.5e}'.format(v) logger_val.info( 'Epoch {:02d}, LR {:s}, PSNR {:.4f}, SSIM {:.4f} Train {:s}, Val Total Loss {:.4e}' .format(epoch, message, psnr_total_avg, ssim_total_avg, message_train_loss, val_loss_total_avg)) # end add if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) # tb_logger.add_scalar('ssim_avg', ssim_total_avg, current_step) # for k, v in ssim_rlt_avg.items(): # tb_logger.add_scalar(k, v, current_step) # add val loss tb_logger.add_scalar('val_loss_avg', val_loss_total_avg, current_step) for k, v in val_loss_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) ############################################ # # end of validation, save model # ############################################ # logger.info("Finished an epoch, Check and Save the model weights") # we check the validation loss instead of training loss. OK~ if saved_total_loss >= val_loss_total_avg: saved_total_loss = val_loss_total_avg #torch.save(model.state_dict(), args.save_path + "/best" + ".pth") model.save('best') logger.info( "Best Weights updated for decreased validation loss") else: logger.info( "Weights Not updated for undecreased validation loss") if saved_total_PSNR <= psnr_total_avg: saved_total_PSNR = psnr_total_avg model.save('bestPSNR') logger.info( "Best Weights updated for increased validation PSNR") else: logger.info( "Weights Not updated for unincreased validation PSNR") ############################################ # # end of one epoch, schedule LR # ############################################ # add scheduler todo if opt['train']['lr_scheme'] == 'ReduceLROnPlateau': for scheduler in model.schedulers: # scheduler.step(val_loss_total_avg) scheduler.step(val_loss_total_avg) if rank <= 0: logger.info('Saving the final model.') model.save('last') logger.info('End of training.') tb_logger.close()
def psnr_main(opt, train_loader, val_loader, train_sampler, logger, resume_state=None, tb_logger=None, rank=-1): # create model model = create_model(opt) # resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) total_epochs = int(opt['train']['nepochs']) best_psnr = 0 patience = 0 all_results = [] for epoch in range(start_epoch, total_epochs): if opt['dist']: train_sampler.set_epoch(epoch) for batch_num, train_data in enumerate(train_loader): current_step += 1 # training model.feed_data(train_data) model.optimize_parameters(current_step) progress_bar(batch_num, len(train_loader), msg=None) # log if epoch % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) # batched validation if epoch % opt['train']['val_freq'] == 0 and rank <= 0: avg_psnr = 0.0 idx = 0 for batch_num, val_data in enumerate(val_loader): img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() # calculate PSNR item_psnr = util.tensor_psnr(model.real_H, model.fake_H) if math.isfinite(item_psnr): avg_psnr += item_psnr idx += 1 progress_bar(batch_num, len(val_loader), msg=None) avg_psnr = avg_psnr / idx all_results.append(avg_psnr) if avg_psnr < best_psnr: patience += 1 if patience == opt['train']['epoch_patience']: model.update_learning_rate(opt['train']['lr_decay']) print( "no improvement, final patience, updating learning rate to {}" .format(model.get_current_learning_rate())) patience = 0 else: print("no improvement, patience {} out of {}".format( patience, opt['train']['epoch_patience'])) if model.get_current_learning_rate() < opt['train']['min_lr']: break else: best_psnr = avg_psnr if rank <= 0: logger.info('Saving models and training states.') model.save('latest') # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e} (best: {:.4e})'. format(epoch, current_step, avg_psnr, best_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) if rank <= 0: logger.info('End of training.') json.dump(all_results, open( os.path.join(opt['path']['log'], 'validation_results.json'), 'w'), indent=2)
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, default='options/test/test_KPSAGAN.yml', help='Path to option YMAL file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=False) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: # util.mkdir_and_rename( #opt['path']['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) util.setup_logger('val', opt['path']['log'], 'val_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) torch.backends.cudnn.benckmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) #### create model model = create_model(opt) avg_psnr = 0.0 idx = 0 dataset_dir = '/srv/wuyichao/Super-Resolution/KPSAGAN/BasicSR-master/BasicSR-master-c/result_600000/' util.mkdir(dataset_dir) for val_data in val_loader: idx += 1 img_name = os.path.splitext(os.path.basename( val_data['LQ_path'][0]))[0] logger.info(img_name) #img_dir = os.path.join(opt['path']['val_images'], img_name) #util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['SR']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # save images suffix = 'cut' #opt['suffix'] if suffix: save_img_path = osp.join(dataset_dir, img_name + suffix + '.png') else: save_img_path = osp.join(dataset_dir, img_name + '.png') util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt['scale'] gt_img = gt_img / 255. sr_img = sr_img / 255. cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) logger_val = logging.getLogger('val') # validation logger logger_val.info('psnr: {:.4e}'.format(avg_psnr))
def final_weights_file(self): path = "weights/{}".format(self.cnf['name']) mkdir(path) return os.path.join(path, 'weights_final.pkl')
def retrain_weights_file(self): path = "weights/{}/retrain".format(self.cnf['name']) mkdir(path) return os.path.join(path, 'weights.pkl')
def main(): dataset = 'REDS' # REDS | Vimeo90K | DIV2K800_sub opt = {} opt['dist'] = False opt['gpu_ids'] = [0] if dataset == 'REDS': opt['name'] = 'test_REDS' opt['dataroot_GT'] = '../../datasets/REDS/train_sharp_wval.lmdb' opt['dataroot_LQ'] = '../../datasets/REDS/train_sharp_bicubic_wval.lmdb' opt['mode'] = 'REDS' opt['N_frames'] = 5 opt['phase'] = 'train' opt['use_shuffle'] = True opt['n_workers'] = 8 opt['batch_size'] = 16 opt['GT_size'] = 256 opt['LQ_size'] = 64 opt['scale'] = 4 opt['use_flip'] = True opt['use_rot'] = True opt['interval_list'] = [1] opt['random_reverse'] = False opt['border_mode'] = False opt['cache_keys'] = None opt['data_type'] = 'lmdb' # img | lmdb | mc elif dataset == 'Vimeo90K': opt['name'] = 'test_Vimeo90K' opt['dataroot_GT'] = '../../datasets/vimeo90k/vimeo90k_train_GT.lmdb' opt['dataroot_LQ'] = '../../datasets/vimeo90k/vimeo90k_train_LR7frames.lmdb' opt['mode'] = 'Vimeo90K' opt['N_frames'] = 7 opt['phase'] = 'train' opt['use_shuffle'] = True opt['n_workers'] = 8 opt['batch_size'] = 16 opt['GT_size'] = 256 opt['LQ_size'] = 64 opt['scale'] = 4 opt['use_flip'] = True opt['use_rot'] = True opt['interval_list'] = [1] opt['random_reverse'] = False opt['border_mode'] = False opt['cache_keys'] = None opt['data_type'] = 'lmdb' # img | lmdb | mc elif dataset == 'DIV2K800_sub': opt['name'] = 'DIV2K800' opt['dataroot_GT'] = '../../datasets/DIV2K/DIV2K800_sub.lmdb' opt['dataroot_LQ'] = '../../datasets/DIV2K/DIV2K800_sub_bicLRx4.lmdb' opt['mode'] = 'LQGT' opt['phase'] = 'train' opt['use_shuffle'] = True opt['n_workers'] = 8 opt['batch_size'] = 16 opt['GT_size'] = 128 opt['scale'] = 4 opt['use_flip'] = True opt['use_rot'] = True opt['color'] = 'RGB' opt['data_type'] = 'lmdb' # img | lmdb else: raise ValueError('Please implement by yourself.') util.mkdir('tmp') train_set = create_dataset(opt) train_loader = create_dataloader(train_set, opt, opt, None) nrow = int(math.sqrt(opt['batch_size'])) padding = 2 if opt['phase'] == 'train' else 0 print('start...') for i, data in enumerate(train_loader): if i > 5: break print(i) if dataset == 'REDS' or dataset == 'Vimeo90K': LQs = data['LQs'] else: LQ = data['LQ'] GT = data['GT'] if dataset == 'REDS' or dataset == 'Vimeo90K': for j in range(LQs.size(1)): torchvision.utils.save_image(LQs[:, j, :, :, :], 'tmp/LQ_{:03d}_{}.png'.format(i, j), nrow=nrow, padding=padding, normalize=False) else: torchvision.utils.save_image(LQ, 'tmp/LQ_{:03d}.png'.format(i), nrow=nrow, padding=padding, normalize=False) torchvision.utils.save_image(GT, 'tmp/GT_{:03d}.png'.format(i), nrow=nrow, padding=padding, normalize=False)
def train_dir(self): return ut.mkdir(pj(self.resdir, 'training'))
def main(config_path, trained_model_path): restore_epoch = None # if None, restore the final epoch # Read a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) corpus = config['corpus'] feature = config['feature'] param = config['param'] if corpus['label_type'] == 'phone': if corpus['social_signal_type'] in ['insert', 'insert3']: output_size = 41 elif corpus['social_signal_type'] == 'insert2': output_size = 44 elif corpus['social_signal_type'] == 'remove': output_size = 38 elif corpus['label_type'] == 'character': if corpus['social_signal_type'] in ['insert', 'insert3']: output_size = 150 elif corpus['social_signal_type'] == 'insert2': output_size = 153 elif corpus['social_signal_type'] == 'remove': output_size = 147 # Load model CTCModel = load(model_type=config['model_name']) network = CTCModel(batch_size=param['batch_size'], input_size=feature['input_size'] * feature['num_stack'], num_cell=param['num_cell'], num_layer=param['num_layer'], output_size=output_size, clip_gradients=param['clip_grad'], clip_activation=param['clip_activation'], dropout_ratio_input=param['dropout_input'], dropout_ratio_hidden=param['dropout_hidden'], num_proj=param['num_proj'], weight_decay=param['weight_decay']) network.model_name = config['model_name'].upper() network.model_name += '_' + str(param['num_cell']) network.model_name += '_' + str(param['num_layer']) network.model_name += '_' + param['optimizer'] network.model_name += '_lr' + str(param['learning_rate']) if feature['num_stack'] != 1: network.model_name += '_stack' + str(feature['num_stack']) network.model_name += '_transfer_' + corpus['transfer_data_size'] # Set save path network.model_dir = mkdir('/n/sd8/inaguma/result/csj/dialog/') network.model_dir = join(network.model_dir, 'ctc') network.model_dir = join(network.model_dir, corpus['label_type']) network.model_dir = join(network.model_dir, corpus['social_signal_type']) network.model_dir = join(network.model_dir, network.model_name) # Reset model directory if not os.path.isfile(os.path.join(network.model_dir, 'complete.txt')): tf.gfile.DeleteRecursively(network.model_dir) tf.gfile.MakeDirs(network.model_dir) else: raise ValueError('File exists.') # Set process name setproctitle('ctc_csj_dialog_' + corpus['label_type'] + '_' + param['optimizer'] + '_' + corpus['social_signal_type'] + '_transfer_' + corpus['transfer_data_size']) # Save config file shutil.copyfile(config_path, os.path.join(network.model_dir, 'config.yml')) sys.stdout = open(os.path.join(network.model_dir, 'train.log'), 'w') print(network.model_name) do_fine_tune(network=network, optimizer=param['optimizer'], learning_rate=param['learning_rate'], batch_size=param['batch_size'], epoch_num=param['num_epoch'], label_type=corpus['label_type'], num_stack=feature['num_stack'], num_skip=feature['num_skip'], social_signal_type=corpus['social_signal_type'], trained_model_path=trained_model_path, restore_epoch=restore_epoch) sys.stdout = sys.__stdout__
from datetime import datetime import pprint import os import argparse import numpy as np import torchvision.models as models from utils.util import mkdir from data import FEATURE_DIR mkdir(FEATURE_DIR) class Config(object): def __init__(self, layers, cnf=None): self.layers = layersGlobalPoolLayer self.cnf = cnf pprint.pprint(cnf) def get(self, k, default=None): return self.cnf.get(k, default) @property def weights_epoch(self): path = "weights/{}/epochs".format(self.cnf['name']) mkdir(path) return os.path.join(path, '{epoch}_{timestamp}_{loss}.pkl') @property def weights_best(self): path = "weights/{}/best".format(self.cnf['name']) mkdir(path)