def parse_options(is_train=True): parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = parse(args.opt, is_train=is_train) # distributed settings if args.launcher == 'none': opt['dist'] = False print('Disable distributed.', flush=True) else: opt['dist'] = True if args.launcher == 'slurm' and 'dist_params' in opt: init_dist(args.launcher, **opt['dist_params']) else: init_dist(args.launcher) opt['rank'], opt['world_size'] = get_dist_info() # random seed seed = opt.get('manual_seed') if seed is None: seed = random.randint(1, 10000) opt['manual_seed'] = seed set_random_seed(seed + opt['rank']) return opt
def create_dataloader(dataset, dataset_opt, num_gpu=1, dist=False, sampler=None, seed=None): """Create dataloader. Args: dataset (torch.utils.data.Dataset): Dataset. dataset_opt (dict): Dataset options. It contains the following keys: phase (str): 'train' or 'val'. num_worker_per_gpu (int): Number of workers for each GPU. batch_size_per_gpu (int): Training batch size for each GPU. num_gpu (int): Number of GPUs. Used only in the train phase. Default: 1. dist (bool): Whether in distributed training. Used only in the train phase. Default: False. sampler (torch.utils.data.sampler): Data sampler. Default: None. seed (int | None): Seed. Default: None """ phase = dataset_opt['phase'] rank, _ = get_dist_info() if phase == 'train': if dist: # distributed training batch_size = dataset_opt['batch_size_per_gpu'] num_workers = dataset_opt['num_worker_per_gpu'] else: # non-distributed training multiplier = 1 if num_gpu == 0 else num_gpu batch_size = dataset_opt['batch_size_per_gpu'] * multiplier num_workers = dataset_opt['num_worker_per_gpu'] * multiplier dataloader_args = dict( dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, sampler=sampler, drop_last=True) if sampler is None: dataloader_args['shuffle'] = True dataloader_args['worker_init_fn'] = partial( worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None elif phase in ['val', 'test']: # validation dataloader_args = dict( dataset=dataset, batch_size=1, shuffle=False, num_workers=0) else: raise ValueError(f'Wrong dataset phase: {phase}. ' "Supported ones are 'train', 'val' and 'test'.") dataloader_args['pin_memory'] = dataset_opt.get('pin_memory', False) prefetch_mode = dataset_opt.get('prefetch_mode') if prefetch_mode == 'cpu': # CPUPrefetcher num_prefetch_queue = dataset_opt.get('num_prefetch_queue', 1) logger = get_root_logger() logger.info(f'Use {prefetch_mode} prefetch dataloader: ' f'num_prefetch_queue = {num_prefetch_queue}') return PrefetchDataLoader( num_prefetch_queue=num_prefetch_queue, **dataloader_args) else: # prefetch_mode=None: Normal dataloader # prefetch_mode='cuda': dataloader for CUDAPrefetcher return torch.utils.data.DataLoader(**dataloader_args)
def parse_options(root_path, is_train=True): parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none', help='job launcher') parser.add_argument('--auto_resume', action='store_true') parser.add_argument('--debug', action='store_true') parser.add_argument('--local_rank', type=int, default=0) parser.add_argument( '--force_yml', nargs='+', default=None, help='Force to update yml files. Examples: train:ema_decay=0.999') args = parser.parse_args() # parse yml to dict with open(args.opt, mode='r') as f: opt = yaml.load(f, Loader=ordered_yaml()[0]) # distributed settings if args.launcher == 'none': opt['dist'] = False print('Disable distributed.', flush=True) else: opt['dist'] = True if args.launcher == 'slurm' and 'dist_params' in opt: init_dist(args.launcher, **opt['dist_params']) else: init_dist(args.launcher) opt['rank'], opt['world_size'] = get_dist_info() # random seed seed = opt.get('manual_seed') if seed is None: seed = random.randint(1, 10000) opt['manual_seed'] = seed set_random_seed(seed + opt['rank']) # force to update yml options if args.force_yml is not None: for entry in args.force_yml: # now do not support creating new keys keys, value = entry.split('=') keys, value = keys.strip(), value.strip() value = _postprocess_yml_value(value) eval_str = 'opt' for key in keys.split(':'): eval_str += f'["{key}"]' eval_str += '=value' # using exec function exec(eval_str) opt['auto_resume'] = args.auto_resume opt['is_train'] = is_train # debug setting if args.debug and not opt['name'].startswith('debug'): opt['name'] = 'debug_' + opt['name'] if opt['num_gpu'] == 'auto': opt['num_gpu'] = torch.cuda.device_count() # datasets for phase, dataset in opt['datasets'].items(): # for multiple datasets, e.g., val_1, val_2; test_1, test_2 phase = phase.split('_')[0] dataset['phase'] = phase if 'scale' in opt: dataset['scale'] = opt['scale'] if dataset.get('dataroot_gt') is not None: dataset['dataroot_gt'] = osp.expanduser(dataset['dataroot_gt']) if dataset.get('dataroot_lq') is not None: dataset['dataroot_lq'] = osp.expanduser(dataset['dataroot_lq']) # paths for key, val in opt['path'].items(): if (val is not None) and ('resume_state' in key or 'pretrain_network' in key): opt['path'][key] = osp.expanduser(val) if is_train: experiments_root = osp.join(root_path, 'experiments', opt['name']) opt['path']['experiments_root'] = experiments_root opt['path']['models'] = osp.join(experiments_root, 'models') opt['path']['training_states'] = osp.join(experiments_root, 'training_states') opt['path']['log'] = experiments_root opt['path']['visualization'] = osp.join(experiments_root, 'visualization') # change some options for debug mode if 'debug' in opt['name']: if 'val' in opt: opt['val']['val_freq'] = 8 opt['logger']['print_freq'] = 1 opt['logger']['save_checkpoint_freq'] = 8 else: # test results_root = osp.join(root_path, 'results', opt['name']) opt['path']['results_root'] = results_root opt['path']['log'] = results_root opt['path']['visualization'] = osp.join(results_root, 'visualization') return opt, args
def dist_validation(self, dataloader, current_iter, tb_logger, save_img): dataset = dataloader.dataset dataset_name = dataset.opt['name'] with_metrics = self.opt['val']['metrics'] is not None # initialize self.metric_results # It is a dict: { # 'folder1': tensor (num_frame x len(metrics)), # 'folder2': tensor (num_frame x len(metrics)) # } if with_metrics and not hasattr(self, 'metric_results'): self.metric_results = {} num_frame_each_folder = Counter(dataset.data_info['folder']) for folder, num_frame in num_frame_each_folder.items(): self.metric_results[folder] = torch.zeros( num_frame, len(self.opt['val']['metrics']), dtype=torch.float32, device='cuda') rank, world_size = get_dist_info() for _, tensor in self.metric_results.items(): tensor.zero_() # record all frames (border and center frames) if rank == 0: pbar = tqdm(total=len(dataset), unit='frame') for idx in range(rank, len(dataset), world_size): val_data = dataset[idx] val_data['lq'].unsqueeze_(0) val_data['gt'].unsqueeze_(0) folder = val_data['folder'] frame_idx, max_idx = val_data['idx'].split('/') lq_path = val_data['lq_path'] self.feed_data(val_data) self.test() visuals = self.get_current_visuals() result_img = tensor2img([visuals['result']]) if 'gt' in visuals: gt_img = tensor2img([visuals['gt']]) del self.gt # tentative for out of GPU memory del self.lq del self.output torch.cuda.empty_cache() if save_img: if self.opt['is_train']: raise NotImplementedError( 'saving image is not supported during training.') else: if 'vimeo' in dataset_name.lower(): # vimeo90k dataset split_result = lq_path.split('/') img_name = (f'{split_result[-3]}_{split_result[-2]}_' f'{split_result[-1].split(".")[0]}') else: # other datasets, e.g., REDS, Vid4 img_name = osp.splitext(osp.basename(lq_path))[0] if self.opt['val']['suffix']: print( 'self.opt[val][suffix](BasicSR/basicsr/models/video_base_model.py):', self.opt['val']['suffix']) save_img_path = osp.join( self.opt['path']['visualization'], dataset_name, folder, f'{img_name}_{self.opt["val"]["suffix"]}.png') else: save_img_path = osp.join( self.opt['path']['visualization'], dataset_name, folder, f'{img_name}_{self.opt["name"]}.png') imwrite(result_img, save_img_path) if with_metrics: # calculate metrics opt_metric = deepcopy(self.opt['val']['metrics']) for metric_idx, opt_ in enumerate(opt_metric.values()): metric_type = opt_.pop('type') result = getattr(metric_module, metric_type)(result_img, gt_img, **opt_) self.metric_results[folder][int(frame_idx), metric_idx] += result # progress bar if rank == 0: for _ in range(world_size): pbar.update(1) pbar.set_description( f'Test {folder}:' f'{int(frame_idx) + world_size}/{max_idx}') if rank == 0: pbar.close() if with_metrics: if self.opt['dist']: # collect data among GPUs for _, tensor in self.metric_results.items(): dist.reduce(tensor, 0) dist.barrier() else: pass # assume use one gpu in non-dist testing if rank == 0: self._log_validation_metric_values(current_iter, dataset_name, tb_logger)
def dist_validation(self, dataloader, current_iter, tb_logger, save_img): # dist_validation has not implemented yet, use nondist_validation rank, world_size = get_dist_info() if rank == 0: self.nondist_validation(dataloader, current_iter, tb_logger, save_img)
def dist_validation(self, dataloader, current_iter, tb_logger, save_img): dataset = dataloader.dataset dataset_name = dataset.opt['name'] with_metrics = self.opt['val']['metrics'] is not None # initialize self.metric_results # It is a dict: { # 'folder1': tensor (num_frame x len(metrics)), # 'folder2': tensor (num_frame x len(metrics)) # } if with_metrics: if not hasattr(self, 'metric_results'): # only execute in the first run self.metric_results = {} num_frame_each_folder = Counter(dataset.data_info['folder']) for folder, num_frame in num_frame_each_folder.items(): self.metric_results[folder] = torch.zeros( num_frame, len(self.opt['val']['metrics']), dtype=torch.float32, device='cuda') # initialize the best metric results self._initialize_best_metric_results(dataset_name) # zero self.metric_results rank, world_size = get_dist_info() if with_metrics: for _, tensor in self.metric_results.items(): tensor.zero_() metric_data = dict() num_folders = len(dataset) num_pad = (world_size - (num_folders % world_size)) % world_size if rank == 0: pbar = tqdm(total=len(dataset), unit='folder') # Will evaluate (num_folders + num_pad) times, but only the first num_folders results will be recorded. # (To avoid wait-dead) for i in range(rank, num_folders + num_pad, world_size): idx = min(i, num_folders - 1) val_data = dataset[idx] folder = val_data['folder'] # compute outputs val_data['lq'].unsqueeze_(0) val_data['gt'].unsqueeze_(0) self.feed_data(val_data) val_data['lq'].squeeze_(0) val_data['gt'].squeeze_(0) self.test() visuals = self.get_current_visuals() # tentative for out of GPU memory del self.lq del self.output if 'gt' in visuals: del self.gt torch.cuda.empty_cache() if self.center_frame_only: visuals['result'] = visuals['result'].unsqueeze(1) if 'gt' in visuals: visuals['gt'] = visuals['gt'].unsqueeze(1) # evaluate if i < num_folders: for idx in range(visuals['result'].size(1)): result = visuals['result'][0, idx, :, :, :] result_img = tensor2img([result]) # uint8, bgr metric_data['img'] = result_img if 'gt' in visuals: gt = visuals['gt'][0, idx, :, :, :] gt_img = tensor2img([gt]) # uint8, bgr metric_data['img2'] = gt_img if save_img: if self.opt['is_train']: raise NotImplementedError( 'saving image is not supported during training.' ) else: if self.center_frame_only: # vimeo-90k clip_ = val_data['lq_path'].split('/')[-3] seq_ = val_data['lq_path'].split('/')[-2] name_ = f'{clip_}_{seq_}' img_path = osp.join( self.opt['path']['visualization'], dataset_name, folder, f"{name_}_{self.opt['name']}.png") else: # others img_path = osp.join( self.opt['path']['visualization'], dataset_name, folder, f"{idx:08d}_{self.opt['name']}.png") # image name only for REDS dataset imwrite(result_img, img_path) # calculate metrics if with_metrics: for metric_idx, opt_ in enumerate( self.opt['val']['metrics'].values()): result = calculate_metric(metric_data, opt_) self.metric_results[folder][idx, metric_idx] += result # progress bar if rank == 0: for _ in range(world_size): pbar.update(1) pbar.set_description(f'Folder: {folder}') if rank == 0: pbar.close() if with_metrics: if self.opt['dist']: # collect data among GPUs for _, tensor in self.metric_results.items(): dist.reduce(tensor, 0) dist.barrier() if rank == 0: self._log_validation_metric_values(current_iter, dataset_name, tb_logger)
def dist_validation(self, dataloader, current_iter, tb_logger, save_img): dataset = dataloader.dataset dataset_name = dataset.opt['name'] save_vid = self.opt['val']['save_vid'] if save_vid: dump = open(os.devnull, 'w') save_path = osp.join(self.opt['path']['visualization'], 'out.avi') fps = '30' crf = '18' vid = sp.Popen([ 'ffmpeg', '-framerate', fps, '-i', '-', '-c:v', 'libx264', '-preset', 'veryslow', '-crf', crf, '-y', save_path ], stdin=sp.PIPE, stderr=dump) with_metrics = self.opt['val']['metrics'] is not None # initialize self.metric_results # It is a dict: { # 'folder1': tensor (num_frame x len(metrics)), # 'folder2': tensor (num_frame x len(metrics)) # } if with_metrics and not hasattr(self, 'metric_results'): self.metric_results = {} num_frame_each_folder = Counter(dataset.data_info['folder']) for folder, num_frame in num_frame_each_folder.items(): self.metric_results[folder] = torch.zeros( num_frame, len(self.opt['val']['metrics']), dtype=torch.float32, device='cuda') rank, world_size = get_dist_info() if with_metrics: for _, tensor in self.metric_results.items(): tensor.zero_() # record all frames (border and center frames) if rank == 0: pbar = tqdm(total=len(dataset), unit='frame') for idx in range(rank, len(dataset), world_size): val_data = dataset[idx] val_data['lq'].unsqueeze_(0) val_data['gt'].unsqueeze_(0) self.feed_data(val_data) self.test() visuals = self.get_current_visuals() result_img = tensor2img([visuals['result']]) if 'gt' in visuals: gt_img = tensor2img([visuals['gt']]) del self.gt # tentative for out of GPU memory del self.lq del self.output torch.cuda.empty_cache() if save_img: folder = val_data['folder'] frame_idx, max_idx = val_data['idx'].split('/') lq_path = val_data['lq_path'] if self.opt['is_train']: raise NotImplementedError( 'saving image is not supported during training.') else: if 'vimeo' in dataset_name.lower(): # vimeo90k dataset split_result = lq_path.split('/') img_name = (f'{split_result[-3]}_{split_result[-2]}_' f'{split_result[-1].split(".")[0]}') else: # other datasets, e.g., REDS, Vid4 img_name = osp.splitext(osp.basename(lq_path))[0] if self.opt['val']['suffix']: save_img_path = osp.join( self.opt['path']['visualization'], dataset_name, folder, f'{img_name}_{self.opt["val"]["suffix"]}.png') else: save_img_path = osp.join( self.opt['path']['visualization'], dataset_name, folder, f'{img_name}_{self.opt["name"]}.png') imwrite(result_img, save_img_path) if self.opt['val']['save_vid']: frame = Image.fromarray(result_img[..., ::-1]) frame.save(vid.stdin, 'PNG') if with_metrics: # calculate metrics opt_metric = deepcopy(self.opt['val']['metrics']) for metric_idx, opt_ in enumerate(opt_metric.values()): metric_type = opt_.pop('type') result = getattr(metric_module, metric_type)(result_img, gt_img, **opt_) self.metric_results[folder][int(frame_idx), metric_idx] += result # progress bar if rank == 0: for _ in range(world_size): pbar.update(1) if rank == 0: pbar.close() vid.stdin.close() vid.communicate() if with_metrics: if self.opt['dist']: # collect data among GPUs for _, tensor in self.metric_results.items(): dist.reduce(tensor, 0) dist.barrier() else: pass # assume use one gpu in non-dist testing if rank == 0: self._log_validation_metric_values(current_iter, dataset_name, tb_logger)