def main(): parser = argparse.ArgumentParser(description='Benchmark dataloading') parser.add_argument('config', help='train config file path') args = parser.parse_args() cfg = Config.fromfile(args.config) # init logger before other steps logger = get_root_logger() logger.info(f'MMPose Version: {__version__}') logger.info(f'Config: {cfg.text}') dataset = build_dataset(cfg.data.train) data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # Start progress bar after first 5 batches prog_bar = mmcv.ProgressBar( len(dataset) - 5 * cfg.data.samples_per_gpu, start=False) for i, data in enumerate(data_loader): if i == 5: prog_bar.start() for _ in data['img']: if i < 5: continue prog_bar.update()
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True args.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) mmcv.mkdir_or_exist(osp.abspath(args.work_dir)) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test, dict(test_mode=True)) data_loader = build_dataloader(dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_posenet(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) _ = load_checkpoint(model, args.checkpoint, map_location='cpu') # for backward compatibility if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() eval_config = cfg.get('eval_config', {}) eval_config = merge_configs(eval_config, dict(metrics=args.eval)) if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) dataset.evaluate(outputs, args.work_dir, **eval_config)
def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # build the dataloader dataset = build_dataset(cfg.data.val) data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # build the model and load checkpoint model = build_posenet(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) if args.fuse_conv_bn: model = fuse_conv_bn(model) model = MMDataParallel(model, device_ids=[0]) # the first several iterations may be very slow so skip them num_warmup = 5 pure_inf_time = 0 # benchmark with total batch and take the average for i, data in enumerate(data_loader): torch.cuda.synchronize() start_time = time.perf_counter() with torch.no_grad(): model(return_loss=False, **data) torch.cuda.synchronize() elapsed = time.perf_counter() - start_time if i >= num_warmup: pure_inf_time += elapsed if (i + 1) % args.log_interval == 0: its = (i + 1 - num_warmup) / pure_inf_time print(f'Done item [{i + 1:<3}], {its:.2f} items / s') print(f'Overall average: {its:.2f} items / s') print(f'Total time: {pure_inf_time:.2f} s')
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.options is not None: cfg.merge_from_dict(args.options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed model = build_posenet(cfg.model) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmpose version, config file content # checkpoints as meta data cfg.checkpoint_config.meta = dict( mmpose_version=__version__ + get_git_hash(digits=7), config=cfg.pretty_text, ) train_model( model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (Dataset): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # step 1: give default values and override (if exist) from cfg.data loader_cfg = { **dict( seed=cfg.get('seed'), drop_last=False, dist=distributed, num_gpus=len(cfg.gpu_ids)), **({} if torch.__version__ != 'parrots' else dict( prefetch_num=2, pin_memory=False, )), **dict((k, cfg.data[k]) for k in [ 'samples_per_gpu', 'workers_per_gpu', 'shuffle', 'seed', 'drop_last', 'prefetch_num', 'pin_memory', 'persistent_workers', ] if k in cfg.data) } # step 2: cfg.data.train_dataloader has highest priority train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {})) data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # determine whether use adversarial training precess or not use_adverserial_train = cfg.get('use_adversarial_train', False) # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', True) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel if use_adverserial_train: # Use DistributedDataParallelWrapper for adversarial training model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizers(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp if use_adverserial_train: # The optimizer step process is included in the train_step function # of the model, so the runner should NOT include optimizer hook. optimizer_config = None else: # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( samples_per_gpu=1, workers_per_gpu=cfg.data.get('workers_per_gpu', 1), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, drop_last=False, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
img = model.show_result(img, result, skeleton, radius=radius, pose_kpt_color=pose_kpt_color, pose_limb_color=pose_limb_color, kpt_score_thr=kpt_score_thr, show=show, out_file=out_file) return img cfg = Config.fromfile('configs/top_down/hrnet/cowacar/cowacar.py') dataset = build_dataset(cfg.data.test, dict(test_mode=True)) data_loader = build_dataloader(dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # for d in datasets: # img = (255 * d['img'].permute(1, 2, 0).numpy())[:, :, ::-1] # hm = d['target'].max(axis=0, keepdims=True).transpose(1, 2, 0) # hm = cv2.resize(hm, None, fx=4, fy=4, interpolation=cv2.INTER_LINEAR) # img *= 0.8 # img[..., -1] += 255 * 0.2 * hm # cv2.imshow('0', img.astype(np.uint8)) # cv2.waitKey(0)
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True args.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) mmcv.mkdir_or_exist(osp.abspath(args.work_dir)) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test, dict(test_mode=True)) dataloader_setting = dict(samples_per_gpu=1, workers_per_gpu=cfg.data.get( 'workers_per_gpu', 1), dist=distributed, shuffle=False, drop_last=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('test_dataloader', {})) data_loader = build_dataloader(dataset, **dataloader_setting) # print(data_loader) # build the model and load checkpoint model = build_posenet(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() eval_config = cfg.get('evaluation', {}) eval_config = merge_configs(eval_config, dict(metric=args.eval)) if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) results = dataset.evaluate(outputs, args.work_dir, **eval_config) for k, v in sorted(results.items()): print(f'{k}: {v}')
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set multi-process settings setup_multi_processes(cfg) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test, dict(test_mode=True)) # step 1: give default values and override (if exist) from cfg.data loader_cfg = { **dict(seed=cfg.get('seed'), drop_last=False, dist=distributed), **({} if torch.__version__ != 'parrots' else dict( prefetch_num=2, pin_memory=False, )), **dict((k, cfg.data[k]) for k in [ 'seed', 'prefetch_num', 'pin_memory', 'persistent_workers', ] if k in cfg.data) } # step2: cfg.data.test_dataloader has higher priority test_loader_cfg = { **loader_cfg, **dict(shuffle=False, drop_last=False), **dict(workers_per_gpu=cfg.data.get('workers_per_gpu', 1)), **dict(samples_per_gpu=cfg.data.get('samples_per_gpu', 1)), **cfg.data.get('test_dataloader', {}) } data_loader = build_dataloader(dataset, **test_loader_cfg) # build the model and load checkpoint model = build_posenet(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) if not distributed: model = MMDataParallel(model, device_ids=[args.gpu_id]) outputs = single_gpu_test(model, data_loader) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() eval_config = cfg.get('evaluation', {}) eval_config = merge_configs(eval_config, dict(metric=args.eval)) if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) results = dataset.evaluate(outputs, cfg.work_dir, **eval_config) for k, v in sorted(results.items()): print(f'{k}: {v}')
def main(): args = parse_args() if 'cuda' in args.device.lower(): if torch.cuda.is_available(): with_cuda = True else: raise RuntimeError('No CUDA device found, please check it again.') else: with_cuda = False if args.root_work_dir is None: # get the current time stamp now = datetime.now() ts = now.strftime('%Y_%m_%d_%H_%M') args.root_work_dir = f'work_dirs/inference_speed_test_{ts}' mmcv.mkdir_or_exist(osp.abspath(args.root_work_dir)) cfg = mmcv.load(args.config) dummy_datasets = mmcv.load(args.dummy_dataset_config)['dummy_datasets'] results = [] for i in range(args.priority + 1): models = cfg['model_list'][f'P{i}'] for cur_model in models: cfg_file = cur_model['config'] model_cfg = Config.fromfile(cfg_file) test_dataset = model_cfg['data']['test'] dummy_dataset = dummy_datasets[test_dataset['type']] test_dataset.update(dummy_dataset) dataset = build_dataset(test_dataset) data_loader = build_dataloader( dataset, samples_per_gpu=args.batch_size, workers_per_gpu=model_cfg.data.workers_per_gpu, dist=False, shuffle=False) data_loader = IterLoader(data_loader) if 'pretrained' in model_cfg.model.keys(): del model_cfg.model['pretrained'] model = init_pose_model(model_cfg, device=args.device.lower()) fp16_cfg = model_cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) if args.fuse_conv_bn: model = fuse_conv_bn(model) # benchmark with several iterations and take the average pure_inf_time = 0 speed = [] for iteration in range(args.num_iters + args.num_warmup): data = next(data_loader) data['img'] = data['img'].to(args.device.lower()) data['img_metas'] = data['img_metas'].data[0] if with_cuda: torch.cuda.synchronize() start_time = time.perf_counter() with torch.no_grad(): model(return_loss=False, **data) if with_cuda: torch.cuda.synchronize() elapsed = time.perf_counter() - start_time if iteration >= args.num_warmup: pure_inf_time += elapsed speed.append(1 / elapsed) speed_mean = np.mean(speed) speed_std = np.std(speed) split_line = '=' * 30 result = f'{split_line}\nModel config:{cfg_file}\n' \ f'Device: {args.device}\n' \ f'Batch size: {args.batch_size}\n' \ f'Overall average speed: {speed_mean:.2f} \u00B1 ' \ f'{speed_std:.2f} items / s\n' \ f'Total iters: {args.num_iters}\n'\ f'Total time: {pure_inf_time:.2f} s \n{split_line}\n'\ print(result) results.append(result) print('!!!Please be cautious if you use the results in papers. ' 'You may need to check if all ops are included and verify that the ' 'speed computation is correct.') with open(osp.join(args.root_work_dir, 'inference_speed.txt'), 'w') as f: for res in results: f.write(res)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (Dataset): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] dataloader_setting = dict( samples_per_gpu=cfg.data.get('samples_per_gpu', {}), workers_per_gpu=cfg.data.get('workers_per_gpu', {}), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) dataloader_setting = dict(dataloader_setting, **cfg.data.get('train_dataloader', {})) data_loaders = [ build_dataloader(ds, **dataloader_setting) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', True) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( # samples_per_gpu=cfg.data.get('samples_per_gpu', {}), samples_per_gpu=1, workers_per_gpu=cfg.data.get('workers_per_gpu', {}), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)