def test_distributed(self, _): common_cfg = dict( dataset=self.data, samples_per_gpu=self.samples_per_gpu, workers_per_gpu=self.workers_per_gpu, num_gpus=2, # num_gpus will be ignored in distributed environment. dist=True) # Test default config dataloader = build_dataloader(**common_cfg) if digit_version(torch.__version__) >= digit_version('1.8.0'): assert dataloader.persistent_workers elif hasattr(dataloader, 'persistent_workers'): assert not dataloader.persistent_workers assert dataloader.batch_size == self.samples_per_gpu assert dataloader.num_workers == self.workers_per_gpu non_expect = torch.tensor(self.data[1::2]) assert not all(torch.cat(list(iter(dataloader))) == non_expect) # Test without shuffle dataloader = build_dataloader(**common_cfg, shuffle=False) expect = torch.tensor(self.data[1::2]) assert all(torch.cat(list(iter(dataloader))) == expect) # Test with custom sampler_cfg dataloader = build_dataloader(**common_cfg, sampler_cfg=dict(type='RepeatAugSampler', selected_round=0), shuffle=False) expect = torch.tensor( [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6][1::2]) assert all(torch.cat(list(iter(dataloader))) == expect)
def test_single_gpu(self, _): common_cfg = dict(dataset=self.data, samples_per_gpu=self.samples_per_gpu, workers_per_gpu=self.workers_per_gpu, dist=False) # Test default config dataloader = build_dataloader(**common_cfg) if digit_version(torch.__version__) >= digit_version('1.8.0'): assert dataloader.persistent_workers elif hasattr(dataloader, 'persistent_workers'): assert not dataloader.persistent_workers assert dataloader.batch_size == self.samples_per_gpu assert dataloader.num_workers == self.workers_per_gpu assert not all( torch.cat(list(iter(dataloader))) == torch.tensor(self.data)) # Test without shuffle dataloader = build_dataloader(**common_cfg, shuffle=False) assert all( torch.cat(list(iter(dataloader))) == torch.tensor(self.data)) # Test with custom sampler_cfg dataloader = build_dataloader(**common_cfg, sampler_cfg=dict(type='RepeatAugSampler', selected_round=0), shuffle=False) expect = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6] assert all(torch.cat(list(iter(dataloader))) == torch.tensor(expect))
def main(): args = parse_args() if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # build dataset and dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, shuffle=False, round_up=False) # build onnxruntime model and run inference. if args.backend == 'onnxruntime': model = ONNXRuntimeClassifier(args.model, class_names=dataset.CLASSES, device_id=0) elif args.backend == 'tensorrt': model = TensorRTClassifier(args.model, class_names=dataset.CLASSES, device_id=0) else: print('Unknown backend: {}.'.format(args.model)) exit(1) model = MMDataParallel(model, device_ids=[0]) model.CLASSES = dataset.CLASSES outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) if args.metrics: results = dataset.evaluate(outputs, args.metrics, args.metric_options) for k, v in results.items(): print(f'\n{k} : {v:.2f}') else: warnings.warn('Evaluation metrics are not specified.') scores = np.vstack(outputs) pred_score = np.max(scores, axis=1) pred_label = np.argmax(scores, axis=1) pred_class = [dataset.CLASSES[lb] for lb in pred_label] results = { 'pred_score': pred_score, 'pred_label': pred_label, 'pred_class': pred_class } if not args.out: print('\nthe predicted result for the first element is ' f'pred_score = {pred_score[0]:.2f}, ' f'pred_label = {pred_label[0]} ' f'and pred_class = {pred_class[0]}. ' 'Specify --out to save all results to files.') if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(results, args.out)
def read_random_data(self): train_dataset = build_dataset(self.cfg.data.train) print(F"train sample number: {len(train_dataset)}") train_dataloader = build_dataloader( dataset=train_dataset, samples_per_gpu=self.cfg.data.samples_per_gpu, workers_per_gpu=self.cfg.data.workers_per_gpu, num_gpus=1, dist=False, shuffle=False) data = next(iter(train_dataloader)) return data
def test_create_dataloader(self): train_dataset = build_dataset(self.cfg.data.train) print(F"train sample number: {len(train_dataset)}") train_dataloader = build_dataloader( dataset=train_dataset, samples_per_gpu=self.cfg.data.samples_per_gpu, workers_per_gpu=self.cfg.data.workers_per_gpu, num_gpus=2, dist=False, shuffle=False) for index, data in enumerate(train_dataloader): for k, v in data.items(): print(k, type(v.data[0])) print(data["img"].data.shape) print(data["gt_labels"].data) print(data["gt_angles"].data) break
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set multi-process settings setup_multi_processes(cfg) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids[0:1] warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 'Because we only support single GPU mode in ' 'non-distributed testing. Use the first GPU ' 'in `gpu_ids` now.') else: cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test, default_args=dict(test_mode=True)) # the extra round_up data will be removed during gpu/cpu collect data_loader = build_dataloader(dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=True) # build the model and load checkpoint model = build_classifier(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if 'CLASSES' in checkpoint.get('meta', {}): CLASSES = checkpoint['meta']['CLASSES'] else: from mmcls.datasets import ImageNet warnings.simplefilter('once') warnings.warn('Class names are not saved in the checkpoint\'s ' 'meta data, use imagenet by default.') CLASSES = ImageNet.CLASSES if not distributed: if args.device == 'cpu': model = model.cpu() else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) if not model.device_ids: assert mmcv.digit_version(mmcv.__version__) >= (1, 4, 4), \ 'To test with CPU, please confirm your mmcv version ' \ 'is not lower than v1.4.4' model.CLASSES = CLASSES show_kwargs = {} if args.show_options is None else args.show_options outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, **show_kwargs) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: results = {} logger = get_root_logger() if args.metrics: eval_results = dataset.evaluate(results=outputs, metric=args.metrics, metric_options=args.metric_options, logger=logger) results.update(eval_results) for k, v in eval_results.items(): if isinstance(v, np.ndarray): v = [round(out, 2) for out in v.tolist()] elif isinstance(v, Number): v = round(v, 2) else: raise ValueError(f'Unsupport metric type: {type(v)}') print(f'\n{k} : {v}') if args.out: if 'none' not in args.out_items: scores = np.vstack(outputs) pred_score = np.max(scores, axis=1) pred_label = np.argmax(scores, axis=1) pred_class = [CLASSES[lb] for lb in pred_label] res_items = { 'class_scores': scores, 'pred_score': pred_score, 'pred_label': pred_label, 'pred_class': pred_class } if 'all' in args.out_items: results.update(res_items) else: for key in args.out_items: results[key] = res_items[key] print(f'\ndumping results to {args.out}') mmcv.dump(results, args.out)
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) if args.options is not None: cfg.merge_from_dict(args.options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=False) # build the model and load checkpoint model = build_classifier(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.metrics: results = dataset.evaluate(outputs, args.metrics, args.metric_options) for k, v in results.items(): print(f'\n{k} : {v:.2f}') else: warnings.warn('Evaluation metrics are not specified.') scores = np.vstack(outputs) pred_score = np.max(scores, axis=1) pred_label = np.argmax(scores, axis=1) if 'CLASSES' in checkpoint['meta']: CLASSES = checkpoint['meta']['CLASSES'] else: from mmcls.datasets import ImageNet warnings.simplefilter('once') warnings.warn('Class names are not saved in the checkpoint\'s ' 'meta data, use imagenet by default.') CLASSES = ImageNet.CLASSES pred_class = [CLASSES[lb] for lb in pred_label] results = { 'pred_score': pred_score, 'pred_label': pred_label, 'pred_class': pred_class } if not args.out: print('\nthe predicted result for the first element is ' f'pred_score = {pred_score[0]:.2f}, ' f'pred_label = {pred_label[0]} ' f'and pred_class = {pred_class[0]}. ' 'Specify --out to save all results to files.') if args.out and rank == 0: print(f'\nwriting results to {args.out}') mmcv.dump(results, args.out)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta, ema_cfg=cfg.get('ema_cfg', None)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=False) # build the model and load checkpoint model = build_classifier(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) _ = load_checkpoint(model, args.checkpoint, map_location='cpu') if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: nums = [] results = {} for output in outputs: nums.append(output['num_samples'].item()) for topk, v in output['accuracy'].items(): if topk not in results: results[topk] = [] results[topk].append(v.item()) assert sum(nums) == len(dataset) for topk, accs in results.items(): avg_acc = np.average(accs, weights=nums) print(f'\n{topk} accuracy: {avg_acc:.2f}') if args.out and rank == 0: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, device='cuda', meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: if device == 'cuda': model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) elif device == 'cpu': model = model.cpu() else: raise ValueError(F'unsupported device name {device}.') # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) runner = build_runner(cfg.runner, default_args=dict(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=True) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, device=None, meta=None): """Train a model. This method will build dataloaders, wrap the model and build a runner according to the provided config. Args: model (:obj:`torch.nn.Module`): The model to be run. dataset (:obj:`mmcls.datasets.BaseDataset` | List[BaseDataset]): The dataset used to train the model. It can be a single dataset, or a list of dataset with the same length as workflow. cfg (:obj:`mmcv.utils.Config`): The configs of the experiment. distributed (bool): Whether to train the model in a distributed environment. Defaults to False. validate (bool): Whether to do validation with :obj:`mmcv.runner.EvalHook`. Defaults to False. timestamp (str, optional): The timestamp string to auto generate the name of log files. Defaults to None. device (str, optional): TODO meta (dict, optional): A dict records some import information such as environment info and seed, which will be logged in logger hook. Defaults to None. """ logger = get_root_logger() # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # The default loader config loader_cfg = dict( # cfg.gpus will be ignored if distributed num_gpus=cfg.ipu_replicas if device == 'ipu' else len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.get('seed'), sampler_cfg=cfg.get('sampler', None), ) # The overall dataloader settings loader_cfg.update({ k: v for k, v in cfg.data.items() if k not in [ 'train', 'val', 'test', 'train_dataloader', 'val_dataloader', 'test_dataloader' ] }) # The specific dataloader settings train_loader_cfg = {**loader_cfg, **cfg.data.get('train_dataloader', {})} data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: if device == 'cpu': warnings.warn( 'The argument `device` is deprecated. To use cpu to train, ' 'please refers to https://mmclassification.readthedocs.io/en' '/latest/getting_started.html#train-a-model') model = model.cpu() elif device == 'ipu': model = model.cpu() else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) if not model.device_ids: from mmcv import __version__, digit_version assert digit_version(__version__) >= (1, 4, 4), \ 'To train with CPU, please confirm your mmcv version ' \ 'is not lower than v1.4.4' # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) if device == 'ipu': if not cfg.runner['type'].startswith('IPU'): cfg.runner['type'] = 'IPU' + cfg.runner['type'] if 'options_cfg' not in cfg.runner: cfg.runner['options_cfg'] = {} cfg.runner['options_cfg']['replicationFactor'] = cfg.ipu_replicas cfg.runner['fp16_cfg'] = cfg.get('fp16', None) runner = build_runner(cfg.runner, default_args=dict(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: if device == 'ipu': from mmcv.device.ipu import IPUFp16OptimizerHook optimizer_config = IPUFp16OptimizerHook( **cfg.optimizer_config, loss_scale=fp16_cfg['loss_scale'], distributed=distributed) else: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, loss_scale=fp16_cfg['loss_scale'], distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) if distributed and cfg.runner['type'] == 'EpochBasedRunner': runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) # The specific dataloader settings val_loader_cfg = { **loader_cfg, 'shuffle': False, # Not shuffle by default 'sampler_cfg': None, # Not use sampler by default **cfg.data.get('val_dataloader', {}), } val_dataloader = build_dataloader(val_dataset, **val_loader_cfg) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook # `EvalHook` needs to be executed after `IterTimerHook`. # Otherwise, it will cause a bug if use `IterBasedRunner`. # Refers to https://github.com/open-mmlab/mmcv/issues/1261 runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) if args.options is not None: cfg.merge_from_dict(args.options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True assert args.metrics or args.out, \ 'Please specify at least one of output path and evaluation metrics.' # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) # the extra round_up data will be removed during gpu/cpu collect data_loader = build_dataloader(dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=True) # build the model and load checkpoint model = build_classifier(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if 'CLASSES' in checkpoint.get('meta', {}): CLASSES = checkpoint['meta']['CLASSES'] else: from mmcls.datasets import ImageNet warnings.simplefilter('once') warnings.warn('Class names are not saved in the checkpoint\'s ' 'meta data, use imagenet by default.') CLASSES = ImageNet.CLASSES if not distributed: if args.device == 'cpu': model = model.cpu() else: model = MMDataParallel(model, device_ids=[0]) model.CLASSES = CLASSES show_kwargs = {} if args.show_options is None else args.show_options outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, **show_kwargs) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: results = {} if args.metrics: eval_results = dataset.evaluate(outputs, args.metrics, args.metric_options) results.update(eval_results) for k, v in eval_results.items(): print(f'\n{k} : {v:.2f}') if args.out: scores = np.vstack(outputs) pred_score = np.max(scores, axis=1) pred_label = np.argmax(scores, axis=1) pred_class = [CLASSES[lb] for lb in pred_label] results.update({ 'class_scores': scores, 'pred_score': pred_score, 'pred_label': pred_label, 'pred_class': pred_class }) print(f'\ndumping results to {args.out}') mmcv.dump(results, args.out)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, device=None, meta=None): logger = get_root_logger() # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] sampler_cfg = cfg.data.get('sampler', None) data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.seed, sampler_cfg=sampler_cfg) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: if device == 'cpu': warnings.warn( 'The argument `device` is deprecated. To use cpu to train, ' 'please refers to https://mmclassification.readthedocs.io/en' '/latest/getting_started.html#train-a-model') model = model.cpu() else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) if not model.device_ids: from mmcv import __version__, digit_version assert digit_version(__version__) >= (1, 4, 4), \ 'To train with CPU, please confirm your mmcv version ' \ 'is not lower than v1.4.4' # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) runner = build_runner(cfg.runner, default_args=dict(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) if distributed and cfg.runner['type'] == 'EpochBasedRunner': runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=True) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook # `EvalHook` needs to be executed after `IterTimerHook`. # Otherwise, it will cause a bug if use `IterBasedRunner`. # Refers to https://github.com/open-mmlab/mmcv/issues/1261 runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)