def process_checkpoint(in_file, out_file): checkpoint = torch.load(in_file, map_location='cpu') # remove optimizer for smaller file size if 'optimizer' in checkpoint: del checkpoint['optimizer'] # if it is necessary to remove some sensitive data in checkpoint['meta'], # add the code here. if digit_version(torch.__version__) >= digit_version('1.6'): torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False) else: torch.save(checkpoint, out_file) sha = subprocess.check_output(['sha256sum', out_file]).decode() if out_file.endswith('.pth'): out_file_name = out_file[:-4] else: out_file_name = out_file current_date = datetime.datetime.now().strftime('%Y%m%d') final_file = out_file_name + f'_{current_date}-{sha[:8]}.pth' subprocess.Popen(['mv', out_file, final_file]) print(f'Successfully generated the publish-ckpt as {final_file}.')
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set multi-process settings setup_multi_processes(cfg) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids[0:1] warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 'Because we only support single GPU mode in ' 'non-distributed testing. Use the first GPU ' 'in `gpu_ids` now.') else: cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test, default_args=dict(test_mode=True)) # the extra round_up data will be removed during gpu/cpu collect data_loader = build_dataloader(dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=True) # build the model and load checkpoint model = build_classifier(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if 'CLASSES' in checkpoint.get('meta', {}): CLASSES = checkpoint['meta']['CLASSES'] else: from mmcls.datasets import ImageNet warnings.simplefilter('once') warnings.warn('Class names are not saved in the checkpoint\'s ' 'meta data, use imagenet by default.') CLASSES = ImageNet.CLASSES if not distributed: if args.device == 'cpu': model = model.cpu() else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) if not model.device_ids: assert mmcv.digit_version(mmcv.__version__) >= (1, 4, 4), \ 'To test with CPU, please confirm your mmcv version ' \ 'is not lower than v1.4.4' model.CLASSES = CLASSES show_kwargs = {} if args.show_options is None else args.show_options outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, **show_kwargs) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: results = {} logger = get_root_logger() if args.metrics: eval_results = dataset.evaluate(results=outputs, metric=args.metrics, metric_options=args.metric_options, logger=logger) results.update(eval_results) for k, v in eval_results.items(): if isinstance(v, np.ndarray): v = [round(out, 2) for out in v.tolist()] elif isinstance(v, Number): v = round(v, 2) else: raise ValueError(f'Unsupport metric type: {type(v)}') print(f'\n{k} : {v}') if args.out: if 'none' not in args.out_items: scores = np.vstack(outputs) pred_score = np.max(scores, axis=1) pred_label = np.argmax(scores, axis=1) pred_class = [CLASSES[lb] for lb in pred_label] res_items = { 'class_scores': scores, 'pred_score': pred_score, 'pred_label': pred_label, 'pred_class': pred_class } if 'all' in args.out_items: results.update(res_items) else: for key in args.out_items: results[key] = res_items[key] print(f'\ndumping results to {args.out}') mmcv.dump(results, args.out)
# Copyright (c) OpenMMLab. All rights reserved. import mmcv from mmcv import digit_version from .version import __version__ mmcv_minimum_version = '1.3.6' mmcv_maximum_version = '1.5.0' mmcv_version = digit_version(mmcv.__version__) assert (digit_version(mmcv_minimum_version) <= mmcv_version <= digit_version(mmcv_maximum_version)), \ f'MMCV=={mmcv.__version__} is used but incompatible. ' \ f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.' __all__ = ['__version__']
import mmcv from mmcv import digit_version, parse_version_info from .version import __version__, short_version mmcv_minimum_version = '1.1.3' mmcv_maximum_version = '1.4.0' mmcv_version = digit_version(mmcv.__version__) version_info = parse_version_info(__version__) assert digit_version(mmcv_minimum_version) <= mmcv_version, \ f'MMCV=={mmcv.__version__} is used but incompatible. ' \ f'Please install mmcv>={mmcv_minimum_version}.' assert digit_version(mmcv_maximum_version) > mmcv_version, \ f'MMCV=={mmcv.__version__} is used but incompatible. ' \ f'Please install mmcv<{mmcv_maximum_version}.' __all__ = ['__version__', 'short_version', 'version_info']
def test_digit_version(): assert digit_version('0.2.16') == (0, 2, 16) assert digit_version('1.2.3') == (1, 2, 3) assert digit_version('1.2.3rc0') == (1, 2, 2, 0) assert digit_version('1.2.3rc1') == (1, 2, 2, 1) assert digit_version('1.0rc0') == (1, -1, 0)
def pytorch2onnx(model, input_shape, opset_version=11, dynamic_export=False, show=False, output_file='tmp.onnx', do_simplify=False, verify=False): """Export Pytorch model to ONNX model and verify the outputs are same between Pytorch and ONNX. Args: model (nn.Module): Pytorch model we want to export. input_shape (tuple): Use this input shape to construct the corresponding dummy input and execute the model. opset_version (int): The onnx op version. Default: 11. show (bool): Whether print the computation graph. Default: False. output_file (string): The path to where we store the output ONNX model. Default: `tmp.onnx`. verify (bool): Whether compare the outputs between Pytorch and ONNX. Default: False. """ model.cpu().eval() if hasattr(model.head, 'num_classes'): num_classes = model.head.num_classes # Some backbones use `num_classes=-1` to disable top classifier. elif getattr(model.backbone, 'num_classes', -1) > 0: num_classes = model.backbone.num_classes else: raise AttributeError('Cannot find "num_classes" in both head and ' 'backbone, please check the config file.') mm_inputs = _demo_mm_inputs(input_shape, num_classes) imgs = mm_inputs.pop('imgs') img_list = [img[None, :] for img in imgs] # replace original forward function origin_forward = model.forward model.forward = partial(model.forward, img_metas={}, return_loss=False) register_extra_symbolics(opset_version) # support dynamic shape export if dynamic_export: dynamic_axes = { 'input': { 0: 'batch', 2: 'width', 3: 'height' }, 'probs': { 0: 'batch' } } else: dynamic_axes = {} with torch.no_grad(): torch.onnx.export(model, (img_list, ), output_file, input_names=['input'], output_names=['probs'], export_params=True, keep_initializers_as_inputs=True, dynamic_axes=dynamic_axes, verbose=show, opset_version=opset_version) print(f'Successfully exported ONNX model: {output_file}') model.forward = origin_forward if do_simplify: import onnx import onnxsim from mmcv import digit_version min_required_version = '0.3.0' assert digit_version(mmcv.__version__) >= digit_version( min_required_version ), f'Requires to install onnx-simplify>={min_required_version}' if dynamic_axes: input_shape = (input_shape[0], input_shape[1], input_shape[2] * 2, input_shape[3] * 2) else: input_shape = (input_shape[0], input_shape[1], input_shape[2], input_shape[3]) imgs = _demo_mm_inputs(input_shape, model.head.num_classes).pop('imgs') input_dic = {'input': imgs.detach().cpu().numpy()} input_shape_dic = {'input': list(input_shape)} model_opt, check_ok = onnxsim.simplify( output_file, input_shapes=input_shape_dic, input_data=input_dic, dynamic_input_shape=dynamic_export) if check_ok: onnx.save(model_opt, output_file) print(f'Successfully simplified ONNX model: {output_file}') else: print('Failed to simplify ONNX model.') if verify: # check by onnx import onnx onnx_model = onnx.load(output_file) onnx.checker.check_model(onnx_model) # test the dynamic model if dynamic_export: dynamic_test_inputs = _demo_mm_inputs( (input_shape[0], input_shape[1], input_shape[2] * 2, input_shape[3] * 2), model.head.num_classes) imgs = dynamic_test_inputs.pop('imgs') img_list = [img[None, :] for img in imgs] # check the numerical value # get pytorch output pytorch_result = model(img_list, img_metas={}, return_loss=False)[0] # get onnx output input_all = [node.name for node in onnx_model.graph.input] input_initializer = [ node.name for node in onnx_model.graph.initializer ] net_feed_input = list(set(input_all) - set(input_initializer)) assert (len(net_feed_input) == 1) sess = rt.InferenceSession(output_file) onnx_result = sess.run( None, {net_feed_input[0]: img_list[0].detach().numpy()})[0] if not np.allclose(pytorch_result, onnx_result): raise ValueError( 'The outputs are different between Pytorch and ONNX') print('The outputs are same between Pytorch and ONNX')
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, device=None, meta=None): """Train a model. This method will build dataloaders, wrap the model and build a runner according to the provided config. Args: model (:obj:`torch.nn.Module`): The model to be run. dataset (:obj:`mmcls.datasets.BaseDataset` | List[BaseDataset]): The dataset used to train the model. It can be a single dataset, or a list of dataset with the same length as workflow. cfg (:obj:`mmcv.utils.Config`): The configs of the experiment. distributed (bool): Whether to train the model in a distributed environment. Defaults to False. validate (bool): Whether to do validation with :obj:`mmcv.runner.EvalHook`. Defaults to False. timestamp (str, optional): The timestamp string to auto generate the name of log files. Defaults to None. device (str, optional): TODO meta (dict, optional): A dict records some import information such as environment info and seed, which will be logged in logger hook. Defaults to None. """ logger = get_root_logger() # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # The default loader config loader_cfg = dict( # cfg.gpus will be ignored if distributed num_gpus=cfg.ipu_replicas if device == 'ipu' else len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.get('seed'), sampler_cfg=cfg.get('sampler', None), ) # The overall dataloader settings loader_cfg.update({ k: v for k, v in cfg.data.items() if k not in [ 'train', 'val', 'test', 'train_dataloader', 'val_dataloader', 'test_dataloader' ] }) # The specific dataloader settings train_loader_cfg = {**loader_cfg, **cfg.data.get('train_dataloader', {})} data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: if device == 'cpu': warnings.warn( 'The argument `device` is deprecated. To use cpu to train, ' 'please refers to https://mmclassification.readthedocs.io/en' '/latest/getting_started.html#train-a-model') model = model.cpu() elif device == 'ipu': model = model.cpu() else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) if not model.device_ids: from mmcv import __version__, digit_version assert digit_version(__version__) >= (1, 4, 4), \ 'To train with CPU, please confirm your mmcv version ' \ 'is not lower than v1.4.4' # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) if device == 'ipu': if not cfg.runner['type'].startswith('IPU'): cfg.runner['type'] = 'IPU' + cfg.runner['type'] if 'options_cfg' not in cfg.runner: cfg.runner['options_cfg'] = {} cfg.runner['options_cfg']['replicationFactor'] = cfg.ipu_replicas cfg.runner['fp16_cfg'] = cfg.get('fp16', None) runner = build_runner(cfg.runner, default_args=dict(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: if device == 'ipu': from mmcv.device.ipu import IPUFp16OptimizerHook optimizer_config = IPUFp16OptimizerHook( **cfg.optimizer_config, loss_scale=fp16_cfg['loss_scale'], distributed=distributed) else: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, loss_scale=fp16_cfg['loss_scale'], distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) if distributed and cfg.runner['type'] == 'EpochBasedRunner': runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) # The specific dataloader settings val_loader_cfg = { **loader_cfg, 'shuffle': False, # Not shuffle by default 'sampler_cfg': None, # Not use sampler by default **cfg.data.get('val_dataloader', {}), } val_dataloader = build_dataloader(val_dataset, **val_loader_cfg) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook # `EvalHook` needs to be executed after `IterTimerHook`. # Otherwise, it will cause a bug if use `IterBasedRunner`. # Refers to https://github.com/open-mmlab/mmcv/issues/1261 runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, device=None, meta=None): logger = get_root_logger() # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] sampler_cfg = cfg.data.get('sampler', None) data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.seed, sampler_cfg=sampler_cfg) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: if device == 'cpu': warnings.warn( 'The argument `device` is deprecated. To use cpu to train, ' 'please refers to https://mmclassification.readthedocs.io/en' '/latest/getting_started.html#train-a-model') model = model.cpu() else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) if not model.device_ids: from mmcv import __version__, digit_version assert digit_version(__version__) >= (1, 4, 4), \ 'To train with CPU, please confirm your mmcv version ' \ 'is not lower than v1.4.4' # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) runner = build_runner(cfg.runner, default_args=dict(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) if distributed and cfg.runner['type'] == 'EpochBasedRunner': runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=True) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook # `EvalHook` needs to be executed after `IterTimerHook`. # Otherwise, it will cause a bug if use `IterBasedRunner`. # Refers to https://github.com/open-mmlab/mmcv/issues/1261 runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)