def test_get_root_logger(): # Reset the initialized log mmcv.utils.logging.logger_initialized = {} with tempfile.TemporaryDirectory() as tmpdirname: log_path = osp.join(tmpdirname, 'test.log') logger = get_root_logger(log_file=log_path) message1 = 'adhsuadghj' logger.info(message1) logger2 = get_root_logger() message2 = 'm,tkrgmkr' logger2.info(message2) with open(log_path, 'r') as f: lines = f.readlines() assert message1 in lines[0] assert message2 in lines[1] assert logger is logger2 handlers = list(logger.handlers) for handler in handlers: handler.close() logger.removeHandler(handler) os.remove(log_path)
def simulate_train(data_loader, cfg, by_epoch=True): # build logger, data_loader, model and optimizer logger = get_root_logger() data_loaders = [data_loader] model = SimpleModel() optimizer = build_optimizer(model, cfg.optimizer) # build runner if by_epoch: runner = DummyEpochBasedRunner(max_epochs=cfg.runner.max_epochs, model=model, optimizer=optimizer, logger=logger) else: runner = DummyIterBasedRunner(max_iters=cfg.runner.max_iters, model=model, optimizer=optimizer, logger=logger) # register hooks runner.register_training_hooks( lr_config=cfg.lr_config, custom_hooks_config=cfg.get('custom_hooks', None), ) # only use the first train workflow workflow = cfg.workflow[:1] assert workflow[0][0] == 'train' return runner.run(data_loaders, cfg.workflow)
def _prepare_relative_position_bias_table(self, state_dict, prefix, *args, **kwargs): state_dict_model = self.state_dict() all_keys = list(state_dict_model.keys()) for key in all_keys: if 'relative_position_bias_table' in key: ckpt_key = prefix + key if ckpt_key not in state_dict: continue relative_position_bias_table_pretrained = state_dict[ckpt_key] relative_position_bias_table_current = state_dict_model[key] L1, nH1 = relative_position_bias_table_pretrained.size() L2, nH2 = relative_position_bias_table_current.size() if L1 != L2: src_size = int(L1**0.5) dst_size = int(L2**0.5) new_rel_pos_bias = resize_relative_position_bias_table( src_size, dst_size, relative_position_bias_table_pretrained, nH1) from mmcls.utils import get_root_logger logger = get_root_logger() logger.info('Resize the relative_position_bias_table from ' f'{state_dict[ckpt_key].shape} to ' f'{new_rel_pos_bias.shape}') state_dict[ckpt_key] = new_rel_pos_bias # The index buffer need to be re-generated. index_buffer = ckpt_key.replace('bias_table', 'index') del state_dict[index_buffer]
def forward_train(self, x, gt_label): logger = get_root_logger() logger.warning("MMClassification doesn't support to train the " 'distilled version DeiT.') cls_token, dist_token = self.pre_logits(x) cls_score = (self.layers.head(cls_token) + self.layers.head_dist(dist_token)) / 2 losses = self.loss(cls_score, gt_label) return losses
def main(args): model_index_file = MMCLS_ROOT / 'model-index.yml' model_index = Config.fromfile(model_index_file) models = OrderedDict() for file in model_index.Import: metafile = Config.fromfile(MMCLS_ROOT / file) models.update({model.Name: model for model in metafile.Models}) logger = get_root_logger(log_file='benchmark_test_image.log', log_level=logging.INFO) if args.models: patterns = [re.compile(pattern) for pattern in args.models] filter_models = {} for k, v in models.items(): if any([re.match(pattern, k) for pattern in patterns]): filter_models[k] = v if len(filter_models) == 0: print('No model found, please specify models in:') print('\n'.join(models.keys())) return models = filter_models summary_data = {} for model_name, model_info in models.items(): config = Path(model_info.Config) assert config.exists(), f'{model_name}: {config} not found.' logger.info(f'Processing: {model_name}') http_prefix = 'https://download.openmmlab.com/mmclassification/' dataset = model_info.Results[0]['Dataset'] if args.checkpoint_root is not None: root = Path(args.checkpoint_root) checkpoint = root / model_info.Weights[len(http_prefix):] checkpoint = str(checkpoint) else: checkpoint = None try: # build the model from a config file and a checkpoint file result = inference(MMCLS_ROOT / config, checkpoint, classes_map[dataset], args) result['valid'] = 'PASS' except Exception as e: logger.error(f'"{config}" : {repr(e)}') result = {'valid': 'FAIL'} summary_data[model_name] = result # show the results if args.show: imshow_infos(args.img, result, wait_time=args.wait_time) show_summary(summary_data)
def pre_logits(self, x): if isinstance(x, tuple): x = x[-1] from mmcls.utils import get_root_logger logger = get_root_logger() logger.warning( 'The input of MultiLabelClsHead should be already logits. ' 'Please modify the backbone if you want to get pre-logits feature.' ) return x
def init_weights(self): super(Conformer, self).init_weights() logger = get_root_logger() if (isinstance(self.init_cfg, dict) and self.init_cfg['type'] == 'Pretrained'): # Suppress default init if use pretrained model. return else: logger.info(f'No pre-trained weights for ' f'{self.__class__.__name__}, ' f'training start from scratch') self.apply(self._init_weights)
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # make sure save_root exists if args.save_path and not args.save_path.parent.exists(): raise Exception(f'The save path is {args.save_path}, and directory ' f"'{args.save_path.parent}' do not exist.") # init logger logger = get_root_logger(log_level=cfg.log_level) logger.info('Lr config : \n\n' + pformat(cfg.lr_config, sort_dicts=False) + '\n') by_epoch = True if cfg.runner.type == 'EpochBasedRunner' else False # prepare data loader batch_size = cfg.data.samples_per_gpu * args.ngpus if args.dataset_size is None and by_epoch: from mmcls.datasets.builder import build_dataset dataset_size = len(build_dataset(cfg.data.train)) else: dataset_size = args.dataset_size or batch_size fake_dataset = list(range(dataset_size)) data_loader = DataLoader(fake_dataset, batch_size=batch_size) dataset_info = (f'\nDataset infos:' f'\n - Dataset size: {dataset_size}' f'\n - Samples per GPU: {cfg.data.samples_per_gpu}' f'\n - Number of GPUs: {args.ngpus}' f'\n - Total batch size: {batch_size}') if by_epoch: dataset_info += f'\n - Iterations per epoch: {len(data_loader)}' logger.info(dataset_info) # simulation training process lr_list = simulate_train(data_loader, cfg, by_epoch) plot_curve(lr_list, args, len(data_loader), by_epoch)
def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. """ if isinstance(pretrained, str): logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger, map_location=torch.device('cpu')) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance( m, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)): constant_init(m, 1) else: raise TypeError('pretrained must be a str or None')
def _load_checkpoint(self, checkpoint, prefix=None, map_location=None): from mmcv.runner import (_load_checkpoint, _load_checkpoint_with_prefix, load_state_dict) from mmcv.utils import print_log logger = get_root_logger() if prefix is None: print_log(f'load model from: {checkpoint}', logger=logger) checkpoint = _load_checkpoint(checkpoint, map_location, logger) # get state_dict from checkpoint if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] else: state_dict = checkpoint else: print_log( f'load {prefix} in model from: {checkpoint}', logger=logger) state_dict = _load_checkpoint_with_prefix(prefix, checkpoint, map_location) if 'pos_embed' in state_dict.keys(): ckpt_pos_embed_shape = state_dict['pos_embed'].shape if self.pos_embed.shape != ckpt_pos_embed_shape: print_log( f'Resize the pos_embed shape from {ckpt_pos_embed_shape} ' f'to {self.pos_embed.shape}.', logger=logger) ckpt_pos_embed_shape = to_2tuple( int(np.sqrt(ckpt_pos_embed_shape[1] - 1))) pos_embed_shape = self.patch_embed.patches_resolution state_dict['pos_embed'] = self.resize_pos_embed( state_dict['pos_embed'], ckpt_pos_embed_shape, pos_embed_shape, self.interpolate_mode) # load state_dict load_state_dict(self, state_dict, strict=False, logger=logger)
def _prepare_pos_embed(self, state_dict, prefix, *args, **kwargs): name = prefix + 'pos_embed' if name not in state_dict.keys(): return ckpt_pos_embed_shape = state_dict[name].shape if self.pos_embed.shape != ckpt_pos_embed_shape: from mmcls.utils import get_root_logger logger = get_root_logger() logger.info( f'Resize the pos_embed shape from {ckpt_pos_embed_shape} ' f'to {self.pos_embed.shape}.') ckpt_pos_embed_shape = to_2tuple( int(np.sqrt(ckpt_pos_embed_shape[1] - self.num_extra_tokens))) pos_embed_shape = self.tokens_to_token.init_out_size state_dict[name] = resize_pos_embed(state_dict[name], ckpt_pos_embed_shape, pos_embed_shape, self.interpolate_mode, self.num_extra_tokens)
def main(): print("--------> 1") args = parse_args() print("--------> 2") cfg = Config.fromfile(args.config) print("--------> 3") if args.options is not None: cfg.merge_from_dict(args.options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True print("--------> 4") # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) print("--------> 5") # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) _, world_size = get_dist_info() cfg.gpu_ids = range(world_size) print("--------> 6") # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) print("--------> 7") # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info # log some basic info cfg_pretty_text = cfg.pretty_text logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg_pretty_text}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed model = build_classifier(cfg.model) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmcls version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(mmcls_version=__version__, config=cfg_pretty_text, CLASSES=datasets[0].CLASSES) model.CLASSES = datasets[0].CLASSES # add an attribute for visualization convenience train_model(model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, device='cpu' if args.device == 'cpu' else 'cuda', meta=meta)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta, ema_cfg=cfg.get('ema_cfg', None)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, device=None, meta=None): logger = get_root_logger() # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] sampler_cfg = cfg.data.get('sampler', None) data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.seed, sampler_cfg=sampler_cfg) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: if device == 'cpu': warnings.warn( 'The argument `device` is deprecated. To use cpu to train, ' 'please refers to https://mmclassification.readthedocs.io/en' '/latest/getting_started.html#train-a-model') model = model.cpu() else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) if not model.device_ids: from mmcv import __version__, digit_version assert digit_version(__version__) >= (1, 4, 4), \ 'To train with CPU, please confirm your mmcv version ' \ 'is not lower than v1.4.4' # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) runner = build_runner(cfg.runner, default_args=dict(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) if distributed and cfg.runner['type'] == 'EpochBasedRunner': runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=True) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook # `EvalHook` needs to be executed after `IterTimerHook`. # Otherwise, it will cause a bug if use `IterBasedRunner`. # Refers to https://github.com/open-mmlab/mmcv/issues/1261 runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)
def train_single_fold(args, cfg, fold, distributed, seed): # create the work_dir for the fold work_dir = osp.join(cfg.work_dir, f'fold{fold}') cfg.work_dir = work_dir # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # wrap the dataset cfg train_dataset = dict( type='KFoldDataset', fold=fold, dataset=cfg.data.train, num_splits=args.num_splits, seed=seed, ) val_dataset = dict( type='KFoldDataset', fold=fold, # Use the same dataset with training. dataset=copy.deepcopy(cfg.data.train), num_splits=args.num_splits, seed=seed, test_mode=True, ) val_dataset['dataset']['pipeline'] = cfg.data.val.pipeline cfg.data.train = train_dataset cfg.data.val = val_dataset cfg.data.test = val_dataset # dump config stem, suffix = osp.basename(args.config).rsplit('.', 1) cfg.dump(osp.join(cfg.work_dir, f'{stem}_fold{fold}.{suffix}')) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') logger.info( f'-------- Cross-validation: [{fold+1}/{args.num_splits}] -------- ') # set random seeds # Use different seed in different folds logger.info(f'Set random seed to {seed + fold}, ' f'deterministic: {args.deterministic}') set_random_seed(seed + fold, deterministic=args.deterministic) cfg.seed = seed + fold meta['seed'] = seed + fold model = build_classifier(cfg.model) model.init_weights() datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) meta.update( dict(mmcls_version=__version__, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES, kfold=dict(fold=fold, num_splits=args.num_splits))) # add an attribute for visualization convenience train_model(model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, device='cpu' if args.device == 'cpu' else 'cuda', meta=meta)
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set multi-process settings setup_multi_processes(cfg) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpus is not None: cfg.gpu_ids = range(1) warnings.warn('`--gpus` is deprecated because we only support ' 'single GPU mode in non-distributed training. ' 'Use `gpus=1` now.') if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids[0:1] warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 'Because we only support single GPU mode in ' 'non-distributed training. Use the first GPU ' 'in `gpu_ids` now.') if args.gpus is None and args.gpu_ids is None: cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) _, world_size = get_dist_info() cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # dump config cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds seed = init_random_seed(args.seed) logger.info(f'Set random seed to {seed}, ' f'deterministic: {args.deterministic}') set_random_seed(seed, deterministic=args.deterministic) cfg.seed = seed meta['seed'] = seed model = build_classifier(cfg.model) model.init_weights() datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) # save mmcls version, config file content and class names in # runner as meta data meta.update( dict(mmcls_version=__version__, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES)) # add an attribute for visualization convenience train_model(model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, device='cpu' if args.device == 'cpu' else 'cuda', meta=meta)
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.options is not None: cfg.merge_from_dict(args.options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True if args.net_params: tag, input_channels, block1, block2, block3, block4, last_channel = args.net_params.split('-') input_channels = [int(item) for item in input_channels.split('_')] block1 = [int(item) for item in block1.split('_')] block2 = [int(item) for item in block2.split('_')] block3 = [int(item) for item in block3.split('_')] block4 = [int(item) for item in block4.split('_')] last_channel = int(last_channel) inverted_residual_setting = [] for item in [block1, block2, block3, block4]: for _ in range(item[0]): inverted_residual_setting.append([item[1], item[2:-int(len(item)/2-1)], item[-int(len(item)/2-1):]]) cfg.model.backbone.input_channel = input_channels cfg.model.backbone.inverted_residual_setting = inverted_residual_setting cfg.model.backbone.last_channel = last_channel cfg.model.head.in_channels = last_channel # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) if args.net_params: log_file = osp.join(cfg.work_dir, f'{args.net_params}.log') else: log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed model = build_classifier(cfg.model) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmcls version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict( mmcls_version=__version__, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES) # add an attribute for visualization convenience train_model( model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, device=None, meta=None): """Train a model. This method will build dataloaders, wrap the model and build a runner according to the provided config. Args: model (:obj:`torch.nn.Module`): The model to be run. dataset (:obj:`mmcls.datasets.BaseDataset` | List[BaseDataset]): The dataset used to train the model. It can be a single dataset, or a list of dataset with the same length as workflow. cfg (:obj:`mmcv.utils.Config`): The configs of the experiment. distributed (bool): Whether to train the model in a distributed environment. Defaults to False. validate (bool): Whether to do validation with :obj:`mmcv.runner.EvalHook`. Defaults to False. timestamp (str, optional): The timestamp string to auto generate the name of log files. Defaults to None. device (str, optional): TODO meta (dict, optional): A dict records some import information such as environment info and seed, which will be logged in logger hook. Defaults to None. """ logger = get_root_logger() # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # The default loader config loader_cfg = dict( # cfg.gpus will be ignored if distributed num_gpus=cfg.ipu_replicas if device == 'ipu' else len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.get('seed'), sampler_cfg=cfg.get('sampler', None), ) # The overall dataloader settings loader_cfg.update({ k: v for k, v in cfg.data.items() if k not in [ 'train', 'val', 'test', 'train_dataloader', 'val_dataloader', 'test_dataloader' ] }) # The specific dataloader settings train_loader_cfg = {**loader_cfg, **cfg.data.get('train_dataloader', {})} data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: if device == 'cpu': warnings.warn( 'The argument `device` is deprecated. To use cpu to train, ' 'please refers to https://mmclassification.readthedocs.io/en' '/latest/getting_started.html#train-a-model') model = model.cpu() elif device == 'ipu': model = model.cpu() else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) if not model.device_ids: from mmcv import __version__, digit_version assert digit_version(__version__) >= (1, 4, 4), \ 'To train with CPU, please confirm your mmcv version ' \ 'is not lower than v1.4.4' # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) if device == 'ipu': if not cfg.runner['type'].startswith('IPU'): cfg.runner['type'] = 'IPU' + cfg.runner['type'] if 'options_cfg' not in cfg.runner: cfg.runner['options_cfg'] = {} cfg.runner['options_cfg']['replicationFactor'] = cfg.ipu_replicas cfg.runner['fp16_cfg'] = cfg.get('fp16', None) runner = build_runner(cfg.runner, default_args=dict(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: if device == 'ipu': from mmcv.device.ipu import IPUFp16OptimizerHook optimizer_config = IPUFp16OptimizerHook( **cfg.optimizer_config, loss_scale=fp16_cfg['loss_scale'], distributed=distributed) else: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, loss_scale=fp16_cfg['loss_scale'], distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) if distributed and cfg.runner['type'] == 'EpochBasedRunner': runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) # The specific dataloader settings val_loader_cfg = { **loader_cfg, 'shuffle': False, # Not shuffle by default 'sampler_cfg': None, # Not use sampler by default **cfg.data.get('val_dataloader', {}), } val_dataloader = build_dataloader(val_dataset, **val_loader_cfg) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook # `EvalHook` needs to be executed after `IterTimerHook`. # Otherwise, it will cause a bug if use `IterBasedRunner`. # Refers to https://github.com/open-mmlab/mmcv/issues/1261 runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set multi-process settings setup_multi_processes(cfg) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids[0:1] warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 'Because we only support single GPU mode in ' 'non-distributed testing. Use the first GPU ' 'in `gpu_ids` now.') else: cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test, default_args=dict(test_mode=True)) # the extra round_up data will be removed during gpu/cpu collect data_loader = build_dataloader(dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=True) # build the model and load checkpoint model = build_classifier(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if 'CLASSES' in checkpoint.get('meta', {}): CLASSES = checkpoint['meta']['CLASSES'] else: from mmcls.datasets import ImageNet warnings.simplefilter('once') warnings.warn('Class names are not saved in the checkpoint\'s ' 'meta data, use imagenet by default.') CLASSES = ImageNet.CLASSES if not distributed: if args.device == 'cpu': model = model.cpu() else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) if not model.device_ids: assert mmcv.digit_version(mmcv.__version__) >= (1, 4, 4), \ 'To test with CPU, please confirm your mmcv version ' \ 'is not lower than v1.4.4' model.CLASSES = CLASSES show_kwargs = {} if args.show_options is None else args.show_options outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, **show_kwargs) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: results = {} logger = get_root_logger() if args.metrics: eval_results = dataset.evaluate(results=outputs, metric=args.metrics, metric_options=args.metric_options, logger=logger) results.update(eval_results) for k, v in eval_results.items(): if isinstance(v, np.ndarray): v = [round(out, 2) for out in v.tolist()] elif isinstance(v, Number): v = round(v, 2) else: raise ValueError(f'Unsupport metric type: {type(v)}') print(f'\n{k} : {v}') if args.out: if 'none' not in args.out_items: scores = np.vstack(outputs) pred_score = np.max(scores, axis=1) pred_label = np.argmax(scores, axis=1) pred_class = [CLASSES[lb] for lb in pred_label] res_items = { 'class_scores': scores, 'pred_score': pred_score, 'pred_label': pred_label, 'pred_class': pred_class } if 'all' in args.out_items: results.update(res_items) else: for key in args.out_items: results[key] = res_items[key] print(f'\ndumping results to {args.out}') mmcv.dump(results, args.out)
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.options is not None: cfg.merge_from_dict(args.options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.aml: data_store = os.environ['AZUREML_DATAREFERENCE_{}'.format( args.aml_data_store)] parse(cfg, data_store) if cfg.resume_from is not None: cfg.resume_from = os.path.join(data_store, args.aml_work_dir_prefix, cfg.resume_from) cfg.work_dir = os.path.join(data_store, args.aml_work_dir_prefix, cfg.work_dir) print('work_dir: ', cfg.work_dir) if 'data' in cfg.model.pretrained: cfg.model.pretrained = os.path.join(data_store, cfg.model.pretrained) # if not args.aml: # # work_dir is determined in this priority: CLI > segment in file > filename # if args.work_dir is not None: # # update configs according to CLI args if args.work_dir is not None # cfg.work_dir = args.work_dir # elif cfg.get('work_dir', None) is None: # # use config filename as default work_dir if cfg.work_dir is None # cfg.work_dir = osp.join('./work_dirs', # osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # update gpu num if dist.is_initialized(): cfg.gpus = dist.get_world_size() else: cfg.gpus = args.gpus if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) cfg.optimizer['lr'] = cfg.optimizer[ 'lr'] * cfg.gpus / 8 * cfg.data.samples_per_gpu / 32 # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed model = build_classifier(cfg.model) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmcls version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(mmcls_version=__version__, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES) # add an attribute for visualization convenience train_model(model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, device='cuda', meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: if device == 'cuda': model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) elif device == 'cpu': model = model.cpu() else: raise ValueError(F'unsupported device name {device}.') # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) runner = build_runner(cfg.runner, default_args=dict(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=True) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)