def build_dataloader(dataset, samples_per_gpu, workers_per_gpu, num_gpus=1, dist=True, shuffle=True, seed=None, runner_type='EpochBasedRunner', persistent_workers=False, class_aware_sampler=None, **kwargs): """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. In non-distributed training, there is only one dataloader for all GPUs. Args: dataset (Dataset): A PyTorch dataset. samples_per_gpu (int): Number of training samples on each GPU, i.e., batch size of each GPU. workers_per_gpu (int): How many subprocesses to use for data loading for each GPU. num_gpus (int): Number of GPUs. Only used in non-distributed training. dist (bool): Distributed training/test or not. Default: True. shuffle (bool): Whether to shuffle the data at every epoch. Default: True. seed (int, Optional): Seed to be used. Default: None. runner_type (str): Type of runner. Default: `EpochBasedRunner` persistent_workers (bool): If True, the data loader will not shutdown the worker processes after a dataset has been consumed once. This allows to maintain the workers `Dataset` instances alive. This argument is only valid when PyTorch>=1.7.0. Default: False. class_aware_sampler (dict): Whether to use `ClassAwareSampler` during training. Default: None. kwargs: any keyword argument to be used to initialize DataLoader Returns: DataLoader: A PyTorch dataloader. """ rank, world_size = get_dist_info() if dist: # When model is :obj:`DistributedDataParallel`, # `batch_size` of :obj:`dataloader` is the # number of training samples on each GPU. batch_size = samples_per_gpu num_workers = workers_per_gpu else: # When model is obj:`DataParallel` # the batch size is samples on all the GPUS batch_size = num_gpus * samples_per_gpu num_workers = num_gpus * workers_per_gpu if runner_type == 'IterBasedRunner': # this is a batch sampler, which can yield # a mini-batch indices each time. # it can be used in both `DataParallel` and # `DistributedDataParallel` if shuffle: batch_sampler = InfiniteGroupBatchSampler(dataset, batch_size, world_size, rank, seed=seed) else: batch_sampler = InfiniteBatchSampler(dataset, batch_size, world_size, rank, seed=seed, shuffle=False) batch_size = 1 sampler = None else: if class_aware_sampler is not None: # ClassAwareSampler can be used in both distributed and # non-distributed training. num_sample_class = class_aware_sampler.get('num_sample_class', 1) sampler = ClassAwareSampler(dataset, samples_per_gpu, world_size, rank, seed=seed, num_sample_class=num_sample_class) elif dist: # DistributedGroupSampler will definitely shuffle the data to # satisfy that images on each GPU are in the same group if shuffle: sampler = DistributedGroupSampler(dataset, samples_per_gpu, world_size, rank, seed=seed) else: sampler = DistributedSampler(dataset, world_size, rank, shuffle=False, seed=seed) else: sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None batch_sampler = None init_fn = partial( worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None if (TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) >= digit_version('1.7.0')): kwargs['persistent_workers'] = persistent_workers elif persistent_workers is True: warnings.warn('persistent_workers is invalid because your pytorch ' 'version is lower than 1.7.0') data_loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=partial( collate, samples_per_gpu=samples_per_gpu), pin_memory=kwargs.pop('pin_memory', False), worker_init_fn=init_fn, **kwargs) return data_loader
def main(): args = parse_args() assert args.out or args.show or args.show_dir, \ ('Please specify at least one operation (save or show the results) ' 'with the argument "--out", "--show" or "show-dir"') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(args.config) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True if args.workers == 0: args.workers = cfg.data.workers_per_gpu # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # set random seeds if args.seed is not None: set_random_seed(args.seed) if 'all' in args.corruptions: corruptions = [ 'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur', 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog', 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression', 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate' ] elif 'benchmark' in args.corruptions: corruptions = [ 'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur', 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog', 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression' ] elif 'noise' in args.corruptions: corruptions = ['gaussian_noise', 'shot_noise', 'impulse_noise'] elif 'blur' in args.corruptions: corruptions = [ 'defocus_blur', 'glass_blur', 'motion_blur', 'zoom_blur' ] elif 'weather' in args.corruptions: corruptions = ['snow', 'frost', 'fog', 'brightness'] elif 'digital' in args.corruptions: corruptions = [ 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression' ] elif 'holdout' in args.corruptions: corruptions = ['speckle_noise', 'gaussian_blur', 'spatter', 'saturate'] elif 'None' in args.corruptions: corruptions = ['None'] args.severities = [0] else: corruptions = args.corruptions rank, _ = get_dist_info() aggregated_results = {} for corr_i, corruption in enumerate(corruptions): aggregated_results[corruption] = {} for sev_i, corruption_severity in enumerate(args.severities): # evaluate severity 0 (= no corruption) only once if corr_i > 0 and corruption_severity == 0: aggregated_results[corruption][0] = \ aggregated_results[corruptions[0]][0] continue test_data_cfg = copy.deepcopy(cfg.data.test) # assign corruption and severity if corruption_severity > 0: corruption_trans = dict(type='Corrupt', corruption=corruption, severity=corruption_severity) # TODO: hard coded "1", we assume that the first step is # loading images, which needs to be fixed in the future test_data_cfg['pipeline'].insert(1, corruption_trans) # print info print(f'\nTesting {corruption} at severity {corruption_severity}') # build the dataloader # TODO: support multiple images per gpu # (only minor changes are needed) dataset = build_dataset(test_data_cfg) data_loader = build_dataloader(dataset, samples_per_gpu=1, workers_per_gpu=args.workers, dist=distributed, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') # old versions did not save class info in checkpoints, # this walkaround is for backward compatibility if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES if not distributed: model = MMDataParallel(model, device_ids=[0]) show_dir = args.show_dir if show_dir is not None: show_dir = osp.join(show_dir, corruption) show_dir = osp.join(show_dir, str(corruption_severity)) if not osp.exists(show_dir): osp.makedirs(show_dir) outputs = single_gpu_test(model, data_loader, args.show, show_dir, args.show_score_thr) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir) if args.out and rank == 0: eval_results_filename = (osp.splitext(args.out)[0] + '_results' + osp.splitext(args.out)[1]) mmcv.dump(outputs, args.out) eval_types = args.eval if cfg.dataset_type == 'VOCDataset': if eval_types: for eval_type in eval_types: if eval_type == 'bbox': test_dataset = mmcv.runner.obj_from_dict( cfg.data.test, datasets) logger = 'print' if args.summaries else None mean_ap, eval_results = \ voc_eval_with_return( args.out, test_dataset, args.iou_thr, logger) aggregated_results[corruption][ corruption_severity] = eval_results else: print('\nOnly "bbox" evaluation \ is supported for pascal voc') else: if eval_types: print(f'Starting evaluate {" and ".join(eval_types)}') if eval_types == ['proposal_fast']: result_file = args.out else: if not isinstance(outputs[0], dict): result_files = dataset.results2json( outputs, args.out) else: for name in outputs[0]: print(f'\nEvaluating {name}') outputs_ = [out[name] for out in outputs] result_file = args.out + f'.{name}' result_files = dataset.results2json( outputs_, result_file) eval_results = coco_eval_with_return( result_files, eval_types, dataset.coco) aggregated_results[corruption][ corruption_severity] = eval_results else: print('\nNo task was selected for evaluation;' '\nUse --eval to select a task') # save results after each evaluation mmcv.dump(aggregated_results, eval_results_filename) if rank == 0: # print filan results print('\nAggregated results:') prints = args.final_prints aggregate = args.final_prints_aggregate if cfg.dataset_type == 'VOCDataset': get_results(eval_results_filename, dataset='voc', prints=prints, aggregate=aggregate) else: get_results(eval_results_filename, dataset='coco', prints=prints, aggregate=aggregate)
def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') for fold in range(args.folds): cfg = mmcv.Config.fromfile(args.config) cfg.data.train.split = cfg.data.train.split.replace('hsy', f'{fold}') cfg.data.val.split = cfg.data.val.split.replace('hsy', f'{fold}') cfg.data.test.split = cfg.data.test.split.replace('hsy', f'{fold}') if args.options is not None: cfg.merge_from_dict(args.options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True if args.aug_test: # hard code index cfg.data.test.pipeline[1].img_ratios = [ 0.5, 0.75, 1.0, 1.25, 1.5, 1.75 ] cfg.data.test.pipeline[1].flip = True cfg.model.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_segmentor(cfg.model, test_cfg=cfg.get('test_cfg')) checkpoint = load_checkpoint(model, args.checkpoint.replace('hsy', f'{fold}'), map_location='cpu') model.CLASSES = checkpoint['meta']['CLASSES'] model.PALETTE = checkpoint['meta']['PALETTE'] efficient_test = False if args.eval_options is not None: efficient_test = args.eval_options.get('efficient_test', False) if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir.replace('hsy', f'{fold}'), efficient_test) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect, efficient_test) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: dataset.evaluate(outputs, args.eval, **kwargs)
def main(): args = parse_args() logger = get_root_logger('INFO') for arg in vars(args): logger.info(f"###################### {arg}: {getattr(args, arg)}") assert args.out or args.show or args.json_out, \ ('Please specify at least one operation (save or show the results) ' 'with the argument "--out" or "--show" or "--json_out"') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') if args.json_out is not None and args.json_out.endswith('.json'): args.json_out = args.json_out[:-5] cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # import pdb; pdb.set_trace() cfg.model.pretrained = None cfg.data.test.test_mode = True cfg.test_cfg.rcnn.zsd = args.zsd cfg.test_cfg.rcnn.gzsd = args.gzsd cfg.test_cfg.rcnn.score_thr = 0.05 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) if cfg.test_cfg.rcnn.gzsd: dataset.cat_to_load = dataset.cat_ids data_loader = build_dataloader(dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') model.CLASSES = dataset.CLASSES logger.info(cfg.data.test.split) dataset_name = args.dataset if args.syn_weights: seen_bg_weight, seen_bg_bias = copy_synthesised_weights( model, args.syn_weights, dataset_name, split=cfg.data.test.split) model.bbox_head.seen_bg_weight = torch.from_numpy( seen_bg_weight).cuda() model.bbox_head.seen_bg_bias = torch.from_numpy(seen_bg_bias).cuda() if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show) else: model = MMDistributedDataParallel(model.cuda()) # if args.syn_weights: # seen_bg_weights, seen_bg_bias = copy_synthesised_weights(model, args.syn_weights, dataset_name, split=cfg.data.test.split) outputs = multi_gpu_test(model, data_loader, args.tmpdir) rank, _ = get_dist_info() if args.out and rank == 0: logger.info('\nwriting results to {}'.format(args.out)) mmcv.dump(outputs, args.out) mean_ap, _ = eval(args.out, dataset, dataset_name=dataset_name, split=cfg.data.test.split) f = open('results.txt', 'a') f.writelines(f"{mean_ap} -------- {args.syn_weights} \n" ) # import pdb; pdb.set_trace() f.close()
def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show, \ ('Please specify at least one operation (save/eval/format/show the ' 'results) with the argument "--out", "--eval", "--format_only" ' 'or "--show"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_module(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print('\nwriting results to {}'.format(args.out)) mmcv.dump(outputs, args.out) kwargs = {} if args.options is None else args.options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: dataset.evaluate(outputs, args.eval, **kwargs)
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # re-set gpu_ids with distributed training mode _, world_size = get_dist_info() cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # dump config cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') # specify logger name, if we still use 'mmdet', the output info will be # filtered and won't be saved in the log_file # TODO: ugly workaround to judge whether we are training det or seg model if cfg.model.type in ['EncoderDecoder3D']: logger_name = 'mmseg' else: logger_name = 'mmdet' logger = get_root_logger(log_file=log_file, log_level=cfg.log_level, name=logger_name) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info meta['config'] = cfg.pretty_text # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed meta['exp_name'] = osp.basename(args.config) model = build_model(cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) model.init_weights() logger.info(f'Model:\n{model}') datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) # in case we use a dataset wrapper if 'dataset' in cfg.data.train: val_dataset.pipeline = cfg.data.train.dataset.pipeline else: val_dataset.pipeline = cfg.data.train.pipeline # set test_mode=False here in deep copied config # which do not affect AP/AR calculation later # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa val_dataset.test_mode = False datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict( mmdet_version=mmdet_version, mmseg_version=mmseg_version, mmdet3d_version=mmdet3d_version, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES, PALETTE=datasets[0].PALETTE # for segmentors if hasattr(datasets[0], 'PALETTE') else None) # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES train_model(model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta)
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # re-set gpu_ids with distributed training mode _, world_size = get_dist_info() cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # dump config cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info meta['config'] = cfg.pretty_text # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds seed = init_random_seed(args.seed) logger.info(f'Set random seed to {seed}, ' f'deterministic: {args.deterministic}') set_random_seed(seed, deterministic=args.deterministic) cfg.seed = seed meta['seed'] = seed meta['exp_name'] = osp.basename(args.config) model = build_detector(cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) model.init_weights() datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(mmdet_version=__version__ + get_git_hash()[:7], CLASSES=datasets[0].CLASSES) # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES train_detector(model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta)
help='the weight of the model') args = parser.parse_args() assert osp.isfile(args.weight) if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) init_dist('pytorch', backend='nccl') extractor = DistPlaceExtractor(args.weight) assert osp.isfile(args.listfile) assert osp.isdir(args.img_prefix) imglist = [x.strip() for x in open(args.listfile)] results = extractor.batch_extract( imglist, args.img_prefix, imgs_per_gpu=args.imgs_per_gpu, workers_per_gpu=args.workers_per_gpu) rank, world_size = get_dist_info() if rank == 0: if args.save_path is not None: save_dir = os.path.dirname(args.save_path) if not osp.isdir(save_dir): os.makedirs(save_dir) save_dict = {} for imgname, result in zip(imglist, results): feature = result.numpy() save_dict[imgname] = feature with open(args.save_path, 'wb') as f: pickle.dump(save_dict, f)
def evaluate_model(model_name, paper_arxiv_id, weights_url, weights_name, paper_results, config): print('---') print('Now Evaluating %s' % model_name) evaluator = COCOEvaluator( root='./.data/vision/coco', model_name=model_name, paper_arxiv_id=paper_arxiv_id, paper_results=paper_results) out = 'results.pkl' launcher = 'none' if out is not None and not out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(config) cfg.data.test['ann_file'] = './.data/vision/coco/annotations/instances_val2017.json' cfg.data.test['img_prefix'] = './.data/vision/coco/val2017/' # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if launcher == 'none': distributed = False else: distributed = True init_dist(launcher, **cfg.dist_params) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) print(cfg.data.test) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) local_checkpoint, _ = urllib.request.urlretrieve( weights_url, '%s/.cache/torch/%s' % (str(Path.home()), weights_name)) print(local_checkpoint) # '/home/ubuntu/GCNet/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth' checkpoint = load_checkpoint(model, local_checkpoint, map_location='cpu') # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES evaluator.reset_time() if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs, cache_exists = single_gpu_test(model, data_loader, False, evaluator) else: model = MMDistributedDataParallel(model.cuda()) outputs = multi_gpu_test(model, data_loader, args.tmpdir) if cache_exists: print('Cache exists: %s' % (evaluator.batch_hash)) evaluator.save() else: from mmdetection.mmdet.core import results2json rank, _ = get_dist_info() if out and rank == 0: print('\nwriting results to {}'.format(out)) mmcv.dump(outputs, out) eval_types = ['bbox'] if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = out else: if not isinstance(outputs[0], dict): result_files = results2json(dataset, outputs, out) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = out + '.{}'.format(name) result_files = results2json(dataset, outputs_, result_file) anns = json.load(open(result_files['bbox'])) evaluator.detections = [] evaluator.add(anns) evaluator.save()
def main(): args = parse_args() if args.out is not None and not args.out.endswith((".pkl", ".pickle")): raise ValueError("The output file must be a pkl file.") cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get("cudnn_benchmark", False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == "none": distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = get_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, ) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) load_checkpoint(model, args.checkpoint, map_location="cpu") if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show) else: model = MMDistributedDataParallel(model.cuda()) outputs = multi_gpu_test(model, data_loader, args.tmpdir) rank, _ = get_dist_info() if args.out and rank == 0: print("\nwriting results to {}".format(args.out)) mmcv.dump(outputs, args.out) eval_types = args.eval if eval_types: print("Starting evaluate {}".format(" and ".join(eval_types))) if eval_types == ["proposal_fast"]: result_file = args.out coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_file = args.out + ".json" results2json(dataset, outputs, result_file) coco_eval(result_file, eval_types, dataset.coco) else: for name in outputs[0]: print("\nEvaluating {}".format(name)) outputs_ = [out[name] for out in outputs] result_file = args.out + ".{}.json".format(name) results2json(dataset, outputs_, result_file) coco_eval(result_file, eval_types, dataset.coco)
def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = Config.fromfile(args.config) if cfg.get('USE_MMDET', False): from mmdet.apis import multi_gpu_test, single_gpu_test from mmdet.datasets import build_dataloader from mmdet.models import build_detector as build_model else: from mmtrack.apis import multi_gpu_test, single_gpu_test from mmtrack.datasets import build_dataloader from mmtrack.models import build_model if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # cfg.model.pretrains = None if hasattr(cfg.model, 'detector'): cfg.model.detector.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) logger = get_logger('ParamsSearcher', log_file=args.log) # get all cases search_params = get_search_params(cfg.model.tracker, logger=logger) combinations = [p for p in product(*search_params.values())] search_cfgs = [] for c in combinations: search_cfg = dotty(cfg.model.tracker.copy()) for i, k in enumerate(search_params.keys()): search_cfg[k] = c[i] search_cfgs.append(dict(search_cfg)) print_log(f'Totally {len(search_cfgs)} cases.', logger) # init with the first one cfg.model.tracker = search_cfgs[0].copy() # build the model and load checkpoint if cfg.get('test_cfg', False): model = build_model(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) else: model = build_model(cfg.model) # We need call `init_weights()` to load pretained weights in MOT task. model.init_weights() fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) if args.checkpoint is not None: checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] if not hasattr(model, 'CLASSES'): model.CLASSES = dataset.CLASSES if args.fuse_conv_bn: model = fuse_conv_bn(model) if not distributed: model = MMDataParallel(model, device_ids=[0]) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) print_log(f'Record {cfg.search_metrics}.', logger) for i, search_cfg in enumerate(search_cfgs): if not distributed: model.module.tracker = build_tracker(search_cfg) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) else: model.module.tracker = build_tracker(search_cfg) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: eval_kwargs = cfg.get('evaluation', {}).copy() # hard-code way to remove EvalHook args for key in ['interval', 'tmpdir', 'start', 'gpu_collect']: eval_kwargs.pop(key, None) eval_kwargs.update(dict(metric=args.eval, **kwargs)) results = dataset.evaluate(outputs, **eval_kwargs) _records = [] for k in cfg.search_metrics: if isinstance(results[k], float): _records.append(f'{(results[k]):.3f}') else: _records.append(f'{(results[k])}') print_log(f'{combinations[i]}: {_records}', logger)
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) # Load output_config from cfg output_config = cfg.get('output_config', {}) # Overwrite output_config from args.out output_config = merge_configs(output_config, dict(out=args.out)) # Load eval_config from cfg eval_config = cfg.get('eval_config', {}) # Overwrite eval_config from args.eval eval_config = merge_configs(eval_config, dict(metrics=args.eval)) # Add options from args.option eval_config = merge_configs(eval_config, args.options) assert output_config or eval_config, \ ('Please specify at least one operation (save or eval the ' 'results) with the argument "--out" or "--eval"') # set cudnn benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.data.test.test_mode = True if cfg.test_cfg is None: cfg.test_cfg = dict(average_clips=args.average_clips) else: cfg.test_cfg.average_clips = args.average_clips # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test, dict(test_mode=True)) data_loader = build_dataloader( dataset, videos_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if output_config: out = output_config['out'] print(f'\nwriting results to {out}') dataset.dump_results(outputs, **output_config) if eval_config: eval_res = dataset.evaluate(outputs, **eval_config) for name, val in eval_res.items(): if 'confusion' not in name: print(f'{name}: {val:.04f}') elif output_config: if 'fig' in name: confmat_dir = os.path.dirname(output_config['out']) val.savefig(os.path.join(confmat_dir, name+'.jpg'), format='jpg')
def main(): # options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = parse(args.opt, is_train=False) # distributed testing settings if args.launcher == 'none': # non-distributed testing opt['dist'] = False print('Disable distributed testing.', flush=True) else: opt['dist'] = True if args.launcher == 'slurm' and 'dist_params' in opt: init_dist(args.launcher, **opt['dist_params']) else: init_dist(args.launcher) rank, world_size = get_dist_info() opt['rank'] = rank opt['world_size'] = world_size make_exp_dirs(opt) log_file = osp.join(opt['path']['log'], f"test_{opt['name']}_{get_time_str()}.log") logger = get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=log_file) logger.info(get_env_info()) logger.info(dict2str(opt)) # random seed seed = opt['manual_seed'] if seed is None: seed = random.randint(1, 10000) opt['manual_seed'] = seed logger.info(f'Random seed: {seed}') set_random_seed(seed + rank) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True # create test dataset and dataloader test_loaders = [] for phase, dataset_opt in sorted(opt['datasets'].items()): test_set = create_dataset(dataset_opt) test_loader = create_dataloader(test_set, dataset_opt, num_gpu=opt['num_gpu'], dist=opt['dist'], sampler=None, seed=seed) logger.info( f"Number of test images in {dataset_opt['name']}: {len(test_set)}") test_loaders.append(test_loader) # create model model = create_model(opt) for test_loader in test_loaders: test_set_name = test_loader.dataset.opt['name'] logger.info(f'Testing {test_set_name}...') model.validation(test_loader, current_iter=opt['name'], tb_logger=None, save_img=opt['val']['save_img'], save_h5=opt['val']['save_h5'])
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.options is not None: cfg.merge_from_dict(args.options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) _, world_size = get_dist_info() cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) if distributed: if args.autoscale_lr: old_lr = cfg.optimizer['lr'] samples_per_gpu = cfg.data['samples_per_gpu'] cfg.optimizer['lr'] = old_lr * world_size * samples_per_gpu / 256. logger.info( f"{world_size} gpus, autoscale lr from {old_lr} to {cfg.optimizer['lr']}" ) old_warmup_ratio = cfg.lr_config['warmup_ratio'] warmup_ratio = 0.1 / cfg.optimizer['lr'] cfg.lr_config['warmup_ratio'] = warmup_ratio logger.info( f"warmup ratio {old_warmup_ratio} is changed to {warmup_ratio}" ) #TODO compatible for other datasets warmup_iters = 5005. * 5 * 8 / world_size * 32 / samples_per_gpu old_warmup = cfg.lr_config['warmup_iters'] cfg.lr_config['warmup_iters'] = warmup_iters logger.info( f"warmup iters {old_warmup} is changed to {warmup_iters}") # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed model = build_classifier(cfg.model) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmcls version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(mmcls_version=__version__, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES) # add an attribute for visualization convenience train_model(model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta)
def load_state_dict(module, state_dict, mapping, logger=None, mb_mapping=None): # noqa: C901 """Load state_dict to a module. This method is modified from :meth:`torch.nn.Module.load_state_dict`. :param module: Module that receives the state_dict :type module: OrderedDict :param state_dict: Weights :type state_dict: Module :param mapping: serail backbone with related pretrained weight name :type mapping: dict :param logger: Logger to log the error :type logger: logging.Logger :param mb_mapping: parallel backbone with related serial backbone :type mb_mapping: dict """ unexpected_keys = [] shape_mismatch_pairs = [] mb_keys = [] own_state = module.state_dict() for name, param in state_dict.items(): if name.find('mb') >= 0: continue if name not in own_state and name not in mapping.keys(): unexpected_keys.append(name) continue if name in mapping.keys(): if mapping[name] not in own_state: unexpected_keys.append(name) continue name = mapping[name] if isinstance(param, torch.nn.Parameter): # backwards compatibility for serialized parameters param = param.data if param.size() != own_state[name].size(): shape_mismatch_pairs.append( [name, own_state[name].size(), param.size()]) continue own_state[name].copy_(param) if mb_mapping is not None and name in mb_mapping.keys(): for mb_name in mb_mapping[name]: own_state[mb_name].copy_(param) mb_keys.append(mb_name) all_missing_keys = set(own_state.keys()) - set(state_dict.keys()) - set( mapping.values()) - set(mb_keys) unexpected_keys = set(unexpected_keys) - set(mb_keys) # ignore "num_batches_tracked" of BN layers missing_keys = [ key for key in all_missing_keys if 'num_batches_tracked' not in key ] rank, _ = get_dist_info() print_err = ['The model and loaded state dict do not match exactly\n'] if rank == 0: if unexpected_keys: print_err.append( 'unexpected key in source state_dict: {}\n'.format( ', '.join(unexpected_keys))) if missing_keys: print_err.append('missing keys in source state_dict: {}\n'.format( ', '.join(missing_keys))) if shape_mismatch_pairs: print_err.append('mismatched shape keys: {}\n'.format( ', '.join(shape_mismatch_pairs))) if logger is not None: logger.warning(print_err) else: print(print_err)
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=False) # build the model and load checkpoint model = build_classifier(cfg.model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) _ = load_checkpoint(model, args.checkpoint, map_location='cpu') if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: nums = [] results = {} for output in outputs: nums.append(output['num_samples'].item()) for topk, v in output['accuracy'].items(): if topk not in results: results[topk] = [] results[topk].append(v.item()) assert sum(nums) == len(dataset) for topk, accs in results.items(): avg_acc = np.average(accs, weights=nums) print(f'\n{topk} accuracy: {avg_acc:.2f}') if args.out and rank == 0: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out)
def train_model(model, dataset, cfg, distributed=False, validate=False, test=dict(test_best=False, test_last=False), timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. test (dict): The testing option, with two keys: test_last & test_best. The value is True or False, indicating whether to test the corresponding checkpoint. Default: dict(test_best=False, test_last=False). timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), persistent_workers=cfg.data.get('persistent_workers', False), num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) dataloader_setting = dict(dataloader_setting, **cfg.data.get('train_dataloader', {})) if cfg.omnisource: # The option can override videos_per_gpu train_ratio = cfg.data.get('train_ratio', [1] * len(dataset)) omni_videos_per_gpu = cfg.data.get('omni_videos_per_gpu', None) if omni_videos_per_gpu is None: dataloader_settings = [dataloader_setting] * len(dataset) else: dataloader_settings = [] for videos_per_gpu in omni_videos_per_gpu: this_setting = cp.deepcopy(dataloader_setting) this_setting['videos_per_gpu'] = videos_per_gpu dataloader_settings.append(this_setting) data_loaders = [ build_dataloader(ds, **setting) for ds, setting in zip(dataset, dataloader_settings) ] else: data_loaders = [ build_dataloader(ds, **dataloader_setting) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = build_ddp(model, default_device, default_args=dict( device_ids=[int(os.environ['LOCAL_RANK'])], broadcast_buffers=False, find_unused_parameters=find_unused_parameters)) else: model = build_dp(model, default_device, default_args=dict(device_ids=cfg.gpu_ids)) # build runner optimizer = build_optimizer(model, cfg.optimizer) Runner = OmniSourceRunner if cfg.omnisource else EpochBasedRunner runner = Runner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) # multigrid setting multigrid_cfg = cfg.get('multigrid', None) if multigrid_cfg is not None: from mmaction.utils.multigrid import LongShortCycleHook multigrid_scheduler = LongShortCycleHook(cfg) runner.register_hook(multigrid_scheduler) logger.info('Finish register multigrid hook') # subbn3d aggregation is HIGH, as it should be done before # saving and evaluation from mmaction.utils.multigrid import SubBatchNorm3dAggregationHook subbn3d_aggre_hook = SubBatchNorm3dAggregationHook() runner.register_hook(subbn3d_aggre_hook, priority='VERY_HIGH') logger.info('Finish register subbn3daggre hook') # precise bn setting if cfg.get('precise_bn', False): precise_bn_dataset = build_dataset(cfg.data.train) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=1, # save memory and time persistent_workers=cfg.data.get('persistent_workers', False), num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) data_loader_precise_bn = build_dataloader(precise_bn_dataset, **dataloader_setting) precise_bn_hook = PreciseBNHook(data_loader_precise_bn, **cfg.get('precise_bn')) runner.register_hook(precise_bn_hook, priority='HIGHEST') logger.info('Finish register precisebn hook') if distributed: if cfg.omnisource: runner.register_hook(OmniSourceDistSamplerSeedHook()) else: runner.register_hook(DistSamplerSeedHook()) if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), persistent_workers=cfg.data.get('persistent_workers', False), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook(val_dataloader, **eval_cfg) if distributed \ else EvalHook(val_dataloader, **eval_cfg) runner.register_hook(eval_hook, priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner_kwargs = dict() if cfg.omnisource: runner_kwargs = dict(train_ratio=train_ratio) runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs) if distributed: dist.barrier() time.sleep(5) if test['test_last'] or test['test_best']: best_ckpt_path = None if test['test_best']: ckpt_paths = [x for x in os.listdir(cfg.work_dir) if 'best' in x] ckpt_paths = [x for x in ckpt_paths if x.endswith('.pth')] if len(ckpt_paths) == 0: runner.logger.info('Warning: test_best set, but no ckpt found') test['test_best'] = False if not test['test_last']: return elif len(ckpt_paths) > 1: epoch_ids = [ int(x.split('epoch_')[-1][:-4]) for x in ckpt_paths ] best_ckpt_path = ckpt_paths[np.argmax(epoch_ids)] else: best_ckpt_path = ckpt_paths[0] if best_ckpt_path: best_ckpt_path = osp.join(cfg.work_dir, best_ckpt_path) test_dataset = build_dataset(cfg.data.test, dict(test_mode=True)) gpu_collect = cfg.get('evaluation', {}).get('gpu_collect', False) tmpdir = cfg.get('evaluation', {}).get('tmpdir', osp.join(cfg.work_dir, 'tmp')) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), persistent_workers=cfg.data.get('persistent_workers', False), num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('test_dataloader', {})) test_dataloader = build_dataloader(test_dataset, **dataloader_setting) names, ckpts = [], [] if test['test_last']: names.append('last') ckpts.append(None) if test['test_best'] and best_ckpt_path is not None: names.append('best') ckpts.append(best_ckpt_path) for name, ckpt in zip(names, ckpts): if ckpt is not None: runner.load_checkpoint(ckpt) outputs = multi_gpu_test(runner.model, test_dataloader, tmpdir, gpu_collect) rank, _ = get_dist_info() if rank == 0: out = osp.join(cfg.work_dir, f'{name}_pred.pkl') test_dataset.dump_results(outputs, out) eval_cfg = cfg.get('evaluation', {}) for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule', 'by_epoch', 'broadcast_bn_buffers' ]: eval_cfg.pop(key, None) eval_res = test_dataset.evaluate(outputs, **eval_cfg) runner.logger.info(f'Testing results of the {name} checkpoint') for metric_name, val in eval_res.items(): runner.logger.info(f'{metric_name}: {val:.04f}')
def build_dataloader(dataset, samples_per_gpu, workers_per_gpu, num_gpus=1, dist=True, shuffle=True, round_up=True, seed=None, **kwargs): """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. In non-distributed training, there is only one dataloader for all GPUs. Args: dataset (Dataset): A PyTorch dataset. samples_per_gpu (int): Number of training samples on each GPU, i.e., batch size of each GPU. workers_per_gpu (int): How many subprocesses to use for data loading for each GPU. num_gpus (int): Number of GPUs. Only used in non-distributed training. dist (bool): Distributed training/test or not. Default: True. shuffle (bool): Whether to shuffle the data at every epoch. Default: True. round_up (bool): Whether to round up the length of dataset by adding extra samples to make it evenly divisible. Default: True. kwargs: any keyword argument to be used to initialize DataLoader Returns: DataLoader: A PyTorch dataloader. """ rank, world_size = get_dist_info() if dataset.triplet_sampler: assert dist num_instance = 4 assert samples_per_gpu%num_instance==0 triplet_seed = 772299 sampler = NaiveIdentitySampler(dataset.samples, samples_per_gpu, num_instance, rank, world_size, triplet_seed) batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, samples_per_gpu, True) #batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, samples_per_gpu*world_size, True) batch_size = samples_per_gpu num_workers = workers_per_gpu shuffle = False elif dist: sampler = DistributedSampler( dataset, world_size, rank, shuffle=shuffle, round_up=round_up) shuffle = False batch_size = samples_per_gpu num_workers = workers_per_gpu else: sampler = None batch_size = num_gpus * samples_per_gpu num_workers = num_gpus * workers_per_gpu init_fn = partial( worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None if dataset.triplet_sampler: data_loader = DataLoader( dataset, #batch_size=batch_size, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), pin_memory=False, #shuffle=shuffle, #worker_init_fn=init_fn, **kwargs) else: data_loader = DataLoader( dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), pin_memory=False, shuffle=shuffle, worker_init_fn=init_fn, **kwargs) return data_loader
def main(): args = parse_args() assert args.out or args.show or args.json_out, \ ('Please specify at least one operation (save or show the results) ' 'with the argument "--out" or "--show" or "--json_out"') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') if args.json_out is not None and args.json_out.endswith('.json'): args.json_out = args.json_out[:-5] cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None # cfg.model.rpn_pretrained = None # cfg.model.rcnn_pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show) else: model = MMDistributedDataParallel(model.cuda()) outputs = multi_gpu_test(model, data_loader, args.tmpdir) rank, _ = get_dist_info() if args.out and rank == 0: print('\nwriting results to {}'.format(args.out)) mmcv.dump(outputs, args.out) eval_types = args.eval data_name = args.data if data_name == 'coco': if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = args.out coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_files = results2json(dataset, outputs, args.out) coco_eval(result_files, eval_types, dataset.coco) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = args.out + '.{}'.format(name) result_files = results2json( dataset, outputs_, result_file) coco_eval(result_files, eval_types, dataset.coco) elif data_name in ['dota', 'hrsc2016']: eval_kwargs = cfg.get('evaluation', {}).copy() work_dir = osp.dirname(args.out) dataset.evaluate(outputs, work_dir, **eval_kwargs) # Save predictions in the COCO json format if args.json_out and rank == 0: if not isinstance(outputs[0], dict): results2json(dataset, outputs, args.json_out) else: for name in outputs[0]: outputs_ = [out[name] for out in outputs] result_file = args.json_out + '.{}'.format(name) results2json(dataset, outputs_, result_file)
def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show, ( "Please specify at least one operation (save/eval/format/show the " 'results) with the argument "--out", "--eval", "--format_only" ' 'or "--show"') if args.eval and args.format_only: raise ValueError("--eval and --format_only cannot be both specified") if args.out is not None and not args.out.endswith((".pkl", ".pickle")): raise ValueError("The output file must be a pkl file.") cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get("cudnn_benchmark", False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == "none": distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # set random seeds if args.seed is not None: set_random_seed(args.seed, deterministic=args.deterministic) # build the dataloader samples_per_gpu = cfg.data.test.pop("samples_per_gpu", 1) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, ) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get("test_cfg")) parameters = filter(lambda p: p.requires_grad, model.parameters()) model_engine, _, _, _ = initialize( args=args, model=model, model_parameters=parameters, ) _, client_state = model_engine.load_checkpoint(args.checkpoint, "ds") model = model_engine.module fp16_cfg = cfg.get("fp16", None) if fp16_cfg is not None: wrap_fp16_model(model) # checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_module(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if "CLASSES" in client_state: model.CLASSES = client_state["CLASSES"] else: model.CLASSES = dataset.CLASSES if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, ) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print(f"\nwriting results to {args.out}") mmcv.dump(outputs, args.out) kwargs = {} if args.options is None else args.options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: dataset.evaluate(outputs, args.eval, **kwargs)
def main(): args = parse_args() assert ( args.out or args.eval or args.format_only or args.show or args.show_dir), ( 'Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir".') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified.') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) setup_multi_processes(cfg) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True if cfg.model.get('pretrained'): cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated samples_per_gpu = (cfg.data.get('test_dataloader', {})).get( 'samples_per_gpu', cfg.data.get('samples_per_gpu', 1)) if samples_per_gpu > 1: cfg = disable_text_recog_aug_test(cfg) cfg = replace_image_to_tensor(cfg) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': cfg.gpu_ids = [args.gpu_id] distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test, dict(test_mode=True)) # step 1: give default values and override (if exist) from cfg.data loader_cfg = { **dict(seed=cfg.get('seed'), drop_last=False, dist=distributed), **({} if torch.__version__ != 'parrots' else dict( prefetch_num=2, pin_memory=False, )), **dict((k, cfg.data[k]) for k in [ 'workers_per_gpu', 'seed', 'prefetch_num', 'pin_memory', 'persistent_workers', ] if k in cfg.data) } test_loader_cfg = { **loader_cfg, **dict(shuffle=False, drop_last=False), **cfg.data.get('test_dataloader', {}), **dict(samples_per_gpu=samples_per_gpu) } data_loader = build_dataloader(dataset, **test_loader_cfg) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) model = revert_sync_batchnorm(model) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) if not distributed: model = MMDataParallel(model, device_ids=cfg.gpu_ids) is_kie = cfg.model.type in ['SDMGR'] outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, is_kie, args.show_score_thr) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: eval_kwargs = cfg.get('evaluation', {}).copy() # hard-code way to remove EvalHook args for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule' ]: eval_kwargs.pop(key, None) eval_kwargs.update(dict(metric=args.eval, **kwargs)) print(dataset.evaluate(outputs, **eval_kwargs))
def main(): # args = ['./DOTA_configs/DOTA_hbb/faster_rcnn_r50_fpn_2x_dota.py', # './results/faster_rcnn_hbb_tv/epoch_24.pth', # '--out', './results/faster_rcnn_hbb_tv/results.pkl', # '--eval', 'bbox' # ] # args = ['./DOTA_configs/DOTA_hbb/faster_rcnn_r50_fpn_2x_dota.py', # './results/faster_rcnn_hbb_tv/epoch_24.pth', # '--out', './results/faster_rcnn_hbb_tv/results.pkl', # ] args = [ './DOTA_configs/DOTA_obb/retinanet_r50_fpn_2x_ad.py', './results/DOTA_retina_ad_obb_tv/epoch_24.pth', '--out', './results/DOTA_retina_ad_obb_tv/results.pkl', ] # # # args = ['./DOTA_configs/DIOR/retinanet_r50_fpn_2x.py', # './results/DIOR_retinanet_full/epoch_24.pth', # '--out', './results/DIOR_retinanet_full/results.pkl', # '--eval', 'bbox' # ] # args = ['./DOTA_configs/DOTA_obb/s2anet_r50_fpn_1x_dota.py', # './results/DOTA_s2anet_obb_tv/epoch_24.pth', # '--out', './results/DOTA_s2anet_obb_tv/results.pkl', # '--eval', 'bbox' # ] # args = ['./DOTA_configs/DIOR_voc_test/retinanet_r50_fpn_2x.py', # './results/DIOR_retinanet_full/epoch_24.pth', # '--out', './results/DIOR_retinanet_full/results.pkl', # '--eval', 'mAP' # ] args = [ './DOTA_configs/DOTA_obb/faster_rcnn_RoITrans_r50_fpn_1x_dota.py', './results/DOTA_faster_rcnn_RoITrans_tv/epoch_12.pth', '--out', './results/DOTA_faster_rcnn_RoITrans_tv/results.pkl', ] args = parse_args(args) print(args) assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = Config.fromfile(args.config) print(cfg.pretty_text) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES ########################################################################### if not LOAD_RESULT: if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, args.show_score_thr) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) else: import pickle as pkl with open(str(args.out), 'rb') as f: outputs = pkl.load(f) a = 0 ########################################################################### rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: eval_kwargs = cfg.get('evaluation', {}).copy() # hard-code way to remove EvalHook args for key in ['interval', 'tmpdir', 'start', 'gpu_collect']: eval_kwargs.pop(key, None) eval_kwargs.update(dict(metric=args.eval, **kwargs)) ###################################### # add class wise # eval_kwargs['classwise']=True # eval_kwargs['proposal_nums'] = (100, 300, 1000) s = str(dataset.evaluate(outputs, **eval_kwargs)) work_dir = os.path.split(str(args.out))[0] eval_file = os.path.join(work_dir, 'eval_results.txt') with open(eval_file, 'wt+') as f: f.write(str(s)) print(s)
def main(): # options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = parse(args.opt, is_train=True) # distributed training settings if args.launcher == 'none': # non-distributed training opt['dist'] = False print('Disable distributed training.') else: opt['dist'] = True if args.launcher == 'slurm' and 'dist_params' in opt: init_dist(args.launcher, **opt['dist_params']) else: init_dist(args.launcher) rank, world_size = get_dist_info() opt['rank'] = rank opt['world_size'] = world_size # load resume states if exists if opt['path'].get('resume_state'): device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) else: resume_state = None # mkdir and loggers if resume_state is None: make_exp_dirs(opt) log_file = './' + opt['name'] + '.log' logger = get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=log_file) logger.info(get_env_info()) logger.info(dict2str(opt)) # initialize tensorboard logger and wandb logger tb_logger = None if opt['logger'].get('use_tb_logger') and 'debug' not in opt['name']: log_dir = './tb_logger/' + opt['name'] if resume_state is None and opt['rank'] == 0: mkdir_and_rename(log_dir) tb_logger = init_tb_logger(log_dir=log_dir) if (opt['logger'].get('wandb') is not None) and (opt['logger']['wandb'].get('project') is not None) and ('debug' not in opt['name']): assert opt['logger'].get('use_tb_logger') is True, ( 'should turn on tensorboard when using wandb') init_wandb_logger(opt) # random seed # print('1') seed = opt['manual_seed'] # if seed is None: # seed = random.randint(1, 10000) # opt['manual_seed'] = seed set_random_seed(seed + rank) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True # create train and val dataloaders train_loader, val_loader = None, None for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': dataset_enlarge_ratio = dataset_opt.get('dataset_enlarge_ratio', 1) train_set = create_dataset(dataset_opt) train_sampler = EnlargedSampler(train_set, world_size, rank, dataset_enlarge_ratio) train_loader = create_dataloader(train_set, dataset_opt, num_gpu=opt['num_gpu'], dist=opt['dist'], sampler=train_sampler, seed=seed) num_iter_per_epoch = math.ceil( len(train_set) * dataset_enlarge_ratio / (dataset_opt['batch_size_per_gpu'] * opt['world_size'])) total_iters = int(opt['train']['total_iter']) total_epochs = math.ceil(total_iters / (num_iter_per_epoch)) logger.info( 'Training statistics:' f'\n\tNumber of train images: {len(train_set)}' f'\n\tDataset enlarge ratio: {dataset_enlarge_ratio}' f'\n\tBatch size per gpu: {dataset_opt["batch_size_per_gpu"]}' f'\n\tWorld size (gpu number): {opt["world_size"]}' f'\n\tRequire iter number per epoch: {num_iter_per_epoch}' f'\n\tTotal epochs: {total_epochs}; iters: {total_iters}.') elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, num_gpu=opt['num_gpu'], dist=opt['dist'], sampler=None, seed=seed) logger.info( f'Number of val images/folders in {dataset_opt["name"]}: ' f'{len(val_set)}') else: raise ValueError(f'Dataset phase {phase} is not recognized.') assert train_loader is not None # create model if resume_state: check_resume(opt, resume_state['iter']) # modify pretrain_model paths model = create_model(opt) # resume training if resume_state: logger.info(f"Resuming training from epoch: {resume_state['epoch']}, " f"iter: {resume_state['iter']}.") start_epoch = resume_state['epoch'] current_iter = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: start_epoch = 0 current_iter = 0 # create message logger (formatted outputs) msg_logger = MessageLogger(opt, current_iter, tb_logger) # dataloader prefetcher prefetch_mode = opt['datasets']['train'].get('prefetch_mode') if prefetch_mode is None or prefetch_mode == 'cpu': prefetcher = CPUPrefetcher(train_loader) elif prefetch_mode == 'cuda': prefetcher = CUDAPrefetcher(train_loader, opt) logger.info(f'Use {prefetch_mode} prefetch dataloader') if opt['datasets']['train'].get('pin_memory') is not True: raise ValueError('Please set pin_memory=True for CUDAPrefetcher.') else: raise ValueError(f'Wrong prefetch_mode {prefetch_mode}.' "Supported ones are: None, 'cuda', 'cpu'.") # training logger.info( f'Start training from epoch: {start_epoch}, iter: {current_iter}') data_time, iter_time = time.time(), time.time() start_time = time.time() for epoch in range(start_epoch, total_epochs + 1): train_sampler.set_epoch(epoch) prefetcher.reset() train_data = prefetcher.next() while train_data is not None: data_time = time.time() - data_time current_iter += 1 if current_iter > total_iters: break # update learning rate model.update_learning_rate(current_iter, warmup_iter=opt['train'].get( 'warmup_iter', -1)) # training model.feed_data(train_data) model.optimize_parameters(current_iter) iter_time = time.time() - iter_time # log if current_iter % opt['logger']['print_freq'] == 0: log_vars = {'epoch': epoch, 'iter': current_iter} log_vars.update({'lrs': model.get_current_learning_rate()}) log_vars.update({'time': iter_time, 'data_time': data_time}) log_vars.update(model.get_current_log()) msg_logger(log_vars) # save models and training states if current_iter % opt['logger']['save_checkpoint_freq'] == 0: logger.info('Saving models and training states.') model.save(epoch, current_iter) # validation if (opt['val']['val_freq'] is not None and current_iter % opt['val']['val_freq'] == 0): # or current_iter==1: model.validation(val_loader, current_iter, tb_logger, opt['val']['save_img']) data_time = time.time() iter_time = time.time() train_data = prefetcher.next() # end of iter # end of epoch consumed_time = str( datetime.timedelta(seconds=int(time.time() - start_time))) logger.info(f'End of training. Time consumed: {consumed_time}') logger.info('Save the latest model.') model.save(epoch=-1, current_iter=-1) # -1 stands for the latest if opt['val']['val_freq'] is not None: model.validation(val_loader, current_iter, tb_logger, opt['val']['save_img']) if tb_logger: tb_logger.close()
def build_dataloader(dataset, samples_per_gpu, workers_per_gpu, num_gpus=1, dist=True, shuffle=True, seed=None, **kwargs): """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. In non-distributed training, there is only one dataloader for all GPUs. Args: dataset (Dataset): A PyTorch dataset. samples_per_gpu (int): Number of training samples on each GPU, i.e., batch size of each GPU. workers_per_gpu (int): How many subprocesses to use for data loading for each GPU. num_gpus (int): Number of GPUs. Only used in non-distributed training. dist (bool): Distributed training/test or not. Default: True. shuffle (bool): Whether to shuffle the data at every epoch. Default: True. kwargs: any keyword argument to be used to initialize DataLoader Returns: DataLoader: A PyTorch dataloader. """ rank, world_size = get_dist_info() if dist: # DistributedGroupSampler will definitely shuffle the data to satisfy # that images on each GPU are in the same group if shuffle: sampler = DistributedGroupSampler(dataset, samples_per_gpu, world_size, rank) else: sampler = DistributedSampler(dataset, world_size, rank, shuffle=False) batch_size = samples_per_gpu num_workers = workers_per_gpu else: sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None batch_size = num_gpus * samples_per_gpu num_workers = num_gpus * workers_per_gpu init_fn = partial( worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None data_loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, collate_fn=partial( collate, samples_per_gpu=samples_per_gpu), pin_memory=False, worker_init_fn=init_fn, **kwargs) return data_loader
def main(): args = parse_args() cfg = Config.fromfile(args.config) cfg.merge_from_dict(args.cfg_options) # Load output_config from cfg output_config = cfg.get('output_config', {}) # Overwrite output_config from args.out output_config = Config._merge_a_into_b(dict(out=args.out), output_config) # Load eval_config from cfg eval_config = cfg.get('eval_config', {}) # Overwrite eval_config from args.eval eval_config = Config._merge_a_into_b(dict(metrics=args.eval), eval_config) # Add options from args.eval_options eval_config = Config._merge_a_into_b(args.eval_options, eval_config) assert output_config or eval_config, \ ('Please specify at least one operation (save or eval the ' 'results) with the argument "--out" or "--eval"') if output_config.get('out', None): out = output_config['out'] # make sure the dirname of the output path exists mmcv.mkdir_or_exist(osp.dirname(out)) _, suffix = osp.splitext(out) assert suffix in file_handlers, \ 'The format of the output file should be json, pickle or yaml' # set cudnn benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.data.test.test_mode = True if cfg.test_cfg is None: cfg.test_cfg = dict(average_clips=args.average_clips) else: # You can set average_clips during testing, it will override the # original settting if args.average_clips is not None: cfg.test_cfg.average_clips = args.average_clips # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test, dict(test_mode=True)) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 2), workers_per_gpu=cfg.data.get('workers_per_gpu', 0), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('test_dataloader', {})) data_loader = build_dataloader(dataset, **dataloader_setting) # build the model and load checkpoint model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if output_config.get('out', None): out = output_config['out'] print(f'\nwriting results to {out}') dataset.dump_results(outputs, **output_config) if eval_config: eval_res = dataset.evaluate(outputs, **eval_config) for name, val in eval_res.items(): print(f'{name}: {val:.04f}')
def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set multi-process settings setup_multi_processes(cfg) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True if args.aug_test: # hard code index cfg.data.test.pipeline[1].img_ratios = [ 0.5, 0.75, 1.0, 1.25, 1.5, 1.75 ] cfg.data.test.pipeline[1].flip = True cfg.model.pretrained = None cfg.data.test.test_mode = True if args.gpu_id is not None: cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': cfg.gpu_ids = [args.gpu_id] distributed = False if len(cfg.gpu_ids) > 1: warnings.warn(f'The gpu-ids is reset from {cfg.gpu_ids} to ' f'{cfg.gpu_ids[0:1]} to avoid potential error in ' 'non-distribute testing time.') cfg.gpu_ids = cfg.gpu_ids[0:1] else: distributed = True init_dist(args.launcher, **cfg.dist_params) rank, _ = get_dist_info() # allows not to create if args.work_dir is not None and rank == 0: mmcv.mkdir_or_exist(osp.abspath(args.work_dir)) timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) if args.aug_test: json_file = osp.join(args.work_dir, f'eval_multi_scale_{timestamp}.json') else: json_file = osp.join(args.work_dir, f'eval_single_scale_{timestamp}.json') elif rank == 0: work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) mmcv.mkdir_or_exist(osp.abspath(work_dir)) timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) if args.aug_test: json_file = osp.join(work_dir, f'eval_multi_scale_{timestamp}.json') else: json_file = osp.join(work_dir, f'eval_single_scale_{timestamp}.json') # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) # The default loader config loader_cfg = dict( # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) # The overall dataloader settings loader_cfg.update({ k: v for k, v in cfg.data.items() if k not in [ 'train', 'val', 'test', 'train_dataloader', 'val_dataloader', 'test_dataloader' ] }) test_loader_cfg = { **loader_cfg, 'samples_per_gpu': 1, 'shuffle': False, # Not shuffle by default **cfg.data.get('test_dataloader', {}) } # build the dataloader data_loader = build_dataloader(dataset, **test_loader_cfg) # build the model and load checkpoint cfg.model.train_cfg = None model = build_segmentor(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: print('"CLASSES" not found in meta, use dataset.CLASSES instead') model.CLASSES = dataset.CLASSES if 'PALETTE' in checkpoint.get('meta', {}): model.PALETTE = checkpoint['meta']['PALETTE'] else: print('"PALETTE" not found in meta, use dataset.PALETTE instead') model.PALETTE = dataset.PALETTE # clean gpu memory when starting a new evaluation. torch.cuda.empty_cache() eval_kwargs = {} if args.eval_options is None else args.eval_options # Deprecated efficient_test = eval_kwargs.get('efficient_test', False) if efficient_test: warnings.warn( '``efficient_test=True`` does not have effect in tools/test.py, ' 'the evaluation and format results are CPU memory efficient by ' 'default') eval_on_format_results = (args.eval is not None and 'cityscapes' in args.eval) if eval_on_format_results: assert len(args.eval) == 1, 'eval on format results is not ' \ 'applicable for metrics other than ' \ 'cityscapes' if args.format_only or eval_on_format_results: if 'imgfile_prefix' in eval_kwargs: tmpdir = eval_kwargs['imgfile_prefix'] else: tmpdir = '.format_cityscapes' eval_kwargs.setdefault('imgfile_prefix', tmpdir) mmcv.mkdir_or_exist(tmpdir) else: tmpdir = None if not distributed: warnings.warn( 'SyncBN is only supported with DDP. To be compatible with DP, ' 'we convert SyncBN to BN. Please use dist_train.sh which can ' 'avoid this error.') if not torch.cuda.is_available(): assert digit_version(mmcv.__version__) >= digit_version('1.4.4'), \ 'Please use MMCV >= 1.4.4 for CPU training!' model = revert_sync_batchnorm(model) model = MMDataParallel(model, device_ids=cfg.gpu_ids) results = single_gpu_test(model, data_loader, args.show, args.show_dir, False, args.opacity, pre_eval=args.eval is not None and not eval_on_format_results, format_only=args.format_only or eval_on_format_results, format_args=eval_kwargs) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) results = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect, False, pre_eval=args.eval is not None and not eval_on_format_results, format_only=args.format_only or eval_on_format_results, format_args=eval_kwargs) rank, _ = get_dist_info() if rank == 0: if args.out: warnings.warn( 'The behavior of ``args.out`` has been changed since MMSeg ' 'v0.16, the pickled outputs could be seg map as type of ' 'np.array, pre-eval results or file paths for ' '``dataset.format_results()``.') print(f'\nwriting results to {args.out}') mmcv.dump(results, args.out) if args.eval: eval_kwargs.update(metric=args.eval) metric = dataset.evaluate(results, **eval_kwargs) metric_dict = dict(config=args.config, metric=metric) mmcv.dump(metric_dict, json_file, indent=4) if tmpdir is not None and eval_on_format_results: # remove tmp dir when cityscapes evaluation shutil.rmtree(tmpdir)
def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True args.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) mmcv.mkdir_or_exist(osp.abspath(args.work_dir)) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test, dict(test_mode=True)) dataloader_setting = dict( samples_per_gpu=512, #1 512 workers_per_gpu=cfg.data.get('workers_per_gpu', 1), dist=distributed, shuffle=False, drop_last=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('test_dataloader', {})) data_loader = build_dataloader(dataset, **dataloader_setting) # build the model and load checkpoint model = build_posenet(cfg.model) # print(model) # model=model.cuda() # summary(model,input_size=(3, 256, 192)) # with open('../logs/dark-HRNet-w32.out','w+') as f: # f.write(model) # f.close() fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) load_checkpoint(model, args.checkpoint, map_location='cpu') #结果是因为文件名写错了,只能说mmcv报的错误实在是不够明显。 # model.load_state_dict(torch.load(args.checkpoint, map_location='cpu')['state_dict']) #"meta", "state_dict", "optimizer" if args.fuse_conv_bn: model = fuse_conv_bn(model) if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() eval_config = cfg.get('evaluation', {}) eval_config = merge_configs(eval_config, dict(metric=args.eval)) if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) results = dataset.evaluate(outputs, args.work_dir, **eval_config) for k, v in sorted(results.items()): print(f'{k}: {v}')
def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, args.show_score_thr) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: eval_kwargs = cfg.get('evaluation', {}).copy() # hard-code way to remove EvalHook args for key in ['interval', 'tmpdir', 'start', 'gpu_collect']: eval_kwargs.pop(key, None) eval_kwargs.update(dict(metric=args.eval, **kwargs)) print(dataset.evaluate(outputs, **eval_kwargs))
def main(): args = parse_args() cfg = Config.fromfile(args.config) cfg_samples_per_gpu = cfg.data.samples_per_gpu if args.update_config is not None: cfg.merge_from_dict(args.update_config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True if torch.cuda.is_available(): init_dist(args.launcher, **cfg.dist_params) else: cfg.dist_params['backend'] = 'gloo' init_dist_cpu(args.launcher, **cfg.dist_params) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # dump config cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) if args.tensorboard_dir is not None: hooks = [ hook for hook in cfg.log_config.hooks if hook.type == 'TensorboardLoggerHook' ] if hooks: hooks[0].log_dir = args.tensorboard_dir else: logger.warning('Failed to find TensorboardLoggerHook') # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') if cfg.get('nncf_config'): check_nncf_is_enabled() logger.info('NNCF config: {}'.format(cfg.nncf_config)) meta.update(get_nncf_metadata()) # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) datasets = [build_dataset(cfg.data.train)] dataset_len_per_gpu = sum(len(dataset) for dataset in datasets) if distributed: dataset_len_per_gpu = dataset_len_per_gpu // get_dist_info()[1] assert dataset_len_per_gpu > 0 if cfg.data.samples_per_gpu == 'auto': if torch.cuda.is_available(): logger.info('Auto-selection of samples per gpu (batch size).') cfg.data.samples_per_gpu = determine_max_batch_size( cfg, distributed, dataset_len_per_gpu) logger.info( f'Auto selected batch size: {cfg.data.samples_per_gpu} {dataset_len_per_gpu}' ) cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) else: logger.warning( 'Auto-selection of batch size is not implemented for CPU.') logger.warning( 'Setting batch size to value taken from configuration file.') cfg.data.samples_per_gpu = cfg_samples_per_gpu if dataset_len_per_gpu < cfg.data.samples_per_gpu: cfg.data.samples_per_gpu = dataset_len_per_gpu logger.warning( f'Decreased samples_per_gpu to: {cfg.data.samples_per_gpu} ' f'because of dataset length: {dataset_len_per_gpu} ' f'and gpus number: {get_dist_info()[1]}') if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(mmdet_version=__version__, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES) # also save nncf status in the checkpoint -- it is important, # since it is used in wrap_nncf_model for loading NNCF-compressed models if cfg.get('nncf_config'): nncf_metadata = get_nncf_metadata() cfg.checkpoint_config.meta.update(nncf_metadata) else: # cfg.checkpoint_config is None assert not cfg.get('nncf_config'), ( "NNCF is enabled, but checkpoint_config is not set -- " "cannot store NNCF metainfo into checkpoints") # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES train_detector(model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta)
def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False, efficient_test=False, pre_eval=False, format_only=False, format_args={}): """Test model with multiple gpus by progressive mode. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' it encodes results to gpu tensors and use gpu communication for results collection. On cpu mode it saves the results on different gpus to 'tmpdir' and collects them by the rank 0 worker. Args: model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. The same path is used for efficient test. Default: None. gpu_collect (bool): Option to use either gpu or cpu to collect results. Default: False. efficient_test (bool): Whether save the results as local numpy files to save CPU memory during evaluation. Mutually exclusive with pre_eval and format_results. Default: False. pre_eval (bool): Use dataset.pre_eval() function to generate pre_results for metric evaluation. Mutually exclusive with efficient_test and format_results. Default: False. format_only (bool): Only format result for results commit. Mutually exclusive with pre_eval and efficient_test. Default: False. format_args (dict): The args for format_results. Default: {}. Returns: list: list of evaluation pre-results or list of save file names. """ if efficient_test: warnings.warn( 'DeprecationWarning: ``efficient_test`` will be deprecated, the ' 'evaluation is CPU memory friendly with pre_eval=True') mmcv.mkdir_or_exist('.efficient_test') # when none of them is set true, return segmentation results as # a list of np.array. assert [efficient_test, pre_eval, format_only].count(True) <= 1, \ '``efficient_test``, ``pre_eval`` and ``format_only`` are mutually ' \ 'exclusive, only one of them could be true .' model.eval() results = [] dataset = data_loader.dataset # The pipeline about how the data_loader retrieval samples from dataset: # sampler -> batch_sampler -> indices # The indices are passed to dataset_fetcher to get data from dataset. # data_fetcher -> collate_fn(dataset[index]) -> data_sample # we use batch_sampler to get correct data idx # batch_sampler based on DistributedSampler, the indices only point to data # samples of related machine. loader_indices = data_loader.batch_sampler rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) for batch_indices, data in zip(loader_indices, data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) if efficient_test: result = [np2tmp(_, tmpdir='.efficient_test') for _ in result] if format_only: result = dataset.format_results(result, indices=batch_indices, **format_args) if pre_eval: # TODO: adapt samples_per_gpu > 1. # only samples_per_gpu=1 valid now result = dataset.pre_eval(result, indices=batch_indices) results.extend(result) if rank == 0: batch_size = len(result) * world_size for _ in range(batch_size): prog_bar.update() # collect results from all ranks if gpu_collect: results = collect_results_gpu(results, len(dataset)) else: results = collect_results_cpu(results, len(dataset), tmpdir) return results