class DistPersonExtractor(object): def __init__(self, weight_path): # build model weights = torch.load(weight_path, map_location='cpu') self.model = resnet50_person(weights) self.model = MMDistributedDataParallel( self.model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) self.model.eval() def batch_extract(self, imglist, img_prefix, imgs_per_gpu=1, workers_per_gpu=4): # build dataset dataset = PersonDataset(imglist, img_prefix=img_prefix) # get dist info rank, world_size = get_dist_info() # build data loader sampler = DistributedSampler(dataset, world_size, rank, shuffle=False) data_loader = DataLoader(dataset, batch_size=imgs_per_gpu, sampler=sampler, num_workers=workers_per_gpu, collate_fn=partial( collate, samples_per_gpu=imgs_per_gpu), pin_memory=False) results = self.multi_gpu_test(data_loader) return results def multi_gpu_test(self, data_loader): results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = self.model(data) results.append(result.detach().cpu()) if rank == 0: batch_size = data.size(0) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks results = self.collect_results(torch.cat(results), len(dataset)) return results def collect_results(self, result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() # pylint: disable=not-callable tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') # pylint: enable=not-callable dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir # mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) torch.save(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) # part_list.append(mmcv.load(part_file)) part_list.append(torch.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.append(torch.stack(res)) # the dataloader may pad some samples ordered_results = torch.cat(ordered_results) ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
def measure_inference_speed(cfg, checkpoint, max_iter, log_interval, is_fuse_conv_bn): # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # build the dataloader samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=1, # Because multiple processes will occupy additional CPU resources, # FPS statistics will be more unstable when workers_per_gpu is not 0. # It is reasonable to set workers_per_gpu to 0. workers_per_gpu=0, dist=True, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) load_checkpoint(model, checkpoint, map_location='cpu') if is_fuse_conv_bn: model = fuse_conv_bn(model) model = MMDistributedDataParallel(model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) model.eval() # the first several iterations may be very slow so skip them num_warmup = 5 pure_inf_time = 0 fps = 0 # benchmark with 2000 image and take the average for i, data in enumerate(data_loader): torch.cuda.synchronize() start_time = time.perf_counter() with torch.no_grad(): model(return_loss=False, rescale=True, **data) torch.cuda.synchronize() elapsed = time.perf_counter() - start_time if i >= num_warmup: pure_inf_time += elapsed if (i + 1) % log_interval == 0: fps = (i + 1 - num_warmup) / pure_inf_time print( f'Done image [{i + 1:<3}/ {max_iter}], ' f'fps: {fps:.1f} img / s, ' f'times per image: {1000 / fps:.1f} ms / img', flush=True) if (i + 1) == max_iter: fps = (i + 1 - num_warmup) / pure_inf_time print( f'Overall fps: {fps:.1f} img / s, ' f'times per image: {1000 / fps:.1f} ms / img', flush=True) break return fps
def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # update configs according to CLI args if args.dir is not None: if args.dir.startswith('//'): cfg.work_dir = args.dir[2:] else: localhost = get_localhost().split('.')[0] # results from server saved to /private if 'gpu' in localhost: output_dir = '/private/huangchenxi/mmdet/outputs' else: output_dir = 'work_dirs' if args.dir.endswith('-c'): args.dir = args.dir[:-2] args.resume_from = search_and_delete(os.path.join( output_dir, args.dir), prefix=cfg.work_dir, suffix=localhost) cfg.work_dir += time.strftime("_%m%d_%H%M") + '_' + localhost cfg.work_dir = os.path.join(output_dir, args.dir, cfg.work_dir) if args.workers_per_gpu != -1: cfg.data['workers_per_gpu'] = args.workers_per_gpu if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.gpus = args.gpus if args.profiler or args.speed: cfg.data.imgs_per_gpu = 1 if cfg.resume_from or cfg.load_from: cfg.model['pretrained'] = None if args.test: cfg.data.train['ann_file'] = cfg.data.val['ann_file'] cfg.data.train['img_prefix'] = cfg.data.val['img_prefix'] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False num_gpus = args.gpus rank = 0 else: distributed = True init_dist(args.launcher, **cfg.dist_params) num_gpus = torch.cuda.device_count() rank, _ = get_dist_info() if cfg.optimizer['type'] == 'SGD': cfg.optimizer['lr'] *= num_gpus * cfg.data.imgs_per_gpu / 256 else: cfg.optimizer['lr'] *= ((num_gpus / 8) * (cfg.data.imgs_per_gpu / 2)) # init logger before other steps logger = get_root_logger(nlogger, cfg.log_level) if rank == 0: logger.set_logger_dir(cfg.work_dir, 'd') logger.info("Config: ------------------------------------------\n" + cfg.text) logger.info('Distributed training: {}'.format(distributed)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}'.format(args.seed)) set_random_seed(args.seed) model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) if rank == 0: # describe_vars(model) writer = set_writer(cfg.work_dir) # try: # # describe_features(model.backbone) # writer.add_graph(model, torch.zeros((1, 3, 800, 800))) # except (NotImplementedError, TypeError): # logger.warn("Add graph failed.") # except Exception as e: # logger.warn("Add graph failed:", e) if not args.graph and not args.profiler and not args.speed: if distributed: model = MMDistributedDataParallel(model.cuda()) else: model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() if isinstance(cfg.data.train, list): for t in cfg.data.train: logger.info("loading training set: " + str(t.ann_file)) train_dataset = [build_dataset(t) for t in cfg.data.train] CLASSES = train_dataset[0].CLASSES else: logger.info("loading training set: " + str(cfg.data.train.ann_file)) train_dataset = build_dataset(cfg.data.train) logger.info("{} images loaded!".format(len(train_dataset))) CLASSES = train_dataset.CLASSES if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(mmdet_version=__version__, config=cfg.text, CLASSES=CLASSES) # add an attribute for visualization convenience if hasattr(model, 'module'): model.module.CLASSES = CLASSES else: model.CLASSES = CLASSES train_detector(model, train_dataset, cfg, distributed=distributed, validate=args.validate, logger=logger, runner_attr_dict={'task_name': args.dir}) else: from mmcv.runner.checkpoint import load_checkpoint from mmdet.datasets import build_dataloader from mmdet.core.utils.model_utils import register_hooks from mmdet.apis.train import parse_losses model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() if args.profiler == 'test' or args.speed == 'test': model.eval() dataset = build_dataset(cfg.data.test) else: model.train() dataset = build_dataset(cfg.data.train) if cfg.load_from and (args.profiler or args.speed): logger.info('load checkpoint from %s', cfg.load_from) load_checkpoint(model, cfg.load_from, map_location='cpu', strict=True) data_loader = build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, shuffle=False) if args.graph: id_dict = {} for name, parameter in model.named_parameters(): id_dict[id(parameter)] = name for i, data_batch in enumerate(data_loader): if args.graph: outputs = model(**data_batch) loss, log_vars = parse_losses(outputs) get_dot = register_hooks(loss, id_dict) loss.backward() dot = get_dot() dot.save('graph.dot') break elif args.profiler: with torch.autograd.profiler.profile(use_cuda=True) as prof: if args.profiler == 'train': outputs = model(**data_batch) loss, log_vars = parse_losses(outputs) loss.backward() else: with torch.no_grad(): model(**data_batch, return_loss=False) if i == 20: prof.export_chrome_trace('./trace.json') logger.info(prof) break elif args.speed: if args.speed == 'train': start = time.perf_counter() outputs = model(**data_batch) loss, log_vars = parse_losses(outputs) loss.backward() torch.cuda.synchronize() end = time.perf_counter() else: start = time.perf_counter() with torch.no_grad(): model(**data_batch, return_loss=False) end = time.perf_counter() logger.info("{:.3f} s/iter, {:.1f} iters/s".format( end - start, 1. / (end - start)))
class DistPersonDetector(object): def __init__(self, arch, cfg_path, weight_path, img_scale=(1333, 800)): # build model if arch == 'retina': self.model = self.build_retinanet(cfg_path, weight_path) elif arch == 'rcnn': self.model = self.build_cascadercnn(cfg_path, weight_path) else: raise KeyError('{} is not supported now.'.format(arch)) self.model = MMDistributedDataParallel( self.model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) self.model.eval() self.img_scale = img_scale def build_retinanet(self, cfg_path, weight_path): with open(cfg_path) as f: cfg = json.load(f) model = RetinaNet(**cfg) load_checkpoint(model, weight_path, map_location='cpu') return model def build_cascadercnn(self, cfg_path, weight_path): with open(cfg_path) as f: cfg = json.load(f) model = CascadeRCNN(**cfg) load_checkpoint(model, weight_path, map_location='cpu') return model def batch_detect(self, imglist, img_prefix, imgs_per_gpu=1, workers_per_gpu=4, conf_thr=0.5): # build dataset dataset = CustomDataset(imglist, img_scale=self.img_scale, img_prefix=img_prefix) # get dist info rank, world_size = get_dist_info() # build data loader sampler = DistributedSampler(dataset, world_size, rank, shuffle=False) data_loader = DataLoader(dataset, batch_size=imgs_per_gpu, sampler=sampler, num_workers=workers_per_gpu, collate_fn=partial( collate, samples_per_gpu=imgs_per_gpu), pin_memory=False) results = self.multi_gpu_test(data_loader, conf_thr) return results def multi_gpu_test(self, data_loader, conf_thr): results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = self.model(rescale=True, **data) result = result[result[:, -1] > conf_thr] results.append(result) if rank == 0: batch_size = (len(data['img_meta']._data) if 'img_meta' in data else data['img'].size(0)) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks results = self.collect_results(results, len(dataset)) return results def collect_results(self, result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() # pylint: disable=not-callable tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') # pylint: enable=not-callable dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
class TestMMDetModel(object): def __init__(self, config, checkpoint, out_pkl, show_result=False, out_json=None, tmpdir=None, launcher='none', eval_types=None): """ :param config: test config file path, can use the training config file :param checkpoint: checkpoint file, generated by training process :param out_pkl: output result file :param show_result: show results :param out_json: Save predictions in the COCO json format :param tmpdir: tmp dir for writing some results :param launcher: job launcher in ['none', 'pytorch', 'slurm', 'mpi'] :param eval_types: nargs='+', ['proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'] """ # args = parse_args() assert out_pkl or show_result or out_json, \ ('Please specify at least one operation (save or show the results) ' 'with the argument "--out" or "--show" or "--json_out"') self.show = show_result self.out_pkl = out_pkl self.out_json = out_json if out_pkl is not None and not out_pkl.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') if out_json is not None and out_json.endswith('.json'): out_json = out_json[:-5] cfg = mmcv.Config.fromfile(config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # init distributed env first, since logger depends on the dist info. if launcher == 'none': self.distributed = False else: self.distributed = True init_dist(launcher, **cfg.dist_params) self.tmpdir = tmpdir self.eval_types = eval_types # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) self.dataset = dataset self.data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=self.distributed, shuffle=False) # build the model and load checkpoint self.model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(self.model) self.checkpoint = load_checkpoint(self.model, checkpoint, map_location='cpu') # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in self.checkpoint['meta']: self.model.CLASSES = self.checkpoint['meta']['CLASSES'] else: self.model.CLASSES = dataset.CLASSES def start_testing(self): if not self.distributed: self.model = MMDataParallel(self.model, device_ids=[0]) outputs = self.__single_gpu_test() else: self.model = MMDistributedDataParallel(self.model.cuda()) outputs = self.__multi_gpu_test() dir_name = os.path.dirname(self.out_pkl) if not os.path.exists(dir_name): os.makedirs(dir_name) rank, _ = get_dist_info() if self.out_pkl and rank == 0: print('\nwriting results to {}'.format(self.out_pkl)) mmcv.dump(outputs, self.out_pkl) eval_types = self.eval_types if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = self.out_pkl coco_eval(result_file, eval_types, self.dataset.coco) else: if not isinstance(outputs[0], dict): result_files = results2json(self.dataset, outputs, self.out_pkl) coco_eval(result_files, eval_types, self.dataset.coco) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = self.out_pkl + '.{}'.format(name) result_files = results2json( self.dataset, outputs_, result_file) coco_eval(result_files, eval_types, self.dataset.coco) # Save predictions in the COCO json format if self.out_json and rank == 0: if not isinstance(outputs[0], dict): results2json(self.dataset, outputs, self.out_json) else: for name in outputs[0]: outputs_ = [out[name] for out in outputs] result_file = self.out_json + '.{}'.format(name) results2json(self.dataset, outputs_, result_file) def __single_gpu_test(self): self.model.eval() results = [] dataset = self.data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(self.data_loader): with torch.no_grad(): result = self.model(return_loss=False, rescale=not self.show, **data) results.append(result) if self.show: self.model.module.show_result(data, result, dataset.img_norm_cfg) batch_size = data['img'][0].size(0) for _ in range(batch_size): prog_bar.update() return results def __multi_gpu_test(self): self.model.eval() results = [] dataset = self.data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(self.data_loader): with torch.no_grad(): result = self.model(return_loss=False, rescale=True, **data) results.append(result) if rank == 0: batch_size = data['img'][0].size(0) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks results = self.__collect_results(results, len(dataset), self.tmpdir) return results @staticmethod def __collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results