def validation(self, epoch): # import pdb; pdb.set_trace() print('Validation of epoch {}:'.format(epoch)) # if self.distributed: # model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) # output_folders = [None] * len(cfg.DATASETS.TEST) # dataset_names = cfg.DATASETS.TEST # if cfg.OUTPUT_DIR: # for idx, dataset_name in enumerate(dataset_names): # output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) # mkdir(output_folder) # output_folders[idx] = output_folder dataset_name = cfg.DATASETS.TEST[0] output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) self.val_loader = make_val_loader(cfg) inference( self.model, self.val_loader, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def run_test(cfg, model, distributed): if distributed: model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def run_test(cfg, model, distributed): model_test = {} if distributed: model_test["backbone"] = model["backbone"].module model_test["fcos"] = model["fcos"].module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) dataset_name = cfg.DATASETS.TEST[0] if cfg.OUTPUT_DIR: output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) results = inference( model_test, data_loaders_val[0], dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize() results = all_gather(results) # import pdb; pdb.set_trace() return results
def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("fcos_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed) if not args.skip_test: run_test(cfg, model, args.distributed)
def cache_url(url, model_dir=None, progress=True): r"""Loads the Torch serialized object at the given URL. If the object is already present in `model_dir`, it's deserialized and returned. The filename part of the URL should follow the naming convention ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more digits of the SHA256 hash of the contents of the file. The hash is used to ensure unique names and to verify the contents of the file. The default value of `model_dir` is ``$TORCH_HOME/models`` where ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be overridden with the ``$TORCH_MODEL_ZOO`` environment variable. Args: url (string): URL of the object to download model_dir (string, optional): directory in which to save the object progress (bool, optional): whether or not to display a progress bar to stderr Example: >>> cached_file = fcos_core.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') """ if model_dir is None: torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch')) model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models')) if not os.path.exists(model_dir): os.makedirs(model_dir) parts = urlparse(url) if parts.fragment != "": filename = parts.fragment else: filename = os.path.basename(parts.path) if filename == "model_final.pkl": # workaround as pre-trained Caffe2 models from Detectron have all the same filename # so make the full path the filename by replacing / with _ filename = parts.path.replace("/", "_") cached_file = os.path.join(model_dir, filename) if not os.path.exists(cached_file) and is_main_process(): sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) hash_prefix = HASH_REGEX.search(filename) if hash_prefix is not None: hash_prefix = hash_prefix.group(1) # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, # which matches the hash PyTorch uses. So we skip the hash matching # if the hash_prefix is less than 6 characters if len(hash_prefix) < 6: hash_prefix = None _download_url_to_file(url, cached_file, hash_prefix, progress=progress) synchronize() return cached_file
def run_test(cfg, model, distributed, test_epoch=None): if distributed: model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference_result = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize() # import pdb; pdb.set_trace() summaryStrs = get_neat_inference_result(inference_result[2][0]) # print('\n'.join(summaryStrs)) summaryStrFinal = '\n'.join(summaryStrs) summaryStrFinal = '\n\nEpoch: ' + str(test_epoch) + '\n' + summaryStrFinal # with open(output_folder+'/summaryStrs.txt', 'w') as f_summaryStrs: with open(output_folder+'/summaryStrs.txt', 'a') as f_summaryStrs: f_summaryStrs.write(summaryStrFinal)
def main(): parser = argparse.ArgumentParser(description="Test onnx models of FCOS") parser.add_argument( "--config-file", default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "--onnx-model", default="fcos_imprv_R_50_FPN_1x.onnx", metavar="FILE", help="path to the onnx model", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) # The onnx model can only be used with DATALOADER.NUM_WORKERS = 0 cfg.DATALOADER.NUM_WORKERS = 0 cfg.freeze() save_dir = "" logger = setup_logger("fcos_core", save_dir, get_rank()) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = ONNX_FCOS(args.onnx_model, cfg) model.to(cfg.MODEL.DEVICE) iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=False) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("fcos_core", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox", ) + ("segm", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.SIPMASK_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def main(): # 这个就是解析命令行参数,如上面的--config-file configs/fcos/fcos_imprv_R_50_FPN_1x.yaml parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) # 这个参数是torch.distributed.launch传递过来的,我们设置位置参数来接受 # local_rank代表当前程序进程使用的GPU标号 parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() # 判断机器上GPU的数量,大于1时自动使用分布式训练 # WORLD_SIZE 由torch.distributed.launch.py产生 # 具体数值为 nproc_per_node*node(node就是主机数) num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() # 参数默认是在fcos_core/config/defaults.py中,其余由config_file,opts覆盖 cfg.merge_from_file(args.config_file) # 从yaml文件中读取参数 cfg.merge_from_list(args.opts) # 也可以从命令行参数重写 cfg.freeze() # 冻住参数,为了防止之后被不小心更改,cfg被传入train() # 可以在这里打印cfg看看,我以fcos_R_50_FPN_1x.yaml为例 output_dir = cfg.OUTPUT_DIR # 创建输出文件夹,存放一些日志信息 if output_dir: mkdir(output_dir) # 写入日志文件,包括GPU数量,系统环境,配置文件参数等 logger = setup_logger("fcos_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) # 这句话是下一个入口,关注train()方法,里面第一步就是构建模型 model = train(cfg, args.local_rank, args.distributed) if not args.skip_test: run_test(cfg, model, args.distributed)
def main(): # 解析命令行参数,例如--config-file parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", #配置文件 default="", metavar="FILE", help="path to config file", type=str, ) #此参数是通过torch.distributed.launch传递过来的,我们设置位置参数来接受 # local_rank代表当前程序进程使用的GPU标号 parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, #所有剩余的命令行参数都被收集到一个列表中 ) args = parser.parse_args() #判断机器上gpu的数量,大于1时自动使用分布式训练 #world_size是由torch.distributed.launch.py产生 # 具体数值为 nproc_per_node*node(node就是主机数) num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 #判断当前系统环境变量中是否有"WORLD_SIZE" 如果没有num_gpus=1 args.distributed = num_gpus > 1 #False if args.distributed: #False torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group\ ( backend="nccl", init_method="env://" ) synchronize() #yacs的具体用法 可以参考印象笔记 #参数默认在fcos_core/config_defaults.py中 其余参数由config_file opts覆盖 cfg.merge_from_file(args.config_file) #从yaml文件中读取参数 即configs/fcos/fcos_R_50_FPN_1x.yaml cfg.merge_from_list(args.opts) #也可以从命令行进行参数重写 cfg.freeze() #冻结参数 防止不小心被更改 cfg被传入train() output_dir = cfg.OUTPUT_DIR #输出模型路径 存放一些日志信息 if output_dir: mkdir(output_dir) #创建对应的输出路径 #写入日志文件 包括gpu数量,系统环境,配置文件参数等 logger = setup_logger("fcos_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed) #local_rank=0 distributed=False if not args.skip_test: run_test(cfg, model, args.distributed)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--device_ids", type=list, default=[0]) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "--use-tensorboard", dest="use_tensorboard", help="Use tensorboardX logger (Requires tensorboardX installed)", action="store_true", default=False) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() # set devices_ids according to num gpus num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) args.device_ids = list(map(str, range(num_gpus))) # do not use torch.distributed args.distributed = False if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("fad_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed, args.device_ids, use_tensorboard=args.use_tensorboard) if not args.skip_test: run_test(cfg, model, args.distributed)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) # add distance loss warmup iters cfg.SOLVER.MAX_ITER += cfg.MODEL.LABELENC.DISTANCE_LOSS_WARMUP_ITERS cfg.SOLVER.STEPS = tuple([ i + cfg.MODEL.LABELENC.DISTANCE_LOSS_WARMUP_ITERS for i in cfg.SOLVER.STEPS ]) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("fcos_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed) if not args.skip_test: run_test(cfg, model, args.distributed) if args.distributed: model = model.module if not args.distributed or dist.get_rank() == 0: label_encoding_function = model.label_encoding_function.state_dict() rpn = model.rpn.state_dict() saved_weights = { 'label_encoding_function': label_encoding_function, 'rpn': rpn } if model.roi_heads: roi_heads = model.roi_heads.state_dict() saved_weights.update({'roi_heads': roi_heads}) torch.save(saved_weights, os.path.join(cfg.OUTPUT_DIR, "label_encoding_function.pth")) logger.info("Successfully save label encoding function weights to " + \ os.path.join(cfg.OUTPUT_DIR, "label_encoding_function.pth")) synchronize()