def main(): # torch.manual_seed(0) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # np.random.seed(0) args = parse_args() cfg = Config.fromfile(args.config) # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir distributed = torch.cuda.device_count() > 1 if distributed: if args.launcher == "pytorch": torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") cfg.local_rank = args.local_rank elif args.launcher == "slurm": proc_id = int(os.environ["SLURM_PROCID"]) ntasks = int(os.environ["SLURM_NTASKS"]) node_list = os.environ["SLURM_NODELIST"] num_gpus = torch.cuda.device_count() cfg.gpus = num_gpus torch.cuda.set_device(proc_id % num_gpus) addr = subprocess.getoutput( f"scontrol show hostname {node_list} | head -n1") # specify master port port = None if port is not None: os.environ["MASTER_PORT"] = str(port) elif "MASTER_PORT" in os.environ: pass # use MASTER_PORT in the environment variable else: # 29500 is torch.distributed default port os.environ["MASTER_PORT"] = "29501" # use MASTER_ADDR in the environment variable if it already exists if "MASTER_ADDR" not in os.environ: os.environ["MASTER_ADDR"] = addr os.environ["WORLD_SIZE"] = str(ntasks) os.environ["LOCAL_RANK"] = str(proc_id % num_gpus) os.environ["RANK"] = str(proc_id) dist.init_process_group(backend="nccl") cfg.local_rank = int(os.environ["LOCAL_RANK"]) cfg.gpus = torch.distributed.get_world_size() else: cfg.gpus = args.gpus # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info("Distributed testing: {}".format(distributed)) logger.info(f"torch.backends.cudnn.benchmark: {torch.backends.cudnn.benchmark}") model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) if args.testset: print("Use Test Set") dataset = build_dataset(cfg.data.test) else: print("Use Val Set") dataset = build_dataset(cfg.data.val) data_loader = build_dataloader( dataset, batch_size=cfg.data.samples_per_gpu if not args.speed_test else 1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, ) checkpoint = load_checkpoint(model, args.checkpoint, map_location="cpu") # put model on gpus if distributed: # model = apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel( model.cuda(cfg.local_rank), device_ids=[cfg.local_rank], output_device=cfg.local_rank, # broadcast_buffers=False, find_unused_parameters=True, ) else: # model = fuse_bn_recursively(model) model = model.cuda() model.eval() mode = "val" prog_bar = None logger.info(f"work dir: {args.work_dir}") if cfg.local_rank == 0: prog_bar = torchie.ProgressBar(len(data_loader.dataset) // cfg.gpus) detections = {} cpu_device = torch.device("cpu") start = time.time() start = int(len(dataset) / 3) end = int(len(dataset) * 2 /3) time_start = 0 time_end = 0 for i, data_batch in enumerate(data_loader): if i == start: torch.cuda.synchronize() time_start = time.time() if i == end: torch.cuda.synchronize() time_end = time.time() with torch.no_grad(): outputs = batch_processor( model, data_batch, train_mode=False, local_rank=args.local_rank, ) for output in outputs: token = output["metadata"]["token"] for k, v in output.items(): if k not in [ "metadata", ]: output[k] = v.to(cpu_device) detections.update( {token: output,} ) if args.local_rank == 0: if prog_bar is not None: prog_bar.update() synchronize() all_predictions = all_gather(detections) print("\n Total time per frame: ", (time_end - time_start) / (end - start)) if args.local_rank != 0: return predictions = {} for p in all_predictions: predictions.update(p) if not os.path.exists(args.work_dir): os.makedirs(args.work_dir) save_pred(predictions, args.work_dir) result_dict, _ = dataset.evaluation(copy.deepcopy(predictions), output_dir=args.work_dir, testset=args.testset) if result_dict is not None: for k, v in result_dict["results"].items(): print(f"Evaluation {k}: {v}") if args.txt_result: assert False, "No longer support kitti"
def main(): # torch.manual_seed(0) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # np.random.seed(0) args = parse_args() cfg = Config.fromfile(args.config) cfg.local_rank = args.local_rank # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir distributed = False if "WORLD_SIZE" in os.environ: distributed = int(os.environ["WORLD_SIZE"]) > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") cfg.gpus = torch.distributed.get_world_size() else: cfg.gpus = args.gpus # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info("Distributed testing: {}".format(distributed)) logger.info( f"torch.backends.cudnn.benchmark: {torch.backends.cudnn.benchmark}") model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) if args.testset: print("Use Test Set") dataset = build_dataset(cfg.data.test) else: print("Use Val Set") dataset = build_dataset(cfg.data.val) data_loader = build_dataloader( dataset, batch_size=cfg.data.samples_per_gpu if not args.speed_test else 1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, ) checkpoint = load_checkpoint(model, args.checkpoint, map_location="cpu") # put model on gpus if distributed: model = apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel( model.cuda(cfg.local_rank), device_ids=[cfg.local_rank], output_device=cfg.local_rank, # broadcast_buffers=False, find_unused_parameters=True, ) else: # model = fuse_bn_recursively(model) model = model.cuda() model.eval() mode = "val" logger.info(f"work dir: {args.work_dir}") if cfg.local_rank == 0: prog_bar = torchie.ProgressBar(len(data_loader.dataset) // cfg.gpus) detections = {} cpu_device = torch.device("cpu") start = time.time() start = int(len(dataset) / 3) end = int(len(dataset) * 2 / 3) time_start = 0 time_end = 0 for i, data_batch in enumerate(data_loader): if i == start: torch.cuda.synchronize() time_start = time.time() if i == end: torch.cuda.synchronize() time_end = time.time() with torch.no_grad(): outputs = batch_processor( model, data_batch, train_mode=False, local_rank=args.local_rank, ) for output in outputs: token = output["metadata"]["token"] for k, v in output.items(): if k not in [ "metadata", ]: output[k] = v.to(cpu_device) detections.update({ token: output, }) if args.local_rank == 0: prog_bar.update() synchronize() all_predictions = all_gather(detections) print("\n Total time per frame: ", (time_end - time_start) / (end - start)) if args.local_rank != 0: return predictions = {} for p in all_predictions: predictions.update(p) if not os.path.exists(args.work_dir): os.makedirs(args.work_dir) save_pred(predictions, args.work_dir) with open(os.path.join(args.work_dir, 'prediction.pkl'), 'rb') as f: predictions = pickle.load(f) result_dict, _ = dataset.evaluation(copy.deepcopy(predictions), output_dir=args.work_dir, testset=args.testset) if result_dict is not None: for k, v in result_dict["results"].items(): print(f"Evaluation {k}: {v}") if args.txt_result: assert False, "No longer support kitti"
def main(): # torch.manual_seed(0) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # np.random.seed(0) args = parse_args() cfg = Config.fromfile(args.config) cfg.local_rank = args.local_rank # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir distributed = False if "WORLD_SIZE" in os.environ: distributed = int(os.environ["WORLD_SIZE"]) > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") cfg.gpus = torch.distributed.get_world_size() else: cfg.gpus = args.gpus # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info("Distributed testing: {}".format(distributed)) logger.info( f"torch.backends.cudnn.benchmark: {torch.backends.cudnn.benchmark}") model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) dataset = build_dataset(cfg.data.val) data_loader = build_dataloader( dataset, batch_size=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, ) checkpoint = load_checkpoint(model, args.checkpoint, map_location="cpu") # put model on gpus if distributed: model = apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel( model.cuda(cfg.local_rank), device_ids=[cfg.local_rank], output_device=cfg.local_rank, # broadcast_buffers=False, find_unused_parameters=True, ) else: model = model.cuda() model.eval() mode = "val" logger.info(f"work dir: {args.work_dir}") if cfg.local_rank == 0: prog_bar = torchie.ProgressBar(len(data_loader.dataset) // cfg.gpus) detections = {} cpu_device = torch.device("cpu") for i, data_batch in enumerate(data_loader): with torch.no_grad(): outputs = batch_processor( model, data_batch, train_mode=False, local_rank=args.local_rank, ) for output in outputs: token = output["metadata"]["token"] for k, v in output.items(): if k not in [ "metadata", ]: output[k] = v.to(cpu_device) detections.update({ token: output, }) if args.local_rank == 0: prog_bar.update() synchronize() all_predictions = all_gather(detections) if args.local_rank != 0: return predictions = {} for p in all_predictions: predictions.update(p) result_dict, _ = dataset.evaluation(predictions, output_dir=args.work_dir) for k, v in result_dict["results"].items(): print(f"Evaluation {k}: {v}") if args.txt_result: res_dir = os.path.join(os.getcwd(), "predictions") for k, dt in predictions.items(): with open( os.path.join(res_dir, "%06d.txt" % int(dt["metadata"]["token"])), "w") as fout: lines = kitti.annos_to_kitti_label(dt) for line in lines: fout.write(line + "\n") ap_result_str, ap_dict = kitti_evaluate( "/data/Datasets/KITTI/Kitti/object/training/label_2", res_dir, label_split_file="/data/Datasets/KITTI/Kitti/ImageSets/val.txt", current_class=0, ) print(ap_result_str)
def main(): # torch.manual_seed(0) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # np.random.seed(0) args = parse_args() cfg = Config.fromfile(args.config) cfg.local_rank = args.local_rank # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir distributed = False if "WORLD_SIZE" in os.environ: distributed = int(os.environ["WORLD_SIZE"]) > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") cfg.gpus = torch.distributed.get_world_size() else: cfg.gpus = args.gpus # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info("Distributed testing: {}".format(distributed)) logger.info( f"torch.backends.cudnn.benchmark: {torch.backends.cudnn.benchmark}") torch.cuda.empty_cache() model = build_detector(cfg.nohead_model, train_cfg=None, test_cfg=cfg.test_cfg) if args.testset: print("Use Test Set") dataset = build_dataset(cfg.data.test) else: print("Use Val Set") dataset = build_dataset(cfg.data.val) data_loader = build_dataloader( dataset, batch_size=cfg.data.samples_per_gpu if not args.speed_test else 1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, ) checkpoint = load_checkpoint(model, args.checkpoint, map_location="cpu") # put model on gpus # model = fuse_bn_recursively(model) model = model.cuda() model.eval() mode = "val" logger.info(f"work dir: {args.work_dir}") if cfg.local_rank == 0: prog_bar = torchie.ProgressBar(len(data_loader.dataset) // cfg.gpus) detections = {} cpu_device = torch.device("cpu") start = time.time() start = int(len(dataset) / 3) end = int(len(dataset) * 2 / 3) time_start = 0 time_end = 0 device = torch.device(args.local_rank) POINTS_NUM = 2 for i, data_batch in enumerate(data_loader): if i == start: torch.cuda.synchronize() time_start = time.time() if i == end: torch.cuda.synchronize() time_end = time.time() with torch.no_grad(): sample = example_to_device(data_batch, device=device) for i in range(len(sample["metadata"])): sample["metadata"][i]['image_prefix'] = None del sample["metadata"] del sample["points"] #del sample["shape"] sample["shape"] = torch.tensor(sample["shape"]) sample["voxels"] = sample["voxels"][0:POINTS_NUM, :, :] sample["num_points"] = sample["num_points"][0:POINTS_NUM] sample["coordinates"] = sample["coordinates"][0:POINTS_NUM, :] outputs = model(sample, return_loss=False) #outputs = batch_processor( # model, data_batch, train_mode=False, local_rank=args.local_rank, #) for k, t in sample.items(): print("====", k) print(t.shape) print("============== start =============") register_custom_op_symbolic("spconv::get_indice_pairs_3d", symbolic_get_indice_pairs_3d, 11) torch.onnx.export( model, # model being run sample, # model input (or a tuple for multiple inputs) "/workspace/data/center_point.onnx", # where to save the model (can be a file or file-like object) export_params= True, # store the trained parameter weights inside the model file opset_version=11, # the ONNX version to export the model to do_constant_folding=True ) # whether to execute constant folding for optimization print("============== finish =============") break for output in outputs: token = output["metadata"]["token"] for k, v in output.items(): if k not in [ "metadata", ]: output[k] = v.to(cpu_device) detections.update({ token: output, }) if args.local_rank == 0: prog_bar.update() synchronize() all_predictions = all_gather(detections) print("\n Total time per frame: ", (time_end - time_start) / (end - start)) if args.local_rank != 0: return predictions = {} for p in all_predictions: predictions.update(p) if not os.path.exists(args.work_dir): os.makedirs(args.work_dir) save_pred(predictions, args.work_dir) result_dict, _ = dataset.evaluation(copy.deepcopy(predictions), output_dir=args.work_dir, testset=args.testset) if result_dict is not None: for k, v in result_dict["results"].items(): print(f"Evaluation {k}: {v}") if args.txt_result: assert False, "No longer support kitti"