def main(): # torch.manual_seed(0) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # np.random.seed(0) args = parse_args() cfg = Config.fromfile(args.config) cfg.local_rank = args.local_rank # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir if args.resume_from is not None: cfg.resume_from = args.resume_from distributed = False if "WORLD_SIZE" in os.environ: distributed = int(os.environ["WORLD_SIZE"]) > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") cfg.gpus = torch.distributed.get_world_size() if args.autoscale_lr: cfg.lr_config.lr_max = cfg.lr_config.lr_max * cfg.gpus # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info("Distributed training: {}".format(distributed)) logger.info(f"torch.backends.cudnn.benchmark: {torch.backends.cudnn.benchmark}") if args.local_rank == 0: # copy important files to backup backup_dir = os.path.join(cfg.work_dir, "det3d") os.makedirs(backup_dir, exist_ok=True) os.system("cp -r ./det3d %s/" % backup_dir) os.system("cp -r ./tools %s/" % backup_dir) os.system("cp -r ./examples %s/" % backup_dir) logger.info(f"Backup source files to {cfg.work_dir}/det3d") # set random seeds if args.seed is not None: logger.info("Set random seed to {}".format(args.seed)) set_random_seed(args.seed) model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: datasets.append(build_dataset(cfg.data.val)) if cfg.checkpoint_config is not None: # save det3d version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict( det3d_version=__version__, config=cfg.text, CLASSES=datasets[0].CLASSES ) # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES train_detector( model, datasets, cfg, distributed=distributed, validate=args.validate, logger=logger, )
def main(): # torch.manual_seed(0) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # np.random.seed(0) args = parse_args() cfg = Config.fromfile(args.config) # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir if args.resume_from is not None: cfg.resume_from = args.resume_from distributed = torch.cuda.device_count() > 1 if distributed: if args.launcher == "pytorch": torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") cfg.local_rank = args.local_rank elif args.launcher == "slurm": proc_id = int(os.environ["SLURM_PROCID"]) ntasks = int(os.environ["SLURM_NTASKS"]) node_list = os.environ["SLURM_NODELIST"] num_gpus = torch.cuda.device_count() cfg.gpus = num_gpus torch.cuda.set_device(proc_id % num_gpus) addr = subprocess.getoutput( f"scontrol show hostname {node_list} | head -n1") # specify master port port = None if port is not None: os.environ["MASTER_PORT"] = str(port) elif "MASTER_PORT" in os.environ: pass # use MASTER_PORT in the environment variable else: # 29500 is torch.distributed default port os.environ["MASTER_PORT"] = "29501" # use MASTER_ADDR in the environment variable if it already exists if "MASTER_ADDR" not in os.environ: os.environ["MASTER_ADDR"] = addr os.environ["WORLD_SIZE"] = str(ntasks) os.environ["LOCAL_RANK"] = str(proc_id % num_gpus) os.environ["RANK"] = str(proc_id) dist.init_process_group(backend="nccl") cfg.local_rank = int(os.environ["LOCAL_RANK"]) cfg.gpus = dist.get_world_size() else: cfg.local_rank = 0 cfg.gpus = 1 if args.autoscale_lr: cfg.lr_config.lr_max = cfg.lr_config.lr_max * cfg.gpus # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info("Distributed training: {}".format(distributed)) logger.info( f"torch.backends.cudnn.benchmark: {torch.backends.cudnn.benchmark}") model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: datasets.append(build_dataset(cfg.data.val)) if cfg.checkpoint_config is not None: # save det3d version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(det3d_version=__version__, config=cfg.text, CLASSES=datasets[0].CLASSES) # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES train_detector( model, datasets, cfg, distributed=distributed, validate=args.validate, logger=logger, )