def my_default_setup(cfg, args): """Perform some basic common setups at the beginning of a job, including: 1. Set up the detectron2 logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (CfgNode): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: mmcv.mkdir_or_exist(output_dir) rank = comm.get_rank() setup_logger(output_dir, distributed_rank=rank) for _mod in ["PIL", "chardet"]: # disable DEBUG logs logging.getLogger(_mod).setLevel(logging.INFO) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read())) logger.info("Running with full config:\n{}".format(cfg)) if comm.is_main_process() and output_dir: # Note: some of our scripts may expect the existence of # config.yaml in output directory # path = os.path.join(output_dir, "config.yaml") # with PathManager.open(path, "w") as f: # f.write(cfg.dump()) path = osp.join(output_dir, osp.basename(args.config_file)) cfg.dump(path) logger.info("Full config saved to {}".format(path)) assert ( args.num_gpus <= torch.cuda.device_count() and args.num_gpus >= 1 ), f"args.num_gpus: {args.num_gpus}, available num gpus: {torch.cuda.device_count()}" # make sure each worker has a different, yet deterministic seed if specified seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
def evaluate(self): # bop toolkit eval in subprocess, no return value if self._distributed: synchronize() self._predictions = all_gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not is_main_process(): return return self._eval_predictions()
def evaluate(self): # bop toolkit eval in subprocess, no return value if self._distributed: synchronize() _predictions = all_gather(self._predictions) # NOTE: gather list of OrderedDict self._predictions = OrderedDict() for preds in _predictions: for _k, _v in preds.items(): self._predictions[_k] = _v # self._predictions = list(itertools.chain(*_predictions)) if not is_main_process(): return if self.eval_precision: return self._eval_predictions_precision() return self._eval_predictions()
def do_train(cfg, args, model, optimizer, resume=False): model.train() # some basic settings ========================= dataset_meta = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) data_ref = ref.__dict__[dataset_meta.ref_key] obj_names = dataset_meta.objs # load data =================================== train_dset_names = cfg.DATASETS.TRAIN data_loader = build_gdrn_train_loader(cfg, train_dset_names) data_loader_iter = iter(data_loader) # load 2nd train dataloader if needed train_2_dset_names = cfg.DATASETS.get("TRAIN2", ()) train_2_ratio = cfg.DATASETS.get("TRAIN2_RATIO", 0.0) if train_2_ratio > 0.0 and len(train_2_dset_names) > 0: data_loader_2 = build_gdrn_train_loader(cfg, train_2_dset_names) data_loader_2_iter = iter(data_loader_2) else: data_loader_2 = None data_loader_2_iter = None images_per_batch = cfg.SOLVER.IMS_PER_BATCH if isinstance(data_loader, AspectRatioGroupedDataset): dataset_len = len(data_loader.dataset.dataset) if data_loader_2 is not None: dataset_len += len(data_loader_2.dataset.dataset) iters_per_epoch = dataset_len // images_per_batch else: dataset_len = len(data_loader.dataset) if data_loader_2 is not None: dataset_len += len(data_loader_2.dataset) iters_per_epoch = dataset_len // images_per_batch max_iter = cfg.SOLVER.TOTAL_EPOCHS * iters_per_epoch dprint("images_per_batch: ", images_per_batch) dprint("dataset length: ", dataset_len) dprint("iters per epoch: ", iters_per_epoch) dprint("total iters: ", max_iter) scheduler = solver_utils.build_lr_scheduler(cfg, optimizer, total_iters=max_iter) AMP_ON = cfg.SOLVER.AMP.ENABLED logger.info(f"AMP enabled: {AMP_ON}") grad_scaler = GradScaler() # resume or load model =================================== checkpointer = MyCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, gradscaler=grad_scaler, save_to_disk=comm.is_main_process(), ) start_iter = checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 if comm._USE_HVD: # hvd may be not available, so do not use the one in args # not needed # start_iter = hvd.broadcast(torch.tensor(start_iter), root_rank=0, name="start_iter").item() # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), op=hvd.Adasum if args.use_adasum else hvd.Average, compression=compression, ) # device_dense='/cpu:0' if cfg.SOLVER.CHECKPOINT_BY_EPOCH: ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD * iters_per_epoch else: ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD periodic_checkpointer = PeriodicCheckpointer( checkpointer, ckpt_period, max_iter=max_iter, max_to_keep=cfg.SOLVER.MAX_TO_KEEP) # build writers ============================================== tbx_event_writer = get_tbx_event_writer( cfg.OUTPUT_DIR, backup=not cfg.get("RESUME", False)) tbx_writer = tbx_event_writer._writer # NOTE: we want to write some non-scalar data writers = ([ MyCommonMetricPrinter(max_iter), MyJSONWriter(osp.join(cfg.OUTPUT_DIR, "metrics.json")), tbx_event_writer ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement logger.info("Starting training from iteration {}".format(start_iter)) iter_time = None with EventStorage(start_iter) as storage: # for data, iteration in zip(data_loader, range(start_iter, max_iter)): for iteration in range(start_iter, max_iter): storage.iter = iteration epoch = iteration // dataset_len + 1 if np.random.rand() < train_2_ratio: data = next(data_loader_2_iter) else: data = next(data_loader_iter) if iter_time is not None: storage.put_scalar("time", time.perf_counter() - iter_time) iter_time = time.perf_counter() # forward ============================================================ batch = batch_data(cfg, data) with autocast(enabled=AMP_ON): out_dict, loss_dict = model( batch["roi_img"], gt_xyz=batch.get("roi_xyz", None), gt_xyz_bin=batch.get("roi_xyz_bin", None), gt_mask_trunc=batch["roi_mask_trunc"], gt_mask_visib=batch["roi_mask_visib"], gt_mask_obj=batch["roi_mask_obj"], gt_region=batch.get("roi_region", None), gt_allo_quat=batch.get("allo_quat", None), gt_ego_quat=batch.get("ego_quat", None), gt_allo_rot6d=batch.get("allo_rot6d", None), gt_ego_rot6d=batch.get("ego_rot6d", None), gt_ego_rot=batch.get("ego_rot", None), gt_trans=batch.get("trans", None), gt_trans_ratio=batch["roi_trans_ratio"], gt_points=batch.get("roi_points", None), sym_infos=batch.get("sym_info", None), roi_classes=batch["roi_cls"], roi_cams=batch["roi_cam"], roi_whs=batch["roi_wh"], roi_centers=batch["roi_center"], resize_ratios=batch["resize_ratio"], roi_coord_2d=batch.get("roi_coord_2d", None), roi_extents=batch.get("roi_extent", None), do_loss=True, ) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() if AMP_ON: grad_scaler.scale(losses).backward() # # Unscales the gradients of optimizer's assigned params in-place # grad_scaler.unscale_(optimizer) # # Since the gradients of optimizer's assigned params are unscaled, clips as usual: # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) if comm._USE_HVD: optimizer.synchronize() with optimizer.skip_synchronize(): grad_scaler.step(optimizer) grad_scaler.update() else: grad_scaler.step(optimizer) grad_scaler.update() else: losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if cfg.TEST.EVAL_PERIOD > 0 and ( iteration + 1 ) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1: do_test(cfg, model, epoch=epoch, iteration=iteration) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and ( (iteration + 1) % cfg.TRAIN.PRINT_FREQ == 0 or iteration == max_iter - 1 or iteration < 100): for writer in writers: writer.write() # visualize some images ======================================== if cfg.TRAIN.VIS_IMG: with torch.no_grad(): vis_i = 0 roi_img_vis = batch["roi_img"][vis_i].cpu().numpy() roi_img_vis = denormalize_image(roi_img_vis, cfg).transpose( 1, 2, 0).astype("uint8") tbx_writer.add_image("input_image", roi_img_vis, iteration) out_coor_x = out_dict["coor_x"].detach() out_coor_y = out_dict["coor_y"].detach() out_coor_z = out_dict["coor_z"].detach() out_xyz = get_out_coor(cfg, out_coor_x, out_coor_y, out_coor_z) out_xyz_vis = out_xyz[vis_i].cpu().numpy().transpose( 1, 2, 0) out_xyz_vis = get_emb_show(out_xyz_vis) tbx_writer.add_image("out_xyz", out_xyz_vis, iteration) gt_xyz_vis = batch["roi_xyz"][vis_i].cpu().numpy( ).transpose(1, 2, 0) gt_xyz_vis = get_emb_show(gt_xyz_vis) tbx_writer.add_image("gt_xyz", gt_xyz_vis, iteration) out_mask = out_dict["mask"].detach() out_mask = get_out_mask(cfg, out_mask) out_mask_vis = out_mask[vis_i, 0].cpu().numpy() tbx_writer.add_image("out_mask", out_mask_vis, iteration) gt_mask_vis = batch["roi_mask"][vis_i].detach().cpu( ).numpy() tbx_writer.add_image("gt_mask", gt_mask_vis, iteration) periodic_checkpointer.step(iteration, epoch=epoch)
def setup(args): """Create configs and perform basic setups.""" cfg = Config.fromfile(args.config_file) if args.opts is not None: cfg.merge_from_dict(args.opts) ############## pre-process some cfg options ###################### # NOTE: check if need to set OUTPUT_DIR automatically if cfg.OUTPUT_DIR.lower() == "auto": cfg.OUTPUT_DIR = osp.join( cfg.OUTPUT_ROOT, osp.splitext(args.config_file)[0].split("configs/")[1]) iprint(f"OUTPUT_DIR was automatically set to: {cfg.OUTPUT_DIR}") if cfg.get("EXP_NAME", "") == "": setproctitle("{}.{}".format( osp.splitext(osp.basename(args.config_file))[0], get_time_str())) else: setproctitle("{}.{}".format(cfg.EXP_NAME, get_time_str())) if cfg.SOLVER.AMP.ENABLED: if torch.cuda.get_device_capability() <= (6, 1): iprint("Disable AMP for older GPUs") cfg.SOLVER.AMP.ENABLED = False # NOTE: pop some unwanterd configs in detectron2 cfg.SOLVER.pop("STEPS", None) cfg.SOLVER.pop("MAX_ITER", None) # NOTE: get optimizer from string cfg dict if cfg.SOLVER.OPTIMIZER_CFG != "": if isinstance(cfg.SOLVER.OPTIMIZER_CFG, str): optim_cfg = eval(cfg.SOLVER.OPTIMIZER_CFG) else: optim_cfg = cfg.SOLVER.OPTIMIZER_CFG iprint("optimizer_cfg:", optim_cfg) cfg.SOLVER.OPTIMIZER_NAME = optim_cfg["type"] cfg.SOLVER.BASE_LR = optim_cfg["lr"] cfg.SOLVER.MOMENTUM = optim_cfg.get("momentum", 0.9) cfg.SOLVER.WEIGHT_DECAY = optim_cfg.get("weight_decay", 1e-4) if cfg.get("DEBUG", False): iprint("DEBUG") args.num_gpus = 1 args.num_machines = 1 cfg.DATALOADER.NUM_WORKERS = 0 cfg.TRAIN.PRINT_FREQ = 1 # register datasets register_datasets_in_cfg(cfg) exp_id = "{}".format(osp.splitext(osp.basename(args.config_file))[0]) if args.eval_only: if cfg.TEST.USE_PNP: # NOTE: need to keep _test at last exp_id += "{}_test".format(cfg.TEST.PNP_TYPE.upper()) else: exp_id += "_test" cfg.EXP_ID = exp_id cfg.RESUME = args.resume #################################### my_default_setup(cfg, args) # Setup logger setup_for_distributed(is_master=comm.is_main_process()) setup_my_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="core") setup_my_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="lib") return cfg