def __init__(self, node_name=''): self.rosparam_(node_name) self.create_logger_(node_name) self.logger.info('ROS parameters loaded successfully') self.logger.info( '********************** Start logging **********************') save_config_to_file(cfg, logger=self.logger) self.logger.info('ROS buffer_len: {}'.format(self.buffer_len)) self.logger.info('ROS lidar_topic: {}'.format(self.lidar_topic)) self.logger.info('ROS debug_flag: {}'.format(self.debug_flag)) self.logger.info( 'ROS depth_threshold: {}'.format(self.depth_threshold)) self.logger.info( 'ROS score_threshold: {}'.format(self.score_threshold)) self.logger.info('ROS model_checkpoint: {}'.format(self.model_checkpoint)) self.model = PointRCNN(num_classes=self.num_class, use_xyz=True, mode='TEST') self.model.cuda() load_checkpoint( self.model, filename=str(self.base_dir / self.model_checkpoint), logger=self.logger) self.model.eval() self.logger.info('Model initialization complete')
def repeat_eval_ckpt(root_result_dir, ckpt_dir): root_result_dir = os.path.join(root_result_dir, 'eval', 'eval_all_' + args.extra_tag) os.makedirs(root_result_dir, exist_ok=True) log_file = os.path.join(root_result_dir, 'log_eval_all_%s.txt' % cfg.TEST.SPLIT) logger = create_logger(log_file) logger.info('**********************Start logging**********************') # save config for key, val in vars(args).items(): logger.info("{:16} {}".format(key, val)) save_config_to_file(cfg, logger=logger) # create dataloader & network test_loader = create_dataloader(logger) model = PointRCNN(num_classes=test_loader.dataset.num_class, use_xyz=True, mode='TEST') model.cuda() # copy important files to backup backup_dir = os.path.join(root_result_dir, 'backup_files') os.makedirs(backup_dir, exist_ok=True) os.system('cp *.py %s/' % backup_dir) os.system('cp ../lib/net/*.py %s/' % backup_dir) os.system('cp ../lib/datasets/kitti_rcnn_dataset.py %s/' % backup_dir) # evaluated ckpt record ckpt_record_file = os.path.join(root_result_dir, 'eval_list_%s.txt' % cfg.TEST.SPLIT) with open(ckpt_record_file, 'a'): pass # tensorboard log tb_log = SummaryWriter(log_dir=os.path.join(root_result_dir, 'tensorboard_%s' % cfg.TEST.SPLIT)) while True: # check whether there is checkpoint which is not evaluated cur_epoch_id, cur_ckpt = get_no_evaluated_ckpt(ckpt_dir, ckpt_record_file) if cur_epoch_id == -1 or int(float(cur_epoch_id)) < args.start_epoch: wait_second = 30 print('Wait %s second for next check: %s' % (wait_second, ckpt_dir)) time.sleep(wait_second) continue # load checkpoint train_utils.load_checkpoint(model, filename=cur_ckpt) # start evaluation cur_result_dir = os.path.join(root_result_dir, 'epoch_%s' % cur_epoch_id, cfg.TEST.SPLIT) tb_dict = eval_one_epoch(model, test_loader, cur_epoch_id, cur_result_dir, logger) step = int(float(cur_epoch_id)) if step == float(cur_epoch_id): for key, val in tb_dict.items(): tb_log.add_scalar(key, val, step) # record this epoch which has been evaluated with open(ckpt_record_file, 'a') as f: print('%s' % cur_epoch_id, file=f) logger.info('Epoch %s has been evaluated' % cur_epoch_id)
def load_ckpt_based_on_args(model, logger): if args.ckpt is not None: train_utils.load_checkpoint(model, filename=args.ckpt, logger=logger) total_keys = model.state_dict().keys().__len__() if cfg.RPN.ENABLED and args.rpn_ckpt is not None: load_part_ckpt(model, filename=args.rpn_ckpt, logger=logger, total_keys=total_keys) if cfg.RCNN.ENABLED and args.rcnn_ckpt is not None: load_part_ckpt(model, filename=args.rcnn_ckpt, logger=logger, total_keys=total_keys)
def load_ckpt_based_on_cfg(config, model, logger): if config['ckpt'] is not None: train_utils.load_checkpoint(model, filename=config['ckpt'], logger=logger) total_keys = model.state_dict().keys().__len__() if cfg.RPN.ENABLED and config['rpn_ckpt'] is not None: load_part_ckpt(model, filename=config['rpn_ckpt'], logger=logger, total_keys=total_keys) if cfg.RCNN.ENABLED and config['rcnn_ckpt'] is not None: load_part_ckpt(model, filename=config['rcnn_ckpt'], logger=logger, total_keys=total_keys)
def load_ckpt_based_on_args(model, logger): """ Input: model and logger instance Output: None Task: Loads ckpt based on the args --rpn_ckpt and --rcnn_ckpt """ if args.ckpt is not None: train_utils.load_checkpoint(model, filename=args.ckpt, logger=logger) total_keys = model.state_dict().keys().__len__() if cfg.RPN.ENABLED and args.rpn_ckpt is not None: load_part_ckpt(model, filename=args.rpn_ckpt, logger=logger, total_keys=total_keys) if cfg.RCNN.ENABLED and args.rcnn_ckpt is not None: load_part_ckpt(model, filename=args.rcnn_ckpt, logger=logger, total_keys=total_keys)
def train(): print(args.local_rank) torch.cuda.set_device(args.local_rank) # create dataloader & network & optimizer model, model_fn_decorator, net_func = create_model(cfg) init_weights(model, init_type='kaiming') model.cuda() root_result_dir = args.output_dir os.makedirs(root_result_dir, exist_ok=True) log_file = os.path.join(root_result_dir, "log_train.txt") logger = create_logger(log_file, get_rank()) logger.info("**********************Start logging**********************") logger.info('TRAINED MODEL:{}'.format(net_func)) # log to file gpu_list = os.environ[ 'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys( ) else 'ALL' logger.info("CUDA_VISIBLE_DEVICES=%s" % gpu_list) for key, val in vars(args).items(): logger.info("{:16} {}".format(key, val)) logger.info("***********************config infos**********************") for key, val in vars(cfg).items(): logger.info("{:16} {}".format(key, val)) # log tensorboard if get_rank() == 0: tb_log = SummaryWriter( log_dir=os.path.join(root_result_dir, "tensorboard")) else: tb_log = None train_loader, test_loader = create_dataloader() # train_loader, test_loader = create_dataloader_Insensee() optimizer = create_optimizer(model) # load checkpoint if it is possible start_epoch = it = best_res = 0 last_epoch = -1 if args.ckpt is not None: pure_model = model it, start_epoch, best_res = load_checkpoint(pure_model, optimizer, args.ckpt, logger) last_epoch = start_epoch + 1 lr_scheduler = create_scheduler(optimizer, last_epoch=last_epoch) # lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.98, last_epoch=-1) criterion = None # start training logger.info('**********************Start training**********************') ckpt_dir = os.path.join(root_result_dir, "ckpt") os.makedirs(ckpt_dir, exist_ok=True) trainer = train_utils.Trainer(model, model_fn=model_fn_decorator(), criterion=criterion, optimizer=optimizer, ckpt_dir=ckpt_dir, lr_scheduler=lr_scheduler, model_fn_eval=model_fn_decorator(), tb_log=tb_log, logger=logger, eval_frequency=1, cfg=cfg) trainer.train(start_it=it, start_epoch=start_epoch, n_epochs=args.epochs, train_loader=train_loader, test_loader=test_loader, ckpt_save_interval=args.ckpt_save_interval, best_res=best_res) logger.info('**********************End training**********************')
use_xyz=True, mode='TRAIN') optimizer = create_optimizer(model) if args.mgpus: model = nn.DataParallel(model) model.cuda() # load checkpoint if it is possible start_epoch = it = 0 last_epoch = -1 if args.ckpt is not None: pure_model = model.module if isinstance( model, torch.nn.DataParallel) else model it, start_epoch = train_utils.load_checkpoint(pure_model, optimizer, filename=args.ckpt, logger=logger) last_epoch = start_epoch + 1 lr_scheduler, bnm_scheduler = create_scheduler( optimizer, total_steps=len(train_loader) * args.epochs, last_epoch=last_epoch) if args.rpn_ckpt is not None: pure_model = model.module if isinstance( model, torch.nn.DataParallel) else model total_keys = pure_model.state_dict().keys().__len__() train_utils.load_part_ckpt(pure_model, filename=args.rpn_ckpt, logger=logger,
# create dataloader & network & optimizer train_loader, test_loader = create_dataloader(logger) model = PointRCNN(num_classes=train_loader.dataset.num_class, use_xyz=True, mode='TRAIN') optimizer = create_optimizer(model) if args.mgpus: model = nn.DataParallel(model) model.cuda() # load checkpoint if it is possible start_iter = it = 0 last_iter = -1 if args.pretrain_ckpt is not None: pure_model = model.module if isinstance(model, torch.nn.DataParallel) else model _, _ = train_utils.load_checkpoint(pure_model, None, filename=args.pretrain_ckpt, logger=logger) it = int(args.total_iters*9/30) if args.ckpt is not None: pure_model = model.module if isinstance(model, torch.nn.DataParallel) else model it, _ = train_utils.load_checkpoint(pure_model, optimizer, filename=args.ckpt, logger=logger) last_iter = it + 1 lr_scheduler, bnm_scheduler = create_scheduler(optimizer, total_steps=args.total_iters, last_iter=last_iter) lr_warmup_scheduler = None # start training logger.info('**********************Start training**********************') logger.info('experiment ID: %s/%s/%s'%(root_result_dir.split('/')[-3], root_result_dir.split('/')[-2], root_result_dir.split('/')[-1]))