def main(): # parsing specific config config = copy.deepcopy(s_config) config.network = get_default_network_config() config.loss = get_default_loss_config() config = update_config_from_file(config, s_config_file, check_necessity=True) config = update_config_from_args(config, s_args) # create log and path final_output_path, final_log_path, logger = create_logger(s_config_file, config.dataset.train_image_set, config.pytorch.output_path, config.pytorch.log_path) logger.info('training config:{}\n'.format(pprint.pformat(config))) # define devices create multi-GPU context os.environ["CUDA_VISIBLE_DEVICES"] = config.pytorch.gpus # a safer method devices = [int(i) for i in config.pytorch.gpus.split(',')] logger.info("Using Devices: {}".format(str(devices))) # lable, loss, metric and result logger.info("Defining lable, loss, metric and result") label_func = get_label_func(config.loss) loss_func = get_loss_func(config.loss) loss_func = DataParallelCriterion(loss_func) result_func = get_result_func(config.loss) merge_flip_func = get_merge_func(config.loss) # dataset, basic imdb logger.info("Creating dataset") train_imdbs = [] valid_imdbs = [] for n_db in range(0, len(config.dataset.name)): train_imdbs.append( eval(config.dataset.name[n_db])(config.dataset.train_image_set[n_db], config.dataset.path[n_db], config.train.patch_width, config.train.patch_height, config.train.rect_3d_width, config.train.rect_3d_height)) valid_imdbs.append( eval(config.dataset.name[n_db])(config.dataset.test_image_set[n_db], config.dataset.path[n_db], config.train.patch_width, config.train.patch_height, config.train.rect_3d_width, config.train.rect_3d_height)) batch_size = len(devices) * config.dataiter.batch_images_per_ctx # basic data_loader unit dataset_name = "" for n_db in range(0, len(config.dataset.name)): dataset_name = dataset_name + config.dataset.name[n_db] + "_" dataset_train = \ eval(dataset_name + "Dataset")(train_imdbs, True, '', config.train.patch_width, config.train.patch_height, config.train.rect_3d_width, config.train.rect_3d_height, batch_size, config.dataiter.mean, config.dataiter.std, config.aug, label_func, config.loss) dataset_valid = \ eval(config.dataset.name[config.dataiter.target_id] + "_Dataset")([valid_imdbs[config.dataiter.target_id]], False, config.train.patch_width, config.train.patch_height, config.train.rect_3d_width, config.train.rect_3d_height, batch_size, config.dataiter.mean, config.dataiter.std, config.aug, label_func, config.loss) train_data_loader = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, num_workers=config.dataiter.threads, drop_last=True) valid_data_loader = DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=False, num_workers=config.dataiter.threads, drop_last=False) # prepare network logger.info("Creating network") joint_num = dataset_train.joint_num assert dataset_train.joint_num == dataset_valid.joint_num net = get_pose_net(config.network, joint_num) init_pose_net(net, config.network) net = DataParallelModel(net).cuda() model_prefix = os.path.join(final_output_path, config.train.model_prefix) logger.info("Net total params: {:.2f}M".format(sum(p.numel() for p in net.parameters()) / 1000000.0)) # Optimizer logger.info("Creating optimizer") optimizer, scheduler = get_optimizer(config.optimizer, net) # train and valid vloss_min = 10000000.0 train_loss = [] valid_loss = [] logger.info("Train DB size: {}; Valid DB size: {}.".format(int(len(dataset_train)), int(len(dataset_valid)))) for epoch in range(config.train.begin_epoch, config.train.end_epoch + 1): scheduler.step() logger.info( "Working on {}/{} epoch || LearningRate:{} ".format(epoch, config.train.end_epoch, scheduler.get_lr()[0])) speedometer = Speedometer(train_data_loader.batch_size, config.pytorch.frequent, auto_reset=False) beginT = time.time() tloss = trainNet(epoch, train_data_loader, net, optimizer, config.loss, loss_func, speedometer) endt1 = time.time() - beginT beginT = time.time() preds_in_patch_with_score, vloss = \ validNet(valid_data_loader, net, config.loss, result_func, loss_func, merge_flip_func, config.train.patch_width, config.train.patch_height, devices, valid_imdbs[config.dataiter.target_id].flip_pairs, flip_test=False) endt2 = time.time() - beginT beginT = time.time() evalNet(epoch, preds_in_patch_with_score, valid_data_loader, valid_imdbs[config.dataiter.target_id], config.train.patch_width, config.train.patch_height, config.train.rect_3d_width, config.train.rect_3d_height, final_output_path) endt3 = time.time() - beginT logger.info('One epoch training %.1fs, validation %.1fs, evaluation %.1fs ' % (endt1, endt2, endt3)) train_loss.append(tloss) valid_loss.append(vloss) if vloss < vloss_min: vloss_min = vloss save_lowest_vloss_model({ 'epoch': epoch, 'network': net.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'train_loss': train_loss, 'valid_loss': valid_loss }, model_prefix, logger) if epoch % (config.train.end_epoch // 10) == 0 \ or epoch == config.train.begin_epoch \ or epoch == config.train.end_epoch: save_model({ 'epoch': epoch, 'network': net.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'train_loss': train_loss, 'valid_loss': valid_loss }, model_prefix, logger, epoch)
def main(): # parsing specific config config = copy.deepcopy(s_config) config.network = get_default_network_config() config.loss = get_default_loss_config() config = update_config_from_file(config, s_config_file, check_necessity=True) config = update_config_from_args(config, s_args) et = config.dataset.eval_target # create log and path final_output_path, final_log_path, logger = create_logger(s_config_file, config.dataset.benchmark[et], config.pytorch.output_path, config.pytorch.log_path) logger.info('Train config:{}\n'.format(pprint.pformat(config))) shutil.copy2(s_args.cfg, final_output_path) # define devices create multi-GPU context os.environ["CUDA_VISIBLE_DEVICES"] = config.pytorch.gpus # a safer method devices = [int(i) for i in config.pytorch.gpus.split(',')] logger.info("Using Devices: {}".format(str(devices))) # label, loss, metric and result logger.info("Defining lable, loss, metric and result") label_func = get_label_func(config.loss) loss_func = get_loss_func(config.loss) merge_hm_flip_func, merge_tag_flip_func = get_merge_func(config.loss) loss_func = DataParallelCriterion(loss_func) # advanced parallel # dataset, basic imdb batch_size = len(devices) * config.dataiter.batch_images_per_ctx logger.info("Creating dataset") train_imdbs = [] for bmk_name in ['JSON', 'XML']: train_imdbs += [facade(bmk_name, 'TRAIN', config.dataset.path)] test_imdbs = [facade('TEST', 'TEST', config.dataset.path)] # basic data_loader unit dataset_train = facade_Dataset(train_imdbs, True, config.train.patch_width, config.train.patch_height, label_func, config.aug, config.loss) dataset_test = facade_Dataset(test_imdbs, False, config.train.patch_width, config.train.patch_height, label_func, config.aug, config.loss) train_data_loader = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, num_workers=config.dataiter.threads) valid_data_loader = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False, num_workers=config.dataiter.threads) # prepare network logger.info("Creating network") net = get_pose_net(config.network, config.loss.ae_feat_dim, num_corners if not config.loss.useCenterNet else num_corners + 1) init_pose_net(net, config.network) net = DataParallelModel(net).cuda() # advanced parallel model_prefix = os.path.join(final_output_path, config.train.model_prefix) logger.info("Net total params: {:.2f}M".format(sum(p.numel() for p in net.parameters()) / 1000000.0)) # Optimizer logger.info("Creating optimizer") optimizer, scheduler = get_optimizer(config.optimizer, net) # resume from model train_loss = [] valid_loss = [] latest_model = '{}_latest.pth.tar'.format(model_prefix) if s_args.autoresume and os.path.exists(latest_model): model_path = latest_model if os.path.exists(latest_model) else s_args.model assert os.path.exists(model_path), 'Cannot find model!' logger.info('Load checkpoint from {}'.format(model_path)) # load state from ckpt ckpt = torch.load(model_path) config.train.begin_epoch = ckpt['epoch'] + 1 net.load_state_dict(ckpt['network']) optimizer.load_state_dict(ckpt['optimizer']) scheduler.load_state_dict(ckpt['scheduler']) train_loss.extend(ckpt['train_loss']) valid_loss.extend(ckpt['valid_loss']) assert config.train.begin_epoch >= 2, 'resume error. begin_epoch should no less than 2' logger.info('continue training the {0}th epoch, init from the {1}th epoch'. format(config.train.begin_epoch,config.train.begin_epoch - 1)) # train and valid logger.info("Train DB size: {}; Valid DB size: {}.".format(int(len(dataset_train)), int(len(dataset_test)))) for epoch in range(config.train.begin_epoch, config.train.end_epoch + 1): logger.info("\nWorking on {}/{} epoch || LearningRate:{} ".format(epoch, config.train.end_epoch, scheduler.get_lr()[0])) speedometer = Speedometer(train_data_loader.batch_size, config.pytorch.frequent, auto_reset=False) beginT = time.time() tloss = trainNet(epoch, train_data_loader, net, optimizer, config.loss, loss_func, speedometer) endt1 = time.time() - beginT beginT = time.time() heatmaps, tagmaps, vloss = validNet(valid_data_loader, net, loss_func, merge_hm_flip_func, merge_tag_flip_func, devices, flip_pairs, flip_test=False) endt2 = time.time() - beginT beginT = time.time() if epoch > config.train.end_epoch - 3: #only eval late model, because evaluation takes too much time evalNet(epoch, heatmaps, tagmaps, valid_data_loader, config.loss, config.test, config.train.patch_width, config.train.patch_height, final_output_path) endt3 = time.time() - beginT logger.info('This Epoch Train %.1fs, Valid %.1fs, Eval %.1fs ' % (endt1, endt2, endt3)) logger.info('Train Loss:%.4f, Valid Loss:%.4f' % (tloss, vloss)) train_loss.append(tloss) valid_loss.append(vloss) scheduler.step() # save model state = { 'epoch': epoch, 'network': net.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'train_loss': train_loss, 'valid_loss': valid_loss } save_all_model(epoch, model_prefix, state, vloss, config, logger) plot_LearningCurve(train_loss, valid_loss, final_log_path, "LearningCurve")
def main(): # parsing specific config config = copy.deepcopy(s_config) config.network = get_default_network_config() config.loss = get_default_loss_config() config = update_config_from_file(config, s_config_file, check_necessity=True) config = update_config_from_args(config, s_args) et = config.dataset.eval_target # create log and path output_path = os.path.dirname(s_config_file) log_name = os.path.basename(s_args.model) logging.basicConfig(filename=os.path.join(output_path, '{}_test.log'.format(log_name)), format='%(asctime)-15s %(message)s', level=logging.INFO) logger = logging.getLogger() logger.addHandler(logging.StreamHandler()) logger.info('Test config:{}\n'.format(pprint.pformat(config))) # define devices create multi-GPU context os.environ["CUDA_VISIBLE_DEVICES"] = config.pytorch.gpus # a safer method devices = [int(i) for i in config.pytorch.gpus.split(',')] logger.info("Using Devices: {}".format(str(devices))) # label, loss, metric and result logger.info("Defining lable, loss, metric and result") label_func = get_label_func(config.loss) loss_func = get_loss_func(config.loss) loss_func = DataParallelCriterion(loss_func) merge_hm_flip_func, merge_tag_flip_func = get_merge_func(config.loss) # dataset, basic imdb batch_size = len(devices) * config.dataiter.batch_images_per_ctx logger.info("Creating dataset") valid_imdbs = [ facade(config.dataset.benchmark[et], 'valid', config.dataset.path[et]) ] dataset_valid = facade_Dataset(valid_imdbs, False, config.train.patch_width, config.train.patch_height, label_func, config.aug, config.loss) # here disable multi-process num_workers, because limit of GPU server valid_data_loader = DataLoader(dataset=dataset_valid, batch_size=batch_size) # prepare network assert os.path.exists(s_args.model), 'Cannot find model!' logger.info("Loading model from %s" % s_args.model) net = get_pose_net( config.network, config.loss.ae_feat_dim, num_corners if not config.loss.useCenterNet else num_corners + 1) net = DataParallelModel(net).cuda() ckpt = torch.load(s_args.model) # or other path/to/model net.load_state_dict(ckpt['network']) logger.info("Net total params: {:.2f}M".format( sum(p.numel() for p in net.parameters()) / 1000000.0)) # T^est logger.info("Test DB size: {}.".format(len(dataset_valid))) print("------TestUseCenter:%s, centerT:%.1f, windowT:%.1f ----------" % (config.test.useCenter, config.test.centerT, config.test.windowT)) beginT = time.time() heatmaps, tagmaps, vloss = \ validNet(valid_data_loader, net, loss_func, merge_hm_flip_func, merge_tag_flip_func, devices, flip_pairs, flip_test=True) endt1 = time.time() - beginT logger.info('Valid Loss:%.4f' % vloss) beginT = time.time() evalNet(0, heatmaps, tagmaps, valid_data_loader, config.loss, config.test, config.train.patch_width, config.train.patch_height, output_path) endt2 = time.time() - beginT logger.info('This Epoch Valid %.3fs, Eval %.3fs ' % (endt1, endt2))
def main(): # parsing specific config config = copy.deepcopy(s_config) config.network = get_default_network_config() config.loss = get_default_loss_config() config = update_config_from_file(config, s_config_file, check_necessity=True) config = update_config_from_args(config, s_args) # create log and path final_log_path = os.path.dirname(s_args.model) log_name = os.path.basename(s_args.model) logging.basicConfig(filename=os.path.join(final_log_path, '{}_test.log'.format(log_name)), format='%(asctime)-15s %(message)s', level=logging.INFO) logger = logging.getLogger() logger.addHandler(logging.StreamHandler()) logger.info('training config:{}\n'.format(pprint.pformat(config))) # define devices create multi-GPU context os.environ["CUDA_VISIBLE_DEVICES"] = config.pytorch.gpus devices = [int(i) for i in config.pytorch.gpus.split(',')] logger.info("Using Devices: {}".format(str(devices))) # lable, loss, metric, result and flip function logger.info("Defining lable, loss, metric, result and flip function") label_func = get_label_func(config.loss) loss_func = get_loss_func(config.loss) loss_func = DataParallelCriterion(loss_func) result_func = get_result_func(config.loss) merge_flip_func = get_merge_func(config.loss) # dataset logger.info("Creating dataset") test_imdbs = [] for n_db in range(0, len(config.dataset.name)): test_imdbs.append( eval(config.dataset.name[n_db])( config.dataset.test_image_set[n_db], config.dataset.path[n_db], config.train.patch_width, config.train.patch_height, config.train.rect_3d_width, config.train.rect_3d_height)) batch_size = len(devices) * config.dataiter.batch_images_per_ctx dataset_test = eval(config.dataset.name[config.dataiter.target_id] + "_Dataset")([test_imdbs[config.dataiter.target_id]], False, config.train.patch_width, config.train.patch_height, config.train.rect_3d_width, config.train.rect_3d_height, batch_size, config.dataiter.mean, config.dataiter.std, config.aug, label_func, config.loss) test_data_loader = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False, num_workers=config.dataiter.threads, drop_last=False) # prepare network assert os.path.exists(s_args.model), 'Cannot find model!' logger.info('Load checkpoint from {}'.format(s_args.model)) joint_num = dataset_test.joint_num net = get_pose_net(config.network, joint_num) net = DataParallelModel( net).cuda() # claim multi-gpu in CUDA_VISIBLE_DEVICES ckpt = torch.load(s_args.model) # or other path/to/model net.load_state_dict(ckpt['network']) logger.info("Net total params: {:.2f}M".format( sum(p.numel() for p in net.parameters()) / 1000000.0)) # test logger.info("Test DB size: {}.".format(int(len(dataset_test)))) beginT = time.time() preds_in_patch, _ = validNet( test_data_loader, net, config.loss, result_func, loss_func, merge_flip_func, config.train.patch_width, config.train.patch_height, devices, test_imdbs[config.dataiter.target_id].flip_pairs, flip_test=True, flip_fea_merge=False) evalNet(0, preds_in_patch, test_data_loader, test_imdbs[config.dataiter.target_id], config.train.patch_width, config.train.patch_height, config.train.rect_3d_width, config.train.rect_3d_height, final_log_path) print('Testing %.2f seconds.....' % (time.time() - beginT))