def main(): logger.info('******************************') logger.info(opt) logger.info('******************************') logger.info(cfg) logger.info('******************************') # Model Initialize m = preset_model(cfg) m = nn.DataParallel(m).cuda() criterion = builder.build_loss(cfg.LOSS).cuda() if cfg.TRAIN.OPTIMIZER == 'adam': optimizer = torch.optim.Adam(m.parameters(), lr=cfg.TRAIN.LR) elif cfg.TRAIN.OPTIMIZER == 'rmsprop': optimizer = torch.optim.RMSprop(m.parameters(), lr=cfg.TRAIN.LR) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.TRAIN.LR_STEP, gamma=cfg.TRAIN.LR_FACTOR) writer = SummaryWriter('.tensorboard/{}-{}'.format(opt.exp_id, cfg.FILE_NAME)) train_dataset = builder.build_dataset(cfg.DATASET.TRAIN, preset_cfg=cfg.DATA_PRESET, train=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE * num_gpu, shuffle=True, num_workers=opt.nThreads) heatmap_to_coord = get_func_heatmap_to_coord(cfg) opt.trainIters = 0 for i in range(cfg.TRAIN.BEGIN_EPOCH, cfg.TRAIN.END_EPOCH): opt.epoch = i current_lr = optimizer.state_dict()['param_groups'][0]['lr'] logger.info( f'############# Starting Epoch {opt.epoch} | LR: {current_lr} #############' ) # Training loss, miou = train(opt, train_loader, m, criterion, optimizer, writer) logger.epochInfo('Train', opt.epoch, loss, miou) lr_scheduler.step() if (i + 1) % opt.snapshot == 0: # Save checkpoint torch.save( m.module.state_dict(), './exp/{}-{}/model_{}.pth'.format(opt.exp_id, cfg.FILE_NAME, opt.epoch)) # Prediction Test with torch.no_grad(): gt_AP = validate_gt(m.module, opt, cfg, heatmap_to_coord) rcnn_AP = validate(m.module, opt, heatmap_to_coord) logger.info( f'##### Epoch {opt.epoch} | gt mAP: {gt_AP} | rcnn mAP: {rcnn_AP} #####' ) # Time to add DPG if i == cfg.TRAIN.DPG_MILESTONE: torch.save( m.module.state_dict(), './exp/{}-{}/final.pth'.format(opt.exp_id, cfg.FILE_NAME)) # Adjust learning rate for param_group in optimizer.param_groups: param_group['lr'] = cfg.TRAIN.LR lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.TRAIN.DPG_STEP, gamma=0.1) # Reset dataset train_dataset = builder.build_dataset(cfg.DATASET.TRAIN, preset_cfg=cfg.DATA_PRESET, train=True, dpg=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE * num_gpu, shuffle=True, num_workers=opt.nThreads) torch.save(m.module.state_dict(), './exp/{}-{}/final_DPG.pth'.format(opt.exp_id, cfg.FILE_NAME))
def main(): logger.info('******************************') logger.info(opt) logger.info('******************************') logger.info(cfg) logger.info('******************************') # Model Initialize m = preset_model(cfg) # todo: try to replace with distributedDataParallel to see if it is faster m = nn.DataParallel(m) if opt.device.type != 'cpu': m = m.cuda() criterion = builder.build_loss(cfg.LOSS) if opt.device.type != 'cpu': criterion = criterion.cuda() if cfg.TRAIN.OPTIMIZER == 'adam': optimizer = torch.optim.Adam(m.parameters(), lr=cfg.TRAIN.LR) elif cfg.TRAIN.OPTIMIZER == 'rmsprop': optimizer = torch.optim.RMSprop(m.parameters(), lr=cfg.TRAIN.LR) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.TRAIN.LR_STEP, gamma=cfg.TRAIN.LR_FACTOR) if opt.clean: if opt.tensorboard_path.exists(): shutil.rmtree(opt.tensorboard_path) if opt.experiment_path.exists(): shutil.rmtree(opt.experiment_path) opt.tensorboard_path.mkdir(exist_ok=True, parents=True) opt.experiment_path.mkdir(exist_ok=True, parents=True) writer = SummaryWriter(str(opt.tensorboard_path)) train_dataset = builder.build_dataset(cfg.DATASET.TRAIN, preset_cfg=cfg.DATA_PRESET, train=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE * max(1, num_gpu), shuffle=True, num_workers=opt.nThreads) heatmap_to_coord = get_func_heatmap_to_coord(cfg) opt.trainIters = 0 scaler = GradScaler() for i in range(cfg.TRAIN.BEGIN_EPOCH, cfg.TRAIN.END_EPOCH): opt.epoch = i current_lr = optimizer.state_dict()['param_groups'][0]['lr'] logger.info( f'############# Starting Epoch {opt.epoch} | LR: {current_lr} #############' ) # Training loggers = train(opt, train_loader, m, criterion, optimizer, writer, scaler) logger.info( f'Train-{opt.epoch:d} epoch | ' f'{" | ".join(f"{name}:{l.avg:.07f}" for name, l in loggers.items())}' ) lr_scheduler.step() if (i + 1) % opt.snapshot == 0: # Save checkpoint torch.save(m.module.state_dict(), str(opt.experiment_path / f'model_{opt.epoch}.pth')) # Prediction Test with torch.no_grad(): metrics_on_true_box = validate_gt(m.module, opt, cfg, heatmap_to_coord) gt_AP = metrics_on_true_box["map"] gt_radius_mse = metrics_on_true_box["radius_mse"] rcnn_AP = validate(m.module, opt, heatmap_to_coord) logger.info(f'##### Epoch {opt.epoch} | ' f'gt mAP: {gt_AP} | ' f'rcnn mAP: {rcnn_AP} | ' f'gt radius_mse {gt_radius_mse}' f' #####') writer.add_scalar(f'Validation/mAP_on_gt_box', gt_AP, opt.trainIters) writer.add_scalar(f'Validation/mAP_on_pred_box', rcnn_AP, opt.trainIters) writer.add_scalar(f'Validation/radius_mse_on_gt_box', gt_radius_mse, opt.trainIters) # Time to add DPG if i == cfg.TRAIN.DPG_MILESTONE: torch.save(m.module.state_dict(), str(opt.experiment_path / "final.pth")) # Adjust learning rate for param_group in optimizer.param_groups: param_group['lr'] = cfg.TRAIN.LR lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.TRAIN.DPG_STEP, gamma=0.1) # Reset dataset train_dataset = builder.build_dataset(cfg.DATASET.TRAIN, preset_cfg=cfg.DATA_PRESET, train=True, dpg=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE * max(1, num_gpu), shuffle=True, num_workers=opt.nThreads) torch.save(m.module.state_dict(), str(opt.experiment_path / 'final_DPG.pth'))