def main(): # assert torch.cuda.is_available(), 'Error: CUDA not found!' best_loss = float('inf') # best test loss start_epoch = 0 # start from epoch 0 or last epoch save_model_path = args.model # Data print('==> Preparing data..') transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Model net = RetinaNet() net.load_state_dict(torch.load('./model/net.pth')) net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() criterion = FocalLoss() # optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) for epoch in range(start_epoch, start_epoch + args.train_epoch): train(epoch + 1, transform, net, optimizer, criterion) save_model(epoch + 1, save_model_path, net, optimizer)
def main(): print('==> chooseing data..') transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Model net = RetinaNet() criterion = FocalLoss() # optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4) optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4) load_model_epoch = args.load_model_epoch checkpoint = torch.load( './checkpoint/{}_ckpt.pth'.format(load_model_epoch)) # max_epoch net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] test(start_epoch, transform, net, criterion, optimizer)
def run_train(args): assert torch.cuda.is_available(), 'Error: CUDA not found!' best_loss = float('inf') # best test loss start_epoch = 0 # start from epoch 0 or last epoch # Data print('==> Preparing data..') trainloader = get_train_loader(img_dir=settings.IMG_DIR, batch_size=batch_size) #trainloader = get_small_train_loader() print(trainloader.num) #testloader = get_train_loader(img_dir=settings.IMG_DIR) # Model net = RetinaNet() #net.load_state_dict(torch.load('./model/net.pth')) net.load_state_dict(torch.load('./ckps/best_0.pth')) net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() criterion = FocalLoss() #optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) optimizer = optim.Adam(net.parameters(), lr=args.lr) iter_save = 200 bgtime = time.time() # Training for epoch in range(start_epoch, start_epoch + 100): print('\nEpoch: %d' % epoch) net.train() #net.module.freeze_bn() train_loss = 0 for batch_idx, (inputs, loc_targets, cls_targets) in enumerate(trainloader): inputs = Variable(inputs.cuda()) loc_targets = Variable(loc_targets.cuda()) cls_targets = Variable(cls_targets.cuda()) optimizer.zero_grad() loc_preds, cls_preds = net(inputs) loss = criterion(loc_preds, loc_targets, cls_preds, cls_targets) loss.backward() optimizer.step() #train_loss += loss.data[0] sample_num = (batch_idx + 1) * batch_size avg_loss = running_loss(loss.data[0]) print( 'Epoch: {}, num: {}/{} train_loss: {:.3f} | run_loss: {:.3f} min: {:.1f}' .format(epoch, sample_num, trainloader.num, loss.data[0], avg_loss, (time.time() - bgtime) / 60), end='\r') if batch_idx % iter_save == 0: torch.save( net.module.state_dict(), './ckps/best_{}.pth'.format(batch_idx // iter_save % 5)) log.info('batch: {}, loss: {:.4f}'.format(batch_idx, avg_loss))
def load_model(backbone): print('loading model...') model = torch.load(os.path.join('model', 'restnet101_8K.pth')) net = RetinaNet(backbone=backbone, num_classes=len(cfg.classes)) net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() cudnn.benchmark = True net.load_state_dict(model['net']) return net
def train(total_epochs=1, interval=100, resume=False, ckpt_path = ''): print("Loading training dataset...") train_dset = OpenImagesDataset(root='./data/train', list_file ='./data/tmp/train_images_bbox.csv', transform=transform, train=True, input_size=600) train_loader = data.DataLoader(train_dset, batch_size=4, shuffle=True, num_workers=4, collate_fn=train_dset.collate_fn) print("Loading completed.") #val_dset = OpenImagesDataset(root='./data/train', # list_file='./data/tmp/train_images_bbox.csv', train=False, transform=transform, input_size=600) #val_loader = torch.utils.data.DataLoader(val_dset, batch_size=1, shuffle=False, num_workers=4, collate_fn=val_dset.collate_fn) net = RetinaNet() net.load_state_dict(torch.load('./model/net.pth')) criterion = FocalLoss() net.cuda() criterion.cuda() optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9, weight_decay=1e-4) best_val_loss = 1000 start_epoch=0 if resume: if os.path.isfile(ckpt_path): print(f'Loading from the checkpoint {ckpt_path}') checkpoint = torch.load(ckpt_path) start_epoch = checkpoint['epoch'] best_val_loss = checkpoint['best_val_loss'] net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print(f'Loaded checkpoint {ckpt_path}, epoch : {start_epoch}') else: print(f'No check point found at the path {ckpt_path}') for epoch in range(start_epoch, total_epochs): train_one_epoch(train_loader, net, criterion, optimizer, epoch, interval) val_loss = 0 #val_loss = validate(val_loader, net, criterion, interval) if val_loss < best_val_loss: best_val_loss = val_loss save_checkpoint({ 'epoch': epoch+1, 'state_dict': net.state_dict(), 'best_val_loss': best_val_loss, 'optimizer' : optimizer.state_dict() }, is_best=True)
def test_retinanet(): """test retinanet""" net = RetinaNet(classes=80) x = Variable(torch.rand(1, 3, 500, 500), volatile=True) now = time.time() net.cuda() predictions = net(x) later = time.time() print(later - now) for prediction in predictions: print(prediction.size())
def evaluate_threshold(img_ids, cls_threshold, bbox_dict): dloader = get_test_loader(img_ids, img_dir=settings.IMG_DIR, batch_size=batch_size) # Model net = RetinaNet() net.load_state_dict(torch.load(CKP_FILE)) #net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() net.eval() bgtime = time.time() encoder = DataEncoder() encoder.class_threshold = cls_threshold true_objects_num = 0 pred_objects_num = 0 for batch_idx, inputs in enumerate(dloader): inputs = Variable(inputs.cuda()) loc_preds, cls_preds = net(inputs) for i in range(len(loc_preds)): boxes, labels, scores = encoder.decode( loc_preds[i].data, cls_preds[i].data, (settings.IMG_SZ, settings.IMG_SZ)) pred_objects_num += len(scores) for img_idx in range(len(inputs)): img_id = dloader.img_ids[batch_idx * batch_size + img_idx] if img_id in bbox_dict: true_objects_num += len(bbox_dict[img_id]) print('{} / {}, {} / {}, {:.4f}, {:.2f} min'.format( batch_size * (batch_idx + 1), dloader.num, pred_objects_num, true_objects_num, cls_threshold, (time.time() - bgtime) / 60), end='\r') print('\n') print('=>>> {}/{}, {}, {:.4f}\n'.format( pred_objects_num, true_objects_num, pred_objects_num - true_objects_num, cls_threshold))
def predict(): assert torch.cuda.is_available(), 'Error: CUDA not found!' print('==> Preparing data..') dloader = get_test_loader(get_test_ids(), img_dir=settings.TEST_IMG_DIR, batch_size=batch_size) print(dloader.num) # Model net = RetinaNet() net.load_state_dict(torch.load(CKP_FILE)) #net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() bgtime = time.time() encoder = DataEncoder() net.eval() prediction_strings = [] for batch_idx, inputs in enumerate(dloader): inputs = Variable(inputs.cuda()) loc_preds, cls_preds = net(inputs) print('{} / {} {:.2f}'.format(batch_size * (batch_idx + 1), dloader.num, (time.time() - bgtime) / 60), end='\r') for i in range(len(loc_preds)): boxes, labels, scores = encoder.decode( loc_preds[i].data, cls_preds[i].data, (settings.IMG_SZ, settings.IMG_SZ)) prediction_strings.append( _get_prediction_string(boxes, labels, scores)) print(len(prediction_strings)) print(prediction_strings[:3]) submission = pd.DataFrame({ 'ImageId': dloader.img_ids, 'PredictionString': prediction_strings }) submission.to_csv('sub7.csv', index=False)
def train_obj(): # Model net = RetinaNet() net = torch.nn.DataParallel(net, device_ids=range( torch.cuda.device_count())) net.cuda() criterion = FocalLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) scheduler_obj = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda x: (1 - x / (len(trainloader) * epochs))**0.9) obj_trainer = ObjDetTrainer(net, criterion, optimizer, scheduler_obj, trainloader, valloader, device) obj_trainer.train(epochs, True)
shuffle=False, num_workers=8, collate_fn=testset.collate_fn) # Model net = RetinaNet(num_classes=15) net.load_state_dict(torch.load('./model/dota_15c_9ma.pth')) if resume: print('==> Resuming from checkpoint..') checkpoint = torch.load('./checkpoint/ckpt.pth') net.load_state_dict(checkpoint['net']) best_loss = checkpoint['loss'] start_epoch = checkpoint['epoch'] net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() if fix == 'head': for param in net.module.fpn.conv1.parameters(): param.requires_grad = False for param in net.module.fpn.bn1.parameters(): param.requires_grad = False for param in net.module.fpn.layer1.parameters(): param.requires_grad = False for param in net.module.fpn.layer2.parameters(): param.requires_grad = False for param in net.module.fpn.layer3.parameters(): param.requires_grad = False for param in net.module.fpn.layer4.parameters(): param.requires_grad = False else:
transform=transform, input_size=512) testloader = torch.utils.data.DataLoader(testset, batch_size=batch_sizes, shuffle=False, num_workers=1, collate_fn=testset.collate_fn) encoder = DataEncoder() print('Loading model..') net = RetinaNet() net.load_state_dict(torch.load('checkpoint/ckpt.pth')['net']) net.eval() net = net.cuda() the_classes = [c.strip() for c in open('data/voc.names').readlines()] w = h = 512 def test(): the_det_file = {} forward_tlist = [] _t = {'im_detect': Timer()} for batch_idx, (inputs, loc_targets, cls_targets, fname) in enumerate(testloader): _t['im_detect'].tic() inputs = inputs.cuda()
print('==> Resuming from checkpoint..') checkpoint = torch.load( '/media/Darius/shayeree/mixed_precision/training/checkpoint_list_wise/albertsons/Retina50ProdB1/ckpt_0010_17696.4766.pth' ) net.load_state_dict(checkpoint['net']) best_loss = checkpoint['loss'] start_epoch = checkpoint['epoch'] else: params_dict = torch.load('./model/{:s}.pth'.format(args.net)) net_dict = net.fpn.state_dict() params_dict = {k: v for k, v in params_dict.items() if k in net_dict} net_dict.update(params_dict) net.fpn.load_state_dict(net_dict) #net,optimizer=amp.initialize(net,optimizer,opt_level='01',loss_scale="dynamic") #net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() criterion = FocalLoss(num_classes=1) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) #amp_handle = amp.init(enabled=True, verbose=True) #optimizer = amp_handle.wrap_optimizer(optimizer) opt_level = 'O1' net, optimizer = amp.initialize(net.cuda(), optimizer, opt_level=opt_level, loss_scale="dynamic") #net,optimizer=amp.initialize(net.cuda(),optimizer,opt_level='O1',loss_scale="dynamic")
def train(): args = parse_args() assert torch.cuda.is_available(), 'Error: CUDA not found!' assert args.focal_loss, "OHEM + ce_loss is not working... :(" if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) if not os.path.exists(args.logdir): os.mkdir(args.logdir) ########################################################################### # Data ########################################################################### print('==> Preparing data..') trainset = ListDataset(root='/mnt/9C5E1A4D5E1A2116/datasets/', dataset=args.dataset, train=True, transform=Augmentation_traininig, input_size=args.input_size, multi_scale=args.multi_scale) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=trainset.collate_fn) ########################################################################### # Training Detail option\ stepvalues = (10000, 20000, 30000, 40000, 50000) if args.dataset in ["SynthText"] \ else (2000, 4000, 6000, 8000, 10000) best_loss = float('inf') # best test loss start_epoch = 0 # start from epoch 0 or last epoch iteration = 0 cur_lr = args.lr mean = (0.485, 0.456, 0.406) var = (0.229, 0.224, 0.225) step_index = 0 pEval = None ########################################################################### # Model ########################################################################### # set model (focal_loss vs OHEM_CE loss) if args.focal_loss: imagenet_pretrain = 'weights/retinanet_se50.pth' criterion = FocalLoss() num_classes = 1 else: imagenet_pretrain = 'weights/retinanet_se50_OHEM.pth' criterion = OHEM_loss() num_classes = 2 net = RetinaNet(num_classes) # Restore model weights net.load_state_dict(torch.load(imagenet_pretrain)) if args.resume: print('==> Resuming from checkpoint..', args.resume) checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) #start_epoch = checkpoint['epoch'] #iteration = checkpoint['iteration'] #cur_lr = checkpoint['lr'] #step_index = checkpoint['step_index'] # optimizer.load_state_dict(state["optimizer"]) print("multi_scale : ", args.multi_scale) print("input_size : ", args.input_size) print("stepvalues : ", stepvalues) print("start_epoch : ", start_epoch) print("iteration : ", iteration) print("cur_lr : ", cur_lr) print("step_index : ", step_index) print("num_gpus : ", torch.cuda.device_count()) # Data parellelism for multi-gpu training net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() # Put model in training mode and freeze batch norm. net.train() net.module.freeze_bn() # you must freeze batchnorm ########################################################################### # Optimizer ########################################################################### optimizer = optim.SGD(net.parameters(), lr=cur_lr, momentum=0.9, weight_decay=1e-4) #optimizer = optim.Adam(net.parameters(), lr=cur_lr) ########################################################################### # Utils ########################################################################### encoder = DataEncoder() writer = SummaryWriter(log_dir=args.logdir) ########################################################################### # Training loop ########################################################################### t0 = time.time() for epoch in range(start_epoch, 10000): if iteration > args.max_iter: break for inputs, loc_targets, cls_targets in trainloader: inputs = Variable(inputs.cuda()) loc_targets = Variable(loc_targets.cuda()) cls_targets = Variable(cls_targets.cuda()) optimizer.zero_grad() loc_preds, cls_preds = net(inputs) loc_loss, cls_loss = criterion(loc_preds, loc_targets, cls_preds, cls_targets) loss = loc_loss + cls_loss loss.backward() optimizer.step() if iteration % 20 == 0: t1 = time.time() print( 'iter ' + repr(iteration) + ' (epoch ' + repr(epoch) + ') || loss: %.4f || l loc_loss: %.4f || l cls_loss: %.4f (Time : %.1f)' % (loss.sum().item(), loc_loss.sum().item(), cls_loss.sum().item(), (t1 - t0))) # t0 = time.time() writer.add_scalar('loc_loss', loc_loss.sum().item(), iteration) writer.add_scalar('cls_loss', cls_loss.sum().item(), iteration) writer.add_scalar('loss', loss.sum().item(), iteration) # show inference image in tensorboard infer_img = np.transpose(inputs[0].cpu().numpy(), (1, 2, 0)) infer_img *= var infer_img += mean infer_img *= 255. infer_img = np.clip(infer_img, 0, 255) infer_img = infer_img.astype(np.uint8) h, w, _ = infer_img.shape boxes, labels, scores = encoder.decode(loc_preds[0], cls_preds[0], (w, h)) boxes = boxes.reshape(-1, 4, 2).astype(np.int32) if boxes.shape[0] != 0: # infer_img = infer_img/np.float32(255) # print(boxes) # print( # f"infer_img prior to cv2.polylines - dtype: {infer_img.dtype}, shape: {infer_img.shape}, min: {infer_img.min()}, max: {infer_img.max()}") # print( # f"boxes prior to cv2.polylines - dtype: {boxes.dtype}, shape: {boxes.shape}, min: {boxes.min()}, max: {boxes.max()}") infer_img = cv2.polylines(infer_img.copy(), boxes, True, (0, 255, 0), 4) # print( # f"infer_img - dtype: {infer_img.dtype}, shape: {infer_img.shape}, min: {infer_img.min()}, max: {infer_img.max()}") writer.add_image('image', infer_img, iteration, dataformats="HWC") writer.add_scalar('input_size', h, iteration) writer.add_scalar('learning_rate', cur_lr, iteration) t0 = time.time() if iteration % args.save_interval == 0 and iteration > 0: print('Saving state, iter : ', iteration) state = { 'net': net.module.state_dict(), "optimizer": optimizer.state_dict(), 'iteration': iteration, 'epoch': epoch, 'lr': cur_lr, 'step_index': step_index } model_file = args.save_folder + \ 'ckpt_' + repr(iteration) + '.pth' torch.save(state, model_file) if iteration in stepvalues: step_index += 1 cur_lr = adjust_learning_rate(cur_lr, optimizer, args.gamma, step_index) if iteration > args.max_iter: break if args.evaluation and iteration % args.eval_step == 0: try: if pEval is None: print("Evaluation started at iteration {} on IC15...". format(iteration)) eval_cmd = "CUDA_VISIBLE_DEVICES=" + str(args.eval_device) + \ " python eval.py" + \ " --tune_from=" + args.save_folder + 'ckpt_' + repr(iteration) + '.pth' + \ " --input_size=1024" + \ " --output_zip=result_temp1" pEval = Popen(eval_cmd, shell=True, stdout=PIPE, stderr=PIPE) elif pEval.poll() is not None: (scorestring, stderrdata) = pEval.communicate() hmean = float( str(scorestring).strip().split(":")[3].split(",") [0].split("}")[0].strip()) writer.add_scalar('test_hmean', hmean, iteration) print("test_hmean for {}-th iter : {:.4f}".format( iteration, hmean)) if pEval is not None: pEval.kill() pEval = None except Exception as e: print("exception happened in evaluation ", e) if pEval is not None: pEval.kill() pEval = None iteration += 1
testset = ListDataset(root='/search/odin/liukuang/data/voc_all_images', list_file='./data/voc12_val.txt', train=False, transform=transform, input_size=600) testloader = torch.utils.data.DataLoader(testset, batch_size=16, shuffle=False, num_workers=8, collate_fn=testset.collate_fn) # Model net = RetinaNet() net.load_state_dict(torch.load('./model/net.pth')) if args.resume: print('==> Resuming from checkpoint..') checkpoint = torch.load('./checkpoint/ckpt.pth') net.load_state_dict(checkpoint['net']) best_loss = checkpoint['loss'] start_epoch = checkpoint['epoch'] net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() criterion = FocalLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) # Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() net.module.freeze_bn() train_loss = 0 for batch_idx, (inputs, loc_targets, cls_targets) in enumerate(trainloader): inputs = Variable(inputs.cuda()) loc_targets = Variable(loc_targets.cuda()) cls_targets = Variable(cls_targets.cuda())