def run_train(args): assert torch.cuda.is_available(), 'Error: CUDA not found!' best_loss = float('inf') # best test loss start_epoch = 0 # start from epoch 0 or last epoch # Data print('==> Preparing data..') trainloader = get_train_loader(img_dir=settings.IMG_DIR, batch_size=batch_size) #trainloader = get_small_train_loader() print(trainloader.num) #testloader = get_train_loader(img_dir=settings.IMG_DIR) # Model net = RetinaNet() #net.load_state_dict(torch.load('./model/net.pth')) net.load_state_dict(torch.load('./ckps/best_0.pth')) net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() criterion = FocalLoss() #optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) optimizer = optim.Adam(net.parameters(), lr=args.lr) iter_save = 200 bgtime = time.time() # Training for epoch in range(start_epoch, start_epoch + 100): print('\nEpoch: %d' % epoch) net.train() #net.module.freeze_bn() train_loss = 0 for batch_idx, (inputs, loc_targets, cls_targets) in enumerate(trainloader): inputs = Variable(inputs.cuda()) loc_targets = Variable(loc_targets.cuda()) cls_targets = Variable(cls_targets.cuda()) optimizer.zero_grad() loc_preds, cls_preds = net(inputs) loss = criterion(loc_preds, loc_targets, cls_preds, cls_targets) loss.backward() optimizer.step() #train_loss += loss.data[0] sample_num = (batch_idx + 1) * batch_size avg_loss = running_loss(loss.data[0]) print( 'Epoch: {}, num: {}/{} train_loss: {:.3f} | run_loss: {:.3f} min: {:.1f}' .format(epoch, sample_num, trainloader.num, loss.data[0], avg_loss, (time.time() - bgtime) / 60), end='\r') if batch_idx % iter_save == 0: torch.save( net.module.state_dict(), './ckps/best_{}.pth'.format(batch_idx // iter_save % 5)) log.info('batch: {}, loss: {:.4f}'.format(batch_idx, avg_loss))
} # ckpt_path = os.path.join('ckpts', args.exp) ckpt_path = '.store' if not os.path.isdir(ckpt_path): os.makedirs(ckpt_path) torch.save(state, os.path.join(ckpt_path, 'ckpt.pth')) best_loss = loss for epoch in range(start_epoch + 1, start_epoch + cfg.num_epochs + 1): if epoch in cfg.lr_decay_epochs: lr *= 0.1 for param_group in optimizer.param_groups: param_group['lr'] = lr print('\nTrain Epoch: %d' % epoch) net.train() train_loss = 0 for batch_idx, (inputs, loc_targets, cls_targets) in enumerate(trainloader): # print(np.any(np.isnan(inputs.numpy()))) # print(np.any(np.isnan(loc_targets.numpy()))) # print(np.any(np.isnan(loc_targets.numpy()))) # ipdb.set_trace() pos = cls_targets > 0 # pos1=cls_targets ==0 # pos2=cls_targets ==-1 print(pos.data.long().sum()) inputs = Variable(inputs.cuda()) loc_targets = Variable(loc_targets.cuda())
def train(): args = parse_args() assert torch.cuda.is_available(), 'Error: CUDA not found!' assert args.focal_loss, "OHEM + ce_loss is not working... :(" if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) if not os.path.exists(args.logdir): os.mkdir(args.logdir) ########################################################################### # Data ########################################################################### print('==> Preparing data..') trainset = ListDataset(root='/mnt/9C5E1A4D5E1A2116/datasets/', dataset=args.dataset, train=True, transform=Augmentation_traininig, input_size=args.input_size, multi_scale=args.multi_scale) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=trainset.collate_fn) ########################################################################### # Training Detail option\ stepvalues = (10000, 20000, 30000, 40000, 50000) if args.dataset in ["SynthText"] \ else (2000, 4000, 6000, 8000, 10000) best_loss = float('inf') # best test loss start_epoch = 0 # start from epoch 0 or last epoch iteration = 0 cur_lr = args.lr mean = (0.485, 0.456, 0.406) var = (0.229, 0.224, 0.225) step_index = 0 pEval = None ########################################################################### # Model ########################################################################### # set model (focal_loss vs OHEM_CE loss) if args.focal_loss: imagenet_pretrain = 'weights/retinanet_se50.pth' criterion = FocalLoss() num_classes = 1 else: imagenet_pretrain = 'weights/retinanet_se50_OHEM.pth' criterion = OHEM_loss() num_classes = 2 net = RetinaNet(num_classes) # Restore model weights net.load_state_dict(torch.load(imagenet_pretrain)) if args.resume: print('==> Resuming from checkpoint..', args.resume) checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) #start_epoch = checkpoint['epoch'] #iteration = checkpoint['iteration'] #cur_lr = checkpoint['lr'] #step_index = checkpoint['step_index'] # optimizer.load_state_dict(state["optimizer"]) print("multi_scale : ", args.multi_scale) print("input_size : ", args.input_size) print("stepvalues : ", stepvalues) print("start_epoch : ", start_epoch) print("iteration : ", iteration) print("cur_lr : ", cur_lr) print("step_index : ", step_index) print("num_gpus : ", torch.cuda.device_count()) # Data parellelism for multi-gpu training net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() # Put model in training mode and freeze batch norm. net.train() net.module.freeze_bn() # you must freeze batchnorm ########################################################################### # Optimizer ########################################################################### optimizer = optim.SGD(net.parameters(), lr=cur_lr, momentum=0.9, weight_decay=1e-4) #optimizer = optim.Adam(net.parameters(), lr=cur_lr) ########################################################################### # Utils ########################################################################### encoder = DataEncoder() writer = SummaryWriter(log_dir=args.logdir) ########################################################################### # Training loop ########################################################################### t0 = time.time() for epoch in range(start_epoch, 10000): if iteration > args.max_iter: break for inputs, loc_targets, cls_targets in trainloader: inputs = Variable(inputs.cuda()) loc_targets = Variable(loc_targets.cuda()) cls_targets = Variable(cls_targets.cuda()) optimizer.zero_grad() loc_preds, cls_preds = net(inputs) loc_loss, cls_loss = criterion(loc_preds, loc_targets, cls_preds, cls_targets) loss = loc_loss + cls_loss loss.backward() optimizer.step() if iteration % 20 == 0: t1 = time.time() print( 'iter ' + repr(iteration) + ' (epoch ' + repr(epoch) + ') || loss: %.4f || l loc_loss: %.4f || l cls_loss: %.4f (Time : %.1f)' % (loss.sum().item(), loc_loss.sum().item(), cls_loss.sum().item(), (t1 - t0))) # t0 = time.time() writer.add_scalar('loc_loss', loc_loss.sum().item(), iteration) writer.add_scalar('cls_loss', cls_loss.sum().item(), iteration) writer.add_scalar('loss', loss.sum().item(), iteration) # show inference image in tensorboard infer_img = np.transpose(inputs[0].cpu().numpy(), (1, 2, 0)) infer_img *= var infer_img += mean infer_img *= 255. infer_img = np.clip(infer_img, 0, 255) infer_img = infer_img.astype(np.uint8) h, w, _ = infer_img.shape boxes, labels, scores = encoder.decode(loc_preds[0], cls_preds[0], (w, h)) boxes = boxes.reshape(-1, 4, 2).astype(np.int32) if boxes.shape[0] != 0: # infer_img = infer_img/np.float32(255) # print(boxes) # print( # f"infer_img prior to cv2.polylines - dtype: {infer_img.dtype}, shape: {infer_img.shape}, min: {infer_img.min()}, max: {infer_img.max()}") # print( # f"boxes prior to cv2.polylines - dtype: {boxes.dtype}, shape: {boxes.shape}, min: {boxes.min()}, max: {boxes.max()}") infer_img = cv2.polylines(infer_img.copy(), boxes, True, (0, 255, 0), 4) # print( # f"infer_img - dtype: {infer_img.dtype}, shape: {infer_img.shape}, min: {infer_img.min()}, max: {infer_img.max()}") writer.add_image('image', infer_img, iteration, dataformats="HWC") writer.add_scalar('input_size', h, iteration) writer.add_scalar('learning_rate', cur_lr, iteration) t0 = time.time() if iteration % args.save_interval == 0 and iteration > 0: print('Saving state, iter : ', iteration) state = { 'net': net.module.state_dict(), "optimizer": optimizer.state_dict(), 'iteration': iteration, 'epoch': epoch, 'lr': cur_lr, 'step_index': step_index } model_file = args.save_folder + \ 'ckpt_' + repr(iteration) + '.pth' torch.save(state, model_file) if iteration in stepvalues: step_index += 1 cur_lr = adjust_learning_rate(cur_lr, optimizer, args.gamma, step_index) if iteration > args.max_iter: break if args.evaluation and iteration % args.eval_step == 0: try: if pEval is None: print("Evaluation started at iteration {} on IC15...". format(iteration)) eval_cmd = "CUDA_VISIBLE_DEVICES=" + str(args.eval_device) + \ " python eval.py" + \ " --tune_from=" + args.save_folder + 'ckpt_' + repr(iteration) + '.pth' + \ " --input_size=1024" + \ " --output_zip=result_temp1" pEval = Popen(eval_cmd, shell=True, stdout=PIPE, stderr=PIPE) elif pEval.poll() is not None: (scorestring, stderrdata) = pEval.communicate() hmean = float( str(scorestring).strip().split(":")[3].split(",") [0].split("}")[0].strip()) writer.add_scalar('test_hmean', hmean, iteration) print("test_hmean for {}-th iter : {:.4f}".format( iteration, hmean)) if pEval is not None: pEval.kill() pEval = None except Exception as e: print("exception happened in evaluation ", e) if pEval is not None: pEval.kill() pEval = None iteration += 1