def test(opt): """model configuration""" if "CTC" in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.sos_token_index = converter.dict["[SOS]"] opt.eos_token_index = converter.dict["[EOS]"] opt.num_class = len(converter.character) model = Model(opt) print( "model input parameters", opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction, ) model = torch.nn.DataParallel(model).to(device) # load model print("loading pretrained model from %s" % opt.saved_model) try: model.load_state_dict(torch.load(opt.saved_model, map_location=device)) except: print( "*** pretrained model not match strictly *** and thus load_state_dict with strict=False mode" ) # pretrained_state_dict = torch.load(opt.saved_model) # for name in pretrained_state_dict: # print(name) model.load_state_dict(torch.load(opt.saved_model, map_location=device), strict=False) opt.exp_name = "_".join(opt.saved_model.split("/")[1:]) # print(model) """ keep evaluation model and result logs """ os.makedirs(f"./result/{opt.exp_name}", exist_ok=True) # os.system(f'cp {opt.saved_model} ./result/{opt.exp_name}/') """ setup loss """ if "CTC" in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: # ignore [PAD] token criterion = torch.nn.CrossEntropyLoss( ignore_index=converter.dict["[PAD]"]).to(device) """ evaluation """ model.eval() with torch.no_grad(): if ( opt.eval_type ): # evaluate 6 benchmark evaluation datasets or 7 additionally collected evaluation datasets benchmark_all_eval(model, criterion, converter, opt) else: log = open(f"./result/{opt.exp_name}/log_evaluation.txt", "a") AlignCollate_eval = AlignCollate(opt, mode="test") eval_data, eval_data_log = hierarchical_dataset(root=opt.eval_data, opt=opt, mode="test") eval_loader = torch.utils.data.DataLoader( eval_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_eval, pin_memory=True, ) _, score_by_best_model, _, _, _, _, _ = validation( model, criterion, eval_loader, converter, opt) log.write(eval_data_log) print(f"{score_by_best_model:0.2f}") log.write(f"{score_by_best_model:0.2f}\n") log.close()
def demoToTxt1(image_folder, saved_model, txtFile): # sensitive parser = argparse.ArgumentParser() parser.add_argument('--image_folder', default=image_folder, help='path to image_folder which contains text images') parser.add_argument('--workers', type=int, help='number of data loading workers', default=4) parser.add_argument('--batch_size', type=int, default=100, help='input batch size') parser.add_argument('--saved_model', default=saved_model, help="path to saved_model to evaluation") """ Data processing """ parser.add_argument('--batch_max_length', type=int, default=20, help='maximum-label-length') parser.add_argument('--imgH', type=int, default=32, help='the height of the input image') parser.add_argument('--imgW', type=int, default=100, help='the width of the input image') parser.add_argument('--rgb', action='store_true', help='use rgb input') parser.add_argument('--character', type=str, default='0123456789', help='character label') parser.add_argument('--sensitive', default=True, help='for sensitive character mode') parser.add_argument('--PAD', default=False, action='store_true', help='whether to keep ratio then pad for image resize') """ Model Architecture """ parser.add_argument('--Transformation', default='TPS', type=str, help='Transformation stage. None|TPS') parser.add_argument('--FeatureExtraction', default='ResNet', type=str, help='FeatureExtraction stage. VGG|RCNN|ResNet') parser.add_argument('--SequenceModeling', default='BiLSTM', type=str, help='SequenceModeling stage. None|BiLSTM') parser.add_argument('--Prediction', default='CTC', type=str, help='Prediction stage. CTC|Attn') parser.add_argument('--num_fiducial', type=int, default=20, help='number of fiducial points of TPS-STN') parser.add_argument( '--input_channel', type=int, default=1, help='the number of input channel of Feature extractor') parser.add_argument( '--output_channel', type=int, default=512, help='the number of output channel of Feature extractor') parser.add_argument('--hidden_size', type=int, default=256, help='the size of the LSTM hidden state') opt = parser.parse_args() """ vocab / character number configuration """ if opt.sensitive: opt.character += 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' # opt.character = string.printable[:-6] # same with ASTER setting (use 94 char). cudnn.benchmark = True cudnn.deterministic = True opt.num_gpu = torch.cuda.device_count() """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) model = torch.nn.DataParallel(model) if torch.cuda.is_available(): model = model.cuda() # load model print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model)) # prepare data. two demo images from https://github.com/bgshih/crnn#run-demo AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) demo_data = RawDataset(root=opt.image_folder, opt=opt) # use RawDataset demo_loader = torch.utils.data.DataLoader(demo_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_demo, pin_memory=True) # predict model.eval() saved_file = open(txtFile, 'w') for image_tensors, image_path_list in demo_loader: batch_size = image_tensors.size(0) with torch.no_grad(): image = image_tensors.cuda() # For max length prediction length_for_pred = torch.cuda.IntTensor([opt.batch_max_length] * batch_size) text_for_pred = torch.cuda.LongTensor( batch_size, opt.batch_max_length + 1).fill_(0) if 'CTC' in opt.Prediction: preds = model(image, text_for_pred).log_softmax(2) # Select max probabilty (greedy decoding) then decode index to character preds_size = torch.IntTensor([preds.size(1)] * batch_size) _, preds_index = preds.permute(1, 0, 2).max(2) preds_index = preds_index.transpose(1, 0).contiguous().view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) print('-' * 80) print('image_path\tpredicted_labels') print('-' * 80) for img_name, pred in zip(image_path_list, preds_str): if 'Attn' in opt.Prediction: pred = pred[:pred.find( '[s]')] # prune after "end of sentence" token ([s]) print(f'{img_name}\t{pred}') saved_file.write(f'{img_name}\t{pred}\n')
def train(opt): plotDir = os.path.join(opt.exp_dir,opt.exp_name,'plots') if not os.path.exists(plotDir): os.makedirs(plotDir) lib.print_model_settings(locals().copy()) """ dataset preparation """ if not opt.data_filtering_off: print('Filtering the images containing characters which are not in opt.character') print('Filtering the images whose label is longer than opt.batch_max_length') # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130 opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') #considering the real images for discriminator opt.batch_size = opt.batch_size*2 train_dataset = Batch_Balanced_Dataset(opt) log = open(os.path.join(opt.exp_dir,opt.exp_name,'log_dataset.txt'), 'a') AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset, valid_dataset_log = hierarchical_dataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle=False, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) log.write(valid_dataset_log) print('-' * 80) log.write('-' * 80 + '\n') log.close() """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = AdaINGen(opt) ocrModel = Model(opt) disModel = MsImageDisV1(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for currModel in [model, ocrModel, disModel]: for name, param in currModel.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU ocrModel = torch.nn.DataParallel(ocrModel).to(device) if not opt.ocrFixed: ocrModel.train() else: ocrModel.module.Transformation.eval() ocrModel.module.FeatureExtraction.eval() ocrModel.module.AdaptiveAvgPool.eval() # ocrModel.module.SequenceModeling.eval() ocrModel.module.Prediction.eval() model = torch.nn.DataParallel(model).to(device) model.train() disModel = torch.nn.DataParallel(disModel).to(device) disModel.train() if opt.modelFolderFlag: if len(glob.glob(os.path.join(opt.exp_dir,opt.exp_name,"iter_*_synth.pth")))>0: opt.saved_synth_model = glob.glob(os.path.join(opt.exp_dir,opt.exp_name,"iter_*_synth.pth"))[-1] if len(glob.glob(os.path.join(opt.exp_dir,opt.exp_name,"iter_*_dis.pth")))>0: opt.saved_dis_model = glob.glob(os.path.join(opt.exp_dir,opt.exp_name,"iter_*_dis.pth"))[-1] #loading pre-trained model if opt.saved_ocr_model != '' and opt.saved_ocr_model != 'None': print(f'loading pretrained ocr model from {opt.saved_ocr_model}') if opt.FT: ocrModel.load_state_dict(torch.load(opt.saved_ocr_model), strict=False) else: ocrModel.load_state_dict(torch.load(opt.saved_ocr_model)) print("OCRModel:") print(ocrModel) if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': print(f'loading pretrained synth model from {opt.saved_synth_model}') if opt.FT: model.load_state_dict(torch.load(opt.saved_synth_model), strict=False) else: model.load_state_dict(torch.load(opt.saved_synth_model)) print("SynthModel:") print(model) if opt.saved_dis_model != '' and opt.saved_dis_model != 'None': print(f'loading pretrained discriminator model from {opt.saved_dis_model}') if opt.FT: disModel.load_state_dict(torch.load(opt.saved_dis_model), strict=False) else: disModel.load_state_dict(torch.load(opt.saved_dis_model)) print("DisModel:") print(disModel) """ setup loss """ if 'CTC' in opt.Prediction: ocrCriterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: ocrCriterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # ignore [GO] token = ignore index 0 recCriterion = torch.nn.L1Loss() styleRecCriterion = torch.nn.L1Loss() # loss averager loss_avg_ocr = Averager() loss_avg = Averager() loss_avg_dis = Averager() loss_avg_ocrRecon_1 = Averager() loss_avg_ocrRecon_2 = Averager() loss_avg_gen = Averager() loss_avg_imgRecon = Averager() loss_avg_styRecon = Averager() ##---------------------------------------## # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.optim=='adam': optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, opt.beta2), weight_decay=opt.weight_decay) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps, weight_decay=opt.weight_decay) print("SynthOptimizer:") print(optimizer) #filter parameters for OCR training ocr_filtered_parameters = [] ocr_params_num = [] for p in filter(lambda p: p.requires_grad, ocrModel.parameters()): ocr_filtered_parameters.append(p) ocr_params_num.append(np.prod(p.size())) print('OCR Trainable params num : ', sum(ocr_params_num)) # setup optimizer if opt.optim=='adam': ocr_optimizer = optim.Adam(ocr_filtered_parameters, lr=opt.lr, betas=(opt.beta1, opt.beta2), weight_decay=opt.weight_decay) else: ocr_optimizer = optim.Adadelta(ocr_filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps, weight_decay=opt.weight_decay) print("OCROptimizer:") print(ocr_optimizer) #filter parameters for OCR training dis_filtered_parameters = [] dis_params_num = [] for p in filter(lambda p: p.requires_grad, disModel.parameters()): dis_filtered_parameters.append(p) dis_params_num.append(np.prod(p.size())) print('Dis Trainable params num : ', sum(dis_params_num)) # setup optimizer if opt.optim=='adam': dis_optimizer = optim.Adam(dis_filtered_parameters, lr=opt.lr, betas=(opt.beta1, opt.beta2), weight_decay=opt.weight_decay) else: dis_optimizer = optim.Adadelta(dis_filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps, weight_decay=opt.weight_decay) print("DisOptimizer:") print(dis_optimizer) ##---------------------------------------## """ final options """ with open(os.path.join(opt.exp_dir,opt.exp_name,'opt.txt'), 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': try: start_iter = int(opt.saved_synth_model.split('_')[-2].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass #get schedulers scheduler = get_scheduler(optimizer,opt) ocr_scheduler = get_scheduler(ocr_optimizer,opt) dis_scheduler = get_scheduler(dis_optimizer,opt) start_time = time.time() best_accuracy = -1 best_norm_ED = -1 best_accuracy_ocr = -1 best_norm_ED_ocr = -1 iteration = start_iter cntr=0 while(True): # train part if opt.lr_policy !="None": scheduler.step() ocr_scheduler.step() dis_scheduler.step() image_tensors_all, labels_1_all, labels_2_all = train_dataset.get_batch() # ## comment # pdb.set_trace() # for imgCntr in range(image_tensors.shape[0]): # save_image(tensor2im(image_tensors[imgCntr]),'temp/'+str(imgCntr)+'.png') # pdb.set_trace() # ### # print(cntr) cntr+=1 disCnt = int(image_tensors_all.size(0)/2) image_tensors, image_tensors_real, labels_gt, labels_2 = image_tensors_all[:disCnt], image_tensors_all[disCnt:disCnt+disCnt], labels_1_all[:disCnt], labels_2_all[:disCnt] image = image_tensors.to(device) image_real = image_tensors_real.to(device) batch_size = image.size(0) ##-----------------------------------## #generate text(labels) from ocr.forward if opt.ocrFixed: # ocrModel.eval() length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) if 'CTC' in opt.Prediction: preds = ocrModel(image, text_for_pred) preds = preds[:, :text_for_loss.shape[1] - 1, :] preds_size = torch.IntTensor([preds.size(1)] * batch_size) _, preds_index = preds.max(2) labels_1 = converter.decode(preds_index.data, preds_size.data) else: preds = ocrModel(image, text_for_pred, is_train=False) _, preds_index = preds.max(2) labels_1 = converter.decode(preds_index, length_for_pred) for idx, pred in enumerate(labels_1): pred_EOS = pred.find('[s]') labels_1[idx] = pred[:pred_EOS] # prune after "end of sentence" token ([s]) # ocrModel.train() else: labels_1 = labels_gt ##-----------------------------------## text_1, length_1 = converter.encode(labels_1, batch_max_length=opt.batch_max_length) text_2, length_2 = converter.encode(labels_2, batch_max_length=opt.batch_max_length) #forward pass from style and word generator images_recon_1, images_recon_2, style = model(image, text_1, text_2) if 'CTC' in opt.Prediction: if not opt.ocrFixed: #ocr training with orig image preds_ocr = ocrModel(image, text_1) preds_size_ocr = torch.IntTensor([preds_ocr.size(1)] * batch_size) preds_ocr = preds_ocr.log_softmax(2).permute(1, 0, 2) ocrCost_train = ocrCriterion(preds_ocr, text_1, preds_size_ocr, length_1) #content loss for reconstructed images preds_1 = ocrModel(images_recon_1, text_1) preds_size_1 = torch.IntTensor([preds_1.size(1)] * batch_size) preds_1 = preds_1.log_softmax(2).permute(1, 0, 2) preds_2 = ocrModel(images_recon_2, text_2) preds_size_2 = torch.IntTensor([preds_2.size(1)] * batch_size) preds_2 = preds_2.log_softmax(2).permute(1, 0, 2) ocrCost_1 = ocrCriterion(preds_1, text_1, preds_size_1, length_1) ocrCost_2 = ocrCriterion(preds_2, text_2, preds_size_2, length_2) # ocrCost = 0.5*( ocrCost_1 + ocrCost_2 ) else: if not opt.ocrFixed: #ocr training with orig image preds_ocr = ocrModel(image, text_1[:, :-1]) # align with Attention.forward target_ocr = text_1[:, 1:] # without [GO] Symbol ocrCost_train = ocrCriterion(preds_ocr.view(-1, preds_ocr.shape[-1]), target_ocr.contiguous().view(-1)) #content loss for reconstructed images preds_1 = ocrModel(images_recon_1, text_1[:, :-1], is_train=False) # align with Attention.forward target_1 = text_1[:, 1:] # without [GO] Symbol preds_2 = ocrModel(images_recon_2, text_2[:, :-1], is_train=False) # align with Attention.forward target_2 = text_2[:, 1:] # without [GO] Symbol ocrCost_1 = ocrCriterion(preds_1.view(-1, preds_1.shape[-1]), target_1.contiguous().view(-1)) ocrCost_2 = ocrCriterion(preds_2.view(-1, preds_2.shape[-1]), target_2.contiguous().view(-1)) # ocrCost = 0.5*(ocrCost_1+ocrCost_2) if not opt.ocrFixed: #training OCR ocrModel.zero_grad() ocrCost_train.backward() # torch.nn.utils.clip_grad_norm_(ocrModel.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) ocr_optimizer.step() #if ocr is fixed; ignore this loss loss_avg_ocr.add(ocrCost_train) else: loss_avg_ocr.add(torch.tensor(0.0)) #Domain discriminator: Dis update disModel.zero_grad() disCost = opt.disWeight*0.5*(disModel.module.calc_dis_loss(images_recon_1.detach(), image_real) + disModel.module.calc_dis_loss(images_recon_2.detach(), image)) disCost.backward() # torch.nn.utils.clip_grad_norm_(disModel.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) dis_optimizer.step() loss_avg_dis.add(disCost) # #[Style Encoder] + [Word Generator] update #Adversarial loss disGenCost = 0.5*(disModel.module.calc_gen_loss(images_recon_1)+disModel.module.calc_gen_loss(images_recon_2)) #Input reconstruction loss recCost = recCriterion(images_recon_1,image) #Pair style reconstruction loss if opt.styleReconWeight == 0.0: styleRecCost = torch.tensor(0.0) else: if opt.styleDetach: styleRecCost = styleRecCriterion(model(images_recon_2, None, None, styleFlag=True), style.detach()) else: styleRecCost = styleRecCriterion(model(images_recon_2, None, None, styleFlag=True), style) #OCR Content cost ocrCost = 0.5*(ocrCost_1+ocrCost_2) cost = opt.ocrWeight*ocrCost + opt.reconWeight*recCost + opt.disWeight*disGenCost + opt.styleReconWeight*styleRecCost model.zero_grad() ocrModel.zero_grad() disModel.zero_grad() cost.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(cost) #Individual losses loss_avg_ocrRecon_1.add(opt.ocrWeight*0.5*ocrCost_1) loss_avg_ocrRecon_2.add(opt.ocrWeight*0.5*ocrCost_2) loss_avg_gen.add(opt.disWeight*disGenCost) loss_avg_imgRecon.add(opt.reconWeight*recCost) loss_avg_styRecon.add(opt.styleReconWeight*styleRecCost) # validation part if (iteration + 1) % opt.valInterval == 0 or iteration == 0: # To see training progress, we also conduct validation when 'iteration == 0' #Save training images os.makedirs(os.path.join(opt.exp_dir,opt.exp_name,'trainImages',str(iteration)), exist_ok=True) for trImgCntr in range(batch_size): try: save_image(tensor2im(image[trImgCntr].detach()),os.path.join(opt.exp_dir,opt.exp_name,'trainImages',str(iteration),str(trImgCntr)+'_input_'+labels_gt[trImgCntr]+'.png')) save_image(tensor2im(images_recon_1[trImgCntr].detach()),os.path.join(opt.exp_dir,opt.exp_name,'trainImages',str(iteration),str(trImgCntr)+'_recon_'+labels_1[trImgCntr]+'.png')) save_image(tensor2im(images_recon_2[trImgCntr].detach()),os.path.join(opt.exp_dir,opt.exp_name,'trainImages',str(iteration),str(trImgCntr)+'_pair_'+labels_2[trImgCntr]+'.png')) except: print('Warning while saving training image') elapsed_time = time.time() - start_time # for log with open(os.path.join(opt.exp_dir,opt.exp_name,'log_train.txt'), 'a') as log: model.eval() ocrModel.module.Transformation.eval() ocrModel.module.FeatureExtraction.eval() ocrModel.module.AdaptiveAvgPool.eval() ocrModel.module.SequenceModeling.eval() ocrModel.module.Prediction.eval() disModel.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ED, preds, confidence_score, labels, infer_time, length_of_data = validation_synth_lrw_res( iteration, model, ocrModel, disModel, recCriterion, styleRecCriterion, ocrCriterion, valid_loader, converter, opt) model.train() if not opt.ocrFixed: ocrModel.train() else: # ocrModel.module.Transformation.eval() # ocrModel.module.FeatureExtraction.eval() # ocrModel.module.AdaptiveAvgPool.eval() ocrModel.module.SequenceModeling.train() # ocrModel.module.Prediction.eval() disModel.train() # training loss and validation loss loss_log = f'[{iteration+1}/{opt.num_iter}] Train OCR loss: {loss_avg_ocr.val():0.5f}, Train Synth loss: {loss_avg.val():0.5f}, Train Dis loss: {loss_avg_dis.val():0.5f}, Valid OCR loss: {valid_loss[0]:0.5f}, Valid Synth loss: {valid_loss[1]:0.5f}, Valid Dis loss: {valid_loss[2]:0.5f}, Elapsed_time: {elapsed_time:0.5f}' current_model_log_ocr = f'{"Current_accuracy_OCR":17s}: {current_accuracy[0]:0.3f}, {"Current_norm_ED_OCR":17s}: {current_norm_ED[0]:0.2f}' current_model_log_1 = f'{"Current_accuracy_recon":17s}: {current_accuracy[1]:0.3f}, {"Current_norm_ED_recon":17s}: {current_norm_ED[1]:0.2f}' current_model_log_2 = f'{"Current_accuracy_pair":17s}: {current_accuracy[2]:0.3f}, {"Current_norm_ED_pair":17s}: {current_norm_ED[2]:0.2f}' #plotting lib.plot.plot(os.path.join(plotDir,'Train-OCR-Loss'), loss_avg_ocr.val().item()) lib.plot.plot(os.path.join(plotDir,'Train-Synth-Loss'), loss_avg.val().item()) lib.plot.plot(os.path.join(plotDir,'Train-Dis-Loss'), loss_avg_dis.val().item()) lib.plot.plot(os.path.join(plotDir,'Train-OCR-Recon1-Loss'), loss_avg_ocrRecon_1.val().item()) lib.plot.plot(os.path.join(plotDir,'Train-OCR-Recon2-Loss'), loss_avg_ocrRecon_2.val().item()) lib.plot.plot(os.path.join(plotDir,'Train-Gen-Loss'), loss_avg_gen.val().item()) lib.plot.plot(os.path.join(plotDir,'Train-ImgRecon1-Loss'), loss_avg_imgRecon.val().item()) lib.plot.plot(os.path.join(plotDir,'Train-StyRecon2-Loss'), loss_avg_styRecon.val().item()) lib.plot.plot(os.path.join(plotDir,'Valid-OCR-Loss'), valid_loss[0].item()) lib.plot.plot(os.path.join(plotDir,'Valid-Synth-Loss'), valid_loss[1].item()) lib.plot.plot(os.path.join(plotDir,'Valid-Dis-Loss'), valid_loss[2].item()) lib.plot.plot(os.path.join(plotDir,'Valid-OCR-Recon1-Loss'), valid_loss[3].item()) lib.plot.plot(os.path.join(plotDir,'Valid-OCR-Recon2-Loss'), valid_loss[4].item()) lib.plot.plot(os.path.join(plotDir,'Valid-Gen-Loss'), valid_loss[5].item()) lib.plot.plot(os.path.join(plotDir,'Valid-ImgRecon1-Loss'), valid_loss[6].item()) lib.plot.plot(os.path.join(plotDir,'Valid-StyRecon2-Loss'), valid_loss[7].item()) lib.plot.plot(os.path.join(plotDir,'Orig-OCR-WordAccuracy'), current_accuracy[0]) lib.plot.plot(os.path.join(plotDir,'Recon-OCR-WordAccuracy'), current_accuracy[1]) lib.plot.plot(os.path.join(plotDir,'Pair-OCR-WordAccuracy'), current_accuracy[2]) lib.plot.plot(os.path.join(plotDir,'Orig-OCR-CharAccuracy'), current_norm_ED[0]) lib.plot.plot(os.path.join(plotDir,'Recon-OCR-CharAccuracy'), current_norm_ED[1]) lib.plot.plot(os.path.join(plotDir,'Pair-OCR-CharAccuracy'), current_norm_ED[2]) # keep best accuracy model (on valid dataset) if current_accuracy[1] > best_accuracy: best_accuracy = current_accuracy[1] torch.save(model.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_accuracy.pth')) torch.save(disModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_accuracy_dis.pth')) if current_norm_ED[1] > best_norm_ED: best_norm_ED = current_norm_ED[1] torch.save(model.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_norm_ED.pth')) torch.save(disModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_norm_ED_dis.pth')) best_model_log = f'{"Best_accuracy_Recon":17s}: {best_accuracy:0.3f}, {"Best_norm_ED_Recon":17s}: {best_norm_ED:0.2f}' # keep best accuracy model (on valid dataset) if current_accuracy[0] > best_accuracy_ocr: best_accuracy_ocr = current_accuracy[0] if not opt.ocrFixed: torch.save(ocrModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_accuracy_ocr.pth')) if current_norm_ED[0] > best_norm_ED_ocr: best_norm_ED_ocr = current_norm_ED[0] if not opt.ocrFixed: torch.save(ocrModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_norm_ED_ocr.pth')) best_model_log_ocr = f'{"Best_accuracy_ocr":17s}: {best_accuracy_ocr:0.3f}, {"Best_norm_ED_ocr":17s}: {best_norm_ED_ocr:0.2f}' loss_model_log = f'{loss_log}\n{current_model_log_ocr}\n{current_model_log_1}\n{current_model_log_2}\n{best_model_log_ocr}\n{best_model_log}' print(loss_model_log) log.write(loss_model_log + '\n') # show some predicted results dashed_line = '-' * 80 head = f'{"Ground Truth":32s} | {"Prediction":25s} | Confidence Score & T/F' predicted_result_log = f'{dashed_line}\n{head}\n{dashed_line}\n' for gt_ocr, pred_ocr, confidence_ocr, gt_1, pred_1, confidence_1, gt_2, pred_2, confidence_2 in zip(labels[0][:5], preds[0][:5], confidence_score[0][:5], labels[1][:5], preds[1][:5], confidence_score[1][:5], labels[2][:5], preds[2][:5], confidence_score[2][:5]): if 'Attn' in opt.Prediction: # gt_ocr = gt_ocr[:gt_ocr.find('[s]')] pred_ocr = pred_ocr[:pred_ocr.find('[s]')] # gt_1 = gt_1[:gt_1.find('[s]')] pred_1 = pred_1[:pred_1.find('[s]')] # gt_2 = gt_2[:gt_2.find('[s]')] pred_2 = pred_2[:pred_2.find('[s]')] predicted_result_log += f'{"ocr"}: {gt_ocr:27s} | {pred_ocr:25s} | {confidence_ocr:0.4f}\t{str(pred_ocr == gt_ocr)}\n' predicted_result_log += f'{"recon"}: {gt_1:25s} | {pred_1:25s} | {confidence_1:0.4f}\t{str(pred_1 == gt_1)}\n' predicted_result_log += f'{"pair"}: {gt_2:26s} | {pred_2:25s} | {confidence_2:0.4f}\t{str(pred_2 == gt_2)}\n' predicted_result_log += f'{dashed_line}' print(predicted_result_log) log.write(predicted_result_log + '\n') loss_avg_ocr.reset() loss_avg.reset() loss_avg_dis.reset() loss_avg_ocrRecon_1.reset() loss_avg_ocrRecon_2.reset() loss_avg_gen.reset() loss_avg_imgRecon.reset() loss_avg_styRecon.reset() lib.plot.flush() lib.plot.tick() # save model per 1e+5 iter. if (iteration) % 1e+5 == 0: torch.save( model.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'iter_'+str(iteration+1)+'_synth.pth')) if not opt.ocrFixed: torch.save( ocrModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'iter_'+str(iteration+1)+'_ocr.pth')) torch.save( disModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'iter_'+str(iteration+1)+'_dis.pth')) if (iteration + 1) == opt.num_iter: print('end the training') sys.exit() iteration += 1
def train(opt): """ dataset preparation """ if not opt.data_filtering_off: print( 'Filtering the images containing characters which are not in opt.character' ) print( 'Filtering the images whose label is longer than opt.batch_max_length' ) # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130 opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) log = open(f'./saved_models/{opt.exp_name}/log_dataset.txt', 'a') AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset, valid_dataset_log = hierarchical_dataset( root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle= True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) log.write(valid_dataset_log) print('-' * 80) log.write('-' * 80 + '\n') log.close() """ model configuration """ if 'CTC' in opt.Prediction: if opt.baiduCTC: converter = CTCLabelConverterForBaiduWarpctc(opt.character) else: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU model = torch.nn.DataParallel(model).to(device) model.train() if opt.saved_model != '': print(f'loading pretrained model from {opt.saved_model}') if opt.FT: model.load_state_dict(torch.load(opt.saved_model), strict=False) else: model.load_state_dict(torch.load(opt.saved_model)) print("Model:") print(model) """ setup loss """ if 'CTC' in opt.Prediction: if opt.baiduCTC: # need to install warpctc. see our guideline. from warpctc_pytorch import CTCLoss criterion = CTCLoss() else: criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to( device) # ignore [GO] token = ignore index 0 # loss averager loss_avg = Averager() # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") print(optimizer) """ final options """ # print(opt) with open(f'./saved_models/{opt.exp_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_model != '': try: start_iter = int(opt.saved_model.split('_')[-1].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass start_time = time.time() best_accuracy = -1 best_norm_ED = -1 iteration = start_iter while (True): # train part image_tensors, labels = train_dataset.get_batch() image = image_tensors.to(device) text, length = converter.encode(labels, batch_max_length=opt.batch_max_length) batch_size = image.size(0) if 'CTC' in opt.Prediction: preds = model(image, text) preds_size = torch.IntTensor([preds.size(1)] * batch_size) if opt.baiduCTC: preds = preds.permute(1, 0, 2) # to use CTCLoss format cost = criterion(preds, text, preds_size, length) / batch_size else: preds = preds.log_softmax(2).permute(1, 0, 2) cost = criterion(preds, text, preds_size, length) else: preds = model(image, text[:, :-1]) # align with Attention.forward target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) model.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(cost) # validation part if ( iteration + 1 ) % opt.valInterval == 0 or iteration == 0: # To see training progress, we also conduct validation when 'iteration == 0' elapsed_time = time.time() - start_time # for log with open(f'./saved_models/{opt.exp_name}/log_train.txt', 'a') as log: model.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ED, preds, confidence_score, labels, infer_time, length_of_data = validation( model, criterion, valid_loader, converter, opt) model.train() # training loss and validation loss loss_log = f'[{iteration+1}/{opt.num_iter}] Train loss: {loss_avg.val():0.5f}, Valid loss: {valid_loss:0.5f}, Elapsed_time: {elapsed_time:0.5f}' loss_avg.reset() current_model_log = f'{"Current_accuracy":17s}: {current_accuracy:0.3f}, {"Current_norm_ED":17s}: {current_norm_ED:0.2f}' # keep best accuracy model (on valid dataset) if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save( model.state_dict(), f'./saved_models/{opt.exp_name}/best_accuracy.pth') if current_norm_ED > best_norm_ED: best_norm_ED = current_norm_ED torch.save( model.state_dict(), f'./saved_models/{opt.exp_name}/best_norm_ED.pth') best_model_log = f'{"Best_accuracy":17s}: {best_accuracy:0.3f}, {"Best_norm_ED":17s}: {best_norm_ED:0.2f}' loss_model_log = f'{loss_log}\n{current_model_log}\n{best_model_log}' print(loss_model_log) log.write(loss_model_log + '\n') # show some predicted results dashed_line = '-' * 80 head = f'{"Ground Truth":25s} | {"Prediction":25s} | Confidence Score & T/F' predicted_result_log = f'{dashed_line}\n{head}\n{dashed_line}\n' for gt, pred, confidence in zip(labels[:5], preds[:5], confidence_score[:5]): if 'Attn' in opt.Prediction: gt = gt[:gt.find('[s]')] pred = pred[:pred.find('[s]')] predicted_result_log += f'{gt:25s} | {pred:25s} | {confidence:0.4f}\t{str(pred == gt)}\n' predicted_result_log += f'{dashed_line}' print(predicted_result_log) log.write(predicted_result_log + '\n') # save model per 1e+5 iter. if (iteration + 1) % 1e+5 == 0: torch.save( model.state_dict(), f'./saved_models/{opt.exp_name}/iter_{iteration+1}.pth') if (iteration + 1) == opt.num_iter: print('end the training') sys.exit() iteration += 1
def train(opt): os.makedirs(opt.log, exist_ok=True) writer = SummaryWriter(opt.log) """ dataset preparation """ if not opt.data_filtering_off: print( 'Filtering the images containing characters which are not in opt.character' ) print( 'Filtering the images whose label is longer than opt.batch_max_length' ) opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) log = open(f'./saved_models/{opt.exp_name}/log_dataset.txt', 'a') AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset, valid_dataset_log = hierarchical_dataset( root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle= True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) log.write(valid_dataset_log) print('-' * 80) log.write('-' * 80 + '\n') log.close() """ model configuration """ ctc_converter = CTCLabelConverter(opt.character) attn_converter = AttnLabelConverter(opt.character) opt.num_class = len(attn_converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU model = torch.nn.DataParallel(model).to(device) model.train() if opt.saved_model != '': print(f'loading pretrained model from {opt.saved_model}') if opt.FT: model.load_state_dict(torch.load(opt.saved_model), strict=False) else: model.load_state_dict(torch.load(opt.saved_model)) """ setup loss """ loss_avg = Averager() ctc_loss = torch.nn.CTCLoss(zero_infinity=True).to(device) attn_loss = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # setup optimizer if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") """ final options """ # print(opt) with open(f'./saved_models/{opt.exp_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_model != '': try: start_iter = int(opt.saved_model.split('_')[-1].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass start_time = time.time() best_accuracy = -1 best_norm_ED = -1 iteration = start_iter pbar = tqdm(range(opt.num_iter)) for iteration in pbar: # train part image_tensors, labels = train_dataset.get_batch() image = image_tensors.to(device) ctc_text, ctc_length = ctc_converter.encode( labels, batch_max_length=opt.batch_max_length) attn_text, attn_length = attn_converter.encode( labels, batch_max_length=opt.batch_max_length) batch_size = image.size(0) preds, refiner = model(image, attn_text[:, :-1]) refiner_size = torch.IntTensor([refiner.size(1)] * batch_size) refiner = refiner.log_softmax(2).permute(1, 0, 2) refiner_loss = ctc_loss(refiner, ctc_text, refiner_size, ctc_length) total_loss = opt.lambda_ctc * refiner_loss target = attn_text[:, 1:] # without [GO] Symbol for pred in preds: total_loss += opt.lambda_attn * attn_loss( pred.view(-1, pred.shape[-1]), target.contiguous().view(-1)) model.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(total_loss) if loss_avg.val() <= 0.6: opt.grad_clip = 2 if loss_avg.val() <= 0.3: opt.grad_clip = 1 preds = (p.cpu() for p in preds) refiner = refiner.cpu() image = image.cpu() torch.cuda.empty_cache() writer.add_scalar('train_loss', loss_avg.val(), iteration) pbar.set_description('Iteration {0}/{1}, AvgLoss {2}'.format( iteration, opt.num_iter, loss_avg.val())) # validation part if (iteration + 1) % opt.valInterval == 0 or iteration == 0: elapsed_time = time.time() - start_time # for log with open(f'./saved_models/{opt.exp_name}/log_train.txt', 'a') as log: model.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ED, preds, confidence_score, labels, infer_time, length_of_data = validation( model, attn_loss, valid_loader, attn_converter, opt) model.train() # training loss and validation loss loss_log = f'[{iteration+1}/{opt.num_iter}] Train loss: {loss_avg.val():0.5f}, Valid loss: {valid_loss:0.5f}, Elapsed_time: {elapsed_time:0.5f}' writer.add_scalar('Val_loss', valid_loss) pbar.set_description(loss_log) loss_avg.reset() current_model_log = f'{"Current_accuracy":17s}: {current_accuracy:0.3f}, {"Current_norm_ED":17s}: {current_norm_ED:0.2f}' # keep best accuracy model (on valid dataset) if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save( model.state_dict(), f'./saved_models/{opt.exp_name}/best_accuracy_{str(best_accuracy)}.pth' ) if current_norm_ED > best_norm_ED: best_norm_ED = current_norm_ED torch.save( model.state_dict(), f'./saved_models/{opt.exp_name}/best_norm_ED.pth') best_model_log = f'{"Best_accuracy":17s}: {best_accuracy:0.3f}, {"Best_norm_ED":17s}: {best_norm_ED:0.2f}' loss_model_log = f'{loss_log}\n{current_model_log}\n{best_model_log}' # print(loss_model_log) log.write(loss_model_log + '\n') # show some predicted results dashed_line = '-' * 80 head = f'{"Ground Truth":25s} | {"Prediction":25s} | Confidence Score & T/F' predicted_result_log = f'{dashed_line}\n{head}\n{dashed_line}\n' for gt, pred, confidence in zip(labels[:5], preds[:5], confidence_score[:5]): if 'Attn' or 'Transformer' in opt.Prediction: gt = gt[:gt.find('[s]')] pred = pred[:pred.find('[s]')] predicted_result_log += f'{gt:25s} | {pred:25s} | {confidence:0.4f}\t{str(pred == gt)}\n' predicted_result_log += f'{dashed_line}' log.write(predicted_result_log + '\n') # save model per 1e+3 iter. if (iteration + 1) % 1e+3 == 0: torch.save(model.state_dict(), f'./saved_models/{opt.exp_name}/SCATTER_STR.pth') if (iteration + 1) == opt.num_iter: print('end the training') sys.exit()
def train(opt): """ dataset preparation """ opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset = hierarchical_dataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle= True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) print('-' * 80) """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU model = torch.nn.DataParallel(model).cuda() model.train() if opt.continue_model != '': print(f'loading pretrained model from {opt.continue_model}') model.load_state_dict(torch.load(opt.continue_model)) print("Model:") print(model) """ setup loss """ if 'CTC' in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).cuda() else: criterion = torch.nn.CrossEntropyLoss( ignore_index=0).cuda() # ignore [GO] token = ignore index 0 # loss averager loss_avg = Averager() # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") print(optimizer) """ final options """ # print(opt) with open(f'./saved_models/{opt.experiment_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.continue_model != '': #start_iter = int(opt.continue_model.split('_')[-1].split('.')[0]) start_iter = 10000 print(f'continue to train, start_iter: {start_iter}') start_time = time.time() best_accuracy = -1 best_norm_ED = 1e+6 i = start_iter while (True): # train part for p in model.parameters(): p.requires_grad = True image_tensors, labels = train_dataset.get_batch() image = image_tensors.cuda() text, length = converter.encode(labels) batch_size = image.size(0) if 'CTC' in opt.Prediction: preds = model(image, text).log_softmax(2) preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) # to use CTCLoss format cost = criterion(preds, text, preds_size, length) else: preds = model(image, text) target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) model.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(cost) # validation part if i % opt.valInterval == 0: elapsed_time = time.time() - start_time print( f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}' ) # for log with open(f'./saved_models/{opt.experiment_name}/log_train.txt', 'a') as log: log.write( f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}\n' ) loss_avg.reset() model.eval() valid_loss, current_accuracy, current_norm_ED, preds, labels, infer_time, length_of_data = validation( model, criterion, valid_loader, converter, opt) model.train() for pred, gt in zip(preds[:5], labels[:5]): if 'Attn' in opt.Prediction: pred = pred[:pred.find('[s]')] gt = gt[:gt.find('[s]')] print(f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}') log.write( f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}\n') valid_log = f'[{i}/{opt.num_iter}] valid loss: {valid_loss:0.5f}' valid_log += f' accuracy: {current_accuracy:0.3f}, norm_ED: {current_norm_ED:0.2f}' print(valid_log) log.write(valid_log + '\n') # keep best accuracy model if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/best_accuracy.pth' ) if current_norm_ED < best_norm_ED: best_norm_ED = current_norm_ED torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth' ) best_model_log = f'best_accuracy: {best_accuracy:0.3f}, best_norm_ED: {best_norm_ED:0.2f}' print(best_model_log) log.write(best_model_log + '\n') # save model per 1e+5 iter. if (i + 1) % 2000 == 0: torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth') if i == opt.num_iter: print('end the training') sys.exit() i += 1
def train(opt): plotDir = os.path.join(opt.exp_dir, opt.exp_name, 'plots') if not os.path.exists(plotDir): os.makedirs(plotDir) lib.print_model_settings(locals().copy()) """ dataset preparation """ if not opt.data_filtering_off: print( 'Filtering the images containing characters which are not in opt.character' ) print( 'Filtering the images whose label is longer than opt.batch_max_length' ) # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130 opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') log = open(os.path.join(opt.exp_dir, opt.exp_name, 'log_dataset.txt'), 'a') AlignCollate_valid = AlignPairCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) train_dataset, train_dataset_log = hierarchical_dataset( root=opt.train_data, opt=opt) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=opt.batch_size, shuffle= True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) log.write(train_dataset_log) print('-' * 80) valid_dataset, valid_dataset_log = hierarchical_dataset( root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle= False, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) log.write(valid_dataset_log) print('-' * 80) log.write('-' * 80 + '\n') log.close() converter = CTCLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 styleModel = StyleTensorEncoder(input_dim=opt.input_channel) genModel = AdaIN_Tensor_WordGenerator(opt) disModel = MsImageDisV2(opt) vggRecCriterion = torch.nn.L1Loss() vggModel = VGGPerceptualLossModel(models.vgg19(pretrained=True), vggRecCriterion) print('model input parameters', opt.imgH, opt.imgW, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length) # weight initialization for currModel in [styleModel, genModel, disModel]: for name, param in currModel.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue styleModel = torch.nn.DataParallel(styleModel).to(device) styleModel.train() genModel = torch.nn.DataParallel(genModel).to(device) genModel.train() disModel = torch.nn.DataParallel(disModel).to(device) disModel.train() vggModel = torch.nn.DataParallel(vggModel).to(device) vggModel.eval() if opt.modelFolderFlag: if len( glob.glob( os.path.join(opt.exp_dir, opt.exp_name, "iter_*_synth.pth"))) > 0: opt.saved_synth_model = glob.glob( os.path.join(opt.exp_dir, opt.exp_name, "iter_*_synth.pth"))[-1] if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': print(f'loading pretrained synth model from {opt.saved_synth_model}') checkpoint = torch.load(opt.saved_synth_model) styleModel.load_state_dict(checkpoint['styleModel']) genModel.load_state_dict(checkpoint['genModel']) disModel.load_state_dict(checkpoint['disModel']) if opt.imgReconLoss == 'l1': recCriterion = torch.nn.L1Loss() elif opt.imgReconLoss == 'ssim': recCriterion = ssim elif opt.imgReconLoss == 'ms-ssim': recCriterion = msssim if opt.styleLoss == 'l1': styleRecCriterion = torch.nn.L1Loss() elif opt.styleLoss == 'triplet': styleRecCriterion = torch.nn.TripletMarginLoss( margin=opt.tripletMargin, p=1) #for validation; check only positive pairs styleTestRecCriterion = torch.nn.L1Loss() # loss averager loss_avg = Averager() loss_avg_dis = Averager() loss_avg_gen = Averager() loss_avg_imgRecon = Averager() loss_avg_vgg_per = Averager() loss_avg_vgg_sty = Averager() ##---------------------------------------## # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, styleModel.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) for p in filter(lambda p: p.requires_grad, genModel.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable style and generator params num : ', sum(params_num)) # setup optimizer if opt.optim == 'adam': optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, opt.beta2), weight_decay=opt.weight_decay) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps, weight_decay=opt.weight_decay) print("SynthOptimizer:") print(optimizer) #filter parameters for Dis training dis_filtered_parameters = [] dis_params_num = [] for p in filter(lambda p: p.requires_grad, disModel.parameters()): dis_filtered_parameters.append(p) dis_params_num.append(np.prod(p.size())) print('Dis Trainable params num : ', sum(dis_params_num)) # setup optimizer if opt.optim == 'adam': dis_optimizer = optim.Adam(dis_filtered_parameters, lr=opt.lr, betas=(opt.beta1, opt.beta2), weight_decay=opt.weight_decay) else: dis_optimizer = optim.Adadelta(dis_filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps, weight_decay=opt.weight_decay) print("DisOptimizer:") print(dis_optimizer) ##---------------------------------------## """ final options """ with open(os.path.join(opt.exp_dir, opt.exp_name, 'opt.txt'), 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': try: start_iter = int( opt.saved_synth_model.split('_')[-2].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass #get schedulers scheduler = get_scheduler(optimizer, opt) dis_scheduler = get_scheduler(dis_optimizer, opt) start_time = time.time() iteration = start_iter cntr = 0 while (True): # train part if opt.lr_policy != "None": scheduler.step() dis_scheduler.step() image_input_tensors, image_gt_tensors, labels_1, labels_2 = iter( train_loader).next() cntr += 1 image_input_tensors = image_input_tensors.to(device) image_gt_tensors = image_gt_tensors.to(device) batch_size = image_input_tensors.size(0) text_2, length_2 = converter.encode( labels_2, batch_max_length=opt.batch_max_length) #forward pass from style and word generator style = styleModel(image_input_tensors) images_recon_2 = genModel(style, text_2) #Domain discriminator: Dis update disModel.zero_grad() disCost = opt.disWeight * (disModel.module.calc_dis_loss( torch.cat((images_recon_2.detach(), image_input_tensors), dim=1), torch.cat((image_gt_tensors, image_input_tensors), dim=1))) disCost.backward() dis_optimizer.step() loss_avg_dis.add(disCost) # #[Style Encoder] + [Word Generator] update #Adversarial loss disGenCost = disModel.module.calc_gen_loss( torch.cat((images_recon_2, image_input_tensors), dim=1)) #Input reconstruction loss recCost = recCriterion(images_recon_2, image_gt_tensors) #vgg loss vggPerCost, vggStyleCost = vggModel(image_gt_tensors, images_recon_2) cost = opt.reconWeight * recCost + opt.disWeight * disGenCost + opt.vggPerWeight * vggPerCost + opt.vggStyWeight * vggStyleCost styleModel.zero_grad() genModel.zero_grad() disModel.zero_grad() vggModel.zero_grad() cost.backward() optimizer.step() loss_avg.add(cost) #Individual losses loss_avg_gen.add(opt.disWeight * disGenCost) loss_avg_imgRecon.add(opt.reconWeight * recCost) loss_avg_vgg_per.add(opt.vggPerWeight * vggPerCost) loss_avg_vgg_sty.add(opt.vggStyWeight * vggStyleCost) # validation part if ( iteration + 1 ) % opt.valInterval == 0 or iteration == 0: # To see training progress, we also conduct validation when 'iteration == 0' #Save training images os.makedirs(os.path.join(opt.exp_dir, opt.exp_name, 'trainImages', str(iteration)), exist_ok=True) for trImgCntr in range(batch_size): try: save_image( tensor2im(image_input_tensors[trImgCntr].detach()), os.path.join( opt.exp_dir, opt.exp_name, 'trainImages', str(iteration), str(trImgCntr) + '_sInput_' + labels_1[trImgCntr] + '.png')) save_image( tensor2im(image_gt_tensors[trImgCntr].detach()), os.path.join( opt.exp_dir, opt.exp_name, 'trainImages', str(iteration), str(trImgCntr) + '_csGT_' + labels_2[trImgCntr] + '.png')) save_image( tensor2im(images_recon_2[trImgCntr].detach()), os.path.join( opt.exp_dir, opt.exp_name, 'trainImages', str(iteration), str(trImgCntr) + '_csRecon_' + labels_2[trImgCntr] + '.png')) except: print('Warning while saving training image') elapsed_time = time.time() - start_time # for log with open(os.path.join(opt.exp_dir, opt.exp_name, 'log_train.txt'), 'a') as log: styleModel.eval() genModel.eval() disModel.eval() with torch.no_grad(): valid_loss, infer_time, length_of_data = validation_synth_v3( iteration, styleModel, genModel, vggModel, disModel, recCriterion, valid_loader, converter, opt) styleModel.train() genModel.train() disModel.train() # training loss and validation loss loss_log = f'[{iteration+1}/{opt.num_iter}] Train Synth loss: {loss_avg.val():0.5f}, \ Train Dis loss: {loss_avg_dis.val():0.5f}, Train Gen loss: {loss_avg_gen.val():0.5f},\ Train ImgRecon loss: {loss_avg_imgRecon.val():0.5f}, Train VGG-Per loss: {loss_avg_vgg_per.val():0.5f},\ Train VGG-Sty loss: {loss_avg_vgg_sty.val():0.5f}, Valid Synth loss: {valid_loss[1]:0.5f}, \ Valid Dis loss: {valid_loss[2]:0.5f}, Elapsed_time: {elapsed_time:0.5f}' #plotting lib.plot.plot(os.path.join(plotDir, 'Train-Synth-Loss'), loss_avg.val().item()) lib.plot.plot(os.path.join(plotDir, 'Train-Dis-Loss'), loss_avg_dis.val().item()) lib.plot.plot(os.path.join(plotDir, 'Train-Gen-Loss'), loss_avg_gen.val().item()) lib.plot.plot(os.path.join(plotDir, 'Train-ImgRecon1-Loss'), loss_avg_imgRecon.val().item()) lib.plot.plot(os.path.join(plotDir, 'Train-VGG-Per-Loss'), loss_avg_vgg_per.val().item()) lib.plot.plot(os.path.join(plotDir, 'Train-VGG-Sty-Loss'), loss_avg_vgg_sty.val().item()) lib.plot.plot(os.path.join(plotDir, 'Valid-Synth-Loss'), valid_loss[0].item()) lib.plot.plot(os.path.join(plotDir, 'Valid-Dis-Loss'), valid_loss[1].item()) lib.plot.plot(os.path.join(plotDir, 'Valid-Gen-Loss'), valid_loss[2].item()) lib.plot.plot(os.path.join(plotDir, 'Valid-ImgRecon1-Loss'), valid_loss[3].item()) lib.plot.plot(os.path.join(plotDir, 'Valid-VGG-Per-Loss'), valid_loss[4].item()) lib.plot.plot(os.path.join(plotDir, 'Valid-VGG-Sty-Loss'), valid_loss[5].item()) print(loss_log) loss_avg.reset() loss_avg_dis.reset() loss_avg_gen.reset() loss_avg_imgRecon.reset() loss_avg_vgg_per.reset() loss_avg_vgg_sty.reset() lib.plot.flush() lib.plot.tick() # save model per 1e+5 iter. if (iteration) % 1e+4 == 0: torch.save( { 'styleModel': styleModel.state_dict(), 'genModel': genModel.state_dict(), 'disModel': disModel.state_dict() }, os.path.join(opt.exp_dir, opt.exp_name, 'iter_' + str(iteration + 1) + '_synth.pth')) if (iteration + 1) == opt.num_iter: print('end the training') sys.exit() iteration += 1
def __init__(self, image_folder, extract_text_file, split): self.i_folder = image_folder #print(image_folder) #print("aaaaaaa test") self.extract_text_file = extract_text_file self.canvas_size = 1280 self.mag_ratio = 1.5 self.show_time = False self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.cuda = torch.cuda.is_available() self.net = CRAFT() #(1st model) model to detect words in images if self.cuda: self.net.load_state_dict( self.copyStateDict( torch.load('CRAFT-pytorch/craft_mlt_25k.pth'))) else: self.net.load_state_dict( self.copyStateDict( torch.load('CRAFT-pytorch/craft_mlt_25k.pth', map_location='cpu'))) if self.cuda: self.net = self.net.cuda() self.net = torch.nn.DataParallel(self.net) cudnn.benchmark = False self.net.eval() self.refine_net = None self.text_threshold = 0.7 self.link_threshold = 0.4 self.low_text = 0.4 self.poly = False self.result_folder = './' + split + '_' + 'intermediate_result/' if not os.path.isdir(self.result_folder): os.mkdir(self.result_folder) #Parameters for image to text model (2nd model) self.parser = argparse.ArgumentParser() #Data processing self.parser.add_argument('--batch_max_length', type=int, default=25, help='maximum-label-length') self.parser.add_argument('--imgH', type=int, default=32, help='the height of the input image') self.parser.add_argument('--imgW', type=int, default=100, help='the width of the input image') self.parser.add_argument('--rgb', default=False, action='store_true', help='use rgb input') self.parser.add_argument( '--character', type=str, default='0123456789abcdefghijklmnopqrstuvwxyz', help='character label') self.parser.add_argument('--sensitive', action='store_true', help='for sensitive character mode') self.parser.add_argument( '--PAD', action='store_true', help='whether to keep ratio then pad for image resize') #Model Architecture self.parser.add_argument('--Transformation', type=str, default='TPS', help='Transformation stage. None|TPS') self.parser.add_argument( '--FeatureExtraction', type=str, default='ResNet', help='FeatureExtraction stage. VGG|RCNN|ResNet') self.parser.add_argument('--SequenceModeling', type=str, default='BiLSTM', help='SequenceModeling stage. None|BiLSTM') self.parser.add_argument('--Prediction', type=str, default='Attn', help='Prediction stage. CTC|Attn') self.parser.add_argument('--num_fiducial', type=int, default=20, help='number of fiducial points of TPS-STN') self.parser.add_argument( '--input_channel', type=int, default=1, help='the number of input channel of Feature extractor') self.parser.add_argument( '--output_channel', type=int, default=512, help='the number of output channel of Feature extractor') self.parser.add_argument('--hidden_size', type=int, default=256, help='the size of the LSTM hidden state') #self.opt = self.parser.parse_args() self.opt, unknown = self.parser.parse_known_args() #self.opt, unknown = self.parser.parse_known_args() if 'CTC' in self.opt.Prediction: self.converter = CTCLabelConverter(self.opt.character) else: self.converter = AttnLabelConverter(self.opt.character) self.opt.num_class = len(self.converter.character) #print(opt.rgb) if self.opt.rgb: self.opt.input_channel = 3 self.opt.num_gpu = torch.cuda.device_count() self.opt.batch_size = 192 #self.opt.batch_size = 3 self.opt.workers = 0 self.model = Model(self.opt) #image to text model (2nd model) self.model = torch.nn.DataParallel(self.model).to(self.device) self.model.load_state_dict( torch.load( 'deep-text-recognition-benchmark/TPS-ResNet-BiLSTM-Attn.pth', map_location=self.device)) self.model.eval()
def demo(opt): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) try: model = torch.nn.DataParallel(model).to(device) except RuntimeError: raise RuntimeError(device) # load model print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model, map_location=device)) # prepare data. two demo images from https://github.com/bgshih/crnn#run-demo AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) demo_data = RawDataset(root=opt.image_folder, opt=opt) # use RawDataset demo_loader = torch.utils.data.DataLoader(demo_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_demo, pin_memory=True) # predict model.eval() with torch.no_grad(): for image_tensors, image_path_list in demo_loader: batch_size = image_tensors.size(0) image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) if 'CTC' in opt.Prediction: preds = model(image, text_for_pred) # Select max probabilty (greedy decoding) then decode index to character preds_size = torch.IntTensor([preds.size(1)] * batch_size) _, preds_index = preds.max(2) # preds_index = preds_index.view(-1) preds_str = converter.decode(preds_index, preds_size) else: preds, alphas = model(image, text_for_pred, is_train=False) alphas = alphas.detach().cpu().numpy() if opt.batch_max_length == 1: # select top_k probabilty (greedy decoding) then decode index to character k = opt.topk preds = F.softmax(preds, dim=2) topk_prob, topk_id = preds.topk(k) topk_id = topk_id.detach().cpu()[:, 0, :].unsqueeze( dim=1).numpy() # (batch_size, topk) # concat 3(['s']) to the end of ids topk_s = np.ones_like(topk_id) * 3 topk_id = np.concatenate((topk_id, topk_s), axis=1) topk_chars = converter.decode(topk_id, length_for_pred) topk_probs = topk_prob.detach().cpu( )[:, 0, :] # (batch_size, topk) else: # select max probabilty (greedy decoding) then decode index to character k = opt.topk # _, preds_index = preds.max(dim=2) # preds_str = converter.decode(preds_index, length_for_pred) preds = F.softmax(preds, dim=2) topk_prob, topk_id = preds.topk(k, dim=2) topk_id = topk_id.detach().cpu().numpy( ) # (batch_size, topk) topk_probs = topk_prob.detach().cpu() topk_strs = converter.decode(topk_id, length_for_pred) if opt.batch_max_length == 1: log = open(f'./log_demo_result.csv', 'a', encoding='utf-8') # topk_probs = F.softmax(topk_probs, dim=-1) for img_name, pred, pred_max_prob in zip( image_path_list, topk_chars, topk_probs): if 'Attn' in opt.Prediction: pred = [p[:p.find('[s]')] for p in pred ] # prune after "end of sentence" token ([s]) print(img_name, end='') log.write(img_name) for pred_char, pred_prob in zip(pred, pred_max_prob): print(',' + pred_char, end='') print(',%.4f' % pred_prob, end='') log.write(',' + pred_char) log.write(',%.4f' % pred_prob) print() log.write('\n') log.close() else: log = open(f'./log_demo_result.txt', 'a', encoding='utf-8') dashed_line = '-' * 80 head = f'{"image_path":25s}\t{"predicted_labels":25s}\tconfidence score' print(f'{dashed_line}\n{head}\n{dashed_line}') log.write(f'{dashed_line}\n{head}\n{dashed_line}\n') # preds_prob = F.softmax(preds, dim=2) # preds_max_prob, _ = preds_prob.max(dim=2) if 'Attn' in opt.Prediction: for idx, (img_name, pred, pred_max_prob) in enumerate( zip(image_path_list, topk_strs, topk_probs)): pred_EOS = pred[0].find('[s]') pred = [s[:pred_EOS] for s in pred ] # prune after "end of sentence" token ([s]) pred_max_prob = pred_max_prob[:pred_EOS, :] if opt.output_split: alpha = alphas[idx, :, :].transpose() img = Image.open(img_name).convert('RGB') width, height = img.size alpha = alpha[:pred_EOS] if len(alpha) > 0: last_alpha_line = alpha[-1] # 消除padding的影响 seq_length = last_alpha_line.shape[0] column_range = np.arange(0, seq_length) ratio = height / width # too long, compress into opt shape, don't need pad if ratio > opt.imgH / opt.imgW: want_height = opt.imgW * ratio compress_ratio = want_height / opt.imgH expect_last_column = seq_length # need pad else: compress_ratio = 1 expect_height = height / width * opt.imgW expect_last_column = expect_height / opt.imgH * seq_length column_range = column_range - seq_length / 2 column_range = column_range / 320 * ( 320 + (compress_ratio - 1) * 32) column_range = column_range + seq_length / 2 # column_range = column_range - column_range[0] # last_column = np.argmax(last_alpha_line) last_column = np.dot(last_alpha_line, column_range) expect_linein = expect_last_column - last_column split_output = os.path.join( 'output', os.path.splitext( os.path.basename(img_name))[0] + '.txt') with open(split_output, 'w', encoding='utf-8') as fp: draw = ImageDraw.Draw(img) for alpha_line in alpha: column = np.dot( alpha_line, column_range) line_height = int( (column - expect_linein / 2) / (last_column - expect_linein / 2) * height) # line_height = int(column / last_column * height) line = [ 0, line_height, width - 1, line_height ] line = list(map(str, line)) fp.write(','.join(line) + '\n') draw.line(((0, line_height), (width - 1, line_height)), fill=(255, 0, 0), width=2) img.save( os.path.join( 'output', os.path.basename(img_name))) best_pred = pred[0] best_prob = pred_max_prob[:, 0] # calculate confidence score (= multiply of pred_max_prob) try: confidence_score = best_prob.cumprod(dim=0)[-1] except IndexError: confidence_score = 0.0 # print(f'{img_name:25s}\t{pred:25s}\t can\'t predict') # raise ValueError() print( f'{img_name:25s}\t{best_pred:25s}\t{confidence_score:0.4f}' ) log.write( f'{img_name:25s}\t{best_pred:25s}\t{confidence_score:0.4f}\n' ) for i in range(k): print(f'Candidatae {i:1d}: ', end='') for j in range(pred_EOS): print( f'{pred[i][j]}, prob: {pred_max_prob[j][i]:0.4f}\t', end='') print() else: preds_prob = F.softmax(preds, dim=2) preds_max_prob, _ = preds_prob.max(dim=2) for img_name, pred, pred_max_prob, pred_idx in zip( image_path_list, preds_str, preds_max_prob, preds_index): pred_EOS = len(pred) pred_max_prob = pred_max_prob[:pred_EOS] # calculate confidence score (= multiply of pred_max_prob) try: confidence_score = pred_max_prob.cumprod(dim=0)[-1] except IndexError: confidence_score = 0.0 # print(f'{img_name:25s}\t{pred:25s}\t can\'t predict') # raise ValueError() if opt.output_split: img = Image.open(img_name).convert('RGB') width, height = img.size pred_idx = pred_idx.detach().cpu().numpy().tolist() preds_len = len(pred_idx) ratio = height / width # too long, compress into opt shape, don't need pad if ratio > opt.imgH / opt.imgW: want_height = opt.imgW * ratio compress_ratio = want_height / opt.imgH expect_last_column = preds_len # need pad else: compress_ratio = 1 expect_height = height / width * opt.imgW expect_last_column = expect_height / opt.imgH * preds_len split_output = os.path.join( 'output', os.path.splitext(os.path.basename(img_name))[0] + '.txt') # hyper-parameter, suggestion 6-0.46-0.21 for 320CTC # TODO find hyper-parameter for 480CTC CTC_start = 6 center_ratio = 0.46 zoom_ratio = 0.21 # for CTC_start in np.arange(6.0, 7.1, 0.1): # for center_ratio in np.arange(0.37, 0.46, 0.01): # for zoom_ratio in np.arange(0.18, 0.23, 0.01): img = Image.open(img_name).convert('RGB') with open(split_output, 'w', encoding='utf-8') as fp: cur_pos = 0 draw = ImageDraw.Draw(img) index_group = itertools.groupby(pred_idx) for key, group in index_group: group = list(group) if key != 0: nxt_pos = cur_pos - 1 + len(group) column = (cur_pos + nxt_pos) // 2 column = column - CTC_start column = (column - preds_len * center_ratio) * (1 + zoom_ratio * compress_ratio) \ + (preds_len * center_ratio) line_height = int(column / expect_last_column * height) line = [ 0, line_height, width - 1, line_height ] line = list(map(str, line)) fp.write(','.join(line) + '\n') draw.line(((0, line_height), (width - 1, line_height)), fill=(255, 0, 0), width=2) cur_pos += len(group) img.save( os.path.join('output', os.path.basename(img_name))) # img.save(os.path.join('output', '{}_{:02d}_{:03d}_{:03d}.jpg'.format(os.path.splitext(os.path.basename(img_name))[0], int(CTC_start*10), int(center_ratio*100), int(zoom_ratio*100)))) print( f'{img_name:25s}\t{pred:25s}\t{confidence_score:0.4f}' ) log.write( f'{img_name:25s}\t{pred:25s}\t{confidence_score:0.4f}\n' ) log.close()
def train(opt): print(opt.local_rank) opt.device = torch.device('cuda:{}'.format(opt.local_rank)) device = opt.device """ dataset preparation """ train_dataset = Batch_Balanced_Dataset(opt) valid_loader = train_dataset.getValDataloader() print('-' * 80) """ model configuration """ if 'CTC' == opt.Prediction: converter = CTCLabelConverter(opt.character, opt) elif 'Attn' == opt.Prediction: converter = AttnLabelConverter(opt.character, opt) elif 'CTC_Attn' == opt.Prediction: converter = CTCLabelConverter(opt.character, opt), AttnLabelConverter( opt.character, opt) opt.num_class = len(opt.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) model.to(opt.device) print(model) print('model input parameters', opt.rgb, opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue """ setup loss """ if 'CTC' == opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) elif 'Attn' == opt.Prediction: criterion = torch.nn.CrossEntropyLoss( ignore_index=0).to(device), torch.nn.MSELoss( reduction="sum").to(device) # ignore [GO] token = ignore index 0 elif 'CTC_Attn' == opt.Prediction: criterion = torch.nn.CTCLoss( zero_infinity=True).to(device), torch.nn.CrossEntropyLoss( ignore_index=0).to(device), torch.nn.MSELoss( reduction='sum').to(device) # loss averager loss_avg = Averager() # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) if opt.local_rank == 0: print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.sgd: optimizer = optim.SGD(filtered_parameters, lr=opt.lr, momentum=0.9, weight_decay=opt.weight_decay) elif opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) if opt.local_rank == 0: print("Optimizer:") print(optimizer) if opt.sync_bn: model = apex.parallel.convert_syncbn_model(model) if opt.amp > 1: model, optimizer = amp.initialize(model, optimizer, opt_level="O" + str(opt.amp), keep_batchnorm_fp32=True, loss_scale="dynamic") else: model, optimizer = amp.initialize(model, optimizer, opt_level="O" + str(opt.amp)) # data parallel for multi-GPU model = DDP(model) if opt.continue_model != '': print(f'loading pretrained model from {opt.continue_model}') try: model.load_state_dict( torch.load(opt.continue_model, map_location=torch.device( 'cuda', torch.cuda.current_device()))) except: traceback.print_exc() print(f'COPYING pretrained model from {opt.continue_model}') pretrained_dict = torch.load(opt.continue_model, map_location=torch.device( 'cuda', torch.cuda.current_device())) model_dict = model.state_dict() pretrained_dict2 = dict() for k, v in pretrained_dict.items(): if opt.Prediction == 'Attn': if 'module.Prediction_attn.' in k: k = k.replace('module.Prediction_attn.', 'module.Prediction.') if k in model_dict and model_dict[k].shape == v.shape: pretrained_dict2[k] = v model_dict.update(pretrained_dict2) model.load_state_dict(model_dict) model.train() """ final options """ with open(f'./saved_models/{opt.experiment_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' opt_log += str(model) print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 start_time = time.time() best_accuracy = -1 best_norm_ED = 1e+6 i = start_iter ct = opt.batch_mul model.zero_grad() dist.barrier() while (True): # train part start = time.time() image, labels, pos = train_dataset.sync_get_batch() end = time.time() data_t = end - start start = time.time() batch_size = image.size(0) if 'CTC' == opt.Prediction: text, length = converter.encode( labels, batch_max_length=opt.batch_max_length) preds = model(image, text).log_softmax(2) preds_size = torch.IntTensor([preds.size(1)] * batch_size).to(device) preds = preds.permute(1, 0, 2) # to use CTCLoss format # To avoid ctc_loss issue, disabled cudnn for the computation of the ctc_loss # https://github.com/jpuigcerver/PyLaia/issues/16 torch.backends.cudnn.enabled = False cost = criterion(preds, text, preds_size, length) torch.backends.cudnn.enabled = True elif 'Attn' == opt.Prediction: text, length = converter.encode( labels, batch_max_length=opt.batch_max_length) preds = model(image, text[:, :-1]) # align with Attention.forward preds_attn = preds[0] preds_alpha = preds[1] target = text[:, 1:] # without [GO] Symbol cost = criterion[0](preds_attn.view(-1, preds_attn.shape[-1]), target.contiguous().view(-1)) if opt.posreg_w > 0.001: cost_pos = alpha_loss(preds_alpha, pos, opt, criterion[1]) print('attn_cost = ', cost, 'pos_cost = ', cost_pos * opt.posreg_w) cost += opt.posreg_w * cost_pos else: print('attn_cost = ', cost_attn) elif 'CTC_Attn' == opt.Prediction: text_ctc, length_ctc = converter[0].encode( labels, batch_max_length=opt.batch_max_length) text_attn, length_attn = converter[1].encode( labels, batch_max_length=opt.batch_max_length) """ ctc prediction and loss """ #should input text_attn here preds = model(image, text_attn[:, :-1]) preds_ctc = preds[0].log_softmax(2) preds_ctc_size = torch.IntTensor([preds_ctc.size(1)] * batch_size).to(device) preds_ctc = preds_ctc.permute(1, 0, 2) # to use CTCLoss format # To avoid ctc_loss issue, disabled cudnn for the computation of the ctc_loss # https://github.com/jpuigcerver/PyLaia/issues/16 torch.backends.cudnn.enabled = False cost_ctc = criterion[0](preds_ctc, text_ctc, preds_ctc_size, length_ctc) torch.backends.cudnn.enabled = True """ attention prediction and loss """ preds_attn = preds[1][0] # align with Attention.forward preds_alpha = preds[1][1] target = text_attn[:, 1:] # without [GO] Symbol cost_attn = criterion[1](preds_attn.view(-1, preds_attn.shape[-1]), target.contiguous().view(-1)) cost = opt.ctc_attn_loss_ratio * cost_ctc + ( 1 - opt.ctc_attn_loss_ratio) * cost_attn if opt.posreg_w > 0.001: cost_pos = alpha_loss(preds_alpha, pos, opt, criterion[2]) cost += opt.posreg_w * cost_pos cost_ctc = reduce_tensor(cost_ctc) cost_attn = reduce_tensor(cost_attn) cost_pos = reduce_tensor(cost_pos) if opt.local_rank == 0: print('ctc_cost = ', cost_ctc, 'attn_cost = ', cost_attn, 'pos_cost = ', cost_pos * opt.posreg_w) else: cost_ctc = reduce_tensor(cost_ctc) cost_attn = reduce_tensor(cost_attn) if opt.local_rank == 0: print('ctc_cost = ', cost_ctc, 'attn_cost = ', cost_attn) cost /= opt.batch_mul if opt.amp > 0: with amp.scale_loss(cost, optimizer) as scaled_loss: scaled_loss.backward() else: cost.backward() """ https://github.com/davidlmorton/learning-rate-schedules/blob/master/increasing_batch_size_without_increasing_memory.ipynb """ ct -= 1 if ct == 0: if opt.amp > 0: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), opt.grad_clip) else: torch.nn.utils.clip_grad_norm_( model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() model.zero_grad() ct = opt.batch_mul else: continue train_t = time.time() - start cost = reduce_tensor(cost) loss_avg.add(cost) if opt.local_rank == 0: print('iter', i, 'loss =', cost, ', data_t=', data_t, ',train_t=', train_t, ', batchsz=', opt.batch_mul * opt.batch_size) sys.stdout.flush() # validation part if (i > 0 and i % opt.valInterval == 0) or (i == 0 and opt.continue_model != ''): elapsed_time = time.time() - start_time print( f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}' ) # for log with open(f'./saved_models/{opt.experiment_name}/log_train.txt', 'a') as log: log.write( f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}\n' ) loss_avg.reset() model.eval() with torch.no_grad(): if 'CTC_Attn' in opt.Prediction: # we only count for attention accuracy, because ctc is used to help attention valid_loss, current_accuracy_ctc, current_accuracy, current_norm_ED_ctc, current_norm_ED, preds, labels, infer_time, length_of_data = validation( model, criterion[1], valid_loader, converter[1], opt, converter[0]) elif 'Attn' in opt.Prediction: valid_loss, current_accuracy, current_norm_ED, preds, labels, infer_time, length_of_data = validation( model, criterion[0], valid_loader, converter, opt) else: valid_loss, current_accuracy, current_norm_ED, preds, labels, infer_time, length_of_data = validation( model, criterion, valid_loader, converter, opt) model.train() for pred, gt in zip(preds[:10], labels[:10]): if 'Attn' in opt.Prediction: pred = pred[:pred.find('[s]')] gt = gt[:gt.find('[s]')] print(f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}') log.write( f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}\n') valid_log = f'[{i}/{opt.num_iter}] valid loss: {valid_loss:0.5f}' valid_log += f' accuracy: {current_accuracy:0.3f}, norm_ED: {current_norm_ED:0.2f}' if 'CTC_Attn' in opt.Prediction: valid_log += f' ctc_accuracy: {current_accuracy_ctc:0.3f}, ctc_norm_ED: {current_norm_ED_ctc:0.2f}' current_accuracy = max(current_accuracy, current_accuracy_ctc) current_norm_ED = min(current_norm_ED, current_norm_ED_ctc) if opt.local_rank == 0: print(valid_log) log.write(valid_log + '\n') # keep best accuracy model if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/best_accuracy.pth' ) torch.save( model, f'./saved_models/{opt.experiment_name}/best_accuracy.model' ) if current_norm_ED < best_norm_ED: best_norm_ED = current_norm_ED torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth' ) torch.save( model, f'./saved_models/{opt.experiment_name}/best_norm_ED.model' ) best_model_log = f'best_accuracy: {best_accuracy:0.3f}, best_norm_ED: {best_norm_ED:0.2f}' print(best_model_log) log.write(best_model_log + '\n') # save model per iter. if (i + 1) % opt.save_interval == 0 and opt.local_rank == 0: torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth') if i == opt.num_iter: print('end the training') sys.exit() if opt.prof_iter > 0 and i > opt.prof_iter: sys.exit() i += 1
def demo(opt, length, db_url): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) model = torch.nn.DataParallel(model).to(device) # load model print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model, map_location=device)) # prepare data. two demo images from https://github.com/bgshih/crnn#run-demo AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) demo_data = RawDataset(root=opt.image_folder, opt=opt) # use RawDataset demo_loader = torch.utils.data.DataLoader(demo_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_demo, pin_memory=True) # predict model.eval() with torch.no_grad(): for image_tensors, image_path_list in demo_loader: batch_size = image_tensors.size(0) image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) if 'CTC' in opt.Prediction: preds = model(image, text_for_pred) # Select max probabilty (greedy decoding) then decode index to character preds_size = torch.IntTensor([preds.size(1)] * batch_size) _, preds_index = preds.max(2) # preds_index = preds_index.view(-1) preds_str = converter.decode(preds_index, preds_size) else: preds = model(image, text_for_pred, is_train=False) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) log = open(f'./log_demo_result.txt', 'a') dashed_line = '-' * 80 head = f'{"image_path":25s}\t{"predicted_labels":25s}\tconfidence score' print(f'{dashed_line}\n{head}\n{dashed_line}') log.write(f'{dashed_line}\n{head}\n{dashed_line}\n') preds_prob = F.softmax(preds, dim=2) preds_max_prob, _ = preds_prob.max(dim=2) for img_name, pred, pred_max_prob in zip(image_path_list, preds_str, preds_max_prob): #we are only interested in plates themselves if img_name.find('plate_', 0, len(img_name)) == -1: continue if 'Attn' in opt.Prediction: pred_EOS = pred.find('[s]') pred = pred[: pred_EOS] # prune after "end of sentence" token ([s]) pred_max_prob = pred_max_prob[:pred_EOS] # calculate confidence score (= multiply of pred_max_prob) confidence_score = pred_max_prob.cumprod(dim=0)[-1] #getting name of the current image img_name = img_name.replace('.jpg', '') img_name = img_name.replace('res_', '') img_name = img_name.replace('plate_', '') #cutting piece of full path which equals length print(length) img_name = img_name[length:] print(img_name) #splitting into image name and db name res = re.split(r'/', img_name) base = res[0] img_name = res[1] print(base) #splitting into first number of image and second (which are frame num and id respectively) result = re.split(r'_', img_name) print(result) engine = create_engine(db_url) conn = engine.connect() #writing recognised numbers to db sql = text('UPDATE table_' + base + ' SET plate_number =' + pred + ' WHERE frame = ' + result[0] + ' AND id = ' + result[1] + ' ;') engine.execute(sql) #print(result[0]) #print(result[1]) #print(base) print(f'{img_name:25s}\t{pred:25s}\t{confidence_score:0.4f}') log.write( f'{img_name:25s}\t{pred:25s}\t{confidence_score:0.4f}\n') ##img_name =re.sub(r'\w+\/', '', img_name) ##result = re.split(r'_', img_name) log.close()
def __init__(self, args): path = os.path.abspath(__file__) dir_path = os.path.dirname(path) self.opts.workers = args.get("workers", 5) # number of data loading workers ## data processing args self.opts.batch_size = args.get('batch_size', 50) #maximum-label-length self.opts.batch_max_length = args.get('batch_max_length', 25) #maximum-label-length self.opts.imgH = args.get('imgH', 32) # the height of the input image self.opts.imgW = args.get('imgW', 100) # #the width of the input image self.opts.rgb = args.get('rgb', False) # use rgb input self.opts.character = args.get( 'character', '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' ) #self.opts.character = args.get('character','0123456789abcdefghijklmnopqrstuvwxyz') #character label self.opts.sensitive = args.get('sensitive', True) #for sensitive character mode self.opts.PAD = args.get( 'PAD', True) #whether to keep ratio then pad for image resize ## Model architecture self.opts.Transformation = args.get( 'Transformation', 'TPS') #Transformation stage. None|TPS self.opts.FeatureExtraction = args.get( 'FeatureExtraction', 'ResNet') #FeatureExtraction stage. VGG|RCNN|ResNet self.opts.SequenceModeling = args.get( 'SequenceModeling', 'BiLSTM') #SequenceModeling stage. None|BiLSTM self.opts.Prediction = args.get('Prediction', 'Attn') #Prediction stage. CTC|Attn self.opts.num_fiducial = args.get( 'num_fiducial', 20) #number of fiducial points of TPS-STN self.opts.input_channel = args.get( 'input_channel', 1) #the number of input channel of Feature extractor self.opts.output_channel = args.get( 'output_channel', 512) #the number of output channel of Feature extractor self.opts.hidden_size = args.get( 'hidden_size', 256) #the size of the LSTM hidden state self.opts.num_gpu = 0 if torch.cuda.is_available( ) else torch.cuda.device_count() self.opts.saved_model = args.get( 'saved_model', dir_path + "/models/TPS-ResNet-BiLSTM-Attn-case-sensitive.pth") if self.opts.sensitive: self.opts.character = string.printable[: -6] # same with ASTER setting (use 94 char). if 'CTC' in self.opts.Prediction: self.converter = CTCLabelConverter(self.opts.character) else: self.converter = AttnLabelConverter(self.opts.character) self.opts.num_class = len(self.converter.character) if self.opts.rgb: self.opts.input_channel = 3 self.pre_load_model()
def train(opt): """ dataset preparation """ opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset = hierarchical_dataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, # 'True' to check training progress with validation function. shuffle=True, num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) print('-' * 80) """ model configuration """ if 'Transformer' in opt.SequenceModeling: converter = TransformerLabelConverter(opt.character) elif 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue """ setup loss """ if 'Transformer' in opt.SequenceModeling: criterion = transformer_loss elif 'CTC' in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).cuda() else: # ignore [GO] token = ignore index 0 criterion = torch.nn.CrossEntropyLoss(ignore_index=0).cuda() # loss averager loss_avg = Averager() # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) elif 'Transformer' in opt.SequenceModeling and opt.use_scheduled_optim: optimizer = optim.Adam(filtered_parameters, betas=(0.9, 0.98), eps=1e-09) optimizer_schedule = ScheduledOptim(optimizer, opt.d_model, opt.n_warmup_steps) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") print(optimizer) """ final options """ # print(opt) with open(f'./saved_models/{opt.experiment_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 start_time = time.time() best_accuracy = -1 best_norm_ED = 1e+6 pickle.load = partial(pickle.load, encoding="latin1") pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1") if opt.load_weights != '' and check_isfile(opt.load_weights): # load pretrained weights but ignore layers that don't match in size checkpoint = torch.load(opt.load_weights, pickle_module=pickle) if type(checkpoint) == dict: pretrain_dict = checkpoint['state_dict'] else: pretrain_dict = checkpoint model_dict = model.state_dict() pretrain_dict = { k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size() } model_dict.update(pretrain_dict) model.load_state_dict(model_dict) print("Loaded pretrained weights from '{}'".format(opt.load_weights)) del checkpoint torch.cuda.empty_cache() if opt.continue_model != '': print(f'loading pretrained model from {opt.continue_model}') checkpoint = torch.load(opt.continue_model) print(checkpoint.keys()) model.load_state_dict(checkpoint['state_dict']) start_iter = checkpoint['step'] + 1 print('continue to train start_iter: ', start_iter) if 'optimizer' in checkpoint.keys(): optimizer.load_state_dict(checkpoint['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() if 'best_accuracy' in checkpoint.keys(): best_accuracy = checkpoint['best_accuracy'] if 'best_norm_ED' in checkpoint.keys(): best_norm_ED = checkpoint['best_norm_ED'] del checkpoint torch.cuda.empty_cache() # data parallel for multi-GPU model = torch.nn.DataParallel(model).cuda() model.train() print("Model size:", count_num_param(model), 'M') if 'Transformer' in opt.SequenceModeling and opt.use_scheduled_optim: optimizer_schedule.n_current_steps = start_iter for i in tqdm(range(start_iter, opt.num_iter)): for p in model.parameters(): p.requires_grad = True cpu_images, cpu_texts = train_dataset.get_batch() image = cpu_images.cuda() if 'Transformer' in opt.SequenceModeling: text, length, text_pos = converter.encode(cpu_texts, opt.batch_max_length) elif 'CTC' in opt.Prediction: text, length = converter.encode(cpu_texts) else: text, length = converter.encode(cpu_texts, opt.batch_max_length) batch_size = image.size(0) if 'Transformer' in opt.SequenceModeling: preds = model(image, text, tgt_pos=text_pos) target = text[:, 1:] # without <s> Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) elif 'CTC' in opt.Prediction: preds = model(image, text).log_softmax(2) preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) # to use CTCLoss format cost = criterion(preds, text, preds_size, length) else: preds = model(image, text) target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) model.zero_grad() cost.backward() if 'Transformer' in opt.SequenceModeling and opt.use_scheduled_optim: optimizer_schedule.step_and_update_lr() elif 'Transformer' in opt.SequenceModeling: optimizer.step() else: # gradient clipping with 5 (Default) torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) optimizer.step() loss_avg.add(cost) # validation part if i > 0 and (i + 1) % opt.valInterval == 0: elapsed_time = time.time() - start_time print( f'[{i+1}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}' ) # for log with open(f'./saved_models/{opt.experiment_name}/log_train.txt', 'a') as log: log.write( f'[{i+1}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}\n' ) loss_avg.reset() model.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ED, preds, gts, infer_time, length_of_data = validation( model, criterion, valid_loader, converter, opt) model.train() for pred, gt in zip(preds[:5], gts[:5]): if 'Transformer' in opt.SequenceModeling: pred = pred[:pred.find('</s>')] gt = gt[:gt.find('</s>')] elif 'Attn' in opt.Prediction: pred = pred[:pred.find('[s]')] gt = gt[:gt.find('[s]')] print(f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}') log.write( f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}\n') valid_log = f'[{i+1}/{opt.num_iter}] valid loss: {valid_loss:0.5f}' valid_log += f' accuracy: {current_accuracy:0.3f}, norm_ED: {current_norm_ED:0.2f}' print(valid_log) log.write(valid_log + '\n') # keep best accuracy model if current_accuracy > best_accuracy: best_accuracy = current_accuracy state_dict = model.module.state_dict() save_checkpoint( { 'best_accuracy': best_accuracy, 'state_dict': state_dict, }, False, f'./saved_models/{opt.experiment_name}/best_accuracy.pth' ) if current_norm_ED < best_norm_ED: best_norm_ED = current_norm_ED state_dict = model.module.state_dict() save_checkpoint( { 'best_norm_ED': best_norm_ED, 'state_dict': state_dict, }, False, f'./saved_models/{opt.experiment_name}/best_norm_ED.pth' ) # torch.save( # model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth') best_model_log = f'best_accuracy: {best_accuracy:0.3f}, best_norm_ED: {best_norm_ED:0.2f}' print(best_model_log) log.write(best_model_log + '\n') # save model per 1000 iter. if (i + 1) % 1000 == 0: state_dict = model.module.state_dict() optimizer_state_dict = optimizer.state_dict() save_checkpoint( { 'state_dict': state_dict, 'optimizer': optimizer_state_dict, 'step': i, 'best_accuracy': best_accuracy, 'best_norm_ED': best_norm_ED, }, False, f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth')
def train(opt, show_number=2, amp=False): """ dataset preparation """ if not opt.data_filtering_off: print( 'Filtering the images containing characters which are not in opt.character' ) print( 'Filtering the images whose label is longer than opt.batch_max_length' ) opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) log = open(f'./saved_models/{opt.experiment_name}/log_dataset.txt', 'a', encoding="utf8") AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD, contrast_adjust=opt.contrast_adjust) valid_dataset, valid_dataset_log = hierarchical_dataset( root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=min(32, opt.batch_size), shuffle= True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), prefetch_factor=512, collate_fn=AlignCollate_valid, pin_memory=True) log.write(valid_dataset_log) print('-' * 80) log.write('-' * 80 + '\n') log.close() """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) if opt.saved_model != '': pretrained_dict = torch.load(opt.saved_model) if opt.new_prediction: model.Prediction = nn.Linear( model.SequenceModeling_output, len(pretrained_dict['module.Prediction.weight'])) model = torch.nn.DataParallel(model).to(device) print(f'loading pretrained model from {opt.saved_model}') if opt.FT: model.load_state_dict(pretrained_dict, strict=False) else: model.load_state_dict(pretrained_dict) if opt.new_prediction: model.module.Prediction = nn.Linear( model.module.SequenceModeling_output, opt.num_class) for name, param in model.module.Prediction.named_parameters(): if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) model = model.to(device) else: # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue model = torch.nn.DataParallel(model).to(device) model.train() print("Model:") print(model) count_parameters(model) """ setup loss """ if 'CTC' in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to( device) # ignore [GO] token = ignore index 0 # loss averager loss_avg = Averager() # freeze some layers try: if opt.freeze_FeatureFxtraction: for param in model.module.FeatureExtraction.parameters(): param.requires_grad = False if opt.freeze_SequenceModeling: for param in model.module.SequenceModeling.parameters(): param.requires_grad = False except: pass # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.optim == 'adam': #optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) optimizer = optim.Adam(filtered_parameters) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") print(optimizer) """ final options """ # print(opt) with open(f'./saved_models/{opt.experiment_name}/opt.txt', 'a', encoding="utf8") as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_model != '': try: start_iter = int(opt.saved_model.split('_')[-1].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass start_time = time.time() best_accuracy = -1 best_norm_ED = -1 i = start_iter scaler = GradScaler() t1 = time.time() while (True): # train part optimizer.zero_grad(set_to_none=True) if amp: with autocast(): image_tensors, labels = train_dataset.get_batch() image = image_tensors.to(device) text, length = converter.encode( labels, batch_max_length=opt.batch_max_length) batch_size = image.size(0) if 'CTC' in opt.Prediction: preds = model(image, text).log_softmax(2) preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) torch.backends.cudnn.enabled = False cost = criterion(preds, text.to(device), preds_size.to(device), length.to(device)) torch.backends.cudnn.enabled = True else: preds = model(image, text[:, :-1]) # align with Attention.forward target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) scaler.scale(cost).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) scaler.step(optimizer) scaler.update() else: image_tensors, labels = train_dataset.get_batch() image = image_tensors.to(device) text, length = converter.encode( labels, batch_max_length=opt.batch_max_length) batch_size = image.size(0) if 'CTC' in opt.Prediction: preds = model(image, text).log_softmax(2) preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) torch.backends.cudnn.enabled = False cost = criterion(preds, text.to(device), preds_size.to(device), length.to(device)) torch.backends.cudnn.enabled = True else: preds = model(image, text[:, :-1]) # align with Attention.forward target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) cost.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) optimizer.step() loss_avg.add(cost) # validation part if (i % opt.valInterval == 0) and (i != 0): print('training time: ', time.time() - t1) t1 = time.time() elapsed_time = time.time() - start_time # for log with open(f'./saved_models/{opt.experiment_name}/log_train.txt', 'a', encoding="utf8") as log: model.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ED, preds, confidence_score, labels,\ infer_time, length_of_data = validation(model, criterion, valid_loader, converter, opt, device) model.train() # training loss and validation loss loss_log = f'[{i}/{opt.num_iter}] Train loss: {loss_avg.val():0.5f}, Valid loss: {valid_loss:0.5f}, Elapsed_time: {elapsed_time:0.5f}' loss_avg.reset() current_model_log = f'{"Current_accuracy":17s}: {current_accuracy:0.3f}, {"Current_norm_ED":17s}: {current_norm_ED:0.4f}' # keep best accuracy model (on valid dataset) if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/best_accuracy.pth' ) if current_norm_ED > best_norm_ED: best_norm_ED = current_norm_ED torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth' ) best_model_log = f'{"Best_accuracy":17s}: {best_accuracy:0.3f}, {"Best_norm_ED":17s}: {best_norm_ED:0.4f}' loss_model_log = f'{loss_log}\n{current_model_log}\n{best_model_log}' print(loss_model_log) log.write(loss_model_log + '\n') # show some predicted results dashed_line = '-' * 80 head = f'{"Ground Truth":25s} | {"Prediction":25s} | Confidence Score & T/F' predicted_result_log = f'{dashed_line}\n{head}\n{dashed_line}\n' #show_number = min(show_number, len(labels)) start = random.randint(0, len(labels) - show_number) for gt, pred, confidence in zip( labels[start:start + show_number], preds[start:start + show_number], confidence_score[start:start + show_number]): if 'Attn' in opt.Prediction: gt = gt[:gt.find('[s]')] pred = pred[:pred.find('[s]')] predicted_result_log += f'{gt:25s} | {pred:25s} | {confidence:0.4f}\t{str(pred == gt)}\n' predicted_result_log += f'{dashed_line}' print(predicted_result_log) log.write(predicted_result_log + '\n') print('validation time: ', time.time() - t1) t1 = time.time() # save model per 1e+4 iter. if (i + 1) % 1e+4 == 0: torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth') if i == opt.num_iter: print('end the training') sys.exit() i += 1
def demo(opt): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) model = torch.nn.DataParallel(model).to(device) # load model print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model, map_location=device)) # prepare data. two demo images from https://github.com/bgshih/crnn#run-demo AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) demo_data = FontDataset(opt=opt) # use FontDataset demo_loader = torch.utils.data.DataLoader( demo_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_demo, pin_memory=True) # predict model.eval() total_cnt = 0 acc1_cnt = 0 acck_cnt = 0 with torch.no_grad(): for image_tensors, image_path_list in demo_loader: batch_size = image_tensors.size(0) image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) if 'CTC' in opt.Prediction: preds = model(image, text_for_pred) # Select max probabilty (greedy decoding) then decode index to character preds_size = torch.IntTensor([preds.size(1)] * batch_size) _, preds_index = preds.max(2) preds_index = preds_index.view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds, alphas = model(image, text_for_pred, is_train=False) if opt.batch_max_length == 1: # select top_k probabilty (greedy decoding) then decode index to character k = opt.topk preds = F.softmax(preds, dim=2) topk = preds.topk(k) topk_id = topk[1] topk_prob = topk[0] topk_id = topk_id.detach().cpu()[:, 0, :].unsqueeze(dim=1).numpy() # (batch_size, topk) # concat 3(['s']) to the end of ids topk_s = np.ones_like(topk_id) * 3 topk_id = np.concatenate((topk_id, topk_s), axis=1) topk_chars = converter.decode(topk_id, length_for_pred) topk_probs = topk_prob.detach().cpu()[:, 0, :] # (batch_size, topk) else: raise ValueError if opt.batch_max_length == 1: log = open(f'./log_demo_result.csv', 'a', encoding='utf-8') # topk_probs = F.softmax(topk_probs, dim=-1) for img_name, pred, pred_max_prob in zip(image_path_list, topk_chars, topk_probs): if 'Attn' in opt.Prediction: pred = [p[:p.find('[s]')] for p in pred] # prune after "end of sentence" token ([s]) # print(img_name, end='') log.write(img_name) for pred_char, pred_prob in zip(pred, pred_max_prob): # print(','+pred_char, end='') # print(',%.4f' % pred_prob, end='') log.write(','+pred_char) log.write(',%.4f' % pred_prob) # print() log.write('\n') if img_name == pred[0]: acc1_cnt += 1 if img_name in pred: acck_cnt += 1 total_cnt += 1 log.close() else: raise ValueError return total_cnt, acc1_cnt, acck_cnt
def test(opt): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) model = torch.nn.DataParallel(model).to(device) # load model print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model, map_location=device)) opt.exp_name = '_'.join(opt.saved_model.split('/')[1:]) # print(model) # return model AlignCollate_evaluation = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) eval_data, eval_data_log = hierarchical_dataset(root=opt.eval_data, opt=opt) evaluation_loader = torch.utils.data.DataLoader( eval_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_evaluation, pin_memory=True) # _, accuracy_by_best_model, _, _, _, _, _, _ = validation( # model, criterion, evaluation_loader, converter, opt) for i, (image_tensors, labels) in enumerate(evaluation_loader): # batch_size = image_tensors.size(0) # text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) target_layer = model.module.FeatureExtraction.ConvNet.layer4[-1] # model.eval() # handle =model.module.Transformation.register_forward_hook(hook) # model(image_tensors,text_for_pred) input_tensor = image_tensors # print(labels) # print(input_tensor.shape,'input_tensor.shape') # handle.remove() # print(input_tensor) # Create an input tensor image for your model.. # input_tensor=image_tensors # Note: input_tensor can be a batch tensor with several images! # print(labels) # Construct the CAM object once, and then re-use it on many images: cam = EigenCAM(model=model, target_layer=target_layer, use_cuda=opt.use_cuda) # If target_category is None, the highest scoring category # will be used for every image in the batch. # target_category can also be an integer, or a list of different integers # for every image in the batch. text_for_loss, length_for_loss = converter.encode( labels, batch_max_length=opt.batch_max_length) target_category = text_for_loss # You can also pass aug_smooth=True and eigen_smooth=True, to apply smoothing. grayscale_cam = cam(input_tensor=input_tensor, target_category=target_category) # In this example grayscale_cam has only one image in the batch: grayscale_cam = grayscale_cam[0, :] loader = transforms.ToPILImage() sourc_image = loader(image_tensors[0]) sourc_image = cv2.cvtColor(np.asarray(sourc_image), cv2.COLOR_RGB2BGR) # rgb_img=loader(input_tensor[0].cpu()) # rgb_img.save('rgb_visual.bmp') # rgb_img2=cv2.imread('rgb_visual.bmp') rgb_img2 = np.float32(sourc_image) / 255 visualization = show_cam_on_image(rgb_img2, grayscale_cam) sourc_image = cv2.resize(sourc_image, (0, 0), fx=5, fy=5, interpolation=cv2.INTER_CUBIC) visualization = cv2.resize(visualization, (0, 0), fx=5, fy=5, interpolation=cv2.INTER_CUBIC) cat_image = np.vstack((sourc_image, visualization)) cv2.imwrite('visual_image2/' + str(i) + '_2.bmp', cat_image)
def test(opt): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) model = torch.nn.DataParallel(model).to(device) # load model print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model, map_location=device)) opt.exp_name = '_'.join(opt.saved_model.split('/')[1:]) # print(model) """ keep evaluation model and result logs """ os.makedirs(f'./result/{opt.exp_name}', exist_ok=True) os.system(f'cp {opt.saved_model} ./result/{opt.exp_name}/') """ setup loss """ if 'CTC' in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to( device) # ignore [GO] token = ignore index 0 """ evaluation """ model.eval() with torch.no_grad(): if opt.benchmark_all_eval: # evaluation with 10 benchmark evaluation datasets benchmark_all_eval(model, criterion, converter, opt) else: log = open(f'./result/{opt.exp_name}/log_evaluation.txt', 'a') AlignCollate_evaluation = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) eval_data, eval_data_log = hierarchical_dataset(root=opt.eval_data, opt=opt) evaluation_loader = torch.utils.data.DataLoader( eval_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_evaluation, pin_memory=True) # valid_loss_avg.val(), accuracy, norm_ED, preds_str, confidence_score_list, labels, infer_time, length_of_data if opt.by_length: _, accuracy, norm_ED, preds_str, confidence_score_list, _, _, length_of_data = validation_by_length( model, criterion, evaluation_loader, converter, opt) log.write(eval_data_log) length_key = sorted(accuracy.keys()) for length in length_key: print( f'Length: {length}, ACC: {accuracy[length]:0.3f}, norm_ED: {norm_ED[length]:0.3f}' ) log.write( f'Length: {length}, ACC: {accuracy[length]:0.3f}, norm_ED: {norm_ED[length]:0.3f}\n' ) log.close() else: _, accuracy, norm_ED, preds_str, confidence_score_list, _, _, length_of_data = validation( model, criterion, evaluation_loader, converter, opt) log.write(eval_data_log) print(f'ACC: {accuracy:0.3f}, norm_ED: {norm_ED:0.3f}') log.write(f'ACC: {accuracy:0.3f}, norm_ED: {norm_ED:0.3f}\n') log.close()
def demo(opt): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) model = torch.nn.DataParallel(model).to(device) # load model print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model, map_location=device)) # model.load_state_dict(copy_state_dict(torch.load(opt.saved_model, map_location=device))) # prepare data. two demo images from https://github.com/bgshih/crnn#run-demo AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) # demo_data = RawDataset(root=opt.image_folder, opt=opt) # use RawDataset demo_data = LmdbDataset(root=opt.image_folder, opt=opt, mode='Val') # use RawDataset demo_loader = torch.utils.data.DataLoader(demo_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_demo, pin_memory=True, drop_last=True) log = open(f'./log_demo_result.txt', 'a') # predict model.eval() fail_count, sample_count = 0, 0 record_count = 1 with torch.no_grad(): for image_tensors, image_path_list in demo_loader: batch_size = image_tensors.size(0) image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) if 'CTC' in opt.Prediction: preds = model(image, text_for_pred) # Select max probabilty (greedy decoding) then decode index to character preds_size = torch.IntTensor([preds.size(1)] * batch_size) _, preds_index = preds.max(2) # preds_index = preds_index.view(-1) preds_str = converter.decode(preds_index, preds_size) else: preds = model(image, text_for_pred, is_train=False) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) dashed_line = '-' * 80 head = f'{"image_path":25s}\t{"predicted_labels":25s}\tconfidence score' print(f'{dashed_line}\n{head}\n{dashed_line}') log.write(f'{dashed_line}\n{head}\n{dashed_line}\n') preds_prob = F.softmax(preds, dim=2) preds_max_prob, _ = preds_prob.max(dim=2) for image_tensor, gt, pred, pred_max_prob in zip( image_tensors, image_path_list, preds_str, preds_max_prob): if 'Attn' in opt.Prediction: pred_EOS = pred.find('[s]') pred = pred[: pred_EOS] # prune after "end of sentence" token ([s]) pred_max_prob = pred_max_prob[:pred_EOS] if pred_max_prob.shape[0] > 0: # calculate confidence score (= multiply of pred_max_prob) confidence_score = pred_max_prob.cumprod(dim=0)[-1] else: confidence_score = 0.0 # gt = img_name.split('_L_')[1] # gt = gt.split('.')[0] # pred = pred.split('.')[0] # except IndexError: # print(f'Index Error {img_name}') # raise IndexError # if img_name.find('1_225427_L_대전출입국관리사무소_L_21.png') >=0 : # print(f'{img_name:25s}\t{pred:25s}\t{confidence_score:0.4f}') # if gt.split('(')[0] != pred.split('(')[0]: # fail_count += 1 # log.write(f'{gt:25s}\t{pred:25s}\t{confidence_score:0.4f}\n') # # import shutil # shutil.copy(gt, os.path.join('./result', os.path.basename(img_name))) # if gt.find('#') >= 0: # continue compare_gt = "".join(x.upper() for x in gt if x.isalnum()) compare_pred = "".join(x.upper() for x in pred if x.isalnum()) if compare_gt != compare_pred: fail_count += 1 print( f'{gt:25s}\t{pred:25s}\tFail\t{confidence_score:0.4f}\t{record_count}\n' ) im = to_pil_image(image_tensor) try: im.save( os.path.join( 'result', f'{fail_count}_{compare_pred}_{compare_gt}.jpeg' )) except Exception as e: print( f'Error: {e} {fail_count}_{compare_pred}_{compare_gt}' ) exit(1) else: # print(f'{gt:25s}\t{pred:25s}\tSuccess\t{confidence_score:0.4f}') pass sample_count += 1 record_count += 1 log.close() print(f'total accuracy: {(sample_count-fail_count)/sample_count:.2f}')
def test(opt): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) # CTCLoss converter_ctc = CTCLabelConverter(opt.character) # Attention converter_atten = AttnLabelConverter(opt.character) opt.num_class_ctc = len(converter_ctc.character) opt.num_class_atten = len(converter_atten.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) model = model.to(device) # if torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model).to(device) # load model print('loading pretrained model from %s' % opt.saved_model) # model.load_state_dict(torch.load(opt.saved_model, map_location=device)) model_ = torch.load(opt.saved_model, map_location=device) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model).to(device) model.load_state_dict(model_) else: model2_ = OrderedDict() for k in model_.keys(): model2_[k.replace('module.', '')] = model_[k] model.load_state_dict(model2_) opt.exp_name = '_'.join(opt.saved_model.split('/')[1:]) # print(model) """ keep evaluation model and result logs """ os.makedirs(f'./result/{opt.exp_name}', exist_ok=True) os.system(f'cp {opt.saved_model} ./result/{opt.exp_name}/') """ setup loss """ if 'CTC' in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to( device) # ignore [GO] token = ignore index 0 """ evaluation """ model.eval() with torch.no_grad(): if opt.benchmark_all_eval: # evaluation with 10 benchmark evaluation datasets benchmark_all_eval(model, criterion, converter, opt) else: log = open(f'./result/{opt.exp_name}/log_evaluation.txt', 'a') AlignCollate_evaluation = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) eval_data, eval_data_log = hierarchical_dataset(root=opt.eval_data, opt=opt) evaluation_loader = torch.utils.data.DataLoader( eval_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_evaluation, pin_memory=True) _, accuracy_by_best_model, _, _, _, _, _, _ = validation( model, criterion, evaluation_loader, converter, opt) log.write(eval_data_log) print(f'{accuracy_by_best_model:0.3f}') log.write(f'{accuracy_by_best_model:0.3f}\n') log.close()
def train(opt, log): """dataset preparation""" # train dataset. for convenience if opt.select_data == "label": select_data = [ "1.SVT", "2.IIIT", "3.IC13", "4.IC15", "5.COCO", "6.RCTW17", "7.Uber", "8.ArT", "9.LSVT", "10.MLT19", "11.ReCTS", ] elif opt.select_data == "synth": select_data = ["MJ", "ST"] elif opt.select_data == "synth_SA": select_data = ["MJ", "ST", "SA"] opt.batch_ratio = "0.4-0.4-0.2" # same ratio with SCATTER paper. elif opt.select_data == "mix": select_data = [ "1.SVT", "2.IIIT", "3.IC13", "4.IC15", "5.COCO", "6.RCTW17", "7.Uber", "8.ArT", "9.LSVT", "10.MLT19", "11.ReCTS", "MJ", "ST", ] elif opt.select_data == "mix_SA": select_data = [ "1.SVT", "2.IIIT", "3.IC13", "4.IC15", "5.COCO", "6.RCTW17", "7.Uber", "8.ArT", "9.LSVT", "10.MLT19", "11.ReCTS", "MJ", "ST", "SA", ] else: select_data = opt.select_data.split("-") # set batch_ratio for each data. if opt.batch_ratio: batch_ratio = opt.batch_ratio.split("-") else: batch_ratio = [round(1 / len(select_data), 3)] * len(select_data) train_loader = Batch_Balanced_Dataset(opt, opt.train_data, select_data, batch_ratio, log) if opt.semi != "None": select_data_unlabel = ["U1.Book32", "U2.TextVQA", "U3.STVQA"] batch_ratio_unlabel = [round(1 / len(select_data_unlabel), 3) ] * len(select_data_unlabel) dataset_root_unlabel = "data_CVPR2021/training/unlabel/" train_loader_unlabel_semi = Batch_Balanced_Dataset( opt, dataset_root_unlabel, select_data_unlabel, batch_ratio_unlabel, log, learn_type="semi", ) AlignCollate_valid = AlignCollate(opt, mode="test") valid_dataset, valid_dataset_log = hierarchical_dataset( root=opt.valid_data, opt=opt, mode="test") valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle= True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=False, ) log.write(valid_dataset_log) print("-" * 80) log.write("-" * 80 + "\n") """ model configuration """ if "CTC" in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.sos_token_index = converter.dict["[SOS]"] opt.eos_token_index = converter.dict["[EOS]"] opt.num_class = len(converter.character) model = Model(opt) # weight initialization for name, param in model.named_parameters(): if "localization_fc2" in name: print(f"Skip {name} as it is already initialized") continue try: if "bias" in name: init.constant_(param, 0.0) elif "weight" in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if "weight" in name: param.data.fill_(1) continue # data parallel for multi-GPU model = torch.nn.DataParallel(model).to(device) model.train() if opt.saved_model != "": fine_tuning_log = f"### loading pretrained model from {opt.saved_model}\n" if "MoCo" in opt.saved_model or "MoCo" in opt.self_pre: pretrained_state_dict_qk = torch.load(opt.saved_model) pretrained_state_dict = {} for name in pretrained_state_dict_qk: if "encoder_q" in name: rename = name.replace("encoder_q.", "") pretrained_state_dict[rename] = pretrained_state_dict_qk[ name] else: pretrained_state_dict = torch.load(opt.saved_model) for name, param in model.named_parameters(): try: param.data.copy_(pretrained_state_dict[name].data ) # load from pretrained model if opt.FT == "freeze": param.requires_grad = False # Freeze fine_tuning_log += f"pretrained layer (freezed): {name}\n" else: fine_tuning_log += f"pretrained layer: {name}\n" except: fine_tuning_log += f"non-pretrained layer: {name}\n" print(fine_tuning_log) log.write(fine_tuning_log + "\n") # print("Model:") # print(model) log.write(repr(model) + "\n") """ setup loss """ if "CTC" in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: # ignore [PAD] token criterion = torch.nn.CrossEntropyLoss( ignore_index=converter.dict["[PAD]"]).to(device) if "Pseudo" in opt.semi: criterion_SemiSL = PseudoLabelLoss(opt, converter, criterion) elif "MeanT" in opt.semi: criterion_SemiSL = MeanTeacherLoss(opt, student_for_init_teacher=model) # loss averager train_loss_avg = Averager() semi_loss_avg = Averager() # semi supervised loss avg # filter that only require gradient descent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print(f"Trainable params num: {sum(params_num)}") log.write(f"Trainable params num: {sum(params_num)}\n") # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.optimizer == "sgd": optimizer = torch.optim.SGD( filtered_parameters, lr=opt.lr, momentum=opt.sgd_momentum, weight_decay=opt.sgd_weight_decay, ) elif opt.optimizer == "adadelta": optimizer = torch.optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) elif opt.optimizer == "adam": optimizer = torch.optim.Adam(filtered_parameters, lr=opt.lr) print("Optimizer:") print(optimizer) log.write(repr(optimizer) + "\n") if "super" in opt.schedule: if opt.optimizer == "sgd": cycle_momentum = True else: cycle_momentum = False scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer, max_lr=opt.lr, cycle_momentum=cycle_momentum, div_factor=20, final_div_factor=1000, total_steps=opt.num_iter, ) print("Scheduler:") print(scheduler) log.write(repr(scheduler) + "\n") """ final options """ # print(opt) opt_log = "------------ Options -------------\n" args = vars(opt) for k, v in args.items(): if str(k) == "character" and len(str(v)) > 500: opt_log += f"{str(k)}: So many characters to show all: number of characters: {len(str(v))}\n" else: opt_log += f"{str(k)}: {str(v)}\n" opt_log += "---------------------------------------\n" print(opt_log) log.write(opt_log) log.close() """ start training """ start_iter = 0 if opt.saved_model != "": try: start_iter = int(opt.saved_model.split("_")[-1].split(".")[0]) print(f"continue to train, start_iter: {start_iter}") except: pass start_time = time.time() best_score = -1 # training loop for iteration in tqdm( range(start_iter + 1, opt.num_iter + 1), total=opt.num_iter, position=0, leave=True, ): if "MeanT" in opt.semi: image_tensors, image_tensors_ema, labels = train_loader.get_batch_ema( ) else: image_tensors, labels = train_loader.get_batch() image = image_tensors.to(device) labels_index, labels_length = converter.encode( labels, batch_max_length=opt.batch_max_length) batch_size = image.size(0) # default recognition loss part if "CTC" in opt.Prediction: preds = model(image) preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds_log_softmax = preds.log_softmax(2).permute(1, 0, 2) loss = criterion(preds_log_softmax, labels_index, preds_size, labels_length) else: preds = model(image, labels_index[:, :-1]) # align with Attention.forward target = labels_index[:, 1:] # without [SOS] Symbol loss = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) # semi supervised part (SemiSL) if "Pseudo" in opt.semi: image_unlabel, _ = train_loader_unlabel_semi.get_batch_two_images() image_unlabel = image_unlabel.to(device) loss_SemiSL = criterion_SemiSL(image_unlabel, model) loss = loss + loss_SemiSL semi_loss_avg.add(loss_SemiSL) elif "MeanT" in opt.semi: ( image_tensors_unlabel, image_tensors_unlabel_ema, ) = train_loader_unlabel_semi.get_batch_two_images() image_unlabel = image_tensors_unlabel.to(device) student_input = torch.cat([image, image_unlabel], dim=0) image_ema = image_tensors_ema.to(device) image_unlabel_ema = image_tensors_unlabel_ema.to(device) teacher_input = torch.cat([image_ema, image_unlabel_ema], dim=0) loss_SemiSL = criterion_SemiSL( student_input=student_input, student_logit=preds, student=model, teacher_input=teacher_input, iteration=iteration, ) loss = loss + loss_SemiSL semi_loss_avg.add(loss_SemiSL) model.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() train_loss_avg.add(loss) if "super" in opt.schedule: scheduler.step() else: adjust_learning_rate(optimizer, iteration, opt) # validation part. # To see training progress, we also conduct validation when 'iteration == 1' if iteration % opt.val_interval == 0 or iteration == 1: # for validation log with open(f"./saved_models/{opt.exp_name}/log_train.txt", "a") as log: model.eval() with torch.no_grad(): ( valid_loss, current_score, preds, confidence_score, labels, infer_time, length_of_data, ) = validation(model, criterion, valid_loader, converter, opt) model.train() # keep best score (accuracy or norm ED) model on valid dataset # Do not use this on test datasets. It would be an unfair comparison # (training should be done without referring test set). if current_score > best_score: best_score = current_score torch.save( model.state_dict(), f"./saved_models/{opt.exp_name}/best_score.pth", ) # validation log: loss, lr, score (accuracy or norm ED), time. lr = optimizer.param_groups[0]["lr"] elapsed_time = time.time() - start_time valid_log = f"\n[{iteration}/{opt.num_iter}] Train_loss: {train_loss_avg.val():0.5f}, Valid_loss: {valid_loss:0.5f}" valid_log += f", Semi_loss: {semi_loss_avg.val():0.5f}\n" valid_log += f'{"Current_score":17s}: {current_score:0.2f}, Current_lr: {lr:0.7f}\n' valid_log += f'{"Best_score":17s}: {best_score:0.2f}, Infer_time: {infer_time:0.1f}, Elapsed_time: {elapsed_time:0.1f}' # show some predicted results dashed_line = "-" * 80 head = f'{"Ground Truth":25s} | {"Prediction":25s} | Confidence Score & T/F' predicted_result_log = f"{dashed_line}\n{head}\n{dashed_line}\n" for gt, pred, confidence in zip(labels[:5], preds[:5], confidence_score[:5]): if "Attn" in opt.Prediction: gt = gt[:gt.find("[EOS]")] pred = pred[:pred.find("[EOS]")] predicted_result_log += f"{gt:25s} | {pred:25s} | {confidence:0.4f}\t{str(pred == gt)}\n" predicted_result_log += f"{dashed_line}" valid_log = f"{valid_log}\n{predicted_result_log}" print(valid_log) log.write(valid_log + "\n") opt.writer.add_scalar("train/train_loss", float(f"{train_loss_avg.val():0.5f}"), iteration) opt.writer.add_scalar("train/semi_loss", float(f"{semi_loss_avg.val():0.5f}"), iteration) opt.writer.add_scalar("train/lr", float(f"{lr:0.7f}"), iteration) opt.writer.add_scalar("train/elapsed_time", float(f"{elapsed_time:0.1f}"), iteration) opt.writer.add_scalar("valid/valid_loss", float(f"{valid_loss:0.5f}"), iteration) opt.writer.add_scalar("valid/current_score", float(f"{current_score:0.2f}"), iteration) opt.writer.add_scalar("valid/best_score", float(f"{best_score:0.2f}"), iteration) train_loss_avg.reset() semi_loss_avg.reset() """ Evaluation at the end of training """ print("Start evaluation on benchmark testset") """ keep evaluation model and result logs """ os.makedirs(f"./result/{opt.exp_name}", exist_ok=True) os.makedirs(f"./evaluation_log", exist_ok=True) saved_best_model = f"./saved_models/{opt.exp_name}/best_score.pth" # os.system(f'cp {saved_best_model} ./result/{opt.exp_name}/') model.load_state_dict(torch.load(f"{saved_best_model}")) opt.eval_type = "benchmark" model.eval() with torch.no_grad(): total_accuracy, eval_data_list, accuracy_list = benchmark_all_eval( model, criterion, converter, opt) opt.writer.add_scalar("test/total_accuracy", float(f"{total_accuracy:0.2f}"), iteration) for eval_data, accuracy in zip(eval_data_list, accuracy_list): accuracy = float(accuracy) opt.writer.add_scalar(f"test/{eval_data}", float(f"{accuracy:0.2f}"), iteration) print( f'finished the experiment: {opt.exp_name}, "CUDA_VISIBLE_DEVICES" was {opt.CUDA_VISIBLE_DEVICES}' )
def train(opt): lib.print_model_settings(locals().copy()) """ dataset preparation """ if not opt.data_filtering_off: print('Filtering the images containing characters which are not in opt.character') print('Filtering the images whose label is longer than opt.batch_max_length') # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130 log = open(os.path.join(opt.exp_dir,opt.exp_name,'log_dataset.txt'), 'a') AlignCollate_valid = AlignPairCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) train_dataset, train_dataset_log = hierarchical_dataset(root=opt.train_data, opt=opt) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=opt.batch_size, sampler=data_sampler(train_dataset, shuffle=True, distributed=opt.distributed), num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True, drop_last=True) log.write(train_dataset_log) print('-' * 80) valid_dataset, valid_dataset_log = hierarchical_dataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, sampler=data_sampler(train_dataset, shuffle=False, distributed=opt.distributed), num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True, drop_last=True) log.write(valid_dataset_log) print('-' * 80) log.write('-' * 80 + '\n') log.close() if 'Attn' in opt.Prediction: converter = AttnLabelConverter(opt.character) else: converter = CTCLabelConverter(opt.character) opt.num_class = len(converter.character) # styleModel = StyleTensorEncoder(input_dim=opt.input_channel) # genModel = AdaIN_Tensor_WordGenerator(opt) # disModel = MsImageDisV2(opt) # styleModel = StyleLatentEncoder(input_dim=opt.input_channel, norm='none') # mixModel = Mixer(opt,nblk=3, dim=opt.latent) genModel = styleGANGen(opt.size, opt.latent, opt.n_mlp, opt.num_class, channel_multiplier=opt.channel_multiplier).to(device) disModel = styleGANDis(opt.size, channel_multiplier=opt.channel_multiplier, input_dim=opt.input_channel).to(device) g_ema = styleGANGen(opt.size, opt.latent, opt.n_mlp, opt.num_class, channel_multiplier=opt.channel_multiplier).to(device) ocrModel = ModelV1(opt).to(device) accumulate(g_ema, genModel, 0) # # weight initialization # for currModel in [styleModel, mixModel]: # for name, param in currModel.named_parameters(): # if 'localization_fc2' in name: # print(f'Skip {name} as it is already initialized') # continue # try: # if 'bias' in name: # init.constant_(param, 0.0) # elif 'weight' in name: # init.kaiming_normal_(param) # except Exception as e: # for batchnorm. # if 'weight' in name: # param.data.fill_(1) # continue if opt.contentLoss == 'vis' or opt.contentLoss == 'seq': ocrCriterion = torch.nn.L1Loss() else: if 'CTC' in opt.Prediction: ocrCriterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: ocrCriterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # ignore [GO] token = ignore index 0 # vggRecCriterion = torch.nn.L1Loss() # vggModel = VGGPerceptualLossModel(models.vgg19(pretrained=True), vggRecCriterion) print('model input parameters', opt.imgH, opt.imgW, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length) if opt.distributed: genModel = torch.nn.parallel.DistributedDataParallel( genModel, device_ids=[opt.local_rank], output_device=opt.local_rank, broadcast_buffers=False, ) disModel = torch.nn.parallel.DistributedDataParallel( disModel, device_ids=[opt.local_rank], output_device=opt.local_rank, broadcast_buffers=False, ) ocrModel = torch.nn.parallel.DistributedDataParallel( ocrModel, device_ids=[opt.local_rank], output_device=opt.local_rank, broadcast_buffers=False ) # styleModel = torch.nn.DataParallel(styleModel).to(device) # styleModel.train() # mixModel = torch.nn.DataParallel(mixModel).to(device) # mixModel.train() # genModel = torch.nn.DataParallel(genModel).to(device) # g_ema = torch.nn.DataParallel(g_ema).to(device) genModel.train() g_ema.eval() # disModel = torch.nn.DataParallel(disModel).to(device) disModel.train() # vggModel = torch.nn.DataParallel(vggModel).to(device) # vggModel.eval() # ocrModel = torch.nn.DataParallel(ocrModel).to(device) # if opt.distributed: # ocrModel.module.Transformation.eval() # ocrModel.module.FeatureExtraction.eval() # ocrModel.module.AdaptiveAvgPool.eval() # # ocrModel.module.SequenceModeling.eval() # ocrModel.module.Prediction.eval() # else: # ocrModel.Transformation.eval() # ocrModel.FeatureExtraction.eval() # ocrModel.AdaptiveAvgPool.eval() # # ocrModel.SequenceModeling.eval() # ocrModel.Prediction.eval() ocrModel.eval() if opt.distributed: g_module = genModel.module d_module = disModel.module else: g_module = genModel d_module = disModel g_reg_ratio = opt.g_reg_every / (opt.g_reg_every + 1) d_reg_ratio = opt.d_reg_every / (opt.d_reg_every + 1) optimizer = optim.Adam( genModel.parameters(), lr=opt.lr * g_reg_ratio, betas=(0 ** g_reg_ratio, 0.99 ** g_reg_ratio), ) dis_optimizer = optim.Adam( disModel.parameters(), lr=opt.lr * d_reg_ratio, betas=(0 ** d_reg_ratio, 0.99 ** d_reg_ratio), ) ## Loading pre-trained files if opt.modelFolderFlag: if len(glob.glob(os.path.join(opt.exp_dir,opt.exp_name,"iter_*_synth.pth")))>0: opt.saved_synth_model = glob.glob(os.path.join(opt.exp_dir,opt.exp_name,"iter_*_synth.pth"))[-1] if opt.saved_ocr_model !='' and opt.saved_ocr_model !='None': if not opt.distributed: ocrModel = torch.nn.DataParallel(ocrModel) print(f'loading pretrained ocr model from {opt.saved_ocr_model}') checkpoint = torch.load(opt.saved_ocr_model) ocrModel.load_state_dict(checkpoint) #temporary fix if not opt.distributed: ocrModel = ocrModel.module if opt.saved_gen_model !='' and opt.saved_gen_model !='None': print(f'loading pretrained gen model from {opt.saved_gen_model}') checkpoint = torch.load(opt.saved_gen_model, map_location=lambda storage, loc: storage) genModel.module.load_state_dict(checkpoint['g']) g_ema.module.load_state_dict(checkpoint['g_ema']) if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': print(f'loading pretrained synth model from {opt.saved_synth_model}') checkpoint = torch.load(opt.saved_synth_model) # styleModel.load_state_dict(checkpoint['styleModel']) # mixModel.load_state_dict(checkpoint['mixModel']) genModel.load_state_dict(checkpoint['genModel']) g_ema.load_state_dict(checkpoint['g_ema']) disModel.load_state_dict(checkpoint['disModel']) optimizer.load_state_dict(checkpoint["optimizer"]) dis_optimizer.load_state_dict(checkpoint["dis_optimizer"]) # if opt.imgReconLoss == 'l1': # recCriterion = torch.nn.L1Loss() # elif opt.imgReconLoss == 'ssim': # recCriterion = ssim # elif opt.imgReconLoss == 'ms-ssim': # recCriterion = msssim # loss averager loss_avg = Averager() loss_avg_dis = Averager() loss_avg_gen = Averager() loss_avg_imgRecon = Averager() loss_avg_vgg_per = Averager() loss_avg_vgg_sty = Averager() loss_avg_ocr = Averager() log_r1_val = Averager() log_avg_path_loss_val = Averager() log_avg_mean_path_length_avg = Averager() log_ada_aug_p = Averager() """ final options """ with open(os.path.join(opt.exp_dir,opt.exp_name,'opt.txt'), 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': try: start_iter = int(opt.saved_synth_model.split('_')[-2].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass #get schedulers scheduler = get_scheduler(optimizer,opt) dis_scheduler = get_scheduler(dis_optimizer,opt) start_time = time.time() iteration = start_iter cntr=0 mean_path_length = 0 d_loss_val = 0 r1_loss = torch.tensor(0.0, device=device) g_loss_val = 0 path_loss = torch.tensor(0.0, device=device) path_lengths = torch.tensor(0.0, device=device) mean_path_length_avg = 0 loss_dict = {} accum = 0.5 ** (32 / (10 * 1000)) ada_augment = torch.tensor([0.0, 0.0], device=device) ada_aug_p = opt.augment_p if opt.augment_p > 0 else 0.0 ada_aug_step = opt.ada_target / opt.ada_length r_t_stat = 0 sample_z = torch.randn(opt.n_sample, opt.latent, device=device) while(True): # print(cntr) # train part if opt.lr_policy !="None": scheduler.step() dis_scheduler.step() image_input_tensors, image_gt_tensors, labels_1, labels_2 = iter(train_loader).next() image_input_tensors = image_input_tensors.to(device) image_gt_tensors = image_gt_tensors.to(device) batch_size = image_input_tensors.size(0) requires_grad(genModel, False) # requires_grad(styleModel, False) # requires_grad(mixModel, False) requires_grad(disModel, True) text_1, length_1 = converter.encode(labels_1, batch_max_length=opt.batch_max_length) text_2, length_2 = converter.encode(labels_2, batch_max_length=opt.batch_max_length) #forward pass from style and word generator # style = styleModel(image_input_tensors).squeeze(2).squeeze(2) style = mixing_noise(opt.batch_size, opt.latent, opt.mixing, device) # scInput = mixModel(style,text_2) if 'CTC' in opt.Prediction: images_recon_2,_ = genModel(style, text_2, input_is_latent=opt.input_latent) else: images_recon_2,_ = genModel(style, text_2[:,1:-1], input_is_latent=opt.input_latent) #Domain discriminator: Dis update if opt.augment: image_gt_tensors_aug, _ = augment(image_gt_tensors, ada_aug_p) images_recon_2, _ = augment(images_recon_2, ada_aug_p) else: image_gt_tensors_aug = image_gt_tensors fake_pred = disModel(images_recon_2) real_pred = disModel(image_gt_tensors_aug) disCost = d_logistic_loss(real_pred, fake_pred) loss_dict["d"] = disCost*opt.disWeight loss_dict["real_score"] = real_pred.mean() loss_dict["fake_score"] = fake_pred.mean() loss_avg_dis.add(disCost) disModel.zero_grad() disCost.backward() dis_optimizer.step() if opt.augment and opt.augment_p == 0: ada_augment += torch.tensor( (torch.sign(real_pred).sum().item(), real_pred.shape[0]), device=device ) ada_augment = reduce_sum(ada_augment) if ada_augment[1] > 255: pred_signs, n_pred = ada_augment.tolist() r_t_stat = pred_signs / n_pred if r_t_stat > opt.ada_target: sign = 1 else: sign = -1 ada_aug_p += sign * ada_aug_step * n_pred ada_aug_p = min(1, max(0, ada_aug_p)) ada_augment.mul_(0) d_regularize = cntr % opt.d_reg_every == 0 if d_regularize: image_gt_tensors.requires_grad = True image_input_tensors.requires_grad = True cat_tensor = image_gt_tensors real_pred = disModel(cat_tensor) r1_loss = d_r1_loss(real_pred, cat_tensor) disModel.zero_grad() (opt.r1 / 2 * r1_loss * opt.d_reg_every + 0 * real_pred[0]).backward() dis_optimizer.step() loss_dict["r1"] = r1_loss # #[Style Encoder] + [Word Generator] update image_input_tensors, image_gt_tensors, labels_1, labels_2 = iter(train_loader).next() image_input_tensors = image_input_tensors.to(device) image_gt_tensors = image_gt_tensors.to(device) batch_size = image_input_tensors.size(0) requires_grad(genModel, True) # requires_grad(styleModel, True) # requires_grad(mixModel, True) requires_grad(disModel, False) text_1, length_1 = converter.encode(labels_1, batch_max_length=opt.batch_max_length) text_2, length_2 = converter.encode(labels_2, batch_max_length=opt.batch_max_length) # style = styleModel(image_input_tensors).squeeze(2).squeeze(2) # scInput = mixModel(style,text_2) # images_recon_2,_ = genModel([scInput], input_is_latent=opt.input_latent) style = mixing_noise(batch_size, opt.latent, opt.mixing, device) if 'CTC' in opt.Prediction: images_recon_2, _ = genModel(style, text_2) else: images_recon_2, _ = genModel(style, text_2[:,1:-1]) if opt.augment: images_recon_2, _ = augment(images_recon_2, ada_aug_p) fake_pred = disModel(images_recon_2) disGenCost = g_nonsaturating_loss(fake_pred) loss_dict["g"] = disGenCost # # #Adversarial loss # # disGenCost = disModel.module.calc_gen_loss(torch.cat((images_recon_2,image_input_tensors),dim=1)) # #Input reconstruction loss # recCost = recCriterion(images_recon_2,image_gt_tensors) # #vgg loss # vggPerCost, vggStyleCost = vggModel(image_gt_tensors, images_recon_2) #ocr loss text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) if opt.contentLoss == 'vis' or opt.contentLoss == 'seq': preds_recon = ocrModel(images_recon_2, text_for_pred, is_train=False, returnFeat=opt.contentLoss) preds_gt = ocrModel(image_gt_tensors, text_for_pred, is_train=False, returnFeat=opt.contentLoss) ocrCost = ocrCriterion(preds_recon, preds_gt) else: if 'CTC' in opt.Prediction: preds_recon = ocrModel(images_recon_2, text_for_pred, is_train=False) # preds_o = preds_recon[:, :text_1.shape[1], :] preds_size = torch.IntTensor([preds_recon.size(1)] * batch_size) preds_recon_softmax = preds_recon.log_softmax(2).permute(1, 0, 2) ocrCost = ocrCriterion(preds_recon_softmax, text_2, preds_size, length_2) #predict ocr recognition on generated images # preds_recon_size = torch.IntTensor([preds_recon.size(1)] * batch_size) _, preds_recon_index = preds_recon.max(2) labels_o_ocr = converter.decode(preds_recon_index.data, preds_size.data) #predict ocr recognition on gt style images preds_s = ocrModel(image_input_tensors, text_for_pred, is_train=False) # preds_s = preds_s[:, :text_1.shape[1] - 1, :] preds_s_size = torch.IntTensor([preds_s.size(1)] * batch_size) _, preds_s_index = preds_s.max(2) labels_s_ocr = converter.decode(preds_s_index.data, preds_s_size.data) #predict ocr recognition on gt stylecontent images preds_sc = ocrModel(image_gt_tensors, text_for_pred, is_train=False) # preds_sc = preds_sc[:, :text_2.shape[1] - 1, :] preds_sc_size = torch.IntTensor([preds_sc.size(1)] * batch_size) _, preds_sc_index = preds_sc.max(2) labels_sc_ocr = converter.decode(preds_sc_index.data, preds_sc_size.data) else: preds_recon = ocrModel(images_recon_2, text_for_pred[:, :-1], is_train=False) # align with Attention.forward target_2 = text_2[:, 1:] # without [GO] Symbol ocrCost = ocrCriterion(preds_recon.view(-1, preds_recon.shape[-1]), target_2.contiguous().view(-1)) #predict ocr recognition on generated images _, preds_o_index = preds_recon.max(2) labels_o_ocr = converter.decode(preds_o_index, length_for_pred) for idx, pred in enumerate(labels_o_ocr): pred_EOS = pred.find('[s]') labels_o_ocr[idx] = pred[:pred_EOS] # prune after "end of sentence" token ([s]) #predict ocr recognition on gt style images preds_s = ocrModel(image_input_tensors, text_for_pred, is_train=False) _, preds_s_index = preds_s.max(2) labels_s_ocr = converter.decode(preds_s_index, length_for_pred) for idx, pred in enumerate(labels_s_ocr): pred_EOS = pred.find('[s]') labels_s_ocr[idx] = pred[:pred_EOS] # prune after "end of sentence" token ([s]) #predict ocr recognition on gt stylecontent images preds_sc = ocrModel(image_gt_tensors, text_for_pred, is_train=False) _, preds_sc_index = preds_sc.max(2) labels_sc_ocr = converter.decode(preds_sc_index, length_for_pred) for idx, pred in enumerate(labels_sc_ocr): pred_EOS = pred.find('[s]') labels_sc_ocr[idx] = pred[:pred_EOS] # prune after "end of sentence" token ([s]) # cost = opt.reconWeight*recCost + opt.disWeight*disGenCost + opt.vggPerWeight*vggPerCost + opt.vggStyWeight*vggStyleCost + opt.ocrWeight*ocrCost cost = opt.disWeight*disGenCost + opt.ocrWeight*ocrCost # styleModel.zero_grad() genModel.zero_grad() # mixModel.zero_grad() disModel.zero_grad() # vggModel.zero_grad() ocrModel.zero_grad() cost.backward() optimizer.step() loss_avg.add(cost) g_regularize = cntr % opt.g_reg_every == 0 if g_regularize: image_input_tensors, image_gt_tensors, labels_1, labels_2 = iter(train_loader).next() image_input_tensors = image_input_tensors.to(device) image_gt_tensors = image_gt_tensors.to(device) batch_size = image_input_tensors.size(0) text_1, length_1 = converter.encode(labels_1, batch_max_length=opt.batch_max_length) text_2, length_2 = converter.encode(labels_2, batch_max_length=opt.batch_max_length) path_batch_size = max(1, batch_size // opt.path_batch_shrink) # style = styleModel(image_input_tensors).squeeze(2).squeeze(2) # scInput = mixModel(style,text_2) # images_recon_2, latents = genModel([scInput],input_is_latent=opt.input_latent, return_latents=True) style = mixing_noise(path_batch_size, opt.latent, opt.mixing, device) if 'CTC' in opt.Prediction: images_recon_2, latents = genModel(style, text_2[:path_batch_size], return_latents=True) else: images_recon_2, latents = genModel(style, text_2[:path_batch_size,1:-1], return_latents=True) path_loss, mean_path_length, path_lengths = g_path_regularize( images_recon_2, latents, mean_path_length ) genModel.zero_grad() weighted_path_loss = opt.path_regularize * opt.g_reg_every * path_loss if opt.path_batch_shrink: weighted_path_loss += 0 * images_recon_2[0, 0, 0, 0] weighted_path_loss.backward() optimizer.step() mean_path_length_avg = ( reduce_sum(mean_path_length).item() / get_world_size() ) loss_dict["path"] = path_loss loss_dict["path_length"] = path_lengths.mean() accumulate(g_ema, g_module, accum) loss_reduced = reduce_loss_dict(loss_dict) d_loss_val = loss_reduced["d"].mean().item() g_loss_val = loss_reduced["g"].mean().item() r1_val = loss_reduced["r1"].mean().item() path_loss_val = loss_reduced["path"].mean().item() real_score_val = loss_reduced["real_score"].mean().item() fake_score_val = loss_reduced["fake_score"].mean().item() path_length_val = loss_reduced["path_length"].mean().item() #Individual losses loss_avg_gen.add(opt.disWeight*disGenCost) loss_avg_imgRecon.add(torch.tensor(0.0)) loss_avg_vgg_per.add(torch.tensor(0.0)) loss_avg_vgg_sty.add(torch.tensor(0.0)) loss_avg_ocr.add(opt.ocrWeight*ocrCost) log_r1_val.add(loss_reduced["path"]) log_avg_path_loss_val.add(loss_reduced["path"]) log_avg_mean_path_length_avg.add(torch.tensor(mean_path_length_avg)) log_ada_aug_p.add(torch.tensor(ada_aug_p)) if get_rank() == 0: # pbar.set_description( # ( # f"d: {d_loss_val:.4f}; g: {g_loss_val:.4f}; r1: {r1_val:.4f}; " # f"path: {path_loss_val:.4f}; mean path: {mean_path_length_avg:.4f}; " # f"augment: {ada_aug_p:.4f}" # ) # ) if wandb and opt.wandb: wandb.log( { "Generator": g_loss_val, "Discriminator": d_loss_val, "Augment": ada_aug_p, "Rt": r_t_stat, "R1": r1_val, "Path Length Regularization": path_loss_val, "Mean Path Length": mean_path_length, "Real Score": real_score_val, "Fake Score": fake_score_val, "Path Length": path_length_val, } ) # if cntr % 100 == 0: # with torch.no_grad(): # g_ema.eval() # sample, _ = g_ema([scInput[:,:opt.latent],scInput[:,opt.latent:]]) # utils.save_image( # sample, # os.path.join(opt.trainDir, f"sample_{str(cntr).zfill(6)}.png"), # nrow=int(opt.n_sample ** 0.5), # normalize=True, # range=(-1, 1), # ) # validation part if (iteration + 1) % opt.valInterval == 0 or iteration == 0: # To see training progress, we also conduct validation when 'iteration == 0' #Save training images curr_batch_size = style[0].shape[0] images_recon_2, _ = g_ema(style, text_2[:curr_batch_size], input_is_latent=opt.input_latent) os.makedirs(os.path.join(opt.trainDir,str(iteration)), exist_ok=True) for trImgCntr in range(batch_size): try: if opt.contentLoss == 'vis' or opt.contentLoss == 'seq': save_image(tensor2im(image_input_tensors[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_sInput_'+labels_1[trImgCntr]+'.png')) save_image(tensor2im(image_gt_tensors[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_csGT_'+labels_2[trImgCntr]+'.png')) save_image(tensor2im(images_recon_2[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_csRecon_'+labels_2[trImgCntr]+'.png')) else: save_image(tensor2im(image_input_tensors[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_sInput_'+labels_1[trImgCntr]+'_'+labels_s_ocr[trImgCntr]+'.png')) save_image(tensor2im(image_gt_tensors[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_csGT_'+labels_2[trImgCntr]+'_'+labels_sc_ocr[trImgCntr]+'.png')) save_image(tensor2im(images_recon_2[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_csRecon_'+labels_2[trImgCntr]+'_'+labels_o_ocr[trImgCntr]+'.png')) except: print('Warning while saving training image') elapsed_time = time.time() - start_time # for log with open(os.path.join(opt.exp_dir,opt.exp_name,'log_train.txt'), 'a') as log: # styleModel.eval() genModel.eval() g_ema.eval() # mixModel.eval() disModel.eval() with torch.no_grad(): valid_loss, infer_time, length_of_data = validation_synth_v6( iteration, g_ema, ocrModel, disModel, ocrCriterion, valid_loader, converter, opt) # styleModel.train() genModel.train() # mixModel.train() disModel.train() # training loss and validation loss loss_log = f'[{iteration+1}/{opt.num_iter}] Train Synth loss: {loss_avg.val():0.5f}, \ Train Dis loss: {loss_avg_dis.val():0.5f}, Train Gen loss: {loss_avg_gen.val():0.5f},\ Train OCR loss: {loss_avg_ocr.val():0.5f}, \ Train R1-val loss: {log_r1_val.val():0.5f}, Train avg-path-loss: {log_avg_path_loss_val.val():0.5f}, \ Train mean-path-length loss: {log_avg_mean_path_length_avg.val():0.5f}, Train ada-aug-p: {log_ada_aug_p.val():0.5f}, \ Valid Synth loss: {valid_loss[0]:0.5f}, \ Valid Dis loss: {valid_loss[1]:0.5f}, Valid Gen loss: {valid_loss[2]:0.5f}, \ Valid OCR loss: {valid_loss[6]:0.5f}, Elapsed_time: {elapsed_time:0.5f}' #plotting lib.plot.plot(os.path.join(opt.plotDir,'Train-Synth-Loss'), loss_avg.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-Dis-Loss'), loss_avg_dis.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-Gen-Loss'), loss_avg_gen.val().item()) # lib.plot.plot(os.path.join(opt.plotDir,'Train-ImgRecon1-Loss'), loss_avg_imgRecon.val().item()) # lib.plot.plot(os.path.join(opt.plotDir,'Train-VGG-Per-Loss'), loss_avg_vgg_per.val().item()) # lib.plot.plot(os.path.join(opt.plotDir,'Train-VGG-Sty-Loss'), loss_avg_vgg_sty.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-OCR-Loss'), loss_avg_ocr.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-r1_val'), log_r1_val.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-path_loss_val'), log_avg_path_loss_val.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-mean_path_length_avg'), log_avg_mean_path_length_avg.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-ada_aug_p'), log_ada_aug_p.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Valid-Synth-Loss'), valid_loss[0].item()) lib.plot.plot(os.path.join(opt.plotDir,'Valid-Dis-Loss'), valid_loss[1].item()) lib.plot.plot(os.path.join(opt.plotDir,'Valid-Gen-Loss'), valid_loss[2].item()) # lib.plot.plot(os.path.join(opt.plotDir,'Valid-ImgRecon1-Loss'), valid_loss[3].item()) # lib.plot.plot(os.path.join(opt.plotDir,'Valid-VGG-Per-Loss'), valid_loss[4].item()) # lib.plot.plot(os.path.join(opt.plotDir,'Valid-VGG-Sty-Loss'), valid_loss[5].item()) lib.plot.plot(os.path.join(opt.plotDir,'Valid-OCR-Loss'), valid_loss[6].item()) print(loss_log) loss_avg.reset() loss_avg_dis.reset() loss_avg_gen.reset() loss_avg_imgRecon.reset() loss_avg_vgg_per.reset() loss_avg_vgg_sty.reset() loss_avg_ocr.reset() log_r1_val.reset() log_avg_path_loss_val.reset() log_avg_mean_path_length_avg.reset() log_ada_aug_p.reset() lib.plot.flush() lib.plot.tick() # save model per 1e+5 iter. if (iteration) % 1e+4 == 0: torch.save({ # 'styleModel':styleModel.state_dict(), # 'mixModel':mixModel.state_dict(), 'genModel':g_module.state_dict(), 'g_ema':g_ema.state_dict(), 'disModel':d_module.state_dict(), 'optimizer':optimizer.state_dict(), 'dis_optimizer':dis_optimizer.state_dict()}, os.path.join(opt.exp_dir,opt.exp_name,'iter_'+str(iteration+1)+'_synth.pth')) if (iteration + 1) == opt.num_iter: print('end the training') sys.exit() iteration += 1 cntr+=1
def demo(args): """Open csv file wherein you are going to write the Predicted Words""" data = pd.read_csv('../data/craft_output/data.csv') """ model configuration """ if 'CTC' in args.Prediction: converter = CTCLabelConverter(args.character) else: converter = AttnLabelConverter(args.character) args.num_class = len(converter.character) if args.rgb: args.input_channel = 3 model = Model(args) print('model input parameters', args.imgH, args.imgW, args.num_fiducial, args.input_channel, args.output_channel, args.hidden_size, args.num_class, args.batch_max_length, args.Transformation, args.FeatureExtraction, args.SequenceModeling, args.Prediction) model = torch.nn.DataParallel(model).to(device) # load model print('loading pretrained model from %s' % args.saved_model) model.load_state_dict(torch.load(args.saved_model, map_location=device)) # prepare data. two demo images from https://github.com/bgshih/crnn#run-demo AlignCollate_demo = AlignCollate(imgH=args.imgH, imgW=args.imgW, keep_ratio_with_pad=args.PAD) demo_data = RawDataset(root=args.image_folder, args=args) # use RawDataset demo_loader = torch.utils.data.DataLoader( demo_data, batch_size=args.batch_size, shuffle=False, num_workers=int(args.workers), collate_fn=AlignCollate_demo, pin_memory=True) # predict model.eval() with torch.no_grad(): for image_tensors, image_path_list in demo_loader: batch_size = image_tensors.size(0) image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([args.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, args.batch_max_length + 1).fill_(0).to(device) if 'CTC' in args.Prediction: preds = model(image, text_for_pred) # Select max probabilty (greedy decoding) then decode index to character preds_size = torch.IntTensor([preds.size(1)] * batch_size) _, preds_index = preds.max(2) # preds_index = preds_index.view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) dashed_line = '-' * 80 head = f'{"image_path":25s}\t {"predicted_labels":25s}\t confidence score' print(f'{dashed_line}\n{head}\n{dashed_line}') # log.write(f'{dashed_line}\n{head}\n{dashed_line}\n') preds_prob = F.softmax(preds, dim=2) preds_max_prob, _ = preds_prob.max(dim=2) for img_name, pred, pred_max_prob in zip(image_path_list, preds_str, preds_max_prob): start = '../data/crop_img/' path = os.path.relpath(img_name, start) folder = os.path.dirname(path) image_name=os.path.basename(path) file_name='_'.join(image_name.split('_')[:-8]) txt_file=os.path.join(start, folder, file_name) log = open(f'{txt_file}_log_demo_result.txt', 'a') if 'Attn' in args.Prediction: pred_EOS = pred.find('[s]') pred = pred[:pred_EOS] # prune after "end of sentence" token ([s]) pred_max_prob = pred_max_prob[:pred_EOS] # calculate confidence score (= multiply of pred_max_prob) confidence_score = pred_max_prob.cumprod(dim=0)[-1] print(f'{image_name:25s}\t {pred:25s}\t {confidence_score:0.4f}') log.write(f'{image_name:25s}\t {pred:25s}\t {confidence_score:0.4f}\n') log.close()
def train(opt): lib.print_model_settings(locals().copy()) if 'Attn' in opt.Prediction: converter = AttnLabelConverter(opt.character) text_len = opt.batch_max_length+2 else: converter = CTCLabelConverter(opt.character) text_len = opt.batch_max_length opt.classes = converter.character """ dataset preparation """ if not opt.data_filtering_off: print('Filtering the images containing characters which are not in opt.character') print('Filtering the images whose label is longer than opt.batch_max_length') # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130 log = open(os.path.join(opt.exp_dir,opt.exp_name,'log_dataset.txt'), 'a') AlignCollate_valid = AlignPairCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) train_dataset = LmdbStyleDataset(root=opt.train_data, opt=opt) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=opt.batch_size*2, #*2 to sample different images from training encoder and discriminator real images shuffle=True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True, drop_last=True) print('-' * 80) valid_dataset = LmdbStyleDataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size*2, #*2 to sample different images from training encoder and discriminator real images shuffle=False, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True, drop_last=True) print('-' * 80) log.write('-' * 80 + '\n') log.close() text_dataset = text_gen(opt) text_loader = torch.utils.data.DataLoader( text_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=int(opt.workers), pin_memory=True, drop_last=True) opt.num_class = len(converter.character) c_code_size = opt.latent cEncoder = GlobalContentEncoder(opt.num_class, text_len, opt.char_embed_size, c_code_size) ocrModel = ModelV1(opt) genModel = styleGANGen(opt.size, opt.latent, opt.latent, opt.n_mlp, channel_multiplier=opt.channel_multiplier) g_ema = styleGANGen(opt.size, opt.latent, opt.latent, opt.n_mlp, channel_multiplier=opt.channel_multiplier) disEncModel = styleGANDis(opt.size, channel_multiplier=opt.channel_multiplier, input_dim=opt.input_channel, code_s_dim=c_code_size) accumulate(g_ema, genModel, 0) # uCriterion = torch.nn.MSELoss() # sCriterion = torch.nn.MSELoss() # if opt.contentLoss == 'vis' or opt.contentLoss == 'seq': # ocrCriterion = torch.nn.L1Loss() # else: if 'CTC' in opt.Prediction: ocrCriterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: print('Not implemented error') sys.exit() # ocrCriterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # ignore [GO] token = ignore index 0 cEncoder= torch.nn.DataParallel(cEncoder).to(device) cEncoder.train() genModel = torch.nn.DataParallel(genModel).to(device) g_ema = torch.nn.DataParallel(g_ema).to(device) genModel.train() g_ema.eval() disEncModel = torch.nn.DataParallel(disEncModel).to(device) disEncModel.train() ocrModel = torch.nn.DataParallel(ocrModel).to(device) if opt.ocrFixed: if opt.Transformation == 'TPS': ocrModel.module.Transformation.eval() ocrModel.module.FeatureExtraction.eval() ocrModel.module.AdaptiveAvgPool.eval() # ocrModel.module.SequenceModeling.eval() ocrModel.module.Prediction.eval() else: ocrModel.train() g_reg_ratio = opt.g_reg_every / (opt.g_reg_every + 1) d_reg_ratio = opt.d_reg_every / (opt.d_reg_every + 1) optimizer = optim.Adam( list(genModel.parameters())+list(cEncoder.parameters()), lr=opt.lr * g_reg_ratio, betas=(0 ** g_reg_ratio, 0.99 ** g_reg_ratio), ) dis_optimizer = optim.Adam( disEncModel.parameters(), lr=opt.lr * d_reg_ratio, betas=(0 ** d_reg_ratio, 0.99 ** d_reg_ratio), ) ocr_optimizer = optim.Adam( ocrModel.parameters(), lr=opt.lr, betas=(0.9, 0.99), ) ## Loading pre-trained files if opt.modelFolderFlag: if len(glob.glob(os.path.join(opt.exp_dir,opt.exp_name,"iter_*_synth.pth")))>0: opt.saved_synth_model = glob.glob(os.path.join(opt.exp_dir,opt.exp_name,"iter_*_synth.pth"))[-1] if opt.saved_ocr_model !='' and opt.saved_ocr_model !='None': print(f'loading pretrained ocr model from {opt.saved_ocr_model}') checkpoint = torch.load(opt.saved_ocr_model) ocrModel.load_state_dict(checkpoint) # if opt.saved_gen_model !='' and opt.saved_gen_model !='None': # print(f'loading pretrained gen model from {opt.saved_gen_model}') # checkpoint = torch.load(opt.saved_gen_model, map_location=lambda storage, loc: storage) # genModel.module.load_state_dict(checkpoint['g']) # g_ema.module.load_state_dict(checkpoint['g_ema']) if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': print(f'loading pretrained synth model from {opt.saved_synth_model}') checkpoint = torch.load(opt.saved_synth_model) # styleModel.load_state_dict(checkpoint['styleModel']) # mixModel.load_state_dict(checkpoint['mixModel']) genModel.load_state_dict(checkpoint['genModel']) g_ema.load_state_dict(checkpoint['g_ema']) disEncModel.load_state_dict(checkpoint['disEncModel']) ocrModel.load_state_dict(checkpoint['ocrModel']) optimizer.load_state_dict(checkpoint["optimizer"]) dis_optimizer.load_state_dict(checkpoint["dis_optimizer"]) ocr_optimizer.load_state_dict(checkpoint["ocr_optimizer"]) # if opt.imgReconLoss == 'l1': # recCriterion = torch.nn.L1Loss() # elif opt.imgReconLoss == 'ssim': # recCriterion = ssim # elif opt.imgReconLoss == 'ms-ssim': # recCriterion = msssim # loss averager loss_avg_dis = Averager() loss_avg_gen = Averager() loss_avg_unsup = Averager() loss_avg_sup = Averager() log_r1_val = Averager() log_avg_path_loss_val = Averager() log_avg_mean_path_length_avg = Averager() log_ada_aug_p = Averager() loss_avg_ocr_sup = Averager() loss_avg_ocr_unsup = Averager() """ final options """ with open(os.path.join(opt.exp_dir,opt.exp_name,'opt.txt'), 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': try: start_iter = int(opt.saved_synth_model.split('_')[-2].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass #get schedulers scheduler = get_scheduler(optimizer,opt) dis_scheduler = get_scheduler(dis_optimizer,opt) ocr_scheduler = get_scheduler(ocr_optimizer,opt) start_time = time.time() iteration = start_iter cntr=0 mean_path_length = 0 d_loss_val = 0 r1_loss = torch.tensor(0.0, device=device) g_loss_val = 0 path_loss = torch.tensor(0.0, device=device) path_lengths = torch.tensor(0.0, device=device) mean_path_length_avg = 0 # loss_dict = {} accum = 0.5 ** (32 / (10 * 1000)) ada_augment = torch.tensor([0.0, 0.0], device=device) ada_aug_p = opt.augment_p if opt.augment_p > 0 else 0.0 ada_aug_step = opt.ada_target / opt.ada_length r_t_stat = 0 epsilon = 10e-50 # sample_z = torch.randn(opt.n_sample, opt.latent, device=device) while(True): # print(cntr) # train part if opt.lr_policy !="None": scheduler.step() dis_scheduler.step() ocr_scheduler.step() image_input_tensors, _, labels, _ = iter(train_loader).next() labels_z_c = iter(text_loader).next() image_input_tensors = image_input_tensors.to(device) gt_image_tensors = image_input_tensors[:opt.batch_size].detach() real_image_tensors = image_input_tensors[opt.batch_size:].detach() labels_gt = labels[:opt.batch_size] requires_grad(cEncoder, False) requires_grad(genModel, False) requires_grad(disEncModel, True) requires_grad(ocrModel, False) text_z_c, length_z_c = converter.encode(labels_z_c, batch_max_length=opt.batch_max_length) text_gt, length_gt = converter.encode(labels_gt, batch_max_length=opt.batch_max_length) z_c_code = cEncoder(text_z_c) noise_style = mixing_noise_style(opt.batch_size, opt.latent, opt.mixing, device) style=[] style.append(noise_style[0]*z_c_code) if len(noise_style)>1: style.append(noise_style[1]*z_c_code) if opt.zAlone: #to validate orig style gan results newstyle = [] newstyle.append(style[0][:,:opt.latent]) if len(style)>1: newstyle.append(style[1][:,:opt.latent]) style = newstyle fake_img,_ = genModel(style, input_is_latent=opt.input_latent) # #unsupervised code prediction on generated image # u_pred_code = disEncModel(fake_img, mode='enc') # uCost = uCriterion(u_pred_code, z_code) # #supervised code prediction on gt image # s_pred_code = disEncModel(gt_image_tensors, mode='enc') # sCost = uCriterion(s_pred_code, gt_phoc_tensors) #Domain discriminator fake_pred = disEncModel(fake_img) real_pred = disEncModel(real_image_tensors) disCost = d_logistic_loss(real_pred, fake_pred) # dis_cost = disCost + opt.gamma_e*uCost + opt.beta*sCost loss_avg_dis.add(disCost) # loss_avg_sup.add(opt.beta*sCost) # loss_avg_unsup.add(opt.gamma_e * uCost) disEncModel.zero_grad() disCost.backward() dis_optimizer.step() d_regularize = cntr % opt.d_reg_every == 0 if d_regularize: real_image_tensors.requires_grad = True real_pred = disEncModel(real_image_tensors) r1_loss = d_r1_loss(real_pred, real_image_tensors) disEncModel.zero_grad() (opt.r1 / 2 * r1_loss * opt.d_reg_every + 0 * real_pred[0]).backward() dis_optimizer.step() log_r1_val.add(r1_loss) # Recognizer update if not opt.ocrFixed and not opt.zAlone: requires_grad(disEncModel, False) requires_grad(ocrModel, True) if 'CTC' in opt.Prediction: preds_recon = ocrModel(gt_image_tensors, text_gt, is_train=True) preds_size = torch.IntTensor([preds_recon.size(1)] * opt.batch_size) preds_recon_softmax = preds_recon.log_softmax(2).permute(1, 0, 2) ocrCost = ocrCriterion(preds_recon_softmax, text_gt, preds_size, length_gt) else: print("Not implemented error") sys.exit() ocrModel.zero_grad() ocrCost.backward() # torch.nn.utils.clip_grad_norm_(ocrModel.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) ocr_optimizer.step() loss_avg_ocr_sup.add(ocrCost) else: loss_avg_ocr_sup.add(torch.tensor(0.0)) # [Word Generator] update # image_input_tensors, _, labels, _ = iter(train_loader).next() labels_z_c = iter(text_loader).next() # image_input_tensors = image_input_tensors.to(device) # gt_image_tensors = image_input_tensors[:opt.batch_size] # real_image_tensors = image_input_tensors[opt.batch_size:] # labels_gt = labels[:opt.batch_size] requires_grad(cEncoder, True) requires_grad(genModel, True) requires_grad(disEncModel, False) requires_grad(ocrModel, False) text_z_c, length_z_c = converter.encode(labels_z_c, batch_max_length=opt.batch_max_length) z_c_code = cEncoder(text_z_c) noise_style = mixing_noise_style(opt.batch_size, opt.latent, opt.mixing, device) style=[] style.append(noise_style[0]*z_c_code) if len(noise_style)>1: style.append(noise_style[1]*z_c_code) if opt.zAlone: #to validate orig style gan results newstyle = [] newstyle.append(style[0][:,:opt.latent]) if len(style)>1: newstyle.append(style[1][:,:opt.latent]) style = newstyle fake_img,_ = genModel(style, input_is_latent=opt.input_latent) fake_pred = disEncModel(fake_img) disGenCost = g_nonsaturating_loss(fake_pred) if opt.zAlone: ocrCost = torch.tensor(0.0) else: #Compute OCR prediction (Reconstruction of content) # text_for_pred = torch.LongTensor(opt.batch_size, opt.batch_max_length + 1).fill_(0).to(device) # length_for_pred = torch.IntTensor([opt.batch_max_length] * opt.batch_size).to(device) if 'CTC' in opt.Prediction: preds_recon = ocrModel(fake_img, text_z_c, is_train=False) preds_size = torch.IntTensor([preds_recon.size(1)] * opt.batch_size) preds_recon_softmax = preds_recon.log_softmax(2).permute(1, 0, 2) ocrCost = ocrCriterion(preds_recon_softmax, text_z_c, preds_size, length_z_c) else: print("Not implemented error") sys.exit() genModel.zero_grad() cEncoder.zero_grad() gen_enc_cost = disGenCost + opt.ocrWeight * ocrCost grad_fake_OCR = torch.autograd.grad(ocrCost, fake_img, retain_graph=True)[0] loss_grad_fake_OCR = 10**6*torch.mean(grad_fake_OCR**2) grad_fake_adv = torch.autograd.grad(disGenCost, fake_img, retain_graph=True)[0] loss_grad_fake_adv = 10**6*torch.mean(grad_fake_adv**2) if opt.grad_balance: gen_enc_cost.backward(retain_graph=True) grad_fake_OCR = torch.autograd.grad(ocrCost, fake_img, create_graph=True, retain_graph=True)[0] grad_fake_adv = torch.autograd.grad(disGenCost, fake_img, create_graph=True, retain_graph=True)[0] a = opt.ocrWeight * torch.div(torch.std(grad_fake_adv), epsilon+torch.std(grad_fake_OCR)) if a is None: print(ocrCost, disGenCost, torch.std(grad_fake_adv), torch.std(grad_fake_OCR)) if a>1000 or a<0.0001: print(a) ocrCost = a.detach() * ocrCost gen_enc_cost = disGenCost + ocrCost gen_enc_cost.backward(retain_graph=True) grad_fake_OCR = torch.autograd.grad(ocrCost, fake_img, create_graph=False, retain_graph=True)[0] grad_fake_adv = torch.autograd.grad(disGenCost, fake_img, create_graph=False, retain_graph=True)[0] loss_grad_fake_OCR = 10 ** 6 * torch.mean(grad_fake_OCR ** 2) loss_grad_fake_adv = 10 ** 6 * torch.mean(grad_fake_adv ** 2) with torch.no_grad(): gen_enc_cost.backward() else: gen_enc_cost.backward() loss_avg_gen.add(disGenCost) loss_avg_ocr_unsup.add(opt.ocrWeight * ocrCost) optimizer.step() g_regularize = cntr % opt.g_reg_every == 0 if g_regularize: path_batch_size = max(1, opt.batch_size // opt.path_batch_shrink) # image_input_tensors, _, labels, _ = iter(train_loader).next() labels_z_c = iter(text_loader).next() # image_input_tensors = image_input_tensors.to(device) # gt_image_tensors = image_input_tensors[:path_batch_size] # labels_gt = labels[:path_batch_size] text_z_c, length_z_c = converter.encode(labels_z_c[:path_batch_size], batch_max_length=opt.batch_max_length) # text_gt, length_gt = converter.encode(labels_gt, batch_max_length=opt.batch_max_length) z_c_code = cEncoder(text_z_c) noise_style = mixing_noise_style(path_batch_size, opt.latent, opt.mixing, device) style=[] style.append(noise_style[0]*z_c_code) if len(noise_style)>1: style.append(noise_style[1]*z_c_code) if opt.zAlone: #to validate orig style gan results newstyle = [] newstyle.append(style[0][:,:opt.latent]) if len(style)>1: newstyle.append(style[1][:,:opt.latent]) style = newstyle fake_img, grad = genModel(style, return_latents=True, g_path_regularize=True, mean_path_length=mean_path_length) decay = 0.01 path_lengths = torch.sqrt(grad.pow(2).sum(2).mean(1)) mean_path_length_orig = mean_path_length + decay * (path_lengths.mean() - mean_path_length) path_loss = (path_lengths - mean_path_length_orig).pow(2).mean() mean_path_length = mean_path_length_orig.detach().item() genModel.zero_grad() cEncoder.zero_grad() weighted_path_loss = opt.path_regularize * opt.g_reg_every * path_loss if opt.path_batch_shrink: weighted_path_loss += 0 * fake_img[0, 0, 0, 0] weighted_path_loss.backward() optimizer.step() # mean_path_length_avg = ( # reduce_sum(mean_path_length).item() / get_world_size() # ) #commented above for multi-gpu , non-distributed setting mean_path_length_avg = mean_path_length accumulate(g_ema, genModel, accum) log_avg_path_loss_val.add(path_loss) log_avg_mean_path_length_avg.add(torch.tensor(mean_path_length_avg)) log_ada_aug_p.add(torch.tensor(ada_aug_p)) if get_rank() == 0: if wandb and opt.wandb: wandb.log( { "Generator": g_loss_val, "Discriminator": d_loss_val, "Augment": ada_aug_p, "Rt": r_t_stat, "R1": r1_val, "Path Length Regularization": path_loss_val, "Mean Path Length": mean_path_length, "Real Score": real_score_val, "Fake Score": fake_score_val, "Path Length": path_length_val, } ) # validation part if (iteration + 1) % opt.valInterval == 0 or iteration == 0: # To see training progress, we also conduct validation when 'iteration == 0' #generate paired content with similar style labels_z_c_1 = iter(text_loader).next() labels_z_c_2 = iter(text_loader).next() text_z_c_1, length_z_c_1 = converter.encode(labels_z_c_1, batch_max_length=opt.batch_max_length) text_z_c_2, length_z_c_2 = converter.encode(labels_z_c_2, batch_max_length=opt.batch_max_length) z_c_code_1 = cEncoder(text_z_c_1) z_c_code_2 = cEncoder(text_z_c_2) style_c1_s1 = [] style_c2_s1 = [] style_s1 = mixing_noise_style(opt.batch_size, opt.latent, opt.mixing, device) style_c1_s1.append(style_s1[0]*z_c_code_1) style_c2_s1.append(style_s1[0]*z_c_code_2) if len(style_s1)>1: style_c1_s1.append(style_s1[1]*z_c_code_1) style_c2_s1.append(style_s1[1]*z_c_code_2) noise_style = mixing_noise_style(opt.batch_size, opt.latent, opt.mixing, device) style_c1_s2 = [] style_c1_s2.append(noise_style[0]*z_c_code_1) if len(noise_style)>1: style_c1_s2.append(noise_style[1]*z_c_code_1) if opt.zAlone: #to validate orig style gan results newstyle = [] newstyle.append(style_c1_s1[0][:,:opt.latent]) if len(style_c1_s1)>1: newstyle.append(style_c1_s1[1][:,:opt.latent]) style_c1_s1 = newstyle style_c2_s1 = newstyle style_c1_s2 = newstyle fake_img_c1_s1, _ = g_ema(style_c1_s1, input_is_latent=opt.input_latent) fake_img_c2_s1, _ = g_ema(style_c2_s1, input_is_latent=opt.input_latent) fake_img_c1_s2, _ = g_ema(style_c1_s2, input_is_latent=opt.input_latent) if not opt.zAlone: #Run OCR prediction if 'CTC' in opt.Prediction: preds = ocrModel(fake_img_c1_s1, text_z_c_1, is_train=False) preds_size = torch.IntTensor([preds.size(1)] * opt.batch_size) _, preds_index = preds.max(2) preds_str_fake_img_c1_s1 = converter.decode(preds_index.data, preds_size.data) preds = ocrModel(fake_img_c2_s1, text_z_c_2, is_train=False) preds_size = torch.IntTensor([preds.size(1)] * opt.batch_size) _, preds_index = preds.max(2) preds_str_fake_img_c2_s1 = converter.decode(preds_index.data, preds_size.data) preds = ocrModel(fake_img_c1_s2, text_z_c_1, is_train=False) preds_size = torch.IntTensor([preds.size(1)] * opt.batch_size) _, preds_index = preds.max(2) preds_str_fake_img_c1_s2 = converter.decode(preds_index.data, preds_size.data) preds = ocrModel(gt_image_tensors, text_gt, is_train=False) preds_size = torch.IntTensor([preds.size(1)] * gt_image_tensors.shape[0]) _, preds_index = preds.max(2) preds_str_gt = converter.decode(preds_index.data, preds_size.data) else: print("Not implemented error") sys.exit() else: preds_str_fake_img_c1_s1 = [':None:'] * fake_img_c1_s1.shape[0] preds_str_gt = [':None:'] * fake_img_c1_s1.shape[0] os.makedirs(os.path.join(opt.trainDir,str(iteration)), exist_ok=True) for trImgCntr in range(opt.batch_size): try: save_image(tensor2im(fake_img_c1_s1[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_c1_s1_'+labels_z_c_1[trImgCntr]+'_ocr:'+preds_str_fake_img_c1_s1[trImgCntr]+'.png')) if not opt.zAlone: save_image(tensor2im(fake_img_c2_s1[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_c2_s1_'+labels_z_c_2[trImgCntr]+'_ocr:'+preds_str_fake_img_c2_s1[trImgCntr]+'.png')) save_image(tensor2im(fake_img_c1_s2[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_c1_s2_'+labels_z_c_1[trImgCntr]+'_ocr:'+preds_str_fake_img_c1_s2[trImgCntr]+'.png')) if trImgCntr<gt_image_tensors.shape[0]: save_image(tensor2im(gt_image_tensors[trImgCntr].detach()),os.path.join(opt.trainDir,str(iteration),str(trImgCntr)+'_gt_act:'+labels_gt[trImgCntr]+'_ocr:'+preds_str_gt[trImgCntr]+'.png')) except: print('Warning while saving training image') elapsed_time = time.time() - start_time # for log with open(os.path.join(opt.exp_dir,opt.exp_name,'log_train.txt'), 'a') as log: # training loss and validation loss loss_log = f'[{iteration+1}/{opt.num_iter}] \ Train Dis loss: {loss_avg_dis.val():0.5f}, Train Gen loss: {loss_avg_gen.val():0.5f},\ Train UnSup OCR loss: {loss_avg_ocr_unsup.val():0.5f}, Train Sup OCR loss: {loss_avg_ocr_sup.val():0.5f}, \ Train R1-val loss: {log_r1_val.val():0.5f}, Train avg-path-loss: {log_avg_path_loss_val.val():0.5f}, \ Train mean-path-length loss: {log_avg_mean_path_length_avg.val():0.5f}, Train ada-aug-p: {log_ada_aug_p.val():0.5f}, \ Elapsed_time: {elapsed_time:0.5f}' #plotting lib.plot.plot(os.path.join(opt.plotDir,'Train-Dis-Loss'), loss_avg_dis.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-Gen-Loss'), loss_avg_gen.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-UnSup-OCR-Loss'), loss_avg_ocr_unsup.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-Sup-OCR-Loss'), loss_avg_ocr_sup.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-r1_val'), log_r1_val.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-path_loss_val'), log_avg_path_loss_val.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-mean_path_length_avg'), log_avg_mean_path_length_avg.val().item()) lib.plot.plot(os.path.join(opt.plotDir,'Train-ada_aug_p'), log_ada_aug_p.val().item()) print(loss_log) loss_avg_dis.reset() loss_avg_gen.reset() loss_avg_ocr_unsup.reset() loss_avg_ocr_sup.reset() log_r1_val.reset() log_avg_path_loss_val.reset() log_avg_mean_path_length_avg.reset() log_ada_aug_p.reset() lib.plot.flush() lib.plot.tick() # save model per 1e+5 iter. if (iteration) % 1e+4 == 0: torch.save({ 'cEncoder':cEncoder.state_dict(), 'genModel':genModel.state_dict(), 'g_ema':g_ema.state_dict(), 'ocrModel':ocrModel.state_dict(), 'disEncModel':disEncModel.state_dict(), 'optimizer':optimizer.state_dict(), 'ocr_optimizer':ocr_optimizer.state_dict(), 'dis_optimizer':dis_optimizer.state_dict()}, os.path.join(opt.exp_dir,opt.exp_name,'iter_'+str(iteration+1)+'_synth.pth')) if (iteration + 1) == opt.num_iter: print('end the training') sys.exit() iteration += 1 cntr+=1
def train(opt): """ dataset preparation """ if not opt.data_filtering_off: print( 'Filtering the images containing characters which are not in opt.character' ) print( 'Filtering the images whose label is longer than opt.batch_max_length' ) # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130 opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) log = open(f'./saved_models/{opt.exp_name}/log_dataset.txt', 'a') AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset, valid_dataset_log = hierarchical_dataset( root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle= True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) log.write(valid_dataset_log) print('-' * 80) log.write('-' * 80 + ' ') log.close() """ model configuration """ # if 'CTC' in opt.Prediction: if opt.baiduCTC: CTC_converter = CTCLabelConverterForBaiduWarpctc(opt.character) else: CTC_converter = CTCLabelConverter(opt.character) # else: Attn_converter = AttnLabelConverter(opt.character) opt.num_class_ctc = len(CTC_converter.character) opt.num_class_attn = len(Attn_converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class_ctc, opt.num_class_attn, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for name, param in model.named_parameters(): # print(name) if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU model = torch.nn.DataParallel(model).to(device) model.train() print("Model:") print(model) # print(summary(model, (1, opt.imgH, opt.imgW,1))) """ setup loss """ if opt.baiduCTC: # need to install warpctc. see our guideline. if opt.label_smooth: criterion_major_path = SmoothCTCLoss(num_classes=opt.num_class_ctc, weight=0.05) else: criterion_major_path = CTCLoss() #criterion_major_path = CTCLoss(average_frames=False, reduction="mean", blank=0) else: criterion_major_path = torch.nn.CTCLoss(zero_infinity=True).to(device) # else: # criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # ignore [GO] token = ignore index 0 # loss averager #criterion_major_path = torch.nn.CTCLoss(zero_infinity=True).to(device) criterion_guide_path = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) loss_avg_major_path = Averager() loss_avg_guide_path = Averager() # filter that only require gradient decent guide_parameters = [] major_parameters = [] guide_model_part_names = [ "Transformation", "FeatureExtraction", "SequenceModeling_Attn", "Attention" ] major_model_part_names = ["SequenceModeling_CTC", "CTC"] for name, param in model.named_parameters(): if param.requires_grad: if name.split(".")[1] in guide_model_part_names: guide_parameters.append(param) elif name.split(".")[1] in major_model_part_names: major_parameters.append(param) # print(name) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] if opt.continue_training: guide_parameters = [] # setup optimizer if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer_ctc = AdamW(major_parameters, lr=opt.lr) if not opt.continue_training: optimizer_attn = AdamW(guide_parameters, lr=opt.lr) scheduler_ctc = get_linear_schedule_with_warmup( optimizer_ctc, num_warmup_steps=10000, num_training_steps=opt.num_iter) scheduler_attn = get_linear_schedule_with_warmup( optimizer_attn, num_warmup_steps=10000, num_training_steps=opt.num_iter) start_iter = 0 if opt.saved_model != '' and (not opt.continue_training): print(f'loading pretrained model from {opt.saved_model}') checkpoint = torch.load(opt.saved_model) start_iter = checkpoint['start_iter'] + 1 if not opt.adam: optimizer_ctc.load_state_dict( checkpoint['optimizer_ctc_state_dict']) if not opt.continue_training: optimizer_attn.load_state_dict( checkpoint['optimizer_attn_state_dict']) scheduler_ctc.load_state_dict( checkpoint['scheduler_ctc_state_dict']) scheduler_attn.load_state_dict( checkpoint['scheduler_attn_state_dict']) print(scheduler_ctc.get_lr()) print(scheduler_attn.get_lr()) if opt.FT: model.load_state_dict(checkpoint['model_state_dict'], strict=False) else: model.load_state_dict(checkpoint['model_state_dict']) if opt.continue_training: model.load_state_dict(torch.load(opt.saved_model)) # print("Optimizer:") # print(optimizer) # scheduler_ctc = get_linear_schedule_with_warmup( optimizer_ctc, num_warmup_steps=10000, num_training_steps=opt.num_iter, last_epoch=start_iter - 1) scheduler_attn = get_linear_schedule_with_warmup( optimizer_attn, num_warmup_steps=10000, num_training_steps=opt.num_iter, last_epoch=start_iter - 1) """ final options """ # print(opt) with open(f'./saved_models/{opt.exp_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options ------------- ' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)} ' opt_log += '--------------------------------------- ' print(opt_log) opt_file.write(opt_log) """ start training """ start_time = time.time() best_accuracy = -1 best_norm_ED = -1 iteration = start_iter - 1 if opt.continue_training: start_iter = 0 while (True): # train part image_tensors, labels = train_dataset.get_batch() iteration += 1 if iteration < start_iter: continue image = image_tensors.to(device) # print(image.size()) text_attn, length_attn = Attn_converter.encode( labels, batch_max_length=opt.batch_max_length) #print("1") text_ctc, length_ctc = CTC_converter.encode( labels, batch_max_length=opt.batch_max_length) #print("2") #if iteration == start_iter : # writer.add_graph(model, (image, text_attn)) batch_size = image.size(0) preds_major, preds_guide = model(image, text_attn[:, :-1]) #print("10") preds_size = torch.IntTensor([preds_major.size(1)] * batch_size) if opt.baiduCTC: preds_major = preds_major.permute(1, 0, 2) # to use CTCLoss format if opt.label_smooth: cost_ctc = criterion_major_path(preds_major, text_ctc, preds_size, length_ctc, batch_size) else: cost_ctc = criterion_major_path( preds_major, text_ctc, preds_size, length_ctc) / batch_size else: preds_major = preds_major.log_softmax(2).permute(1, 0, 2) cost_ctc = criterion_major_path(preds_major, text_ctc, preds_size, length_ctc) #print("3") # preds = model(image, text[:, :-1]) # align with Attention.forward target = text_attn[:, 1:] # without [GO] Symbol if not opt.continue_training: cost_attn = criterion_guide_path( preds_guide.view(-1, preds_guide.shape[-1]), target.contiguous().view(-1)) optimizer_attn.zero_grad() cost_attn.backward(retain_graph=True) torch.nn.utils.clip_grad_norm_( guide_parameters, opt.grad_clip) # gradient clipping with 5 (Default) optimizer_attn.step() optimizer_ctc.zero_grad() cost_ctc.backward() torch.nn.utils.clip_grad_norm_( major_parameters, opt.grad_clip) # gradient clipping with 5 (Default) optimizer_ctc.step() scheduler_ctc.step() scheduler_attn.step() #print("4") loss_avg_major_path.add(cost_ctc) if not opt.continue_training: loss_avg_guide_path.add(cost_attn) if (iteration + 1) % 100 == 0: writer.add_scalar("Loss/train_ctc", loss_avg_major_path.val(), (iteration + 1) // 100) loss_avg_major_path.reset() if not opt.continue_training: writer.add_scalar("Loss/train_attn", loss_avg_guide_path.val(), (iteration + 1) // 100) loss_avg_guide_path.reset() # validation part if ( iteration + 1 ) % opt.valInterval == 0: #or iteration == 0: # To see training progress, we also conduct validation when 'iteration == 0' elapsed_time = time.time() - start_time # for log with open(f'./saved_models/{opt.exp_name}/log_train.txt', 'a') as log: model.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ED, preds, confidence_score, labels, infer_time, length_of_data = validation( model, criterion_major_path, valid_loader, CTC_converter, opt) model.train() writer.add_scalar("Loss/valid", valid_loss, (iteration + 1) // opt.valInterval) writer.add_scalar("Metrics/accuracy", current_accuracy, (iteration + 1) // opt.valInterval) writer.add_scalar("Metrics/norm_ED", current_norm_ED, (iteration + 1) // opt.valInterval) # loss_log = f'[{iteration+1}/{opt.num_iter}] Train loss: {train_loss:0.5f}, Valid loss: {valid_loss:0.5f}, Elapsed_time: {elapsed_time:0.5f}' # loss_avg.reset() current_model_log = f'{"Current_accuracy":17s}: {current_accuracy:0.3f}, {"Current_norm_ED":17s}: {current_norm_ED:0.2f}' # training loss and validation loss if not opt.continue_training: loss_log = f'[{iteration+1}/{opt.num_iter}] Train loss ctc: {loss_avg_major_path.val():0.5f}, Train loss attn: {loss_avg_guide_path.val():0.5f}, Valid loss: {valid_loss:0.5f}, Elapsed_time: {elapsed_time:0.5f}' else: loss_log = f'[{iteration+1}/{opt.num_iter}] Train loss ctc: {loss_avg_major_path.val():0.5f}, Valid loss: {valid_loss:0.5f}, Elapsed_time: {elapsed_time:0.5f}' loss_avg_major_path.reset() if not opt.continue_training: loss_avg_guide_path.reset() current_model_log = f'{"Current_accuracy":17s}: {current_accuracy:0.3f}, {"Current_norm_ED":17s}: {current_norm_ED:0.2f}' # keep best accuracy model (on valid dataset) if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save(model.state_dict(), f'{fol_ckpt}/best_accuracy.pth') if current_norm_ED > best_norm_ED: best_norm_ED = current_norm_ED torch.save(model.state_dict(), f'{fol_ckpt}/best_norm_ED.pth') best_model_log = f'{"Best_accuracy":17s}: {best_accuracy:0.3f}, {"Best_norm_ED":17s}: {best_norm_ED:0.2f}' loss_model_log = f'{loss_log} {current_model_log} {best_model_log}' print(loss_model_log) log.write(loss_model_log + ' ') # show some predicted results dashed_line = '-' * 80 head = f'{"Ground Truth":25s} | {"Prediction":25s} | Confidence Score & T/F' predicted_result_log = f'{dashed_line} {head} {dashed_line} ' for gt, pred, confidence in zip(labels[:5], preds[:5], confidence_score[:5]): # if 'Attn' in opt.Prediction: # gt = gt[:gt.find('[s]')] # pred = pred[:pred.find('[s]')] predicted_result_log += f'{gt:25s} | {pred:25s} | {confidence:0.4f} {str(pred == gt)} ' predicted_result_log += f'{dashed_line}' print(predicted_result_log) log.write(predicted_result_log + ' ') # save model per 1e+5 iter. if (iteration + 1) % 1e+3 == 0 and (not opt.continue_training): # print(scheduler_ctc.get_lr()) # print(scheduler_attn.get_lr()) torch.save( { 'model_state_dict': model.state_dict(), 'optimizer_attn_state_dict': optimizer_attn.state_dict(), 'optimizer_ctc_state_dict': optimizer_ctc.state_dict(), 'start_iter': iteration, 'scheduler_ctc_state_dict': scheduler_ctc.state_dict(), 'scheduler_attn_state_dict': scheduler_attn.state_dict(), }, f'{fol_ckpt}/current_model.pth') if (iteration + 1) == opt.num_iter: print('end the training') sys.exit()
def demo(opt): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) model = torch.nn.DataParallel(model).to(device) # load model print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model, map_location=device)) # prepare data. two demo images from https://github.com/bgshih/crnn#run-demo AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) demo_data = RawDataset(root=opt.image_folder, opt=opt) # use RawDataset demo_loader = torch.utils.data.DataLoader(demo_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_demo, pin_memory=True) # predict model.eval() with torch.no_grad(): for image_tensors, image_path_list in demo_loader: batch_size = image_tensors.size(0) image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) if 'CTC' in opt.Prediction: preds = model(image, text_for_pred).log_softmax(2) # Select max probabilty (greedy decoding) then decode index to character preds_size = torch.IntTensor([preds.size(1)] * batch_size) _, preds_index = preds.permute(1, 0, 2).max(2) preds_index = preds_index.transpose(1, 0).contiguous().view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) print('-' * 80) print('image_path\tpredicted_labels') print('-' * 80) for img_name, pred in zip(image_path_list, preds_str): if 'Attn' in opt.Prediction: pred = pred[:pred.find( '[s]')] # prune after "end of sentence" token ([s]) print(f'{img_name}\t{pred}')
def train(opt): lib.print_model_settings(locals().copy()) if 'Attn' in opt.Prediction: converter = AttnLabelConverter(opt.character) else: converter = CTCLabelConverter(opt.character) opt.classes = converter.character """ dataset preparation """ if not opt.data_filtering_off: print( 'Filtering the images containing characters which are not in opt.character' ) print( 'Filtering the images whose label is longer than opt.batch_max_length' ) # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130 log = open(os.path.join(opt.exp_dir, opt.exp_name, 'log_dataset.txt'), 'a') AlignCollate_valid = AlignPHOCCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) train_dataset = LmdbStylePHOCDataset(root=opt.train_data, opt=opt) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=opt.batch_size * 2, #*2 to sample different images from training encoder and discriminator real images shuffle= True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True, drop_last=True) print('-' * 80) valid_dataset = LmdbStylePHOCDataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size * 2, #*2 to sample different images from training encoder and discriminator real images shuffle= False, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True, drop_last=True) print('-' * 80) log.write('-' * 80 + '\n') log.close() phoc_dataset = phoc_gen(opt) phoc_loader = torch.utils.data.DataLoader(phoc_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=int(opt.workers), pin_memory=True, drop_last=True) opt.num_class = len(converter.character) if opt.zAlone: genModel = styleGANGen(opt.size, opt.latent, opt.latent, opt.n_mlp, channel_multiplier=opt.channel_multiplier) g_ema = styleGANGen(opt.size, opt.latent, opt.latent, opt.n_mlp, channel_multiplier=opt.channel_multiplier) else: genModel = styleGANGen(opt.size, opt.latent + phoc_dataset.phoc_size, opt.latent, opt.n_mlp, channel_multiplier=opt.channel_multiplier) g_ema = styleGANGen(opt.size, opt.latent + phoc_dataset.phoc_size, opt.latent, opt.n_mlp, channel_multiplier=opt.channel_multiplier) disEncModel = styleGANDis(opt.size, channel_multiplier=opt.channel_multiplier, input_dim=opt.input_channel, code_s_dim=phoc_dataset.phoc_size) accumulate(g_ema, genModel, 0) uCriterion = torch.nn.MSELoss() sCriterion = torch.nn.MSELoss() genModel = torch.nn.DataParallel(genModel).to(device) g_ema = torch.nn.DataParallel(g_ema).to(device) genModel.train() g_ema.eval() disEncModel = torch.nn.DataParallel(disEncModel).to(device) disEncModel.train() g_reg_ratio = opt.g_reg_every / (opt.g_reg_every + 1) d_reg_ratio = opt.d_reg_every / (opt.d_reg_every + 1) optimizer = optim.Adam( genModel.parameters(), lr=opt.lr * g_reg_ratio, betas=(0**g_reg_ratio, 0.99**g_reg_ratio), ) dis_optimizer = optim.Adam( disEncModel.parameters(), lr=opt.lr * d_reg_ratio, betas=(0**d_reg_ratio, 0.99**d_reg_ratio), ) ## Loading pre-trained files if opt.modelFolderFlag: if len( glob.glob( os.path.join(opt.exp_dir, opt.exp_name, "iter_*_synth.pth"))) > 0: opt.saved_synth_model = glob.glob( os.path.join(opt.exp_dir, opt.exp_name, "iter_*_synth.pth"))[-1] # if opt.saved_ocr_model !='' and opt.saved_ocr_model !='None': # print(f'loading pretrained ocr model from {opt.saved_ocr_model}') # checkpoint = torch.load(opt.saved_ocr_model) # ocrModel.load_state_dict(checkpoint) # if opt.saved_gen_model !='' and opt.saved_gen_model !='None': # print(f'loading pretrained gen model from {opt.saved_gen_model}') # checkpoint = torch.load(opt.saved_gen_model, map_location=lambda storage, loc: storage) # genModel.module.load_state_dict(checkpoint['g']) # g_ema.module.load_state_dict(checkpoint['g_ema']) if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': print(f'loading pretrained synth model from {opt.saved_synth_model}') checkpoint = torch.load(opt.saved_synth_model) # styleModel.load_state_dict(checkpoint['styleModel']) # mixModel.load_state_dict(checkpoint['mixModel']) genModel.load_state_dict(checkpoint['genModel']) g_ema.load_state_dict(checkpoint['g_ema']) disEncModel.load_state_dict(checkpoint['disEncModel']) optimizer.load_state_dict(checkpoint["optimizer"]) dis_optimizer.load_state_dict(checkpoint["dis_optimizer"]) # if opt.imgReconLoss == 'l1': # recCriterion = torch.nn.L1Loss() # elif opt.imgReconLoss == 'ssim': # recCriterion = ssim # elif opt.imgReconLoss == 'ms-ssim': # recCriterion = msssim # loss averager loss_avg_dis = Averager() loss_avg_gen = Averager() loss_avg_unsup = Averager() loss_avg_sup = Averager() log_r1_val = Averager() log_avg_path_loss_val = Averager() log_avg_mean_path_length_avg = Averager() log_ada_aug_p = Averager() """ final options """ with open(os.path.join(opt.exp_dir, opt.exp_name, 'opt.txt'), 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_synth_model != '' and opt.saved_synth_model != 'None': try: start_iter = int( opt.saved_synth_model.split('_')[-2].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass #get schedulers scheduler = get_scheduler(optimizer, opt) dis_scheduler = get_scheduler(dis_optimizer, opt) start_time = time.time() iteration = start_iter cntr = 0 mean_path_length = 0 d_loss_val = 0 r1_loss = torch.tensor(0.0, device=device) g_loss_val = 0 path_loss = torch.tensor(0.0, device=device) path_lengths = torch.tensor(0.0, device=device) mean_path_length_avg = 0 # loss_dict = {} accum = 0.5**(32 / (10 * 1000)) ada_augment = torch.tensor([0.0, 0.0], device=device) ada_aug_p = opt.augment_p if opt.augment_p > 0 else 0.0 ada_aug_step = opt.ada_target / opt.ada_length r_t_stat = 0 # sample_z = torch.randn(opt.n_sample, opt.latent, device=device) while (True): # print(cntr) # train part if opt.lr_policy != "None": scheduler.step() dis_scheduler.step() image_input_tensors, _, labels_1, _, phoc_1, _ = iter( train_loader).next() z_code, z_labels = iter(phoc_loader).next() image_input_tensors = image_input_tensors.to(device) gt_image_tensors = image_input_tensors[:opt.batch_size] real_image_tensors = image_input_tensors[opt.batch_size:] phoc_1 = phoc_1.to(device) gt_phoc_tensors = phoc_1[:opt.batch_size] labels_1 = labels_1[:opt.batch_size] z_code = z_code.to(device) requires_grad(genModel, False) # requires_grad(styleModel, False) # requires_grad(mixModel, False) requires_grad(disEncModel, True) text_1, length_1 = converter.encode( labels_1, batch_max_length=opt.batch_max_length) style = mixing_noise(z_code, opt.batch_size, opt.latent, opt.mixing, device) if opt.zAlone: #to validate orig style gan results newstyle = [] newstyle.append(style[0][:, :opt.latent]) if len(style) > 1: newstyle.append(style[1][:, :opt.latent]) style = newstyle fake_img, _ = genModel(style, input_is_latent=opt.input_latent) #unsupervised code prediction on generated image u_pred_code = disEncModel(fake_img, mode='enc') uCost = uCriterion(u_pred_code, z_code) #supervised code prediction on gt image s_pred_code = disEncModel(gt_image_tensors, mode='enc') sCost = uCriterion(s_pred_code, gt_phoc_tensors) #Domain discriminator fake_pred = disEncModel(fake_img) real_pred = disEncModel(real_image_tensors) disCost = d_logistic_loss(real_pred, fake_pred) dis_enc_cost = disCost + opt.gamma_e * uCost + opt.beta * sCost loss_avg_dis.add(disCost) loss_avg_sup.add(opt.beta * sCost) loss_avg_unsup.add(opt.gamma_e * uCost) disEncModel.zero_grad() dis_enc_cost.backward() dis_optimizer.step() d_regularize = cntr % opt.d_reg_every == 0 if d_regularize: real_image_tensors.requires_grad = True real_pred = disEncModel(real_image_tensors) r1_loss = d_r1_loss(real_pred, real_image_tensors) disEncModel.zero_grad() (opt.r1 / 2 * r1_loss * opt.d_reg_every + 0 * real_pred[0]).backward() dis_optimizer.step() # loss_dict["r1"] = r1_loss # [Word Generator] update image_input_tensors, _, labels_1, _, phoc_1, _ = iter( train_loader).next() z_code, z_labels = iter(phoc_loader).next() image_input_tensors = image_input_tensors.to(device) gt_image_tensors = image_input_tensors[:opt.batch_size] real_image_tensors = image_input_tensors[opt.batch_size:] phoc_1 = phoc_1.to(device) gt_phoc_tensors = phoc_1[:opt.batch_size] labels_1 = labels_1[:opt.batch_size] z_code = z_code.to(device) requires_grad(genModel, True) requires_grad(disEncModel, False) text_1, length_1 = converter.encode( labels_1, batch_max_length=opt.batch_max_length) style = mixing_noise(z_code, opt.batch_size, opt.latent, opt.mixing, device) if opt.zAlone: #to validate orig style gan results newstyle = [] newstyle.append(style[0][:, :opt.latent]) if len(style) > 1: newstyle.append(style[1][:, :opt.latent]) style = newstyle fake_img, _ = genModel(style, input_is_latent=opt.input_latent) #unsupervised code prediction on generated image u_pred_code = disEncModel(fake_img, mode='enc') uCost = uCriterion(u_pred_code, z_code) fake_pred = disEncModel(fake_img) disGenCost = g_nonsaturating_loss(fake_pred) gen_enc_cost = disGenCost + opt.gamma_g * uCost loss_avg_gen.add(disGenCost) loss_avg_unsup.add(opt.gamma_g * uCost) # loss_dict["g"] = disGenCost genModel.zero_grad() disEncModel.zero_grad() gen_enc_cost.backward() optimizer.step() g_regularize = cntr % opt.g_reg_every == 0 if g_regularize: image_input_tensors, _, labels_1, _, phoc_1, _ = iter( train_loader).next() z_code, z_labels = iter(phoc_loader).next() image_input_tensors = image_input_tensors.to(device) path_batch_size = max(1, opt.batch_size // opt.path_batch_shrink) gt_image_tensors = image_input_tensors[:path_batch_size] phoc_1 = phoc_1.to(device) gt_phoc_tensors = phoc_1[:path_batch_size] labels_1 = labels_1[:path_batch_size] z_code = z_code.to(device) z_code = z_code[:path_batch_size] z_labels = z_labels[:path_batch_size] text_1, length_1 = converter.encode( labels_1, batch_max_length=opt.batch_max_length) style = mixing_noise(z_code, path_batch_size, opt.latent, opt.mixing, device) if opt.zAlone: #to validate orig style gan results newstyle = [] newstyle.append(style[0][:, :opt.latent]) if len(style) > 1: newstyle.append(style[1][:, :opt.latent]) style = newstyle fake_img, grad = genModel(style, return_latents=True, g_path_regularize=True, mean_path_length=mean_path_length) decay = 0.01 path_lengths = torch.sqrt(grad.pow(2).sum(2).mean(1)) mean_path_length_orig = mean_path_length + decay * ( path_lengths.mean() - mean_path_length) path_loss = (path_lengths - mean_path_length_orig).pow(2).mean() mean_path_length = mean_path_length_orig.detach().item() # path_loss, mean_path_length, path_lengths = g_path_regularize( # images_recon_2, latents, mean_path_length # ) genModel.zero_grad() weighted_path_loss = opt.path_regularize * opt.g_reg_every * path_loss if opt.path_batch_shrink: weighted_path_loss += 0 * fake_img[0, 0, 0, 0] weighted_path_loss.backward() optimizer.step() # mean_path_length_avg = ( # reduce_sum(mean_path_length).item() / get_world_size() # ) #commented above for multi-gpu , non-distributed setting mean_path_length_avg = mean_path_length accumulate(g_ema, genModel, accum) log_r1_val.add(r1_loss) log_avg_path_loss_val.add(path_loss) log_avg_mean_path_length_avg.add(torch.tensor(mean_path_length_avg)) log_ada_aug_p.add(torch.tensor(ada_aug_p)) if get_rank() == 0: if wandb and opt.wandb: wandb.log({ "Generator": g_loss_val, "Discriminator": d_loss_val, "Augment": ada_aug_p, "Rt": r_t_stat, "R1": r1_val, "Path Length Regularization": path_loss_val, "Mean Path Length": mean_path_length, "Real Score": real_score_val, "Fake Score": fake_score_val, "Path Length": path_length_val, }) # validation part if ( iteration + 1 ) % opt.valInterval == 0 or iteration == 0: # To see training progress, we also conduct validation when 'iteration == 0' #generate paired content with similar style z_code_1, z_labels_1 = iter(phoc_loader).next() z_code_2, z_labels_2 = iter(phoc_loader).next() z_code_1 = z_code_1.to(device) z_code_2 = z_code_2.to(device) style_1 = mixing_noise(z_code_1, opt.batch_size, opt.latent, opt.mixing, device) style_2 = [] style_2.append( torch.cat((style_1[0][:, :opt.latent], z_code_2), dim=1)) if len(style_1) > 1: style_2.append( torch.cat((style_1[1][:, :opt.latent], z_code_2), dim=1)) if opt.zAlone: #to validate orig style gan results newstyle = [] newstyle.append(style_1[0][:, :opt.latent]) if len(style_1) > 1: newstyle.append(style_1[1][:, :opt.latent]) style_1 = newstyle style_2 = newstyle fake_img_1, _ = g_ema(style_1, input_is_latent=opt.input_latent) fake_img_2, _ = g_ema(style_2, input_is_latent=opt.input_latent) os.makedirs(os.path.join(opt.trainDir, str(iteration)), exist_ok=True) for trImgCntr in range(opt.batch_size): try: save_image( tensor2im(fake_img_1[trImgCntr].detach()), os.path.join( opt.trainDir, str(iteration), str(trImgCntr) + '_pair1_' + z_labels_1[trImgCntr] + '.png')) save_image( tensor2im(fake_img_2[trImgCntr].detach()), os.path.join( opt.trainDir, str(iteration), str(trImgCntr) + '_pair2_' + z_labels_2[trImgCntr] + '.png')) except: print('Warning while saving training image') elapsed_time = time.time() - start_time # for log with open(os.path.join(opt.exp_dir, opt.exp_name, 'log_train.txt'), 'a') as log: # training loss and validation loss loss_log = f'[{iteration+1}/{opt.num_iter}] \ Train Dis loss: {loss_avg_dis.val():0.5f}, Train Gen loss: {loss_avg_gen.val():0.5f},\ Train UnSup loss: {loss_avg_unsup.val():0.5f}, Train Sup loss: {loss_avg_sup.val():0.5f}, \ Train R1-val loss: {log_r1_val.val():0.5f}, Train avg-path-loss: {log_avg_path_loss_val.val():0.5f}, \ Train mean-path-length loss: {log_avg_mean_path_length_avg.val():0.5f}, Train ada-aug-p: {log_ada_aug_p.val():0.5f}, \ Elapsed_time: {elapsed_time:0.5f}' #plotting lib.plot.plot(os.path.join(opt.plotDir, 'Train-Dis-Loss'), loss_avg_dis.val().item()) lib.plot.plot(os.path.join(opt.plotDir, 'Train-Gen-Loss'), loss_avg_gen.val().item()) lib.plot.plot(os.path.join(opt.plotDir, 'Train-UnSup-Loss'), loss_avg_unsup.val().item()) lib.plot.plot(os.path.join(opt.plotDir, 'Train-Sup-Loss'), loss_avg_sup.val().item()) lib.plot.plot(os.path.join(opt.plotDir, 'Train-r1_val'), log_r1_val.val().item()) lib.plot.plot(os.path.join(opt.plotDir, 'Train-path_loss_val'), log_avg_path_loss_val.val().item()) lib.plot.plot( os.path.join(opt.plotDir, 'Train-mean_path_length_avg'), log_avg_mean_path_length_avg.val().item()) lib.plot.plot(os.path.join(opt.plotDir, 'Train-ada_aug_p'), log_ada_aug_p.val().item()) print(loss_log) loss_avg_dis.reset() loss_avg_gen.reset() loss_avg_unsup.reset() loss_avg_sup.reset() log_r1_val.reset() log_avg_path_loss_val.reset() log_avg_mean_path_length_avg.reset() log_ada_aug_p.reset() lib.plot.flush() lib.plot.tick() # save model per 1e+5 iter. if (iteration) % 1e+4 == 0: torch.save( { 'genModel': genModel.state_dict(), 'g_ema': g_ema.state_dict(), 'disEncModel': disEncModel.state_dict(), 'optimizer': optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict() }, os.path.join(opt.exp_dir, opt.exp_name, 'iter_' + str(iteration + 1) + '_synth.pth')) if (iteration + 1) == opt.num_iter: print('end the training') sys.exit() iteration += 1 cntr += 1
parser.add_argument( "--input_channel", type=int, default=1, help="the number of input channel of Feature extractor", ) parser.add_argument( "--output_channel", type=int, default=512, help="the number of output channel of Feature extractor", ) parser.add_argument( "--hidden_size", type=int, default=256, help="the size of the LSTM hidden state" ) opt = parser.parse_args() if "CTC" in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) model = Model(opt) model.load_state_dict(torch.load(opt.model_path)) model.to(device) model.eval() for img_path in os.listdir("task1_2_test(361p)"): predict("task1_2_test(361p)/" + img_path, model)
def train(opt): """ dataset preparation """ if not opt.data_filtering_off: print('Filtering the images containing characters which are not in opt.character') print('Filtering the images whose label is longer than opt.batch_max_length') # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130 opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') #considering the real images for discriminator opt.batch_size = opt.batch_size*2 train_dataset = Batch_Balanced_Dataset(opt) log = open(os.path.join(opt.exp_dir,opt.exp_name,'log_dataset.txt'), 'a') AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset, valid_dataset_log = hierarchical_dataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle=True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) log.write(valid_dataset_log) print('-' * 80) log.write('-' * 80 + '\n') log.close() """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = AdaINGen(opt) ocrModel = Model(opt) disModel = MsImageDis() print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # Synthesizer weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # Recognizer weight initialization for name, param in ocrModel.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # Discriminator weight initialization for name, param in disModel.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU model = torch.nn.DataParallel(model).to(device) model.train() ocrModel = torch.nn.DataParallel(ocrModel).to(device) ocrModel.train() disModel = torch.nn.DataParallel(disModel).to(device) disModel.train() if opt.saved_synth_model != '': print(f'loading pretrained synth model from {opt.saved_synth_model}') if opt.FT: model.load_state_dict(torch.load(opt.saved_synth_model), strict=False) else: model.load_state_dict(torch.load(opt.saved_synth_model)) print("Model:") print(model) if opt.saved_ocr_model != '': print(f'loading pretrained ocr model from {opt.saved_ocr_model}') if opt.FT: ocrModel.load_state_dict(torch.load(opt.saved_ocr_model), strict=False) else: ocrModel.load_state_dict(torch.load(opt.saved_ocr_model)) # ocrModel.eval() #as we can't call RNN.backward in eval mode print("OCRModel:") print(ocrModel) if opt.saved_dis_model != '': print(f'loading pretrained discriminator model from {opt.saved_dis_model}') if opt.FT: disModel.load_state_dict(torch.load(opt.saved_dis_model), strict=False) else: disModel.load_state_dict(torch.load(opt.saved_dis_model)) # ocrModel.eval() #as we can't call RNN.backward in eval mode print("DisModel:") print(disModel) """ setup loss """ if 'CTC' in opt.Prediction: ocrCriterion = torch.nn.CTCLoss(zero_infinity=True).to(device) else: ocrCriterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # ignore [GO] token = ignore index 0 recCriterion = torch.nn.L1Loss() # loss averager loss_avg = Averager() loss_avg_ocr = Averager() ##---------- loss_avg_dis = Averager() # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("SynthOptimizer:") print(optimizer) #filter parameters for OCR training # filter that only require gradient decent ocr_filtered_parameters = [] ocr_params_num = [] for p in filter(lambda p: p.requires_grad, ocrModel.parameters()): ocr_filtered_parameters.append(p) ocr_params_num.append(np.prod(p.size())) print('OCR Trainable params num : ', sum(ocr_params_num)) # setup optimizer if opt.adam: ocr_optimizer = optim.Adam(ocr_filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: ocr_optimizer = optim.Adadelta(ocr_filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("OCROptimizer:") print(ocr_optimizer) #filter parameters for OCR training # filter that only require gradient decent dis_filtered_parameters = [] dis_params_num = [] for p in filter(lambda p: p.requires_grad, disModel.parameters()): dis_filtered_parameters.append(p) dis_params_num.append(np.prod(p.size())) print('Dis Trainable params num : ', sum(dis_params_num)) # setup optimizer if opt.adam: dis_optimizer = optim.Adam(dis_filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: dis_optimizer = optim.Adadelta(dis_filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("DisOptimizer:") print(dis_optimizer) """ final options """ # print(opt) with open(os.path.join(opt.exp_dir,opt.exp_name,'opt.txt'), 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.saved_synth_model != '': try: start_iter = int(opt.saved_synth_model.split('_')[-1].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') except: pass start_time = time.time() best_accuracy = -1 best_norm_ED = -1 best_accuracy_ocr = -1 best_norm_ED_ocr = -1 iteration = start_iter while(True): # train part image_tensors_all, labels_1_all, labels_2_all = train_dataset.get_batch() # ## comment # pdb.set_trace() # for imgCntr in range(image_tensors.shape[0]): # save_image(tensor2im(image_tensors[imgCntr]),'temp/'+str(imgCntr)+'.png') # pdb.set_trace() # ### disCnt = int(image_tensors_all.size(0)/2) image_tensors, image_tensors_real, labels_1, labels_2 = image_tensors_all[:disCnt], image_tensors_all[disCnt:disCnt+disCnt], labels_1_all[:disCnt], labels_2_all[:disCnt] image = image_tensors.to(device) image_real = image_tensors_real.to(device) text_1, length_1 = converter.encode(labels_1, batch_max_length=opt.batch_max_length) text_2, length_2 = converter.encode(labels_2, batch_max_length=opt.batch_max_length) batch_size = image.size(0) images_recon_1, images_recon_2, _ = model(image, text_1, text_2) if 'CTC' in opt.Prediction: #ocr training preds_ocr = ocrModel(image, text_1) preds_size_ocr = torch.IntTensor([preds_ocr.size(1)] * batch_size) preds_ocr = preds_ocr.log_softmax(2).permute(1, 0, 2) ocrCost_train = ocrCriterion(preds_ocr, text_1, preds_size_ocr, length_1) #dis training #Check: Using alternate real images disCost = opt.disWeight*0.5*(disModel.module.calc_dis_loss(images_recon_1.detach(), image_real) + disModel.module.calc_dis_loss(images_recon_2.detach(), image)) #synth training preds_1 = ocrModel(images_recon_1, text_1) preds_size_1 = torch.IntTensor([preds_1.size(1)] * batch_size) preds_1 = preds_1.log_softmax(2).permute(1, 0, 2) preds_2 = ocrModel(images_recon_2, text_2) preds_size_2 = torch.IntTensor([preds_2.size(1)] * batch_size) preds_2 = preds_2.log_softmax(2).permute(1, 0, 2) ocrCost = 0.5*(ocrCriterion(preds_1, text_1, preds_size_1, length_1) + ocrCriterion(preds_2, text_2, preds_size_2, length_2)) #gen training disGenCost = 0.5*(disModel.module.calc_gen_loss(images_recon_1)+disModel.module.calc_gen_loss(images_recon_2)) else: preds = model(image, text[:, :-1]) # align with Attention.forward target = text[:, 1:] # without [GO] Symbol ocrCost = ocrCriterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) recCost = recCriterion(images_recon_1,image) cost = opt.ocrWeight*ocrCost + opt.reconWeight*recCost + opt.disWeight*disGenCost disModel.zero_grad() disCost.backward() torch.nn.utils.clip_grad_norm_(disModel.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) dis_optimizer.step() loss_avg_dis.add(disCost) model.zero_grad() ocrModel.zero_grad() disModel.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(cost) #training OCR ocrModel.zero_grad() ocrCost_train.backward() torch.nn.utils.clip_grad_norm_(ocrModel.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) ocr_optimizer.step() loss_avg_ocr.add(ocrCost_train) #START HERE # validation part if (iteration + 1) % opt.valInterval == 0 or iteration == 0: # To see training progress, we also conduct validation when 'iteration == 0' #Save training images os.makedirs(os.path.join(opt.exp_dir,opt.exp_name,'trainImages',str(iteration)), exist_ok=True) for trImgCntr in range(batch_size): try: save_image(tensor2im(image[trImgCntr].detach()),os.path.join(opt.exp_dir,opt.exp_name,'trainImages',str(iteration),str(trImgCntr)+'_input_'+labels_1[trImgCntr]+'.png')) save_image(tensor2im(images_recon_1[trImgCntr].detach()),os.path.join(opt.exp_dir,opt.exp_name,'trainImages',str(iteration),str(trImgCntr)+'_recon_'+labels_1[trImgCntr]+'.png')) save_image(tensor2im(images_recon_2[trImgCntr].detach()),os.path.join(opt.exp_dir,opt.exp_name,'trainImages',str(iteration),str(trImgCntr)+'_pair_'+labels_2[trImgCntr]+'.png')) except: print('Warning while saving training image') elapsed_time = time.time() - start_time # for log with open(os.path.join(opt.exp_dir,opt.exp_name,'log_train.txt'), 'a') as log: model.eval() ocrModel.eval() disModel.eval() with torch.no_grad(): # valid_loss, current_accuracy, current_norm_ED, preds, confidence_score, labels, infer_time, length_of_data = validation( # model, criterion, valid_loader, converter, opt) valid_loss, current_accuracy, current_norm_ED, preds, confidence_score, labels, infer_time, length_of_data = validation_synth_adv( iteration, model, ocrModel, disModel, recCriterion, ocrCriterion, valid_loader, converter, opt) model.train() ocrModel.train() disModel.train() # training loss and validation loss loss_log = f'[{iteration+1}/{opt.num_iter}] Train OCR loss: {loss_avg_ocr.val():0.5f}, Train Synth loss: {loss_avg.val():0.5f}, Train Dis loss: {loss_avg_dis.val():0.5f}, Valid OCR loss: {valid_loss[0]:0.5f}, Valid Synth loss: {valid_loss[1]:0.5f}, Valid Dis loss: {valid_loss[2]:0.5f}, Elapsed_time: {elapsed_time:0.5f}' loss_avg_ocr.reset() loss_avg.reset() loss_avg_dis.reset() current_model_log_ocr = f'{"Current_accuracy_OCR":17s}: {current_accuracy[0]:0.3f}, {"Current_norm_ED_OCR":17s}: {current_norm_ED[0]:0.2f}' current_model_log_1 = f'{"Current_accuracy_recon":17s}: {current_accuracy[1]:0.3f}, {"Current_norm_ED_recon":17s}: {current_norm_ED[1]:0.2f}' current_model_log_2 = f'{"Current_accuracy_pair":17s}: {current_accuracy[2]:0.3f}, {"Current_norm_ED_pair":17s}: {current_norm_ED[2]:0.2f}' # keep best accuracy model (on valid dataset) if current_accuracy[1] > best_accuracy: best_accuracy = current_accuracy[1] torch.save(model.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_accuracy.pth')) torch.save(disModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_accuracy_dis.pth')) if current_norm_ED[1] > best_norm_ED: best_norm_ED = current_norm_ED[1] torch.save(model.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_norm_ED.pth')) torch.save(disModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_norm_ED_dis.pth')) best_model_log = f'{"Best_accuracy_Recon":17s}: {best_accuracy:0.3f}, {"Best_norm_ED_Recon":17s}: {best_norm_ED:0.2f}' # keep best accuracy model (on valid dataset) if current_accuracy[0] > best_accuracy_ocr: best_accuracy_ocr = current_accuracy[0] torch.save(ocrModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_accuracy_ocr.pth')) if current_norm_ED[0] > best_norm_ED_ocr: best_norm_ED_ocr = current_norm_ED[0] torch.save(ocrModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'best_norm_ED_ocr.pth')) best_model_log_ocr = f'{"Best_accuracy_ocr":17s}: {best_accuracy_ocr:0.3f}, {"Best_norm_ED_ocr":17s}: {best_norm_ED_ocr:0.2f}' loss_model_log = f'{loss_log}\n{current_model_log_ocr}\n{current_model_log_1}\n{current_model_log_2}\n{best_model_log_ocr}\n{best_model_log}' print(loss_model_log) log.write(loss_model_log + '\n') # show some predicted results dashed_line = '-' * 80 head = f'{"Ground Truth":32s} | {"Prediction":25s} | Confidence Score & T/F' predicted_result_log = f'{dashed_line}\n{head}\n{dashed_line}\n' for gt_ocr, pred_ocr, confidence_ocr, gt_1, pred_1, confidence_1, gt_2, pred_2, confidence_2 in zip(labels[0][:5], preds[0][:5], confidence_score[0][:5], labels[1][:5], preds[1][:5], confidence_score[1][:5], labels[2][:5], preds[2][:5], confidence_score[2][:5]): if 'Attn' in opt.Prediction: gt = gt[:gt.find('[s]')] pred = pred[:pred.find('[s]')] predicted_result_log += f'{"ocr"}: {gt_ocr:27s} | {pred_ocr:25s} | {confidence_ocr:0.4f}\t{str(pred_ocr == gt_ocr)}\n' predicted_result_log += f'{"recon"}: {gt_1:25s} | {pred_1:25s} | {confidence_1:0.4f}\t{str(pred_1 == gt_1)}\n' predicted_result_log += f'{"pair"}: {gt_2:26s} | {pred_2:25s} | {confidence_2:0.4f}\t{str(pred_2 == gt_2)}\n' predicted_result_log += f'{dashed_line}' print(predicted_result_log) log.write(predicted_result_log + '\n') # save model per 1e+5 iter. if (iteration + 1) % 1e+5 == 0: torch.save( model.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'iter_{iteration+1}.pth')) torch.save( ocrModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'iter_{iteration+1}_ocr.pth')) torch.save( disModel.state_dict(), os.path.join(opt.exp_dir,opt.exp_name,'iter_{iteration+1}_dis.pth')) if (iteration + 1) == opt.num_iter: print('end the training') sys.exit() iteration += 1
def demo(opt): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) model = torch.nn.DataParallel(model).to(device) # load model print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model, map_location=device)) # prepare data. two demo images from https://github.com/bgshih/crnn#run-demo AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) demo_data = RawDataset(root=opt.image_folder, opt=opt) # use RawDataset demo_loader = torch.utils.data.DataLoader(demo_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_demo, pin_memory=True) # predict model.eval() with torch.no_grad(): for image_tensors, image_path_list in demo_loader: batch_size = image_tensors.size(0) image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) if 'CTC' in opt.Prediction: preds = model(image, text_for_pred) # (b, seq, index) # Select max probabilty (greedy decoding) then decode index to character preds_size = torch.IntTensor([preds.size(1)] * batch_size) #(seq, seq, ..., seq) _, preds_index = preds.max(2) # (b, seq) preds_index = preds_index.view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) log = open(f'./log_demo_result.txt', 'a') dashed_line = '-' * 80 head = f'{"image_path":25s}\t{"predicted_labels":25s}\tconfidence score' print(f'{dashed_line}\n{head}\n{dashed_line}') log.write(f'{dashed_line}\n{head}\n{dashed_line}\n') preds_prob = F.softmax(preds, dim=2) preds_max_prob, _ = preds_prob.max(dim=2) for img_name, pred, pred_max_prob in zip(image_path_list, preds_str, preds_max_prob): if 'Attn' in opt.Prediction: pred_EOS = pred.find('[s]') pred = pred[: pred_EOS] # prune after "end of sentence" token ([s]) pred_max_prob = pred_max_prob[:pred_EOS] # calculate confidence score (= multiply of pred_max_prob) confidence_score = pred_max_prob.cumprod(dim=0)[-1] print(f'{img_name:25s}\t{pred:25s}\t{confidence_score:0.4f}') log.write( f'{img_name:25s}\t{pred:25s}\t{confidence_score:0.4f}\n') log.close()
def demo(opt): """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) #print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, # opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, # opt.SequenceModeling, opt.Prediction) model = torch.nn.DataParallel(model).to(device) # load model #print('loading pretrained model from %s' % opt.saved_model) model.load_state_dict(torch.load(opt.saved_model, map_location=device)) # prepare data. two demo images from https://github.com/bgshih/crnn#run-demo AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) demo_data = RawDataset(root=opt.image_folder, opt=opt) # use RawDataset demo_loader = torch.utils.data.DataLoader(demo_data, batch_size=opt.batch_size, shuffle=False, num_workers=int(opt.workers), collate_fn=AlignCollate_demo, pin_memory=True) # predict model.eval # Lista con los valores transcriptos predList = list() retList = dict() with torch.no_grad(): for image_tensors, image_path_list in demo_loader: batch_size = image_tensors.size(0) image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) if 'CTC' in opt.Prediction: preds = model(image, text_for_pred).log_softmax(2) # Select max probabilty (greedy decoding) then decode index to character preds_size = torch.IntTensor([preds.size(1)] * batch_size) _, preds_index = preds.max(2) preds_index = preds_index.view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) preds_prob = F.softmax(preds, dim=2) preds_max_prob, _ = preds_prob.max(dim=2) for img_name, pred, pred_max_prob in zip(image_path_list, preds_str, preds_max_prob): if 'Attn' in opt.Prediction: pred_EOS = pred.find('[s]') pred = pred[: pred_EOS] # prune after "end of sentence" token ([s]) pred_max_prob = pred_max_prob[:pred_EOS] # calculate confidence score (= multiply of pred_max_prob) confidence_score = pred_max_prob.cumprod(dim=0)[-1] #Transcripciones restult = { "img_name": img_name, "pred": str(pred), "confidence_score": f'{confidence_score:0.4f}' } predList.append(restult) #Imagenes location /location/ file_list_1 = os.listdir("../process/location") file_list_2 = os.listdir("../process/images") retList["pred"] = predList retList["localizacion_url"] = "/location/" + file_list_1[0] retList["image_url"] = "/images/" + file_list_2[0] #for file in file_list: # restult = {"localizacion_url": "/location/"+file} # retList[] #Imagenes image /images/ #file_list = os.listdir("../process/images") #for file in file_list: # restult = {"image_url": "/images/"+file} # retList.append(restult) #json_mylist = json.dumps(retList) print(retList)