def main(): config = load_config(FLAGS.config) merge_config(FLAGS.opt) char_ops = CharacterOps(config['Global']) config['Global']['char_num'] = char_ops.get_char_num() print(config) # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] check_gpu(use_gpu) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) rec_model = create_module( config['Architecture']['function'])(params=config) startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): train_loader, train_outputs = rec_model(mode="train") save_var = train_outputs[1] if "gradient_clip" in config['Global']: gradient_clip = config['Global']['gradient_clip'] clip = fluid.clip.GradientClipByGlobalNorm(gradient_clip) fluid.clip.set_gradient_clip(clip, program=train_prog) train_fetch_list = [v.name for v in train_outputs] train_loss = train_outputs[0] opt_params = config['Optimizer'] optimizer = create_module(opt_params['function'])(opt_params) optimizer.minimize(train_loss) global_lr = optimizer._global_learning_rate() global_lr.persistable = True train_fetch_list.append(global_lr.name) train_reader = reader.train_eval_reader(config=config, char_ops=char_ops, mode="train") train_loader.set_sample_list_generator(train_reader, places=place) eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): eval_loader, eval_outputs = rec_model(mode="eval") eval_fetch_list = [v.name for v in eval_outputs] eval_prog = eval_prog.clone(for_test=True) exe.run(startup_prog) eval_reader = reader.train_eval_reader(config=config, char_ops=char_ops, mode="eval") eval_loader.set_sample_list_generator(eval_reader, places=place) # compile program for multi-devices train_compile_program = create_multi_devices_program( train_prog, train_loss.name) pretrain_weights = config['Global']['pretrain_weights'] if pretrain_weights is not None: load_pretrain(exe, train_prog, pretrain_weights) train_batch_id = 0 train_log_keys = ['loss', 'acc'] log_smooth_window = config['Global']['log_smooth_window'] epoch_num = config['Global']['epoch_num'] loss_type = config['Global']['loss_type'] print_step = config['Global']['print_step'] eval_step = config['Global']['eval_step'] save_epoch_step = config['Global']['save_epoch_step'] save_dir = config['Global']['save_dir'] train_stats = TrainingStats(log_smooth_window, train_log_keys) best_eval_acc = -1 best_batch_id = 0 best_epoch = 0 for epoch in range(epoch_num): train_loader.start() try: while True: t1 = time.time() train_outs = exe.run(program=train_compile_program, fetch_list=train_fetch_list, return_numpy=False) loss = np.mean(np.array(train_outs[0])) lr = np.mean(np.array(train_outs[-1])) preds = np.array(train_outs[1]) preds_lod = train_outs[1].lod()[0] labels = np.array(train_outs[2]) labels_lod = train_outs[2].lod()[0] acc, acc_num, img_num = cal_predicts_accuracy( char_ops, preds, preds_lod, labels, labels_lod) t2 = time.time() train_batch_elapse = t2 - t1 stats = {'loss': loss, 'acc': acc} train_stats.update(stats) if train_batch_id > 0 and train_batch_id % print_step == 0: logs = train_stats.log() strs = 'epoch: {}, iter: {}, lr: {:.6f}, {}, time: {:.3f}'.format( epoch, train_batch_id, lr, logs, train_batch_elapse) logger.info(strs) if train_batch_id > 0 and train_batch_id % eval_step == 0: outs = eval_run(exe, eval_prog, eval_loader, eval_fetch_list, char_ops, train_batch_id, "eval") eval_acc, acc_num, sample_num = outs if eval_acc > best_eval_acc: best_eval_acc = eval_acc best_batch_id = train_batch_id best_epoch = epoch save_path = save_dir + "/best_accuracy" save_model(train_prog, save_path) strs = 'Test iter: {}, acc:{:.6f}, best_acc:{:.6f}, best_epoch:{}, best_batch_id:{}, sample_num:{}'.format( train_batch_id, eval_acc, best_eval_acc, best_epoch, best_batch_id, sample_num) logger.info(strs) train_batch_id += 1 except fluid.core.EOFException: train_loader.reset() if epoch > 0 and epoch % save_epoch_step == 0: save_path = save_dir + "/iter_epoch_%d" % (epoch) save_model(train_prog, save_path)
def main(): config = program.load_config(FLAGS.config) program.merge_config(FLAGS.opt) logger.info(config) char_ops = CharacterOps(config['Global']) loss_type = config['Global']['loss_type'] config['Global']['char_ops'] = char_ops # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] # check_gpu(use_gpu) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) rec_model = create_module( config['Architecture']['function'])(params=config) startup_prog = fluid.Program() eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): _, outputs = rec_model(mode="test") fetch_name_list = list(outputs.keys()) fetch_varname_list = [outputs[v].name for v in fetch_name_list] eval_prog = eval_prog.clone(for_test=True) exe.run(startup_prog) init_model(config, eval_prog, exe) blobs = reader_main(config, 'test')() infer_img = config['Global']['infer_img'] infer_list = get_image_file_list(infer_img) max_img_num = len(infer_list) if len(infer_list) == 0: logger.info("Can not find img in infer_img dir.") for i in range(max_img_num): logger.info("infer_img:%s" % infer_list[i]) img = next(blobs) if loss_type != "srn": predict = exe.run(program=eval_prog, feed={"image": img}, fetch_list=fetch_varname_list, return_numpy=False) else: encoder_word_pos_list = [] gsrm_word_pos_list = [] gsrm_slf_attn_bias1_list = [] gsrm_slf_attn_bias2_list = [] encoder_word_pos_list.append(img[1]) gsrm_word_pos_list.append(img[2]) gsrm_slf_attn_bias1_list.append(img[3]) gsrm_slf_attn_bias2_list.append(img[4]) encoder_word_pos_list = np.concatenate(encoder_word_pos_list, axis=0).astype(np.int64) gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list, axis=0).astype(np.int64) gsrm_slf_attn_bias1_list = np.concatenate(gsrm_slf_attn_bias1_list, axis=0).astype( np.float32) gsrm_slf_attn_bias2_list = np.concatenate(gsrm_slf_attn_bias2_list, axis=0).astype( np.float32) predict = exe.run(program=eval_prog, \ feed={'image': img[0], 'encoder_word_pos': encoder_word_pos_list, 'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list, 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \ fetch_list=fetch_varname_list, \ return_numpy=False) if loss_type == "ctc": preds = np.array(predict[0]) preds = preds.reshape(-1) preds_lod = predict[0].lod()[0] preds_text = char_ops.decode(preds) probs = np.array(predict[1]) ind = np.argmax(probs, axis=1) blank = probs.shape[1] valid_ind = np.where(ind != (blank - 1))[0] if len(valid_ind) == 0: continue score = np.mean(probs[valid_ind, ind[valid_ind]]) elif loss_type == "attention": preds = np.array(predict[0]) probs = np.array(predict[1]) end_pos = np.where(preds[0, :] == 1)[0] if len(end_pos) <= 1: preds = preds[0, 1:] score = np.mean(probs[0, 1:]) else: preds = preds[0, 1:end_pos[1]] score = np.mean(probs[0, 1:end_pos[1]]) preds = preds.reshape(-1) preds_text = char_ops.decode(preds) elif loss_type == "srn": cur_pred = [] preds = np.array(predict[0]) preds = preds.reshape(-1) probs = np.array(predict[1]) ind = np.argmax(probs, axis=1) valid_ind = np.where(preds != 37)[0] if len(valid_ind) == 0: continue score = np.mean(probs[valid_ind, ind[valid_ind]]) preds = preds[:valid_ind[-1] + 1] preds_text = char_ops.decode(preds) logger.info("\t index: {}".format(preds)) logger.info("\t word : {}".format(preds_text)) logger.info("\t score: {}".format(score)) # save for inference model target_var = [] for key, values in outputs.items(): target_var.append(values) fluid.io.save_inference_model("./output/", feeded_var_names=['image'], target_vars=target_var, executor=exe, main_program=eval_prog, model_filename="model", params_filename="params")
def main(): # Run code with static graph mode. try: paddle.enable_static() except: pass config = program.load_config(FLAGS.config) program.merge_config(FLAGS.opt) logger.info(config) # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] program.check_gpu(use_gpu) alg = config['Global']['algorithm'] assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE'] if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE']: config['Global']['char_ops'] = CharacterOps(config['Global']) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() startup_program = fluid.Program() train_program = fluid.Program() train_build_outputs = program.build(config, train_program, startup_program, mode='train') train_loader = train_build_outputs[0] train_fetch_name_list = train_build_outputs[1] train_fetch_varname_list = train_build_outputs[2] train_opt_loss_name = train_build_outputs[3] eval_program = fluid.Program() eval_build_outputs = program.build(config, eval_program, startup_program, mode='eval') eval_fetch_name_list = eval_build_outputs[1] eval_fetch_varname_list = eval_build_outputs[2] eval_program = eval_program.clone(for_test=True) train_reader = reader_main(config=config, mode="train") train_loader.set_sample_list_generator(train_reader, places=place) eval_reader = reader_main(config=config, mode="eval") exe = fluid.Executor(place) exe.run(startup_program) # compile program for multi-devices init_model(config, train_program, exe) sen = load_sensitivities("sensitivities_0.data") for i in skip_list: if i in sen.keys(): sen.pop(i) back_bone_list = ['conv' + str(x) for x in range(1, 5)] for i in back_bone_list: for key in list(sen.keys()): if i + '_' in key: sen.pop(key) ratios = get_ratios_by_loss(sen, 0.03) logger.info("FLOPs before pruning: {}".format(flops(eval_program))) pruner = Pruner(criterion='geometry_median') print("ratios: {}".format(ratios)) pruned_val_program, _, _ = pruner.prune(eval_program, fluid.global_scope(), params=ratios.keys(), ratios=ratios.values(), place=place, only_graph=True) pruned_program, _, _ = pruner.prune(train_program, fluid.global_scope(), params=ratios.keys(), ratios=ratios.values(), place=place) logger.info("FLOPs after pruning: {}".format(flops(pruned_val_program))) train_compile_program = program.create_multi_devices_program( pruned_program, train_opt_loss_name) train_info_dict = {'compile_program':train_compile_program,\ 'train_program':pruned_program,\ 'reader':train_loader,\ 'fetch_name_list':train_fetch_name_list,\ 'fetch_varname_list':train_fetch_varname_list} eval_info_dict = {'program':pruned_val_program,\ 'reader':eval_reader,\ 'fetch_name_list':eval_fetch_name_list,\ 'fetch_varname_list':eval_fetch_varname_list} if alg in ['EAST', 'DB']: program.train_eval_det_run(config, exe, train_info_dict, eval_info_dict, is_slim="prune") else: program.train_eval_rec_run(config, exe, train_info_dict, eval_info_dict)
def main(): config = program.load_config(FLAGS.config) program.merge_config(FLAGS.opt) logger.info(config) # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] program.check_gpu(use_gpu) alg = config['Global']['algorithm'] assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE'] if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE']: config['Global']['char_ops'] = CharacterOps(config['Global']) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() startup_program = fluid.Program() train_program = fluid.Program() train_build_outputs = program.build(config, train_program, startup_program, mode='train') train_loader = train_build_outputs[0] train_fetch_name_list = train_build_outputs[1] train_fetch_varname_list = train_build_outputs[2] train_opt_loss_name = train_build_outputs[3] eval_program = fluid.Program() eval_build_outputs = program.build(config, eval_program, startup_program, mode='eval') eval_fetch_name_list = eval_build_outputs[1] eval_fetch_varname_list = eval_build_outputs[2] eval_program = eval_program.clone(for_test=True) train_reader = reader_main(config=config, mode="train") train_loader.set_sample_list_generator(train_reader, places=place) eval_reader = reader_main(config=config, mode="eval") exe = fluid.Executor(place) exe.run(startup_program) # compile program for multi-devices train_compile_program = program.create_multi_devices_program( train_program, train_opt_loss_name) init_model(config, train_program, exe) train_info_dict = {'compile_program':train_compile_program,\ 'train_program':train_program,\ 'reader':train_loader,\ 'fetch_name_list':train_fetch_name_list,\ 'fetch_varname_list':train_fetch_varname_list} eval_info_dict = {'program':eval_program,\ 'reader':eval_reader,\ 'fetch_name_list':eval_fetch_name_list,\ 'fetch_varname_list':eval_fetch_varname_list} if alg in ['EAST', 'DB']: program.train_eval_det_run(config, exe, train_info_dict, eval_info_dict) else: program.train_eval_rec_run(config, exe, train_info_dict, eval_info_dict)
class TextRecognizer(object): def __init__(self, args): self.predictor, self.input_tensor, self.output_tensors =\ utility.create_predictor(args, mode="rec") self.rec_image_shape = [ int(v) for v in args.rec_image_shape.split(",") ] self.character_type = args.rec_char_type self.rec_batch_num = args.rec_batch_num self.rec_algorithm = args.rec_algorithm char_ops_params = { "character_type": args.rec_char_type, "character_dict_path": args.rec_char_dict_path, "use_space_char": args.use_space_char } if self.rec_algorithm != "RARE": char_ops_params['loss_type'] = 'ctc' self.loss_type = 'ctc' else: char_ops_params['loss_type'] = 'attention' self.loss_type = 'attention' self.char_ops = CharacterOps(char_ops_params) def resize_norm_img(self, img, max_wh_ratio): imgC, imgH, imgW = self.rec_image_shape assert imgC == img.shape[2] if self.character_type == "ch": imgW = int((32 * max_wh_ratio)) h, w = img.shape[:2] ratio = w / float(h) if math.ceil(imgH * ratio) > imgW: resized_w = imgW else: resized_w = int(math.ceil(imgH * ratio)) resized_image = cv2.resize(img, (resized_w, imgH)) resized_image = resized_image.astype('float32') resized_image = resized_image.transpose((2, 0, 1)) / 255 resized_image -= 0.5 resized_image /= 0.5 padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) padding_im[:, :, 0:resized_w] = resized_image return padding_im def __call__(self, img_list): img_num = len(img_list) # Calculate the aspect ratio of all text bars width_list = [] for img in img_list: width_list.append(img.shape[1] / float(img.shape[0])) # Sorting can speed up the recognition process indices = np.argsort(np.array(width_list)) # rec_res = [] rec_res = [['', 0.0]] * img_num batch_num = self.rec_batch_num predict_time = 0 for beg_img_no in range(0, img_num, batch_num): end_img_no = min(img_num, beg_img_no + batch_num) norm_img_batch = [] max_wh_ratio = 0 for ino in range(beg_img_no, end_img_no): # h, w = img_list[ino].shape[0:2] h, w = img_list[indices[ino]].shape[0:2] wh_ratio = w * 1.0 / h max_wh_ratio = max(max_wh_ratio, wh_ratio) for ino in range(beg_img_no, end_img_no): # norm_img = self.resize_norm_img(img_list[ino], max_wh_ratio) norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) norm_img_batch = np.concatenate(norm_img_batch) norm_img_batch = norm_img_batch.copy() starttime = time.time() self.input_tensor.copy_from_cpu(norm_img_batch) self.predictor.zero_copy_run() if self.loss_type == "ctc": rec_idx_batch = self.output_tensors[0].copy_to_cpu() rec_idx_lod = self.output_tensors[0].lod()[0] predict_batch = self.output_tensors[1].copy_to_cpu() predict_lod = self.output_tensors[1].lod()[0] elapse = time.time() - starttime predict_time += elapse for rno in range(len(rec_idx_lod) - 1): beg = rec_idx_lod[rno] end = rec_idx_lod[rno + 1] rec_idx_tmp = rec_idx_batch[beg:end, 0] preds_text = self.char_ops.decode(rec_idx_tmp) beg = predict_lod[rno] end = predict_lod[rno + 1] probs = predict_batch[beg:end, :] ind = np.argmax(probs, axis=1) blank = probs.shape[1] valid_ind = np.where(ind != (blank - 1))[0] score = np.mean(probs[valid_ind, ind[valid_ind]]) if len(valid_ind) == 0: continue # rec_res.append([preds_text, score]) rec_res[indices[beg_img_no + rno]] = [preds_text, score] else: rec_idx_batch = self.output_tensors[0].copy_to_cpu() predict_batch = self.output_tensors[1].copy_to_cpu() elapse = time.time() - starttime predict_time += elapse for rno in range(len(rec_idx_batch)): end_pos = np.where(rec_idx_batch[rno, :] == 1)[0] if len(end_pos) <= 1: preds = rec_idx_batch[rno, 1:] score = np.mean(predict_batch[rno, 1:]) else: preds = rec_idx_batch[rno, 1:end_pos[1]] score = np.mean(predict_batch[rno, 1:end_pos[1]]) preds_text = self.char_ops.decode(preds) # rec_res.append([preds_text, score]) rec_res[indices[beg_img_no + rno]] = [preds_text, score] return rec_res, predict_time
def main(): config = program.load_config(FLAGS.config) program.merge_config(FLAGS.opt) logger.info(config) char_ops = CharacterOps(config['Global']) config['Global']['char_ops'] = char_ops # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] # check_gpu(use_gpu) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) rec_model = create_module(config['Architecture']['function'])(params=config) startup_prog = fluid.Program() eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): _, outputs = rec_model(mode="test") fetch_name_list = list(outputs.keys()) fetch_varname_list = [outputs[v].name for v in fetch_name_list] eval_prog = eval_prog.clone(for_test=True) exe.run(startup_prog) init_model(config, eval_prog, exe) blobs = reader_main(config, 'test') imgs = next(blobs()) for img in imgs: predict = exe.run(program=eval_prog, feed={"image": img}, fetch_list=fetch_varname_list, return_numpy=False) preds = np.array(predict[0]) if preds.shape[1] == 1: preds = preds.reshape(-1) preds_lod = predict[0].lod()[0] preds_text = char_ops.decode(preds) else: end_pos = np.where(preds[0, :] == 1)[0] if len(end_pos) <= 1: preds_text = preds[0, 1:] else: preds_text = preds[0, 1:end_pos[1]] preds_text = preds_text.reshape(-1) preds_text = char_ops.decode(preds_text) print(preds) print(preds_text) # save for inference model target_var = [] for key, values in outputs.items(): target_var.append(values) fluid.io.save_inference_model( "./output/", feeded_var_names=['image'], target_vars=target_var, executor=exe, main_program=eval_prog, model_filename="model", params_filename="params")
class TextRecognizer(object): def __init__(self, args): if args.use_pdserving is False: self.predictor, self.input_tensor, self.output_tensors = \ utility.create_predictor(args, mode="rec") self.use_zero_copy_run = args.use_zero_copy_run self.rec_image_shape = [ int(v) for v in args.rec_image_shape.split(",") ] self.character_type = args.rec_char_type self.rec_batch_num = args.rec_batch_num self.rec_algorithm = args.rec_algorithm self.text_len = args.max_text_length char_ops_params = { "character_type": args.rec_char_type, "character_dict_path": args.rec_char_dict_path, "use_space_char": args.use_space_char, "max_text_length": args.max_text_length } if self.rec_algorithm in ["CRNN", "Rosetta", "STAR-Net"]: char_ops_params['loss_type'] = 'ctc' self.loss_type = 'ctc' elif self.rec_algorithm == "RARE": char_ops_params['loss_type'] = 'attention' self.loss_type = 'attention' elif self.rec_algorithm == "SRN": char_ops_params['loss_type'] = 'srn' self.loss_type = 'srn' self.char_ops = CharacterOps(char_ops_params) def resize_norm_img(self, img, max_wh_ratio): imgC, imgH, imgW = self.rec_image_shape assert imgC == img.shape[2] wh_ratio = max(max_wh_ratio, imgW * 1.0 / imgH) if self.character_type == "ch": imgW = int((32 * wh_ratio)) h, w = img.shape[:2] ratio = w / float(h) if math.ceil(imgH * ratio) > imgW: resized_w = imgW else: resized_w = int(math.ceil(imgH * ratio)) resized_image = cv2.resize(img, (resized_w, imgH)) resized_image = resized_image.astype('float32') resized_image = resized_image.transpose((2, 0, 1)) / 255 resized_image -= 0.5 resized_image /= 0.5 padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) padding_im[:, :, 0:resized_w] = resized_image return padding_im def resize_norm_img_srn(self, img, image_shape): imgC, imgH, imgW = image_shape img_black = np.zeros((imgH, imgW)) im_hei = img.shape[0] im_wid = img.shape[1] if im_wid <= im_hei * 1: img_new = cv2.resize(img, (imgH * 1, imgH)) elif im_wid <= im_hei * 2: img_new = cv2.resize(img, (imgH * 2, imgH)) elif im_wid <= im_hei * 3: img_new = cv2.resize(img, (imgH * 3, imgH)) else: img_new = cv2.resize(img, (imgW, imgH)) img_np = np.asarray(img_new) img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) img_black[:, 0:img_np.shape[1]] = img_np img_black = img_black[:, :, np.newaxis] row, col, c = img_black.shape c = 1 return np.reshape(img_black, (c, row, col)).astype(np.float32) def srn_other_inputs(self, image_shape, num_heads, max_text_length, char_num): imgC, imgH, imgW = image_shape feature_dim = int((imgH / 8) * (imgW / 8)) encoder_word_pos = np.array(range(0, feature_dim)).reshape( (feature_dim, 1)).astype('int64') gsrm_word_pos = np.array(range(0, max_text_length)).reshape( (max_text_length, 1)).astype('int64') gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape( [-1, 1, max_text_length, max_text_length]) gsrm_slf_attn_bias1 = np.tile( gsrm_slf_attn_bias1, [1, num_heads, 1, 1]).astype('float32') * [-1e9] gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape( [-1, 1, max_text_length, max_text_length]) gsrm_slf_attn_bias2 = np.tile( gsrm_slf_attn_bias2, [1, num_heads, 1, 1]).astype('float32') * [-1e9] encoder_word_pos = encoder_word_pos[np.newaxis, :] gsrm_word_pos = gsrm_word_pos[np.newaxis, :] return [ encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2 ] def process_image_srn(self, img, image_shape, num_heads, max_text_length, char_ops=None): norm_img = self.resize_norm_img_srn(img, image_shape) norm_img = norm_img[np.newaxis, :] char_num = char_ops.get_char_num() [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ self.srn_other_inputs(image_shape, num_heads, max_text_length, char_num) gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32) gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32) return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2) def __call__(self, img_list): img_num = len(img_list) # Calculate the aspect ratio of all text bars width_list = [] for img in img_list: width_list.append(img.shape[1] / float(img.shape[0])) # Sorting can speed up the recognition process indices = np.argsort(np.array(width_list)) # rec_res = [] rec_res = [['', 0.0]] * img_num batch_num = self.rec_batch_num predict_time = 0 for beg_img_no in range(0, img_num, batch_num): end_img_no = min(img_num, beg_img_no + batch_num) norm_img_batch = [] max_wh_ratio = 0 for ino in range(beg_img_no, end_img_no): # h, w = img_list[ino].shape[0:2] h, w = img_list[indices[ino]].shape[0:2] wh_ratio = w * 1.0 / h max_wh_ratio = max(max_wh_ratio, wh_ratio) for ino in range(beg_img_no, end_img_no): if self.loss_type != "srn": norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) else: norm_img = self.process_image_srn(img_list[indices[ino]], self.rec_image_shape, 8, 25, self.char_ops) encoder_word_pos_list = [] gsrm_word_pos_list = [] gsrm_slf_attn_bias1_list = [] gsrm_slf_attn_bias2_list = [] encoder_word_pos_list.append(norm_img[1]) gsrm_word_pos_list.append(norm_img[2]) gsrm_slf_attn_bias1_list.append(norm_img[3]) gsrm_slf_attn_bias2_list.append(norm_img[4]) norm_img_batch.append(norm_img[0]) norm_img_batch = np.concatenate(norm_img_batch, axis=0) norm_img_batch = norm_img_batch.copy() if self.loss_type == "srn": starttime = time.time() encoder_word_pos_list = np.concatenate(encoder_word_pos_list) gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list) gsrm_slf_attn_bias1_list = np.concatenate( gsrm_slf_attn_bias1_list) gsrm_slf_attn_bias2_list = np.concatenate( gsrm_slf_attn_bias2_list) starttime = time.time() norm_img_batch = fluid.core.PaddleTensor(norm_img_batch) encoder_word_pos_list = fluid.core.PaddleTensor( encoder_word_pos_list) gsrm_word_pos_list = fluid.core.PaddleTensor( gsrm_word_pos_list) gsrm_slf_attn_bias1_list = fluid.core.PaddleTensor( gsrm_slf_attn_bias1_list) gsrm_slf_attn_bias2_list = fluid.core.PaddleTensor( gsrm_slf_attn_bias2_list) inputs = [ norm_img_batch, encoder_word_pos_list, gsrm_slf_attn_bias1_list, gsrm_slf_attn_bias2_list, gsrm_word_pos_list ] self.predictor.run(inputs) else: starttime = time.time() if self.use_zero_copy_run: self.input_tensor.copy_from_cpu(norm_img_batch) self.predictor.zero_copy_run() else: norm_img_batch = fluid.core.PaddleTensor(norm_img_batch) self.predictor.run([norm_img_batch]) if self.loss_type == "ctc": rec_idx_batch = self.output_tensors[0].copy_to_cpu() rec_idx_lod = self.output_tensors[0].lod()[0] predict_batch = self.output_tensors[1].copy_to_cpu() predict_lod = self.output_tensors[1].lod()[0] elapse = time.time() - starttime predict_time += elapse for rno in range(len(rec_idx_lod) - 1): beg = rec_idx_lod[rno] end = rec_idx_lod[rno + 1] rec_idx_tmp = rec_idx_batch[beg:end, 0] preds_text = self.char_ops.decode(rec_idx_tmp) beg = predict_lod[rno] end = predict_lod[rno + 1] probs = predict_batch[beg:end, :] ind = np.argmax(probs, axis=1) blank = probs.shape[1] valid_ind = np.where(ind != (blank - 1))[0] if len(valid_ind) == 0: continue score = np.mean(probs[valid_ind, ind[valid_ind]]) # rec_res.append([preds_text, score]) rec_res[indices[beg_img_no + rno]] = [preds_text, score] elif self.loss_type == 'srn': rec_idx_batch = self.output_tensors[0].copy_to_cpu() probs = self.output_tensors[1].copy_to_cpu() char_num = self.char_ops.get_char_num() preds = rec_idx_batch.reshape(-1) elapse = time.time() - starttime predict_time += elapse total_preds = preds.copy() for ino in range(int(len(rec_idx_batch) / self.text_len)): preds = total_preds[ino * self.text_len:(ino + 1) * self.text_len] ind = np.argmax(probs, axis=1) valid_ind = np.where(preds != int(char_num - 1))[0] if len(valid_ind) == 0: continue score = np.mean(probs[valid_ind, ind[valid_ind]]) preds = preds[:valid_ind[-1] + 1] preds_text = self.char_ops.decode(preds) rec_res[indices[beg_img_no + ino]] = [preds_text, score] else: rec_idx_batch = self.output_tensors[0].copy_to_cpu() predict_batch = self.output_tensors[1].copy_to_cpu() elapse = time.time() - starttime predict_time += elapse for rno in range(len(rec_idx_batch)): end_pos = np.where(rec_idx_batch[rno, :] == 1)[0] if len(end_pos) <= 1: preds = rec_idx_batch[rno, 1:] score = np.mean(predict_batch[rno, 1:]) else: preds = rec_idx_batch[rno, 1:end_pos[1]] score = np.mean(predict_batch[rno, 1:end_pos[1]]) preds_text = self.char_ops.decode(preds) # rec_res.append([preds_text, score]) rec_res[indices[beg_img_no + rno]] = [preds_text, score] return rec_res, predict_time