class TextRecognizer: def __init__(self, checkpoint_path="./py/checkpoint_recognizer.hdf5", input_size=(1024, 128, 1), max_text_length=128, charset_base=string.printable[:95], architecture="flor"): self.tokenizer = None self.model = None self.checkpoint_path = checkpoint_path self.input_size = input_size self.max_text_length = max_text_length self.charset_base = charset_base self.architecture = architecture ml_utils.limit_gpu_memory() self.load_model() def load_model(self): self.tokenizer = Tokenizer(chars=self.charset_base, max_text_length=self.max_text_length) self.model = HTRModel(architecture=self.architecture, input_size=self.input_size, vocab_size=self.tokenizer.vocab_size, top_paths=10) self.model.compile() self.model.load_checkpoint(target=self.checkpoint_path) def read_all_text_from_images(self, images): output = [] for img in images: output.append(self.read_text_from_image(img)) return output def read_text_from_image(self, img): img = pp.preprocess(img, input_size=self.input_size) x_test = pp.normalization([img]) predicts, probabilities = self.model.predict(x_test, ctc_decode=True) predicts = [[self.tokenizer.decode(x) for x in y] for y in predicts] for i, (pred, prob) in enumerate(zip(predicts, probabilities)): return pred[0] return ""
tokenizer = Tokenizer(chars=charset_base, max_text_length=max_text_length) img = pp.preproc(args.image, input_size=input_size) x_test = pp.normalization([img]) model = HTRModel(architecture=args.arch, input_size=input_size, vocab_size=tokenizer.vocab_size, top_paths=10) model.compile() model.load_checkpoint(target=target_path) predicts, probabilities = model.predict(x_test, ctc_decode=True) predicts = [[tokenizer.decode(x) for x in y] for y in predicts] print("\n####################################") for i, (pred, prob) in enumerate(zip(predicts, probabilities)): print("\nProb. - Predict") for (pd, pb) in zip(pred, prob): print(f"{pb:.4f} - {pd}") cv2.imshow(f"Image {i + 1}", pp.adjust_to_see(img)) print("\n####################################") cv2.waitKey(0) else: assert os.path.isfile(source_path) or os.path.isfile(target_path) os.makedirs(output_path, exist_ok=True)
mask = model.generate_square_subsequent_mask(i + 1).to( 'cuda') trg_tensor = torch.LongTensor(out_indexes).unsqueeze( 1).to(device) output = model.vocab( model.transformer.decoder(model.query_pos( model.decoder(trg_tensor)), memory, tgt_mask=mask)) out_token = output.argmax(2)[-1].item() out_indexes.append(out_token) # print(output.shape) if out_token == 3: break predicts.append(tokenizer.decode(out_indexes)) gt.append(tokenizer.decode(trg.flatten(0, 1))) predicts = list( map(lambda x: x.replace('SOS', '').replace('EOS', ''), predicts)) gt = list( map(lambda x: x.replace('SOS', '').replace('EOS', ''), gt)) evaluate = evaluation.ocr_metrics( predicts=predicts, ground_truth=gt, norm_accentuation=args.norm_accentuation, norm_punctuation=args.norm_punctuation) print( "Calculate Character Error Rate {}, Word Error Rate {} and Sequence Error Rate {}"