def predict(img_path, base_model, thresholding=160): t = Timer() img = Image.open(img_path) im = img.convert('L') scale = im.size[1] * 1.0 / 32 w = im.size[0] / scale w = int(w) print('w:', w) im = im.resize((w, 32), Image.ANTIALIAS) img = np.array(im).astype(np.float32) / 255.0 - 0.5 X = img.reshape((32, w, 1)) X = np.array([X]) t.tic() y_pred = base_model.predict(X) t.toc() print("times,", t.diff) argmax = np.argmax(y_pred, axis=2)[0] y_pred = y_pred[:, :, :] out = K.get_value(K.ctc_decode(y_pred, input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1], )[0][0])[:, :] out = u''.join([id_to_char[x] for x in out[0]]) return out, im
def test_ctc_decode_greedy(self): # Test adapted from tensorflow """Test two batch entries - best path decoder.""" max_time_steps = 6 seq_len_0 = 4 input_prob_matrix_0 = np.asarray( [ [1.0, 0.0, 0.0, 0.0], # t=0 [0.0, 0.0, 0.4, 0.6], # t=1 [0.0, 0.0, 0.4, 0.6], # t=2 [0.0, 0.9, 0.1, 0.0], # t=3 [0.0, 0.0, 0.0, 0.0], # t=4 (ignored) [0.0, 0.0, 0.0, 0.0], ], # t=5 (ignored) dtype=np.float32, ) input_log_prob_matrix_0 = np.log(input_prob_matrix_0) seq_len_1 = 5 # dimensions are time x depth input_prob_matrix_1 = np.asarray( [ [0.1, 0.9, 0.0, 0.0], # t=0 [0.0, 0.9, 0.1, 0.0], # t=1 [0.0, 0.0, 0.1, 0.9], # t=2 [0.0, 0.9, 0.1, 0.1], # t=3 [0.9, 0.1, 0.0, 0.0], # t=4 [0.0, 0.0, 0.0, 0.0], ], # t=5 (ignored) dtype=np.float32, ) # len max_time_steps array of batch_size x depth matrices inputs = [np.vstack([input_prob_matrix_0[t, :], input_prob_matrix_1[t, :]]) for t in range(max_time_steps)] # change tensorflow order to keras backend order inputs = KTF.variable(np.asarray(inputs).transpose((1, 0, 2))) # batch_size length vector of sequence_lengths input_length = KTF.variable(np.array([seq_len_0, seq_len_1], dtype=np.int32)) # batch_size length vector of negative log probabilities log_prob_truth = np.array( [np.sum(-np.log([1.0, 0.6, 0.6, 0.9])), np.sum(-np.log([0.9, 0.9, 0.9, 0.9, 0.9]))], np.float32 )[:, np.newaxis] # keras output, unlike tensorflow, is a dense (not sparse) tensor decode_truth = np.array([[0, 1, -1], [1, 1, 0]]) decode_pred_tf, log_prob_pred_tf = KTF.ctc_decode(inputs, input_length, greedy=True) assert len(decode_pred_tf) == 1 decode_pred = KTF.eval(decode_pred_tf[0]) log_prob_pred = KTF.eval(log_prob_pred_tf) assert np.alltrue(decode_truth == decode_pred) assert np.allclose(log_prob_truth, log_prob_pred)
def predict_model(model, input_): pred_ = model.predict(input_) shape = pred_[:, :, :].shape ctc_decode = K.ctc_decode(pred_[:, :, :], input_length=np.ones(shape[0]) * shape[1])[0][0] output_ = K.get_value(ctc_decode) # return output_[:, :ocr.MAX_CAPTCHA] return output_
def test_ctc_decode_greedy(self): # Test adapted from tensorflow """Test two batch entries - best path decoder.""" max_time_steps = 6 seq_len_0 = 4 input_prob_matrix_0 = np.asarray( [[1.0, 0.0, 0.0, 0.0], # t=0 [0.0, 0.0, 0.4, 0.6], # t=1 [0.0, 0.0, 0.4, 0.6], # t=2 [0.0, 0.9, 0.1, 0.0], # t=3 [0.0, 0.0, 0.0, 0.0], # t=4 (ignored) [0.0, 0.0, 0.0, 0.0]], # t=5 (ignored) dtype=np.float32) input_log_prob_matrix_0 = np.log(input_prob_matrix_0) seq_len_1 = 5 # dimensions are time x depth input_prob_matrix_1 = np.asarray( [[0.1, 0.9, 0.0, 0.0], # t=0 [0.0, 0.9, 0.1, 0.0], # t=1 [0.0, 0.0, 0.1, 0.9], # t=2 [0.0, 0.9, 0.1, 0.1], # t=3 [0.9, 0.1, 0.0, 0.0], # t=4 [0.0, 0.0, 0.0, 0.0]], # t=5 (ignored) dtype=np.float32) # len max_time_steps array of batch_size x depth matrices inputs = [np.vstack([input_prob_matrix_0[t, :], input_prob_matrix_1[t, :]]) for t in range(max_time_steps)] # change tensorflow order to keras backend order inputs = KTF.variable(np.asarray(inputs).transpose((1, 0, 2))) # batch_size length vector of sequence_lengths input_length = KTF.variable(np.array([seq_len_0, seq_len_1], dtype=np.int32)) # batch_size length vector of negative log probabilities log_prob_truth = np.array([ np.sum(-np.log([1.0, 0.6, 0.6, 0.9])), np.sum(-np.log([0.9, 0.9, 0.9, 0.9, 0.9])) ], np.float32)[:, np.newaxis] # keras output, unlike tensorflow, is a dense (not sparse) tensor decode_truth = np.array([[0, 1, -1], [1, 1, 0]]) decode_pred_tf, log_prob_pred_tf = KTF.ctc_decode(inputs, input_length, greedy=True) assert len(decode_pred_tf) == 1 decode_pred = KTF.eval(decode_pred_tf[0]) log_prob_pred = KTF.eval(log_prob_pred_tf) assert np.alltrue(decode_truth == decode_pred) assert np.allclose(log_prob_truth, log_prob_pred)
def eval(model, sample, sample_target): """ 计算一个单独样本的输出 """ _input = sample.reshape(1, sample.shape[0], sample.shape[1]) log_prob = model.predict(_input) output = K.ctc_decode(log_prob, input_length=np.asarray(model.get_layer('pred').output_shape[1]).reshape(1,)) with tf.Session() as sess: print("sample target", sample_target) print("predicted", output[0][0].eval())
def get_tensorflow_decoder(output_tensor, beam_size=1024): """ The TensorFlow implementation of the CTC decoder. """ def get_length(tensor): lengths = tf.reduce_sum(tf.ones_like(tensor), 1) return tf.cast(lengths, tf.int32) sequence_length = get_length(tf.reduce_max(output_tensor, 2)) top_k_decoded, _ = K.ctc_decode(output_tensor, sequence_length, greedy=False, beam_width=beam_size) decoder = K.function([output_tensor], [top_k_decoded[0]]) return decoder
def test_ctc_decode_beam_search(self): """Test one batch, two beams - hibernating beam search.""" depth = 6 seq_len_0 = 5 input_prob_matrix_0 = np.asarray( [[0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908], [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517], [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763], [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655], [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878], # Random entry added in at time=5 [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671]], dtype=np.float32) # len max_time_steps array of batch_size x depth matrices inputs = ([input_prob_matrix_0[t, :][np.newaxis, :] for t in range(seq_len_0)] + # Pad to max_time_steps = 8 2 * [np.zeros((1, depth), dtype=np.float32)]) inputs = KTF.variable(np.asarray(inputs).transpose((1, 0, 2))) # batch_size length vector of sequence_lengths input_length = KTF.variable(np.array([seq_len_0], dtype=np.int32)) # batch_size length vector of negative log probabilities log_prob_truth = np.array([ 0.584855, # output beam 0 0.389139 # output beam 1 ], np.float32)[np.newaxis, :] decode_truth = [np.array([1, 0]), np.array([0, 1, 0])] beam_width = 2 top_paths = 2 decode_pred_tf, log_prob_pred_tf = KTF.ctc_decode(inputs, input_length, greedy=False, beam_width=beam_width, top_paths=top_paths) assert len(decode_pred_tf) == top_paths log_prob_pred = KTF.eval(log_prob_pred_tf) for i in range(top_paths): assert np.alltrue(decode_truth[i] == KTF.eval(decode_pred_tf[i])) assert np.allclose(log_prob_truth, log_prob_pred)
def predict(img_path, base_model, thresholding=160): """ thresholding 输入范围 0 - 255 默认为160 0 : 采用自动阈值 > 0 : 采用人工设置的阈值 """ if thresholding > 255: thresholding = 255 if thresholding < 0: thresholding = 0 t = Timer() img = Image.open(img_path) im = img.convert('L') scale = im.size[1] * 1.0 / 64 w = im.size[0] / scale w = int(w) # print('w:',w) im = im.resize((160, 32), Image.ANTIALIAS) img = np.array(im) h, w = img.shape if thresholding == 0: img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 3, 5) else: for i in range(h): for j in range(w): if img[i, j] > thresholding: img[i, j] = 255 else: img[i, j] = 0 img = np.array(img) img = img.astype(np.float32) / 255.0 - 0.5 X = img.reshape((32, 160, 1)) X = np.array([X]) t.tic() y_pred = base_model.predict(X) t.toc() # print("times,",t.diff) argmax = np.argmax(y_pred, axis=2)[0] y_pred = y_pred[:, :, :] out = K.get_value(K.ctc_decode(y_pred, input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1], )[0][0])[:, :] out = u''.join([id_to_char[x] for x in out[0]]) return out, im
def test_model(model, X_test, Y_test): print("X_test:", X_test.shape) print("Y_test:", Y_test.shape) y_pred = model.predict(X_test) shape = y_pred[:, :, :].shape ctc_decode = K.ctc_decode(y_pred[:, :, :], input_length=np.ones(shape[0]) * shape[1])[0][0] out = K.get_value(ctc_decode)[:, :MAX_CAPTCHA] accur = np.sum(abs(out - Y_test), axis=1) accur_score = len(accur[accur == 0]) * 1.0 / len(accur) print("accur_score:", accur_score)
def predict(img_path, base_model): img = Image.open(img_path).convert('L') w, h = img.size rate = w / h img = img.resize((int(rate * 32), 32), Image.ANTIALIAS) img = np.array(img).astype(np.float32) / 255.0 - 0.5 x = img.reshape(1, 32, int(rate * 32), 1) y_pred = base_model.predict(x) print(np.argmax(y_pred, axis=2)[0]) y_pred = y_pred[:, :, :] print( type( K.ctc_decode( y_pred, input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1], )[0][0])) out = K.get_value( K.ctc_decode( y_pred, input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1], )[0][0])[:, :] out = u''.join([id_to_char[x] for x in out[0]]) return out, img
def eval(self, sample, sample_target): """Evaluate model given a single sample Args: sample (torch.Tensor): shape (n_features, frame_len) Returns: log probabilities (torch.Tensor): shape (n_features, output_len) """ _input = sample.reshape(1, sample.shape[0], sample.shape[1]) log_prob = self.predict_model.predict(_input) output = K.ctc_decode(log_prob, input_length=np.asarray(self.model.get_layer('pred').output_shape[1]).reshape(1,)) with tf.Session().as_default() as sess: print("sample target", sample_target) print("predicted", output[0][0].eval())
def predict(img_path, base_model, thresholding=160): if thresholding > 255: thresholding = 255 if thresholding < 0: thresholding = 0 t = Timer() img = Image.open(img_path).convert('L') w, h = img.size rate = w / h img = img.resize((int(rate * 32), 32), Image.ANTIALIAS) img = np.array(img) # if thresholding == 0: # img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 3, 5) # for i in range(32): # for j in range(int(rate * 32)): # if img[i,j] > 160: # img[i,j] = 255 # else: # img[i,j] = 0 img = np.array(img, 'f') / 255.0 - 0.5 t_img = np.zeros((32, 512)) t_img[:, :int(rate * 32)] = img X = np.array([t_img]) X = X.reshape((1, 32, 512, 1)) t.tic() y_pred = base_model.predict(X) t.toc() print("times,", t.diff) argmax = np.argmax(y_pred, axis=2)[0] y_pred = y_pred[:, :, :] out = K.get_value( K.ctc_decode( y_pred, input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1], )[0][0])[:, :] out = u''.join([id_to_char[x] for x in out[0]]) return out, t_img
def __text_recognition(self): self._rec_results = [] for box in self._boxes: test_img = self._image[box[1]:box[7], box[0]:box[6]] test_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2GRAY) scale = test_img.shape[0] * 1.0 / 32 w = test_img.shape[1] / scale w = int(w) test_img = cv2.resize(test_img, (w, 32)) test_img = np.array(test_img).astype(np.float32) / 255.0 - 0.5 X = test_img.reshape((32, w, 1)) X = np.array([X]) y_pred = self._ocr_model.predict(X) y_pred = y_pred[:, :, :] word = K.get_value( K.ctc_decode( y_pred, input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1], )[0][0])[:, :] words = u''.join([id_to_char[x] for x in word[0]]) self._rec_results.append(words)