def get_sub_area_text(self, sub_im): #1. 垂直投影,切分 prj_val = prj.get_image_projection(sub_im, 'ver') ratio_val = np.fromiter(prj_val, dtype=np.float32) / (sub_im.shape[0] * 255) #2. 获得区域 range_list = prj.get_range_list(ratio_val, 0.995) #3. 对每个区域进行进一步切割 sub_ranges = [] for rg in range_list: tmp_im = sub_im[0:sub_im.shape[0], rg.begin:rg.begin + rg.get_length()] tmp_ranges = self.split_sub_ranges(tmp_im, rg) sub_ranges.extend(tmp_ranges) #4. 绘制区域 ''' draw_img = sub_im.copy() for rg in sub_ranges: cv2.line(draw_img, (rg.begin, 0), (rg.begin, sub_im.shape[0]), [0, 255, 0], 1) cv2.line(draw_img, (rg.end, 0), (rg.end, sub_im.shape[0]), [0, 255, 0], 1) cv2.namedWindow('x', cv2.WINDOW_NORMAL) cv2.imshow('x', draw_img) cv2.waitKey() ''' #分析区域 #连接识别网络服务 client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) client.connect(("127.0.0.1", 8009)) text = self.analyze_sub_ranges(sub_im, sub_ranges, client) client.send('bye'.encode()) client.close() return text
def get_sub_area_text(self, sub_im): #0. 接识别网络服务 client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) client.connect(("127.0.0.1", 8009)) #1. 垂直投影,切分 prj_val = prj.get_image_projection(sub_im, 'ver') ratio_val = np.fromiter(prj_val, dtype = np.float32) / (sub_im.shape[0]*255) #去除左右空白区域 range_list = prj.get_range_list(ratio_val, 0.995) if len(range_list)==0: return None, 0 else: sub_im = sub_im[:, range_list[0].begin:range_list[len(range_list)-1].end+1] ratio_val = ratio_val[range_list[0].begin:range_list[len(range_list)-1].end+1] #用来缓存OCR结果,减少OCR次数 ocr_cache = {} #2. 尝试三次beamsearch搜索,阈值从0.95逐步减小 threshold = 0.98 best_sol = None #用于保存最佳的方案 for i in range(0, 4): print('try threshold ', str(threshold)) #候选解的格式[分割点ID集合[],识别文字集合[],平均概率],其中文字为三元组(文字,概率,类型,(非空白起始位置、非空白结束为止)) sol = self.beam_search_solution(sub_im, ratio_val, client, ocr_cache, threshold) if best_sol is None: best_sol = sol elif best_sol[2]<sol[2]: best_sol = sol #如果sol中存在负数识别区,则降低阈值重试 invalid = True word_set = sol[1] prop_list = [] for word in word_set: if word[1]!=0: prop_list.append(word[1]) if word[1]<0: invalid = False break if invalid and len(word_set)==0 and sol[2]<0: invalid = False #如果存在整块未识别区,则降低阈值重试 if invalid==False: threshold = threshold - 0.05 continue #分析方案中是否存在明显低于概率均值的情况,如果有尝试进行修复 #计算prop_list的标准差 prop_ver = np.fromiter(prop_list, dtype=np.float) std = np.std(prop_ver) mean = np.mean(prop_ver) for i in range(len(word_set)): #认为误差在允许范围内 if word_set[i][1]>=mean-std: continue #对于中文字符,尝试拆分 if word_set[i][2]=='cn': (begin_pos, end_pos) = word_set[i][3] mid_pos = int((begin_pos+end_pos)/2) result1 = self.get_split_area_text(sub_im, ratio_val, begin_pos, mid_pos, client) result2 = self.get_split_area_text(sub_im, ratio_val, mid_pos, end_pos, client) if word_set[i][1]<(result1[1]+result2[1])/2.0 and result1[2]!='cn' and result2[2]!='cn': word_set[i][0] = result1[0]+result2[0] #对于非中文字符,尝试与前后进行组合识别 if word_set[i][2]=='en' or word_set[i][2]=='ccn': #稍后实现 print('important') break #3. 获取结果 text = '' for i in range(len(best_sol[1])): text = text + best_sol[1][i][0] #final. destroy client.send('bye'.encode()) client.close() return text, best_sol[2]
def create(self): #生成writer tfr_writer = tf.python_io.TFRecordWriter(self.tfrecords_filename) #读取idx.dat文件 fr = open(self.base_dir + '/idx.dat', encoding='utf-8') count = 0 img_files = [] for line in fr.readlines(): _ = line.split(' ') if len(_) < 3: print('err data', _) break _label = int(_[1]) _word = _[2] _img_path = self.base_dir + _[0] img_files.append((_img_path, _label, _word)) random.shuffle(img_files) for _img_path, _label, _word in img_files: print('process %d: %s...' % (count, _img_path)) _img = cv2.imdecode(np.fromfile(_img_path, dtype=np.uint8), flags=0) #对文字进行垂直投影,如果宽高比小于1,就放到比例为1的画布中央 prj_val_ver = prj.get_image_projection(_img, 'ver') ratio_val = np.fromiter(prj_val_ver, dtype=np.float32) / (_img.shape[0] * 255) range_list_ver = prj.get_range_list(ratio_val, 1) if len(range_list_ver) == 0: continue _img = _img[:, range_list_ver[0]. begin:range_list_ver[len(range_list_ver) - 1].end + 1] if _img.shape[1] * 1.0 / _img.shape[0] < 1: #按照比例1确定画布 _new_img = np.zeros((_img.shape[0], _img.shape[0]), dtype=np.uint8) _new_img = (_new_img + 1) * 255 x_pos = int((_new_img.shape[1] - _img.shape[1]) / 2) _new_img[:, x_pos:x_pos + _img.shape[1]] = _img _img = _new_img #将数据压缩到48*48的范围,如果原来不足则填0 _img, _shape = self.encode_img(_img, (48, 48)) _height = _shape[0] _width = _shape[1] example = tf.train.Example(features=tf.train.Features( feature={ 'height': self._int64_feature(_height), 'width': self._int64_feature(_width), 'word': self._bytes_feature(_word.encode( encoding="utf-8")), 'img_raw': self._bytes_feature(_img.tostring()), 'label': self._int64_feature(_label) })) tfr_writer.write(example.SerializeToString()) count += 1 #关闭句柄 fr.close() tfr_writer.close() pass
def inference_images_with_server(self, sess, endpoints, images): img_matrix = None #将图片清单转化为matrix for img_file in images: #1. 读文件(灰度) #print(img_file) if os.path.exists(img_file) == False: return None im = cv2.imdecode(np.fromfile(img_file, dtype=np.uint8), flags=0) if im is None: return None #2. 二值化 im = cv2.threshold(im, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY)[1] #3.对文字进行垂直投影,如果宽高比小于1,就放到比例为1的画布中央 prj_val_ver = prj.get_image_projection(im, 'ver') ratio_val = np.fromiter(prj_val_ver, dtype=np.float32) / (im.shape[0] * 255) range_list_ver = prj.get_range_list(ratio_val, 1) if len(range_list_ver) == 0: continue #进行水平投影,如果顶部和底部没有空白或空白区域过小,则增加空白(顶部按照高度最少10%或2px、底部按照最少10%或1px计算) prj_val_hor = prj.get_image_projection(im, 'hor') ratio_val = np.fromiter(prj_val_hor, dtype=np.float32) / (im.shape[1] * 255) range_list_hor = prj.get_range_list(ratio_val, 1) if len(range_list_hor) == 0: continue top_empty = range_list_hor[0].begin new_top_empty = top_empty bottom_empty = im.shape[0] - 1 - range_list_hor[len(range_list_hor) - 1].end new_bottom_empty = bottom_empty if top_empty < 3 or top_empty < int(im.shape[0] * 0.2): new_top_empty = max(3, int(im.shape[0] * 0.2)) if bottom_empty < 2 or bottom_empty < int(im.shape[0] * 0.15): new_bottom_empty = max(2, int(im.shape[0] * 0.15)) #print(top_empty, new_top_empty, bottom_empty, new_bottom_empty) if new_top_empty > top_empty or new_bottom_empty > bottom_empty: top_empty_dif = max(new_top_empty - top_empty, 0) bottom_empty_dif = max(new_bottom_empty - bottom_empty, 0) _new_img = np.zeros( (im.shape[0] + top_empty_dif + bottom_empty_dif, im.shape[1])) _new_img = (_new_img + 1) * 255 _new_img[top_empty_dif:top_empty_dif + im.shape[0], :] = im im = _new_img im = im[:, range_list_ver[0]. begin:range_list_ver[len(range_list_ver) - 1].end + 1] if im.shape[1] * 1.0 / im.shape[0] < 1: #按照比例1确定画布 _new_img = np.zeros((im.shape[0], im.shape[0]), dtype=np.uint8) _new_img = (_new_img + 1) * 255 x_pos = int((_new_img.shape[1] - im.shape[1]) / 2) _new_img[:, x_pos:x_pos + im.shape[1]] = im im = _new_img ''' cv2.imshow('net', im) cv2.waitKey() ''' #4. 调整样本至新的比例 im = cv2.resize(im, (SAMPLE_SIZE, SAMPLE_SIZE), interpolation=cv2.INTER_CUBIC) #cv2.imshow('max', im) #cv2.waitKey() #5. 归一化 im = im.astype('float') im = im / 255.0 #6. 扁平 im = im.reshape([1, SAMPLE_SIZE * SAMPLE_SIZE]) #7. 添加到矩阵 if img_matrix is None: img_matrix = im else: img_matrix = np.concatenate((img_matrix, [im])) #转入网络获取结果 predict_val, predict_idx = sess.run( [endpoints['predict_val_top3'], endpoints['predict_idx_top3']], feed_dict={endpoints['inputs.data']: img_matrix}) #从结果中获取索引号 s = predict_idx.shape[:2] result_mtx = None for i in range(s[0]): rec_words = [] for j in range(s[1]): rec_words.append([ map_id_cw[predict_idx[i][j]][0], map_id_cw[predict_idx[i][j]][1], predict_val[i][j] ]) if result_mtx is None: result_mtx = [rec_words] else: result_mtx = np.concatenate((result_mtx, [rec_words])) return result_mtx
def split_sub_ranges(self, sub_im, org_range): #1. 垂直投影,切分 prj_val = prj.get_image_projection(sub_im, 'ver') ratio_val = np.fromiter(prj_val, dtype=np.float32) / (sub_im.shape[0] * 255) #2. 获得区域 range_list = prj.get_range_list(ratio_val, 0.99) #3. 对区域进行逐个分析 sub_ranges = [] h = sub_im.shape[0] min_ratio = 0.7 max_ratio = 1.2 for i in range(len(range_list)): tmp_im = sub_im[:, range_list[i].begin:range_list[i].end + 1] ''' cv2.imshow('z', tmp_im) cv2.waitKey() ''' range_width = range_list[i].get_length() #对于某一行,假定其高度为h,如果其宽度与高度比为[0.8-1.2],那么认为其为单字,OVER if range_width >= h * min_ratio and range_width <= h * max_ratio: sub_ranges.append(range_list[i].adjust(org_range)) continue #假定宽度与高度比小于0.8,那么将其待定为数字、字母或单字一部分,留待后续确定 if range_width < h * min_ratio: rg = range_list[i] rg.mark = 1 #待定标记 sub_ranges.append(rg.adjust(org_range)) continue #如果宽高比大于1.2 try_width = range(int(h * min_ratio), int(h * max_ratio) + 1) split_plans = [] for w in try_width: #三元组标识切割方案,分别为起始位置、宽度、平方和均值平方根 sub_plans = [] #尝试从0-w-1开始,以w为阶,进行切割,计算切割区域垂直投影的平方和的均值平方根 for pos in range(0, min(w, range_list[i].get_length() - w)): #计算切割点数目 length = range_list[i].get_length() - pos split_points_count = int(length * 1.0 / w) + 1 error = 0 for k in range(0, split_points_count): global_pos = range_list[i].begin + pos + k * w if global_pos >= len(ratio_val): split_points_count = split_points_count - 1 break error = error + ratio_val[global_pos] * ratio_val[ global_pos] error = math.sqrt(error / split_points_count) plan = SplitPlan(pos, w, error) sub_plans.append(plan) if len(sub_plans) == 0: continue sub_plans.sort() split_plans.append(sub_plans[len(sub_plans) - 1]) #从split_plans中查找error最大的plan split_plans = sorted(split_plans) #对最大的两个方案(如果误差在,选取分块数目最多的一个 max_plan = split_plans[len(split_plans) - 1] if len(split_plans) > 2: plan1 = split_plans[len(split_plans) - 1] plan2 = split_plans[len(split_plans) - 2] if plan1.errs != 0 and (plan1.errs - plan2.errs) / plan1.errs <= 0.05: #比较分块数目 if plan2.get_split_count( range_list[i]) > plan1.get_split_count( range_list[i]): max_plan = plan2 #输出max_plan print('maxplan: ', max_plan) #采用max_plan进行切割 length = range_list[i].get_length() - max_plan.pos split_points_count = int(length * 1.0 / w) + 1 if max_plan.pos != 0: #将首段加入待定 rg = Range(range_list[i].begin, range_list[i].begin + max_plan.pos - 1) rg.mark = 1 sub_ranges.append(rg.adjust(org_range)) for k in range(0, split_points_count - 1): #这里还要加上联通区域分析(如果内容整体联通,则按照整体加入,否则切割加入) rg = Range( range_list[i].begin + max_plan.pos + k * max_plan.width, range_list[i].begin + max_plan.pos + (k + 1) * max_plan.width - 1) #获取该区域对应的图 ''' rg_im = tmp_im[:, rg.begin: rg.end+1] _, contours, heris = cv2.findContours(rg_im, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) for contour in contours: x1, y1, w1, h1 = cv2.boundingRect(contour) #cv2.rectangle(rg_im, (x1, y1), (x1+w1, y1+h1), [0, 0, 0], 1) print('%d, %d, %d, %d\n' % (x1, y1, x1+w1, y1+h1)) cv2.imshow('rg_im', rg_im) cv2.waitKey() ''' rg.mark = 1 sub_ranges.append(rg.adjust(org_range)) #将末端加入待定 k = split_points_count if max_plan.pos + ( k - 1) * max_plan.width < range_list[i].get_length(): rg = Range( range_list[i].begin + max_plan.pos + (k - 1) * max_plan.width, range_list[i].end) rg.mark = 1 sub_ranges.append(rg.adjust(org_range)) #返回子区域 return sub_ranges