def normalized_scale(arr): crss_denom = float(max(horizontal_transitions(arr))) if crss_denom > 0: crossings_val = (arr.shape[1]) / crss_denom scale = 35.0 / crossings_val else: scale = 1.0 return scale
def combine_for_line_ind(self, line_info, lineind=None): fli = [] # Final Line indices flb = [] line_widths = [] line = line_info.lines_chars[lineind] line.sort(key=lambda x: line_info.get_box(x)[0]) shapes = line_info.shapes # line, line info, shapes if line: top = 1000000 # arbitrary high number bottom = 0 for k in line: j = line_info.get_box(k) if j[1] < top: top = j[1] if j[1] + j[3] > bottom: bottom = j[1] + j[3] firstbox = line_info.get_box(line[0]) lastbox = line_info.get_box(line[-1]) whitespace = np.zeros(len(line), dtype=int) for p, c in enumerate(line): if p + 1 < len(line): ab = line_info.get_box(c) nab = line_info.get_box(line[p + 1]) ws_diff = nab[0] - (ab[0] + ab[2]) whitespace[p] = ws_diff sum_whitespace = whitespace.sum() ln_arr = shapes.img_arr[top:bottom, firstbox[0]:lastbox[0] + lastbox[2]].copy() try: for inx in line_info.small_cc_lines_chars[lineind]: box = line_info.get_box(inx) ln_arr[box[1] - top:box[1] + box[3] - top, box[0] - firstbox[0]:box[0] + box[2] - firstbox[0]] = 1 except: pass # Remove small contours when calculating scale # for k in line_info.small_cc_lines_chars[lineind]: # x, y, w, h = shapes.get_boxes()[k] # x = x - firstbox[0] # y = y - top # ln_arr[y:y+h, x:x+w] = 1 crss_denom = float(max(horizontal_transitions(ln_arr))) if crss_denom > 0: crossings_val = (ln_arr.shape[1] - sum_whitespace) / crss_denom scale = 35.0 / crossings_val else: scale = 1.0 self.transitions.append(scale) else: return [] line = iter(line) # Initialize the current box and its attrs# BREAKWIDTH = 3.0 try: cur_ind = [next(line)] except StopIteration: return [] # cb is current box, b is the next box cb = line_info.get_box(cur_ind[0]) led, red, top, bottom, w, h = box_attrs(cb) # Loop through box, combine and close along the way for i in line: b = line_info.get_box(i) ledn, ren, topn, bottomn, wn, hn = box_attrs(b) # If left edge of next box doesn't overlap cur box # separate as 2 different chars is_interior = ((ledn >= led and ren <= red) or (ledn <= led and ren >= red)) and not bottom < topn if not isinstance(i, str): is_interior = line_info.shapes.hierarchy[0][i][ 0] < 0 and line_info.shapes.hierarchy[0][i][ 1] < 0 and line_info.shapes.hierarchy[0][i][ 2] < 0 # i.e. it has no peers at its place in the tree... and no children else: is_interior = False # Its a string, meaning it is the result of a horizontal cut and likely not an interior if ledn > red: # or (not is_interior and in_tsek_pop(shapes, wn,topn,top, bottomn, bs, i) and not hn > 1.5*shapes.tsek_mean # ): fli.append(cur_ind) x, y, w, h = combine_many_boxes( [line_info.get_box(j) for j in cur_ind]) flb.append((x, y, w, h)) self.widths.append(w * scale) cur_ind = [i] cb = b led, red, top, bottom, w, h = box_attrs(cb) # continue else: # There is overlap # one box is completely enveloped by the other if is_interior: # print 'enveloped' cur_ind.append(i) bxs = [line_info.get_box(j) for j in cur_ind] bxs.append(b) cb = combine_many_boxes(bxs) led, red, top, bottom, w, h = box_attrs(cb) elif ((float(min(wn, w)) - abs( (red - ledn))) / float(min(wn, w)) ) < self.hangoff: # amount hanging off end is 30 % cur_ind.append(i) bxs = [line_info.get_box(j) for j in cur_ind] bxs.append(b) cb = combine_many_boxes(bxs) led, red, top, bottom, w, h = box_attrs(cb) # The overlap is incidental / boxes are not related else: # print 'incidental overlap' fli.append(cur_ind) x, y, w, h = combine_many_boxes( [line_info.get_box(j) for j in cur_ind]) flb.append((x, y, w, h)) self.widths.append(w * scale) cur_ind = [i] cb = b led, red, top, bottom, w, h = box_attrs(cb) fli.append(cur_ind) x, y, w, h = combine_many_boxes( [line_info.get_box(j) for j in cur_ind]) flb.append((x, y, w, h)) line_widths.append(w) self.widths.append(w * scale) if shapes.low_ink: lib = self.line_info.low_ink_boxes[lineind] low_ink_segmentation = {} not_intr = [] for d, box in enumerate(flb): # b = line_info.get_box(i) led, red, top, bottom, w, h = box_attrs(box) for p, bx in enumerate(lib): ledn, ren, topn, bottomn, wn, hn = box_attrs(bx) is_interior = (led >= ledn - 15 and red <= ren + 15) #and not bottom < topn if is_interior: ### This attempts to remove noise ### that doesn't fall into blurred low ink ### box but does get combined according to normal ### combination rules # for inx in fli[d]: # tb = line_info.get_box(inx) # if tb[1] >= topn and tb[1] + tb[3] <= bottomn: # cur = low_ink_segmentation.get(p,[]) # # cur.extend(fli[d]) # cur.append(inx) # low_ink_segmentation[p] = cur cur = low_ink_segmentation.get(p, []) cur.extend(fli[d]) low_ink_segmentation[p] = cur break else: continue else: # print 'fail' not_intr.append(fli[d]) #NO! do something all_li_seg = low_ink_segmentation.values() all_li_seg.extend(not_intr) newfli = [] newflb = [] for j in all_li_seg: newfli.append(j) x, y, w, h = combine_many_boxes( [line_info.get_box(i) for i in j]) newflb.append([x, y, w, h]) self.widths.append(w * scale) fli = newfli flb = newflb fliflb = zip(fli, flb) fliflb.sort(key=lambda x: x[1][0]) fli = [i[0] for i in fliflb] flb = [i[1] for i in fliflb] self.final_indices.append(fli) self.final_boxes.append(flb) self.line_width_means.append(mean(line_widths)) if not line: print flb
def li_combine_for_line_ind(self, line_info, lineind=None): fli = [] # Final Line indices flb = [] line_widths = [] line = line_info.lines_chars[lineind] lib = self.line_info.low_ink_boxes[lineind] shapes = line_info.shapes if line: top = 1000000 # arbitrary high number bottom = 0 for k in line: j = line_info.get_box(k) if j[1] < top: top = j[1] if j[1] + j[3] > bottom: bottom = j[1] + j[3] firstbox = line_info.get_box(line[0]) lastbox = line_info.get_box(line[-1]) whitespace = [] for p, c in enumerate(line): if p + 1 < len(line): ab = line_info.get_box(c) nab = line_info.get_box(line[p + 1]) ws_diff = nab[0] - (ab[0] + ab[2]) # if ws_diff > 10*shapes.char_mean: whitespace.append(ws_diff) sum_whitespace = sum(whitespace) ln_arr = shapes.img_arr[top:bottom, firstbox[0]:lastbox[0] + lastbox[2]].copy() crss_denom = float(max(horizontal_transitions(ln_arr))) if crss_denom > 0: crossings_val = (ln_arr.shape[1] - sum_whitespace) / crss_denom scale = 35.0 / crossings_val else: scale = 1.0 self.transitions.append(scale) else: return [] line = iter(line) low_ink_segmentation = {} not_intr = [] for i in line: b = line_info.get_box(i) led, red, top, bottom, w, h = box_attrs(b) for p, bx in enumerate(lib): ledn, ren, topn, bottomn, wn, hn = box_attrs(bx) is_interior = (led >= ledn - 5 and red <= ren + 5) #and not bottom < topn if is_interior: cur = low_ink_segmentation.get(p, []) cur.append(i) low_ink_segmentation[p] = cur break else: continue else: not_intr.append([i]) all_li_seg = low_ink_segmentation.values() all_li_seg.extend(not_intr) for j in all_li_seg: fli.append(j) x, y, w, h = combine_many_boxes([line_info.get_box(i) for i in j]) flb.append([x, y, w, h]) self.widths.append(w * scale) fliflb = zip(fli, flb) fliflb.sort(key=lambda x: x[1][0]) fli = [i[0] for i in fliflb] flb = [i[1] for i in fliflb] self.final_indices.append(fli) self.final_boxes.append(flb)