def adjust_bbox_h_latex_horizontally(bbox, latex): """ :param bbox: :param latex: :return: """ # adjust horizontally. ver_latex2ur, ver_latex2lr, hor_latex2ur, hor_latex2lr = get_latex2adjustment_ratio( ) if latex not in hor_latex2ur or latex not in hor_latex2lr: hor_equivalent_map = { "\\approx": "=", "\\simeq": "=", } if latex in hor_equivalent_map: latex = hor_equivalent_map[latex] elif latex.startswith("\\not"): latex = "\\backslash" else: raise Exception("No Stat yet for {}".format(latex)) upper_ratio = hor_latex2ur[latex] lower_ratio = hor_latex2lr[latex] new_bbox = copy.copy(bbox) if isinstance(new_bbox, BBox): new_bbox = new_bbox.to_list() else: new_bbox = list(new_bbox) width = get_width(bbox) new_bbox[1] = new_bbox[1] - width * lower_ratio new_bbox[3] = new_bbox[3] + width * upper_ratio return new_bbox
def get_lower_ratio_hor(cinfo, lower_line): """ using the horizontal width, rather than the vertical height because the height might be very thin """ # get mean in vertical direction y_cen = get_y_center(cinfo['bbox']) return (y_cen - lower_line) / get_width(cinfo['bbox'])
def get_upper_ratio_hor(cinfo, upper_line): """ given the upper line and the cinfo, calculate the vertical difference of upper boundary w.r.t. the vertical center, with respect to the width of the current char. :param cinfo: :param upper_line: :return: """ y_cen = get_y_center(cinfo['bbox']) return (upper_line - y_cen) / get_width(cinfo['bbox'])
def merge_line_ime(char_list_list): """ Though it's called IME processing, however it's only merging the bind var, no matter it's IME or EME. a better name might be merge big op only merge based on the bind var operator :param char_list_list: :return: """ line_bbox_list = [] for char_list in char_list_list: line_bbox = get_char_list_bbox(char_list) # not removing the accent line_bbox_list.append(line_bbox) cur_line_idx_list = range(len(char_list_list)) cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top()) uppper_under_line_idx = list() line_idx2line_idx_list = {} res_char_list_list = [] for i, line_idx in enumerate(cur_line_idx_list): left_bound, right_bound = 1000000, -1 for char in char_list_list[line_idx]: if not isinstance(char, LTChar): continue latex_val = get_latex_val_of_lt_char(char) #print latex_val if latex_val in ['\\sum', '\\prod']: left_bound = min(left_bound, char.bbox[0]-get_width(char.bbox)) right_bound = max(right_bound, char.bbox[2]+get_width(char.bbox)) if left_bound > right_bound: line_idx2line_idx_list[line_idx] = [line_idx] continue line_idx2line_idx_list[line_idx] = [line_idx] if i != 0: prev_line_idx = cur_line_idx_list[i-1] prev_bbox = line_bbox_list[prev_line_idx] if prev_bbox.left() > left_bound and prev_bbox.right() < right_bound: line_idx2line_idx_list[line_idx].append(prev_line_idx) uppper_under_line_idx.append(prev_line_idx) if i != len(cur_line_idx_list)-1: next_line_idx = cur_line_idx_list[i+1] next_bbox = line_bbox_list[next_line_idx] if next_bbox.left() > left_bound and next_bbox.right() < right_bound: line_idx2line_idx_list[line_idx].append(next_line_idx) uppper_under_line_idx.append(next_line_idx) res_char_list_list = [] for line_idx in cur_line_idx_list: if line_idx in uppper_under_line_idx: continue tmp_char_list = [] for tmp_li in line_idx2line_idx_list[line_idx]: tmp_char_list.extend(char_list_list[tmp_li]) res_char_list_list.append(tmp_char_list) return res_char_list_list
def normalized_Ar_Bl_diff_by_width_AB(b1, b2): d = b2[0] - b1[2] return d / get_width(b1)
def normalized_xcenter_diff_by_merge_AB(b1, b2): """ symmetric horizontal difference """ mb = merge_bbox(b1, b2) return abs(get_x_center(b1) - get_x_center(b2)) / get_width(mb)