def re_group_char_list_merge_isolated_digit(char_list_in, font): char_list_list = get_char_list_list(char_list_in) word_list = [] for word_char_list in char_list_list: word_list.append(char_list2str(word_char_list)) if MERGE_ISOLATED_DIGIT: while True: merge_pos = None for i in range(len(word_list) - 1): next_first_latex_val = get_latex_val_of_lt_char( char_list_list[i + 1][0], font) prev_last_latex_val = get_latex_val_of_lt_char( char_list_list[i][-1], font) if word_list[i].isdigit() and \ (is_bin_op_latex_val(next_first_latex_val) or is_rel_latex_val(next_first_latex_val)): merge_pos = i break elif word_list[i + 1].isdigit() and \ (is_bin_op_latex_val(prev_last_latex_val) or is_rel_latex_val(prev_last_latex_val)): merge_pos = i break else: pass if merge_pos is None: break # process new_char_list_list = [] i = 0 while i < len(char_list_list): if i == merge_pos: new_char_list_list.append(char_list_list[i]) new_char_list_list[-1].extend(char_list_list[i + 1]) i += 1 else: new_char_list_list.append(char_list_list[i]) i += 1 char_list_list = new_char_list_list word_list = [] for word_char_list in char_list_list: word_list.append(char_list2str(word_char_list)) return char_list_list2char_list(char_list_list)
def could_expand_left(word): """ :param word: list of LTChar :return: """ if len(word) == 0: return False latex_val = get_latex_val_of_lt_char(word[0]) return latex_val in expand_left_right_list or \ latex_val in expand_left_sym_list
def re_group_ending_punct(char_list_in, font): """ split the last punct from the words, usuallly not part of word or ME :param char_list_in: :param font: :return: """ char_list_list = get_char_list_list(char_list_in) new_char_list_list = [] for char_list in char_list_list: latex_val = get_latex_val_of_lt_char(char_list[-1], font) if latex_val in [',', '.', ';', ":", 'comma', 'period', 'colon']: new_char_list_list.append(char_list[:-1]) new_char_list_list.append([char_list[-1]]) else: new_char_list_list.append(char_list) return char_list_list2char_list(new_char_list_list)
def get_UGPs(self): """ only get the UGP and test the performance of ME layout analysis mostly copy from the export_latex :return: """ assert len(self.id_list_list_for_nscs) == len(self.nscs_label_list) ugp_list = [] nscs_id = 0 # the id for nscs nscs_num = len(self.id_list_list_for_nscs) while nscs_id < nscs_num: if self.nscs_label_list[nscs_id] == 1: # keep finding more tmp_id = nscs_id me_symbol_groups = [ ] # prepare the me_symbol group for parsing while tmp_id < nscs_num and self.nscs_label_list[tmp_id] == 1: cid_list = self.id_list_list_for_nscs[tmp_id] for cid in cid_list: latex_val = get_latex_val_of_lt_char( self.chars[cid], self.get_font()) # TODO, ajdust of the tight bounding box bbox = BBox(self.chars[cid].bbox) me_symbol_group = MESymbolGroup( MESymbol(latex_val, bbox)) me_symbol_groups.append(me_symbol_group) tmp_id += 1 nscs_id = tmp_id - 1 # TODO, the path is not presented here ugp = UnorganizedGroupPath(me_symbol_groups, []) ugp_list.append(ugp) nscs_id += 1 return ugp_list
def export_xml(page_info, out_path, pdf_path=None, pid=None): """ TODO, also export the value human could understand, rather than the hex value hex value is only for consistency with the other system """ page_n = ET.Element('Page', {'PageNum': str(page_info['pid'])}) font = get_font_from_pdf(pdf_path, pid) for ime_line in page_info['ilist']: bbox = get_char_list_bbox(ime_line) i_n = ET.SubElement(page_n, 'IsolatedFormula', { 'BBox': icst_bbox2str(bbox), 'readable_bbox': readable_bbox2str(bbox) }) for char in ime_line: if isinstance(char, LTChar): clean_text = get_latex_val_of_lt_char(char, font) clean_text = invalid_xml_remove(clean_text) #print clean_text #clean_text = illegal_xml_re.sub('', char.get_text()) c_n = ET.SubElement( i_n, 'Char', { 'BBox': icst_bbox2str(char.bbox), 'readable_bbox': readable_bbox2str(char.bbox), 'FSize': str(char.size), 'Text': clean_text }) # the eme part for eme in page_info['elist']: bbox = get_char_list_bbox(eme) i_n = ET.SubElement(page_n, 'EmbeddedFormula', { 'BBox': icst_bbox2str(bbox), 'readable_bbox': readable_bbox2str(bbox) }) for char in eme: if isinstance(char, LTChar): #clean_text = illegal_xml_re.sub('', char.get_text()) clean_text = get_latex_val_of_lt_char(char, font) clean_text = invalid_xml_remove(clean_text) #print clean_text c_n = ET.SubElement( i_n, 'Char', { 'BBox': icst_bbox2str(char.bbox), 'readable_bbox': readable_bbox2str(char.bbox), 'FSize': str(char.size), 'Text': clean_text }) try: res = ET.tostring(page_n, encoding='utf-8') if out_path: with open(out_path, 'w') as f: print >> f, res else: print res except Exception as e: print e
def merge_line_ime(char_list_list): """ Though it's called IME processing, however it's only merging the bind var, no matter it's IME or EME. a better name might be merge big op only merge based on the bind var operator :param char_list_list: :return: """ line_bbox_list = [] for char_list in char_list_list: line_bbox = get_char_list_bbox(char_list) # not removing the accent line_bbox_list.append(line_bbox) cur_line_idx_list = range(len(char_list_list)) cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top()) uppper_under_line_idx = list() line_idx2line_idx_list = {} res_char_list_list = [] for i, line_idx in enumerate(cur_line_idx_list): left_bound, right_bound = 1000000, -1 for char in char_list_list[line_idx]: if not isinstance(char, LTChar): continue latex_val = get_latex_val_of_lt_char(char) #print latex_val if latex_val in ['\\sum', '\\prod']: left_bound = min(left_bound, char.bbox[0]-get_width(char.bbox)) right_bound = max(right_bound, char.bbox[2]+get_width(char.bbox)) if left_bound > right_bound: line_idx2line_idx_list[line_idx] = [line_idx] continue line_idx2line_idx_list[line_idx] = [line_idx] if i != 0: prev_line_idx = cur_line_idx_list[i-1] prev_bbox = line_bbox_list[prev_line_idx] if prev_bbox.left() > left_bound and prev_bbox.right() < right_bound: line_idx2line_idx_list[line_idx].append(prev_line_idx) uppper_under_line_idx.append(prev_line_idx) if i != len(cur_line_idx_list)-1: next_line_idx = cur_line_idx_list[i+1] next_bbox = line_bbox_list[next_line_idx] if next_bbox.left() > left_bound and next_bbox.right() < right_bound: line_idx2line_idx_list[line_idx].append(next_line_idx) uppper_under_line_idx.append(next_line_idx) res_char_list_list = [] for line_idx in cur_line_idx_list: if line_idx in uppper_under_line_idx: continue tmp_char_list = [] for tmp_li in line_idx2line_idx_list[line_idx]: tmp_char_list.extend(char_list_list[tmp_li]) res_char_list_list.append(tmp_char_list) return res_char_list_list
def export_latex(self): """ export the sentence into latex format, might also need to pipeline the layout analysis :return: """ print "Start exporting LaTeX" assert len(self.id_list_list_for_nscs) == len(self.nscs_label_list) res = "" nscs_id = 0 # the id for nscs nscs_num = len(self.id_list_list_for_nscs) while nscs_id < nscs_num: cid_list = self.id_list_list_for_nscs[nscs_id] nscs_str = "".join([self.text_list[cid] for cid in cid_list]) if isinstance(nscs_str, unicode): nscs_str = nscs_str.encode("utf-8") if self.nscs_label_list[nscs_id] == 1: # keep finding more me_str = "" tmp_id = nscs_id me_symbol_groups = [ ] # prepare the me_symbol group for parsing while tmp_id < nscs_num and self.nscs_label_list[tmp_id] == 1: cid_list = self.id_list_list_for_nscs[tmp_id] nscs_str = "".join( [self.text_list[cid] for cid in cid_list]) # convert from char to latex value. NOTE: there used to be a bug here. for cid in cid_list: latex_val = get_latex_val_of_lt_char( self.chars[cid], self.get_font()) # TODO, ajdust of the tight bounding box bbox = BBox(self.chars[cid].bbox) me_symbol_group = MESymbolGroup( MESymbol(latex_val, bbox)) me_symbol_groups.append(me_symbol_group) if isinstance(nscs_str, unicode): nscs_str = nscs_str.encode("utf-8") me_str += nscs_str tmp_id += 1 nscs_id = tmp_id - 1 # TODO, NOTE, remove the try catch to get all the parsing here try: print "HOWDY!!!" # TODO, the path is not presented here ugp = UnorganizedGroupPath(me_symbol_groups, []) hgroup = ugp2hgroup(ugp) latex_str = hgroup.to_latex() #res += "${}$ ".format(xml_str) res += "${}$ ".format(latex_str) except Exception as e: print "OH NO!!!" res += "${}$ ".format(me_str) else: res += nscs_str + " " nscs_id += 1 res = res.strip() return res