コード例 #1
0
def re_group_char_list_merge_isolated_digit(char_list_in, font):
    char_list_list = get_char_list_list(char_list_in)
    word_list = []
    for word_char_list in char_list_list:
        word_list.append(char_list2str(word_char_list))
    if MERGE_ISOLATED_DIGIT:
        while True:
            merge_pos = None
            for i in range(len(word_list) - 1):
                next_first_latex_val = get_latex_val_of_lt_char(
                    char_list_list[i + 1][0], font)
                prev_last_latex_val = get_latex_val_of_lt_char(
                    char_list_list[i][-1], font)
                if word_list[i].isdigit() and \
                        (is_bin_op_latex_val(next_first_latex_val) or is_rel_latex_val(next_first_latex_val)):
                    merge_pos = i
                    break
                elif word_list[i + 1].isdigit() and \
                        (is_bin_op_latex_val(prev_last_latex_val) or is_rel_latex_val(prev_last_latex_val)):
                    merge_pos = i
                    break
                else:
                    pass
            if merge_pos is None:
                break
            # process
            new_char_list_list = []
            i = 0
            while i < len(char_list_list):
                if i == merge_pos:
                    new_char_list_list.append(char_list_list[i])
                    new_char_list_list[-1].extend(char_list_list[i + 1])
                    i += 1
                else:
                    new_char_list_list.append(char_list_list[i])
                i += 1

            char_list_list = new_char_list_list
            word_list = []
            for word_char_list in char_list_list:
                word_list.append(char_list2str(word_char_list))

    return char_list_list2char_list(char_list_list)
コード例 #2
0
def could_expand_left(word):
    """

    :param word: list of LTChar
    :return:
    """
    if len(word) == 0:
        return False
    latex_val = get_latex_val_of_lt_char(word[0])
    return latex_val in expand_left_right_list or \
        latex_val in expand_left_sym_list
コード例 #3
0
def re_group_ending_punct(char_list_in, font):
    """
    split the last punct from the words, usuallly not part of word or ME

    :param char_list_in:
    :param font:
    :return:
    """
    char_list_list = get_char_list_list(char_list_in)
    new_char_list_list = []
    for char_list in char_list_list:
        latex_val = get_latex_val_of_lt_char(char_list[-1], font)
        if latex_val in [',', '.', ';', ":", 'comma', 'period', 'colon']:
            new_char_list_list.append(char_list[:-1])
            new_char_list_list.append([char_list[-1]])
        else:
            new_char_list_list.append(char_list)

    return char_list_list2char_list(new_char_list_list)
コード例 #4
0
    def get_UGPs(self):
        """
        only get the UGP and test the performance of ME layout analysis
        mostly copy from the export_latex

        :return:
        """
        assert len(self.id_list_list_for_nscs) == len(self.nscs_label_list)
        ugp_list = []

        nscs_id = 0  # the id for nscs
        nscs_num = len(self.id_list_list_for_nscs)

        while nscs_id < nscs_num:
            if self.nscs_label_list[nscs_id] == 1:
                # keep finding more
                tmp_id = nscs_id

                me_symbol_groups = [
                ]  # prepare the me_symbol group for parsing

                while tmp_id < nscs_num and self.nscs_label_list[tmp_id] == 1:
                    cid_list = self.id_list_list_for_nscs[tmp_id]

                    for cid in cid_list:
                        latex_val = get_latex_val_of_lt_char(
                            self.chars[cid], self.get_font())
                        # TODO, ajdust of the tight bounding box
                        bbox = BBox(self.chars[cid].bbox)

                        me_symbol_group = MESymbolGroup(
                            MESymbol(latex_val, bbox))
                        me_symbol_groups.append(me_symbol_group)

                    tmp_id += 1
                nscs_id = tmp_id - 1

                # TODO, the path is not presented here
                ugp = UnorganizedGroupPath(me_symbol_groups, [])
                ugp_list.append(ugp)
            nscs_id += 1
        return ugp_list
コード例 #5
0
def export_xml(page_info, out_path, pdf_path=None, pid=None):
    """
    TODO, also export the value human could understand, rather than the hex value
    hex value is only for consistency with the other system
    """
    page_n = ET.Element('Page', {'PageNum': str(page_info['pid'])})
    font = get_font_from_pdf(pdf_path, pid)

    for ime_line in page_info['ilist']:
        bbox = get_char_list_bbox(ime_line)

        i_n = ET.SubElement(page_n, 'IsolatedFormula', {
            'BBox': icst_bbox2str(bbox),
            'readable_bbox': readable_bbox2str(bbox)
        })

        for char in ime_line:
            if isinstance(char, LTChar):
                clean_text = get_latex_val_of_lt_char(char, font)
                clean_text = invalid_xml_remove(clean_text)
                #print clean_text

                #clean_text = illegal_xml_re.sub('', char.get_text())
                c_n = ET.SubElement(
                    i_n, 'Char', {
                        'BBox': icst_bbox2str(char.bbox),
                        'readable_bbox': readable_bbox2str(char.bbox),
                        'FSize': str(char.size),
                        'Text': clean_text
                    })

    # the eme part
    for eme in page_info['elist']:
        bbox = get_char_list_bbox(eme)

        i_n = ET.SubElement(page_n, 'EmbeddedFormula', {
            'BBox': icst_bbox2str(bbox),
            'readable_bbox': readable_bbox2str(bbox)
        })
        for char in eme:
            if isinstance(char, LTChar):
                #clean_text = illegal_xml_re.sub('', char.get_text())
                clean_text = get_latex_val_of_lt_char(char, font)
                clean_text = invalid_xml_remove(clean_text)
                #print clean_text
                c_n = ET.SubElement(
                    i_n, 'Char', {
                        'BBox': icst_bbox2str(char.bbox),
                        'readable_bbox': readable_bbox2str(char.bbox),
                        'FSize': str(char.size),
                        'Text': clean_text
                    })

    try:

        res = ET.tostring(page_n, encoding='utf-8')
        if out_path:
            with open(out_path, 'w') as f:
                print >> f, res
        else:
            print res

    except Exception as e:
        print e
コード例 #6
0
ファイル: pdfbox_line_ime.py プロジェクト: senyalin/pdfxml
def merge_line_ime(char_list_list):
    """
    Though it's called IME processing,
    however it's only merging the bind var,
    no matter it's IME or EME.

    a better name might be merge big op

    only merge based on the bind var operator

    :param char_list_list:
    :return:
    """
    line_bbox_list = []
    for char_list in char_list_list:
        line_bbox = get_char_list_bbox(char_list)  # not removing the accent
        line_bbox_list.append(line_bbox)
    cur_line_idx_list = range(len(char_list_list))
    cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top())

    uppper_under_line_idx = list()
    line_idx2line_idx_list = {}

    res_char_list_list = []
    for i, line_idx in enumerate(cur_line_idx_list):
        left_bound, right_bound = 1000000, -1
        for char in char_list_list[line_idx]:
            if not isinstance(char, LTChar):
                continue
            latex_val = get_latex_val_of_lt_char(char)
            #print latex_val
            if latex_val in ['\\sum', '\\prod']:
                left_bound = min(left_bound, char.bbox[0]-get_width(char.bbox))
                right_bound = max(right_bound, char.bbox[2]+get_width(char.bbox))
        if left_bound > right_bound:
            line_idx2line_idx_list[line_idx] = [line_idx]
            continue

        line_idx2line_idx_list[line_idx] = [line_idx]
        if i != 0:
            prev_line_idx = cur_line_idx_list[i-1]
            prev_bbox = line_bbox_list[prev_line_idx]
            if prev_bbox.left() > left_bound and prev_bbox.right() < right_bound:
                line_idx2line_idx_list[line_idx].append(prev_line_idx)
                uppper_under_line_idx.append(prev_line_idx)

        if i != len(cur_line_idx_list)-1:
            next_line_idx = cur_line_idx_list[i+1]
            next_bbox = line_bbox_list[next_line_idx]
            if next_bbox.left() > left_bound and next_bbox.right() < right_bound:
                line_idx2line_idx_list[line_idx].append(next_line_idx)
                uppper_under_line_idx.append(next_line_idx)

    res_char_list_list = []
    for line_idx in cur_line_idx_list:
        if line_idx in uppper_under_line_idx:
            continue
        tmp_char_list = []
        for tmp_li in line_idx2line_idx_list[line_idx]:
            tmp_char_list.extend(char_list_list[tmp_li])
        res_char_list_list.append(tmp_char_list)

    return res_char_list_list
コード例 #7
0
    def export_latex(self):
        """
        export the sentence into latex format, might also need to pipeline the layout analysis

        :return:
        """
        print "Start exporting LaTeX"
        assert len(self.id_list_list_for_nscs) == len(self.nscs_label_list)
        res = ""

        nscs_id = 0  # the id for nscs
        nscs_num = len(self.id_list_list_for_nscs)

        while nscs_id < nscs_num:
            cid_list = self.id_list_list_for_nscs[nscs_id]
            nscs_str = "".join([self.text_list[cid] for cid in cid_list])
            if isinstance(nscs_str, unicode):
                nscs_str = nscs_str.encode("utf-8")

            if self.nscs_label_list[nscs_id] == 1:
                # keep finding more
                me_str = ""
                tmp_id = nscs_id

                me_symbol_groups = [
                ]  # prepare the me_symbol group for parsing

                while tmp_id < nscs_num and self.nscs_label_list[tmp_id] == 1:
                    cid_list = self.id_list_list_for_nscs[tmp_id]
                    nscs_str = "".join(
                        [self.text_list[cid] for cid in cid_list])

                    # convert from char to latex value. NOTE: there used to be a bug here.
                    for cid in cid_list:
                        latex_val = get_latex_val_of_lt_char(
                            self.chars[cid], self.get_font())
                        # TODO, ajdust of the tight bounding box
                        bbox = BBox(self.chars[cid].bbox)

                        me_symbol_group = MESymbolGroup(
                            MESymbol(latex_val, bbox))
                        me_symbol_groups.append(me_symbol_group)

                    if isinstance(nscs_str, unicode):
                        nscs_str = nscs_str.encode("utf-8")
                    me_str += nscs_str
                    tmp_id += 1
                nscs_id = tmp_id - 1

                # TODO, NOTE, remove the try catch to get all the parsing here

                try:
                    print "HOWDY!!!"
                    # TODO, the path is not presented here
                    ugp = UnorganizedGroupPath(me_symbol_groups, [])
                    hgroup = ugp2hgroup(ugp)
                    latex_str = hgroup.to_latex()

                    #res += "${}$ ".format(xml_str)
                    res += "${}$ ".format(latex_str)
                except Exception as e:
                    print "OH NO!!!"
                    res += "${}$ ".format(me_str)

            else:
                res += nscs_str + " "
            nscs_id += 1
        res = res.strip()
        return res