コード例 #1
0
def word_info_filter(char_list_list, word_info_list):
    """
    pass in all lines

    :param char_list_list:
    :param word_info_list:
    :return:
    """
    # remove bbox that span multiple lines
    # 2D overlapping finding.
    from pdfxml.intervaltree_2d import IntervalTree2D
    line_interval_tree_2d = IntervalTree2D()
    for i, char_list_line in enumerate(char_list_list):
        line_bbox = get_char_list_bbox(char_list_line)
        line_interval_tree_2d.add_bbox(i, line_bbox)
    filtered_word_info_list = []
    for word_info in word_info_list:
        line_name_list = line_interval_tree_2d.get_overlap_by_bbox(
            word_info['bbox'])
        if len(line_name_list) > 1:
            msg = "one word overlap with multipe lines {}".format(word_info)
            print msg
            continue
        filtered_word_info_list.append(word_info)
    word_info_list = filtered_word_info_list
    return word_info_list
コード例 #2
0
def check_pdfbox_word_segmentation_fail(char_list_list, word_info_list):
    """

    :param char_list_list:
    :param word_info_list:
    :return:
    """
    CHECK_PDFBOX_FAIL_THRES = 0.5
    line_bbox_list = []
    line_width_list = []
    for char_list_line in char_list_list:
        line_bbox = get_char_list_bbox(char_list_line)
        line_bbox_list.append(line_bbox)
        line_width_list.append(line_bbox.width())

    line_width_median = np.percentile(line_width_list, 95)

    failed_count = 0
    for word_info in word_info_list:
        if BBox(word_info['bbox']).width() > CHECK_PDFBOX_FAIL_THRES * line_width_median:
            failed_count += 1
    return failed_count > CHECK_PDFBOX_FAIL_THRES * len(word_info_list)
コード例 #3
0
def merging_merge_one_line(char_list_line, word_info_list):
    """
    merge the word in a line based on the overlapping with the word extracted from pdfbox

    :param char_list_line:
    :param word_info_list:
    :return:
    """
    # out put some debuging information here.
    pdf_util_debug_log.debug(char_list2str(char_list_line))
    line_bbox = get_char_list_bbox(char_list_line)
    pdfbox_word_bbox_list = []
    for word_info in word_info_list:
        if line_bbox.overlap(word_info['bbox']):
            pdfbox_word_bbox_list.append(word_info['bbox'])

    # get the bbox and overlapped word bbox
    # build a confusion matrix of the word.
    char_word_list = get_char_list_list(char_list_line)
    char_word_bbox_list = []
    for wid, char_word in enumerate(char_word_list):
        char_word_bbox_list.append(get_char_list_bbox(char_word))
        pdf_util_debug_log.debug("{} {}".format(wid, char_list2str(char_word)))

    # build the overlapping matrix here.
    # two should be merged if they overlap with the same word from pdfbox
    # union find algorithm here to find connected components.
    uf = UnionFind()
    for cid, char_word_bbox in enumerate(char_word_bbox_list):
        uf.add_node(cid)

    for pid, pdfbox_word_bbox in enumerate(pdfbox_word_bbox_list):
        cid_list = []
        for cid, char_word_bbox in enumerate(char_word_bbox_list):
            if char_word_bbox.overlap(pdfbox_word_bbox):
                cid_list.append(cid)
        for cid in cid_list:
            uf.merge(cid_list[0], cid)

    merged_cid_list_list = uf.get_groups()
    new_char_word_list = []
    new_char_word_bbox_list = []
    for merged_cid_list in merged_cid_list_list:
        tmp_char_word = []
        for cid in merged_cid_list:
            tmp_char_word.extend(char_word_list[cid])
        new_char_word_list.append(tmp_char_word)
        new_char_word_bbox_list.append(get_char_list_bbox(tmp_char_word))

    # for the lesft cid_list

    # sort based on the left boundary
    tmp_idx_list = range(len(new_char_word_list))
    tmp_idx_list.sort(key=lambda idx: new_char_word_bbox_list[idx].left())
    sorted_new_char_word_list = []
    for tmp_idx in tmp_idx_list:
        sorted_new_char_word_list.append(new_char_word_list[tmp_idx])

    # TODO, split the lines with too long word, very likely to be wrong
    max_len = get_longest_length(sorted_new_char_word_list)
    if max_len > WORD_LENGTH_95_QUARTILE:
        new_char_list_line = max_word_split(char_list_line)
    else:
        new_char_list_line = char_list_list2char_list(sorted_new_char_word_list)

    return new_char_list_line
コード例 #4
0
def export_xml(page_info, out_path, pdf_path=None, pid=None):
    """
    TODO, also export the value human could understand, rather than the hex value
    hex value is only for consistency with the other system
    """
    page_n = ET.Element('Page', {'PageNum': str(page_info['pid'])})
    font = get_font_from_pdf(pdf_path, pid)

    for ime_line in page_info['ilist']:
        bbox = get_char_list_bbox(ime_line)

        i_n = ET.SubElement(page_n, 'IsolatedFormula', {
            'BBox': icst_bbox2str(bbox),
            'readable_bbox': readable_bbox2str(bbox)
        })

        for char in ime_line:
            if isinstance(char, LTChar):
                clean_text = get_latex_val_of_lt_char(char, font)
                clean_text = invalid_xml_remove(clean_text)
                #print clean_text

                #clean_text = illegal_xml_re.sub('', char.get_text())
                c_n = ET.SubElement(
                    i_n, 'Char', {
                        'BBox': icst_bbox2str(char.bbox),
                        'readable_bbox': readable_bbox2str(char.bbox),
                        'FSize': str(char.size),
                        'Text': clean_text
                    })

    # the eme part
    for eme in page_info['elist']:
        bbox = get_char_list_bbox(eme)

        i_n = ET.SubElement(page_n, 'EmbeddedFormula', {
            'BBox': icst_bbox2str(bbox),
            'readable_bbox': readable_bbox2str(bbox)
        })
        for char in eme:
            if isinstance(char, LTChar):
                #clean_text = illegal_xml_re.sub('', char.get_text())
                clean_text = get_latex_val_of_lt_char(char, font)
                clean_text = invalid_xml_remove(clean_text)
                #print clean_text
                c_n = ET.SubElement(
                    i_n, 'Char', {
                        'BBox': icst_bbox2str(char.bbox),
                        'readable_bbox': readable_bbox2str(char.bbox),
                        'FSize': str(char.size),
                        'Text': clean_text
                    })

    try:

        res = ET.tostring(page_n, encoding='utf-8')
        if out_path:
            with open(out_path, 'w') as f:
                print >> f, res
        else:
            print res

    except Exception as e:
        print e
コード例 #5
0
def pdf_extract_lines(pdf_path, pid=0, force_single=False):
    """
    each line is a list of LTChar

    :param pdf_path:
    :param pid:
    :return:
    """
    tmp_pdf_path = get_tmp_path(pdf_path)
    cache_path = "{}.pdfbox_merge_line.{}.pkl".format(tmp_pdf_path, pid)
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)

    char_list_list = pdf_extract_lines_raw(pdf_path, pid)

    # TODO, do another round of line merging
    # still use our column line detection model to find the region.
    fontname2space = pdf_extract_fontname2space(pdf_path, pid)
    word_info_list = pdf_extract_words(pdf_path, pid)

    res_char_list_list = []
    if not force_single and is_double_column(pdf_path, pid):
        # split the current list into three parts
        # detect the center split, create two column
        # outside of the double column,
        # within the double column
        page_size = get_pdf_page_size(pdf_path, pid)
        page_width = page_size['width']

        out_char_list_list = []
        left_char_list_list = []
        right_char_list_list = []
        from pdfxml.pdf_util.layout_util import get_char_list_bbox
        for char_list in char_list_list:
            bbox = get_char_list_bbox(char_list)
            if bbox.left() < bbox.right() < page_width / 2:
                left_char_list_list.append(char_list)
            elif bbox.right() > bbox.left() > page_width / 2:
                right_char_list_list.append(char_list)
            else:
                out_char_list_list.append(char_list)

        # before mering do the word_info_filter
        word_info_list = word_info_filter(char_list_list, word_info_list)

        new_out_char_list_list = merging_lines(out_char_list_list,
                                               fontname2space, word_info_list,
                                               pdf_path, pid)
        new_left_char_list_list = merging_lines(left_char_list_list,
                                                fontname2space, word_info_list,
                                                pdf_path, pid)
        new_right_char_list_list = merging_lines(right_char_list_list,
                                                 fontname2space,
                                                 word_info_list, pdf_path, pid)

        # not in the vertical range of the double dolumn
        # center on the left part,
        # center on the right part,
        char_list_list = []
        char_list_list.extend(new_out_char_list_list)
        char_list_list.extend(new_left_char_list_list)
        char_list_list.extend(new_right_char_list_list)

        res_char_list_list = char_list_list
    else:
        # before mering do the word_info_filter
        word_info_list = word_info_filter(char_list_list, word_info_list)

        # single column, then just go on merging the lines
        new_char_list_list = merging_lines(char_list_list, fontname2space,
                                           word_info_list, pdf_path, pid)
        res_char_list_list = new_char_list_list
    dump_serialization(res_char_list_list, cache_path)
    return res_char_list_list
コード例 #6
0
def merge_line_basic(char_list_list, fontname2space):
    """
    with bugs of creating duplicate characters

    after the accent is merge with the corresponding line

    first, sort the lines based on the top position
    then, merge the lines that overlap with 0.5 of the height

    :param char_list_list:
    :return:
    """
    if len(char_list_list) == 0:
        return char_list_list

    line_bbox_list = []
    for char_list in char_list_list:
        # remove the accent is to avoid over merging.
        line_bbox = get_char_list_bbox(char_list, remove_accent=True)

        # adjust to reduce the height by 1/3
        line_bbox = BBox([
            line_bbox.left(),
            line_bbox.bottom() + line_bbox.height() / 6,
            line_bbox.right(),
            line_bbox.top() - line_bbox.height() / 6
        ])

        line_bbox_list.append(line_bbox)

    cur_line_idx_list = range(len(char_list_list))
    cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top())

    tmp_char_list = []
    tmp_bbox = line_bbox_list[cur_line_idx_list[0]]

    # create debug information here about the merging
    line_str = []
    try:
        for sort_lid in cur_line_idx_list:
            tmp_str = ""
            for char in char_list_list[sort_lid]:
                tmp_str += char.get_text()
            line_str.append(tmp_str)
    except Exception as e:
        print 'create debug information error'
        pass

    # the accent merging here.
    return_char_list_list = []
    for i, sort_lid in enumerate(cur_line_idx_list):
        # if vertical overlapping larger 0.5 of each, then merging,
        # other wise, dont merge

        #if tmp_bbox.v_overlap(line_bbox_list[sort_lid], 0.5):
        # hat not be part of the calculation
        if tmp_bbox.v_overlap(line_bbox_list[sort_lid]):
            tmp_char_list.extend(char_list_list[sort_lid])
            tmp_bbox = merge_bbox_list([tmp_bbox, line_bbox_list[sort_lid]])
        else:
            tmp_char_list = [c for c in tmp_char_list if isinstance(c, LTChar)]
            tmp_char_list.sort(key=lambda c: c.bbox[0])
            return_char_list_list.append(
                re_group_char_list_seg(tmp_char_list, fontname2space))

            # create a new line to merge
            tmp_char_list = []
            tmp_char_list.extend(char_list_list[sort_lid])
            tmp_bbox = line_bbox_list[sort_lid]

    if len(tmp_char_list) > 0:
        return_char_list_list.append(
            re_group_char_list_seg(tmp_char_list, fontname2space))
    return return_char_list_list
コード例 #7
0
ファイル: pdfbox_line_accent.py プロジェクト: senyalin/pdfxml
def merge_accent(char_list_list, fontname2space):
    """
    merge the accent

    :param char_list_list:
    :return:
    """
    line_bbox_list = []
    for char_list in char_list_list:
        line_bbox = get_char_list_bbox(char_list)  # not removing the accent
        line_bbox_list.append(line_bbox)
        #print char_list2str(char_list)  # for debugging only

    cur_line_idx_list = range(len(char_list_list))
    cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top())

    # for each accent line, merge with the first line under it. and h overlap with it
    used_line_id_list = []

    # the line might be occur before the accent
    accent_idx2target_idx = {}
    for i, line_idx in enumerate(cur_line_idx_list):
        if line_idx in used_line_id_list:
            continue

        if only_accent(char_list_list[cur_line_idx_list[i]]):
            cur_bbox = line_bbox_list[line_idx]
            found_next_line = False
            for j in range(0, len(cur_line_idx_list)):
                # check all lines
                cand_bbox = line_bbox_list[cur_line_idx_list[j]]
                #if cur_bbox.h_overlap(cand_bbox) and cand_bbox.top() < cur_bbox.bottom():
                if cur_bbox.h_overlap(cand_bbox):
                    cond1 = cand_bbox.top() < cur_bbox.top()
                    # but not self
                    cond2 = cand_bbox.bottom() <= cur_bbox.bottom() <= \
                            cur_bbox.top() <= cand_bbox.top() and i != j

                    if cond1 or cond2:
                        accent_idx2target_idx[line_idx] = cur_line_idx_list[j]

                        used_line_id_list.append(line_idx)
                        used_line_id_list.append(cur_line_idx_list[j])

                        #tmp_char_list = char_list_list[line_idx]
                        #tmp_char_list.extend(char_list_list[cur_line_idx_list[j]])
                        #return_char_list_list.append(tmp_char_list)
                        found_next_line = True
                        break
            if not found_next_line:
                pass
        else:
            #return_char_list_list.append(char_list_list[line_idx])
            pass

    return_char_list_list = []
    used_line_id_list = []
    target_idx2accent_idx = {v: k for k, v in accent_idx2target_idx.items()}
    for i, line_idx in enumerate(cur_line_idx_list):
        if line_idx in used_line_id_list:
            continue

        if line_idx in accent_idx2target_idx:
            tmp_char_list = []
            tmp_char_list.extend(char_list_list[line_idx])
            tmp_char_list.extend(
                char_list_list[accent_idx2target_idx[line_idx]])
            return_char_list_list.append(tmp_char_list)
            used_line_id_list.append(line_idx)
            used_line_id_list.append(accent_idx2target_idx[line_idx])
        elif line_idx in target_idx2accent_idx:
            tmp_char_list = []
            tmp_char_list.extend(char_list_list[line_idx])
            tmp_char_list.extend(
                char_list_list[target_idx2accent_idx[line_idx]])
            return_char_list_list.append(tmp_char_list)
            used_line_id_list.append(line_idx)
            used_line_id_list.append(target_idx2accent_idx[line_idx])
        else:
            return_char_list_list.append(char_list_list[line_idx])

    for char_list in return_char_list_list:
        #print char_list2str(char_list)
        pass

    # sort the return _char_list_list based on the top
    line_bbox_list = []
    for char_list in return_char_list_list:
        line_bbox = get_char_list_bbox(char_list)  # not removing the accent
        line_bbox_list.append(line_bbox)
    cur_line_idx_list = range(len(return_char_list_list))
    cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top())

    # re-order
    new_return_char_list_list = []
    for line_idx in cur_line_idx_list:
        new_return_char_list_list.append(return_char_list_list[line_idx])
    return_char_list_list = new_return_char_list_list

    res_char_list_list = []
    for char_list in return_char_list_list:
        res_char_list_list.append(
            re_group_char_list_seg(char_list, fontname2space))
    return res_char_list_list
コード例 #8
0
ファイル: pdfbox_layout.py プロジェクト: senyalin/pdfxml
def is_double_column(pdf_path, pid, debug=False):
    """
        The idea is that if there are two cluster of begin position , then double column

    :param pdf_path:
    :param pid:
    :return:
    """
    from pdfxml.pdf_util.pdfbox_line_merging import pdf_extract_lines_raw
    from pdfxml.pdf_util.pdfbox_wrapper import get_pdf_page_size

    char_list_list = pdf_extract_lines_raw(pdf_path, pid)

    page_size = get_pdf_page_size(pdf_path, pid)
    page_width = page_size['width']

    # get the boundary of the column, collect the startpoint, and end point, use 0.95 quantile
    start_pos_list = []
    end_pos_list = []
    quantile = 0.90
    for char_list in char_list_list:
        # remove line with less than 10 chars
        if len(char_list) < 30:
            continue
        bbox = get_char_list_bbox(char_list)
        start_pos_list.append(bbox.left())
        end_pos_list.append(bbox.right())

    if len(start_pos_list) == 0 or len(end_pos_list) == 0:
        # it's an empty page.
        #raise Exception("could not get the left/right boundary")
        return False

    start_pos = np.percentile(start_pos_list, int((1 - quantile) * 100))
    end_pos = np.percentile(end_pos_list, int(quantile * 100))
    if debug:
        #plt.hist(start_pos_list)
        print("The main column boundary {} {}".format(start_pos, end_pos))

    if end_pos < page_width / 2 or start_pos > page_width / 2:
        # if only half of the column have enough lines.
        return True

    center_pos = (start_pos + end_pos) / 2
    good_line_count = 0
    total_count = 0.0

    for char_list in char_list_list:
        # remove line with less than 10 chars
        if len(char_list) < 30:
            continue

        bbox = get_char_list_bbox(char_list)
        if bbox.left() < bbox.right() < center_pos or \
                bbox.right() > bbox.left() > center_pos:
            good_line_count += 1
            if debug:
                tmp_str = char_list2str(char_list)
                print "Good Line", tmp_str, bbox
        total_count += 1
        #print "BadLine", total_count

    if debug:
        line_str_list = []
        for char_list in char_list_list:
            line_str_list.append(char_list2str(char_list))

    threshold = 0.6
    if float(good_line_count) / total_count > threshold:
        return True
    else:
        return False
コード例 #9
0
ファイル: pdfbox_line_ime.py プロジェクト: senyalin/pdfxml
def merge_line_ime(char_list_list):
    """
    Though it's called IME processing,
    however it's only merging the bind var,
    no matter it's IME or EME.

    a better name might be merge big op

    only merge based on the bind var operator

    :param char_list_list:
    :return:
    """
    line_bbox_list = []
    for char_list in char_list_list:
        line_bbox = get_char_list_bbox(char_list)  # not removing the accent
        line_bbox_list.append(line_bbox)
    cur_line_idx_list = range(len(char_list_list))
    cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top())

    uppper_under_line_idx = list()
    line_idx2line_idx_list = {}

    res_char_list_list = []
    for i, line_idx in enumerate(cur_line_idx_list):
        left_bound, right_bound = 1000000, -1
        for char in char_list_list[line_idx]:
            if not isinstance(char, LTChar):
                continue
            latex_val = get_latex_val_of_lt_char(char)
            #print latex_val
            if latex_val in ['\\sum', '\\prod']:
                left_bound = min(left_bound, char.bbox[0]-get_width(char.bbox))
                right_bound = max(right_bound, char.bbox[2]+get_width(char.bbox))
        if left_bound > right_bound:
            line_idx2line_idx_list[line_idx] = [line_idx]
            continue

        line_idx2line_idx_list[line_idx] = [line_idx]
        if i != 0:
            prev_line_idx = cur_line_idx_list[i-1]
            prev_bbox = line_bbox_list[prev_line_idx]
            if prev_bbox.left() > left_bound and prev_bbox.right() < right_bound:
                line_idx2line_idx_list[line_idx].append(prev_line_idx)
                uppper_under_line_idx.append(prev_line_idx)

        if i != len(cur_line_idx_list)-1:
            next_line_idx = cur_line_idx_list[i+1]
            next_bbox = line_bbox_list[next_line_idx]
            if next_bbox.left() > left_bound and next_bbox.right() < right_bound:
                line_idx2line_idx_list[line_idx].append(next_line_idx)
                uppper_under_line_idx.append(next_line_idx)

    res_char_list_list = []
    for line_idx in cur_line_idx_list:
        if line_idx in uppper_under_line_idx:
            continue
        tmp_char_list = []
        for tmp_li in line_idx2line_idx_list[line_idx]:
            tmp_char_list.extend(char_list_list[tmp_li])
        res_char_list_list.append(tmp_char_list)

    return res_char_list_list