Beispiel #1
0
def page_to_tables(page, extend_y=False, hints=[], atomise=False):
    """
    Get a rectangular list of list of strings from one page of a document
    """
    if not isinstance(page, LTPage):
        raise TypeError("Page must be LTPage, not {}".format(page.__class__))

    table_array = []

    # For LTTextLine horizontal column and row thresholds of 3 work ok
    columnThreshold = 5  # 3 works for smaller tables
    rowThreshold = 3

    if atomise:
        flt = ['LTPage', 'LTTextLineHorizontal', 'LTChar']
    else:
        flt = ['LTPage', 'LTTextLineHorizontal']
    # flt = ['LTPage', 'LTTextLineHorizontal', 'LTFigure']
    box_list = LeafList().populate(page, flt).purge_empty_text()

    (minx, maxx, miny, maxy) = find_table_bounding_box(box_list, hints=hints)

    """If miny and maxy are None then we found no tables and should exit"""
    if miny is None and maxy is None:
       print "found no tables"
       return table_array, TableDiagnosticData()

    if atomise:
        box_list = box_list.filterByType(['LTPage', 'LTChar'])

    filtered_box_list = filter_box_list_by_position(
        box_list,
        miny,
        maxy,
        Leaf._midline)

    filtered_box_list = filter_box_list_by_position(
        filtered_box_list,
        minx,
        maxx,
        Leaf._centreline)

    # Project boxes onto horizontal axis
    column_projection = project_boxes(filtered_box_list, "column")

    # Project boxes onto vertical axis
    # Erode row height by a fraction of the modal text box height
    erodelevel = int(math.floor(calculate_modal_height(filtered_box_list) / 4))
    row_projection = project_boxes(
        filtered_box_list, "row",
        erosion=erodelevel)

    #
    y_comb = comb_from_projection(row_projection, rowThreshold, "row")
    y_comb.reverse()

    # columnThreshold = max(len(y_comb)*0.75,5)
    x_comb = comb_from_projection(column_projection, columnThreshold, "column")

    x_comb[0] = minx
    x_comb[-1] = maxx

    # Extend y_comb to page size if extend_y is true
    if extend_y:
        pageminy = min([box.bottom for box in box_list])
        pagemaxy = max([box.top for box in box_list])
        y_comb = comb_extend(y_comb, pageminy, pagemaxy)
        filtered_box_list = box_list

    # Applying the combs
    table_array = apply_combs(box_list, x_comb, y_comb)

    # Strip out leading and trailing spaces when atomise true
    if atomise:
        tmp_table = []
        for row in table_array:
            stripped_row = map(unicode.strip,row)
            tmp_table.append(stripped_row)
        table_array = tmp_table

    diagnostic_data = TableDiagnosticData(
        filtered_box_list,
        column_projection,
        row_projection,
        x_comb,
        y_comb)

    return table_array, diagnostic_data
Beispiel #2
0
def page_to_tables(pdf_page, config=None):
    """
    Get a rectangular list of list of strings from one page of a document
    """
    if not isinstance(pdf_page, PDFPage):
        raise TypeError("Page must be PDFPage, not {}".format(
            pdf_page.__class__))

    if not config:
        config = ConfigParameters()
    table_array = []

    # For LTTextLine horizontal column and row thresholds of 3 work ok
    columnThreshold = 5  # 3 works for smaller tables
    rowThreshold = 3

    if config.atomise:
        flt = ['LTPage', 'LTTextLineHorizontal', 'LTChar']
    else:
        flt = ['LTPage', 'LTTextLineHorizontal']
    # flt = ['LTPage', 'LTTextLineHorizontal', 'LTFigure']
    box_list = LeafList().populate(pdf_page, flt).purge_empty_text()

    (minx, maxx, miny,
     maxy) = find_table_bounding_box(box_list, config.table_top_hint,
                                     config.table_bottom_hint)
    """If miny and maxy are None then we found no tables and should exit"""
    if miny is None and maxy is None:
        print "found no tables"
        return table_array, TableDiagnosticData()

    if config.atomise:
        box_list = box_list.filterByType(['LTPage', 'LTChar'])

    filtered_box_list = filter_box_list_by_position(box_list, miny, maxy,
                                                    Leaf._midline)

    filtered_box_list = filter_box_list_by_position(filtered_box_list, minx,
                                                    maxx, Leaf._centreline)

    # Project boxes onto horizontal axis
    column_projection = project_boxes(filtered_box_list, "column")

    # Project boxes onto vertical axis
    # Erode row height by a fraction of the modal text box height
    erodelevel = int(math.floor(calculate_modal_height(filtered_box_list) / 4))
    row_projection = project_boxes(filtered_box_list,
                                   "row",
                                   erosion=erodelevel)

    #
    y_comb = comb_from_projection(row_projection, rowThreshold, "row")
    y_comb.reverse()

    # columnThreshold = max(len(y_comb)*0.75,5)
    x_comb = comb_from_projection(column_projection, columnThreshold, "column")

    x_comb[0] = minx
    x_comb[-1] = maxx

    # Extend y_comb to page size if extend_y is true
    if config.extend_y:
        pageminy = min([box.bottom for box in box_list])
        pagemaxy = max([box.top for box in box_list])
        y_comb = comb_extend(y_comb, pageminy, pagemaxy)
        filtered_box_list = box_list

    # Applying the combs
    table_array = apply_combs(box_list, x_comb, y_comb)

    # Strip out leading and trailing spaces when atomise true
    if config.atomise:
        tmp_table = []
        for row in table_array:
            stripped_row = map(unicode.strip, row)
            tmp_table.append(stripped_row)
        table_array = tmp_table

    diagnostic_data = TableDiagnosticData(filtered_box_list, column_projection,
                                          row_projection, x_comb, y_comb)

    return table_array, diagnostic_data