Beispiel #1
0
def page_contains_tables(layout, device):
    # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's
    # silly that we have to care about these (see function signature!!)

    box_list = LeafList().populate(layout)
    for item in box_list:
        assert isinstance(item, Leaf), "NOT LEAF"
    yhist = box_list.histogram(Leaf._top).rounder(1)

    test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
    return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
Beispiel #2
0
def page_contains_tables(pdf_page):
    if not isinstance(pdf_page, PDFPage):
        raise TypeError("Page must be PDFPage, not {}".format(
            pdf_page.__class__))

    box_list = LeafList().populate(pdf_page)
    for item in box_list:
        assert isinstance(item, Leaf), "NOT LEAF"
    yhist = box_list.histogram(Leaf._top).rounder(1)

    test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
    return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
Beispiel #3
0
def page_contains_tables(pdf_page):
    if not isinstance(pdf_page, PDFPage):
        raise TypeError("Page must be PDFPage, not {}".format(
            pdf_page.__class__))

    box_list = LeafList().populate(pdf_page)
    for item in box_list:
        assert isinstance(item, Leaf), "NOT LEAF"
    yhist = box_list.histogram(Leaf._top).rounder(1)

    test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
    return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
Beispiel #4
0
def page_contains_tables(pdf_page, interpreter, device):
    # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's
    # silly that we have to care about these (see function signature!!)

    interpreter.process_page(pdf_page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    box_list = LeafList().populate(layout)
    for item in box_list:
        assert isinstance(item, Leaf), "NOT LEAF"
    yhist = box_list.histogram(Leaf._top).rounder(1)

    test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
    return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD