def page_contains_tables(layout, device): # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's # silly that we have to care about these (see function signature!!) box_list = LeafList().populate(layout) for item in box_list: assert isinstance(item, Leaf), "NOT LEAF" yhist = box_list.histogram(Leaf._top).rounder(1) test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD] return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
def page_contains_tables(pdf_page): if not isinstance(pdf_page, PDFPage): raise TypeError("Page must be PDFPage, not {}".format( pdf_page.__class__)) box_list = LeafList().populate(pdf_page) for item in box_list: assert isinstance(item, Leaf), "NOT LEAF" yhist = box_list.histogram(Leaf._top).rounder(1) test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD] return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
def page_contains_tables(pdf_page, interpreter, device): # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's # silly that we have to care about these (see function signature!!) interpreter.process_page(pdf_page) # receive the LTPage object for the page. layout = device.get_result() box_list = LeafList().populate(layout) for item in box_list: assert isinstance(item, Leaf), "NOT LEAF" yhist = box_list.histogram(Leaf._top).rounder(1) test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD] return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD