#SelectedPDF = "pdf_prc_prod_1_7_1288_acucar-vhp-vendido-mercado-externo_sao-paulo_mensal.pdf" #pagenumber = 1 #SelectedPDF = "commodity-prices_en.pdf" #pagenumber = 1 SelectedPDF = "AnimalExampleTables.pdf" # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf pagenumber = 2 filepath = os.path.join(PDF_TEST_FILES, SelectedPDF) fh = open(filepath, 'rb') #pta.plotAllPages(fh) pdf_page = get_pdf_page(fh, pagenumber) table, diagnosticData = page_to_tables(pdf_page, extend_y=False, hints=hints, atomise=False) fig, ax1 = pta.plotpage(diagnosticData) result = StringIO() (columns, rows) = get_dimensions(table) result.write(" {} columns, {} rows\n".format(columns, rows)) print to_string(table) # BoxList = plotAllPages(open(filepath, 'rb'))
def calculate_modal_height(box_list): height_list = [] for box in box_list: if box.classname in ('LTTextLineHorizontal', 'LTChar'): height_list.append(round(box.bbox[TOP] - box.bbox[BOTTOM])) modal_height = Counter(height_list).most_common(1) return modal_height[0][0] def file_handle_from_url(URL): # TODO: move this function to a helper library response = requests.get(URL) fh = StringIO(response.content) return fh if __name__ == '__main__': sys.stdout = codecs.getwriter('utf-8')(sys.stdout) if len(sys.argv) > 1: from display import to_string with open(sys.argv[1], 'rb') as f: tables = get_tables(f) for i, table in enumerate(tables): print("---- TABLE {} ----".format(i + 1)) print(to_string(table)) else: print("Usage: {} <file.pdf>".format(sys.argv[0]))
#SelectedPDF = "AnimalExampleTables.pdf" # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf #pagenumber = 2 filepath = os.path.join(PDF_TEST_FILES, SelectedPDF) pta.plotAllPages(fh) doc = PDFDocument(open(filepath, 'rb')) pdf_page = doc.get_page(pagenumber) table, diagnosticData = page_to_tables( pdf_page, ConfigParameters( extend_y=False, table_top_hint=table_top_hint, table_bottom_hint=table_bottom_hint, atomise=False)) fig, ax1 = pta.plotpage(diagnosticData) result = StringIO() (columns, rows) = get_dimensions(table) result.write(" {} columns, {} rows\n".format(columns, rows)) print to_string(table) # BoxList = plotAllPages(open(filepath, 'rb'))