def fixture(filename): """ Obtain a PDFDocument for fixtures/sample_data/{filename}, memoizing the return result. """ global memoized if filename in memoized: return memoized.get(filename) here = abspath(dirname(__file__)) fn = pjoin(here, "..", "fixtures", "sample_data", filename) fd = open(fn, "rb") memoized[filename] = PDFDocument(fd) return memoized[filename]
def render_pdf(pdf_filename): with open(pdf_filename, "rb") as fd: doc = PDFDocument.from_fileobj(fd) for page_number, page in enumerate(doc.get_pages()): svg_file = "svgs/{0}_{1:02d}.svg".format(basename(pdf_filename), page_number) png_file = "pngs/{0}_{1:02d}.png".format(basename(pdf_filename), page_number) table_container = page_to_tables(page) annotations = make_annotations(table_container) render_page(pdf_filename, page_number, annotations, svg_file, png_file) print "Rendered", svg_file, png_file
def render_pdf(pdf_filename): with open(pdf_filename, "rb") as fd: doc = PDFDocument.from_fileobj(fd) for page_number, page in enumerate(doc.get_pages()): svg_file = 'svgs/{0}_{1:02d}.svg'.format(basename(pdf_filename), page_number) png_file = 'pngs/{0}_{1:02d}.png'.format(basename(pdf_filename), page_number) table_container = page_to_tables(page) annotations = make_annotations(table_container) render_page(pdf_filename, page_number, annotations, svg_file, png_file) print "Rendered", svg_file, png_file
def check(path): fileobj = open(path, "rb") doc = PDFDocument.from_fileobj(fileobj) tables = pdftables.page_to_tables(doc.get_page(0)) print tables
#pagenumber = 1 #SelectedPDF = "pdf_prc_prod_1_7_1288_acucar-vhp-vendido-mercado-externo_sao-paulo_mensal.pdf" #pagenumber = 1 #SelectedPDF = "commodity-prices_en.pdf" #pagenumber = 1 #SelectedPDF = "AnimalExampleTables.pdf" # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf #pagenumber = 2 filepath = os.path.join(PDF_TEST_FILES, SelectedPDF) pta.plotAllPages(fh) doc = PDFDocument(open(filepath, 'rb')) pdf_page = doc.get_page(pagenumber) table, diagnosticData = page_to_tables( pdf_page, ConfigParameters( extend_y=False, table_top_hint=table_top_hint, table_bottom_hint=table_bottom_hint, atomise=False)) fig, ax1 = pta.plotpage(diagnosticData) result = StringIO() (columns, rows) = get_dimensions(table) result.write(" {} columns, {} rows\n".format(columns, rows))
def check(path): fileobj = open(path, 'rb') doc = PDFDocument.from_fileobj(fileobj) tables = pdftables.page_to_tables(doc.get_page(0)) print tables
from pdftables.pdf_document import PDFDocument as pdfdoc from pdftables.pdftables import page_to_tables from pdftables.display import to_string filepath = 'irregular-verbs-de.pdf' fileobj = open(filepath, 'rb') doc = pdfdoc.from_fileobj(fileobj) page = doc.get_page(0) tables = page_to_tables(page) for table in tables: print to_string(table.data)
filepath = 'CBSinglePage.pdf' fileobj = open(filepath, 'rb') # Then we create a PDF element from the file object: import pdftables from pdftables.pdf_document import PDFDocument doc = PDFDocument.from_fileobj(fileobj) #Then we use the get_page() method to select a single page from the document: from pdftables.pdftables import page_to_tables page = doc.get_page(12) tables = page_to_tables(page) #Now you have a TableContainer object, you can convert it to ASCII for quick previewing: from pdftables.display import to_string for table in tables: print to_string(table.data)
#fh = pdftables.filehandleFromURL("http://www.candyusa.com/files/1st%20qtr%202013%20report.pdf") #pagenumber = 1 #SelectedPDF = "pdf_prc_prod_1_7_1288_acucar-vhp-vendido-mercado-externo_sao-paulo_mensal.pdf" #pagenumber = 1 #SelectedPDF = "commodity-prices_en.pdf" #pagenumber = 1 #SelectedPDF = "AnimalExampleTables.pdf" # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf #pagenumber = 2 filepath = os.path.join(PDF_TEST_FILES, SelectedPDF) pta.plotAllPages(fh) doc = PDFDocument(open(filepath, 'rb')) pdf_page = doc.get_page(pagenumber) table, diagnosticData = page_to_tables( pdf_page, ConfigParameters(extend_y=False, table_top_hint=table_top_hint, table_bottom_hint=table_bottom_hint, atomise=False)) fig, ax1 = pta.plotpage(diagnosticData) result = StringIO() (columns, rows) = get_dimensions(table) result.write(" {} columns, {} rows\n".format(columns, rows))