def _test_sample_pdf(short_filename): tables = get_tables_from_document(fixture(short_filename)) assert_equal(get_expected_number_of_tables(short_filename), len(tables)) for table_num, table in enumerate(tables): table_filename = "{}_{}.txt".format(short_filename, table_num) expected_filename = join(EXPECTED_DIR, table_filename) actual_filename = join(ACTUAL_DIR, table_filename) with open(actual_filename, 'w') as f: f.write(to_string(table).encode('utf-8')) diff_table_files(expected_filename, actual_filename)
def _test_sample_pdf(short_filename): with open(join(SAMPLE_DIR, short_filename), "rb") as f: tables = get_tables(f) assert_equal(get_expected_number_of_tables(short_filename), len(tables)) for table_num, table in enumerate(tables): table_filename = "{}_{}.txt".format(short_filename, table_num) expected_filename = join(EXPECTED_DIR, table_filename) actual_filename = join(ACTUAL_DIR, table_filename) with open(actual_filename, "w") as f: f.write(to_string(table).encode("utf-8")) diff_table_files(expected_filename, actual_filename)
from pdftables.pdf_document import PDFDocument as pdfdoc from pdftables.pdftables import page_to_tables from pdftables.display import to_string filepath = 'irregular-verbs-de.pdf' fileobj = open(filepath, 'rb') doc = pdfdoc.from_fileobj(fileobj) page = doc.get_page(0) tables = page_to_tables(page) for table in tables: print to_string(table.data)