def test_output_is_correct(self): file_path = os.path.join(os.path.dirname(__file__), "../../docs/source/example_files/figure.pdf") # Without all_texts document = load_file(file_path) self.assertListEqual( [element.text() for element in document.elements], ["Here is some text outside of an image"], ) document = load_file(file_path, la_params={"all_texts": True}) self.assertListEqual( [element.text() for element in document.elements], [ "This is some text in an image", "Here is some text outside of an image" ], )
def main(): parser = ArgumentParser() parser.add_argument("file_path") args = parser.parse_args() document = load_file(args.file_path) print("Parsing PDF: " + args.file_path) if document is not None: tomorrow_parser = TomorrowParser(document) statement = tomorrow_parser.run() write_csv(statement, args.file_path.replace(".pdf", ".csv")) else: print("Document not found at path [" + args.filepath + "]") exit(-1)
def test_visualise(self): file_path = os.path.join( os.path.dirname(__file__), "../docs/source/example_files/tables.pdf" ) FONT_MAPPING = { "BAAAAA+LiberationSerif-Bold,12.0": "header", "CAAAAA+LiberationSerif,12.0": "table_element", } document = load_file(file_path, font_mapping=FONT_MAPPING) visualiser = PDFVisualiser( self.root, document, show_info=True, width=1920, height=1080 ) self.check_images(visualiser, "tables1") visualiser.toolbar._buttons["Next page"].invoke() self.check_images(visualiser, "tables2")
def test_load_file(self): file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf") document = load_file(file_path) self.assertIsInstance(document, PDFDocument)
def test_load_file_with_text_in_image(self): file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "image.pdf") document = load_file(file_path, la_params={"all_texts": True}) self.assertIsInstance(document, PDFDocument) self.assertEqual(len(document.elements), 2)
def test_output_is_correct(self): # The code below should match that in the documentation example "simple_memo" # Step 1 - Load the document file_path = os.path.join( os.path.dirname(__file__), "../../docs/source/example_files/simple_memo.pdf", ) document = load_file(file_path) # We could visualise it here to check it looks correct: # from py_pdf_parser.visualise import visualise # visualise(document) # Step 2 - Extract reference elements: to_element = document.elements.filter_by_text_equal( "TO:").extract_single_element() from_element = document.elements.filter_by_text_equal( "FROM:").extract_single_element() date_element = document.elements.filter_by_text_equal( "DATE:").extract_single_element() subject_element = document.elements.filter_by_text_equal( "SUBJECT:").extract_single_element() # Step 3 - Extract the data to_text = (document.elements.to_the_right_of( to_element).extract_single_element().text()) from_text = (document.elements.to_the_right_of( from_element).extract_single_element().text()) date_text = (document.elements.to_the_right_of( date_element).extract_single_element().text()) subject_text_element = document.elements.to_the_right_of( subject_element).extract_single_element() subject_text = subject_text_element.text() content_elements = document.elements.after(subject_element) content_text = "\n".join(element.text() for element in content_elements) output = { "to": to_text, "from": from_text, "date": date_text, "subject": subject_text, "content": content_text, } self.assertDictEqual( output, { "content": ("A new PDF Parsing tool\n" "There is a new PDF parsing tool available, called py-pdf-parser - " "you should all check it out!\n" "I think it could really help you extract that data we need from " "those PDFs."), "date": "1st January 2020", "from": "John Smith", "subject": "A new PDF Parsing tool", "to": "All Developers", }, )
def test_output_is_correct(self): file_path = os.path.join(os.path.dirname(__file__), "../../docs/source/example_files/tables.pdf") # Step 1 - Load the file FONT_MAPPING = { "BAAAAA+LiberationSerif-Bold,12.0": "header", "CAAAAA+LiberationSerif,12.0": "table_element", } document = load_file(file_path, font_mapping=FONT_MAPPING) headers = document.elements.filter_by_font("header") # Extract reference elements simple_table_header = headers.filter_by_text_equal( "Simple Table").extract_single_element() simple_table_with_gaps_header = headers.filter_by_text_equal( "Simple Table with gaps").extract_single_element() simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal( "Simple Table with gaps in first row/col").extract_single_element( ) non_simple_table_header = headers.filter_by_text_equal( "Non Simple Table").extract_single_element() non_simple_table_with_merged_cols_header = headers.filter_by_text_equal( "Non Simple Table with Merged Columns").extract_single_element() non_simple_table_with_merged_rows_header = headers.filter_by_text_equal( "Non Simple Table with Merged Rows and Columns" ).extract_single_element() over_the_page_header = headers.filter_by_text_equal( "Over the page").extract_single_element() # Extract table elements simple_table_elements = document.elements.between( simple_table_header, simple_table_with_gaps_header) simple_table_with_gaps_elements = document.elements.between( simple_table_with_gaps_header, simple_table_with_gaps_in_first_row_col_header, ) simple_table_with_gaps_in_first_row_col_elements = document.elements.between( simple_table_with_gaps_in_first_row_col_header, non_simple_table_header) non_simple_table_elements = document.elements.between( non_simple_table_header, non_simple_table_with_merged_cols_header) non_simple_table_with_merged_cols_elements = document.elements.between( non_simple_table_with_merged_cols_header, non_simple_table_with_merged_rows_header, ) non_simple_table_with_merged_rows_and_cols_elements = document.elements.between( non_simple_table_with_merged_rows_header, over_the_page_header) over_the_page_elements = document.elements.after(over_the_page_header) # Simple Table table = tables.extract_simple_table(simple_table_elements, as_text=True) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "A", "1"], ["B", "2", "B", "2"], ["C", "3", "C", "3"], ], ) # Simple Table with gaps with self.assertRaises(TableExtractionError): tables.extract_simple_table(simple_table_with_gaps_elements, as_text=True) table = tables.extract_simple_table(simple_table_with_gaps_elements, as_text=True, allow_gaps=True) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "", "1"], ["B", "", "", ""], ["C", "", "C", "3"], ], ) # Simple Table with gaps in first row/col with self.assertRaises(TableExtractionError): tables.extract_simple_table( simple_table_with_gaps_in_first_row_col_elements, as_text=True, allow_gaps=True, ) reference_element = simple_table_with_gaps_in_first_row_col_elements[9] table = tables.extract_simple_table( simple_table_with_gaps_in_first_row_col_elements, as_text=True, allow_gaps=True, reference_element=reference_element, ) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "", "Heading 4"], ["", "1", "A", ""], ["B", "2", "", "2"], ["C", "3", "C", "3"], ], ) # Non Simple Table table = tables.extract_table(non_simple_table_elements, as_text=True) self.assertListEqual( table, [ ["", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "", "1"], ["B", "", "B", "2"], ["C", "3", "C", ""], ], ) # Non Simple Table with Merged Columns with self.assertRaises(TableExtractionError): tables.extract_table(non_simple_table_with_merged_cols_elements, as_text=True) table = tables.extract_table( non_simple_table_with_merged_cols_elements, as_text=True, fix_element_in_multiple_cols=True, ) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "A", "1"], ["This text spans across multiple columns", "", "B", "2"], ["C", "3", "C", "3"], ], ) # Non Simple Table with Merged Rows and Columns table = tables.extract_table( non_simple_table_with_merged_rows_and_cols_elements, as_text=True, fix_element_in_multiple_rows=True, fix_element_in_multiple_cols=True, ) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], [ "This text spans across multiple rows and \nmultiple columns.", "", "A", "1", ], ["", "", "B", "2"], ["C", "3", "C", "3"], ], ) # Over the page table = tables.extract_simple_table(over_the_page_elements, as_text=True) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "A", "1"], ["B", "2", "B", "2"], ["C", "3", "C", "3"], ], )
def test_output_is_correct(self): # The code below should match that in the documentation example "order_summary" # Step 1 - Load the document file_path = os.path.join( os.path.dirname(__file__), "../../docs/source/example_files/order_summary.pdf", ) FONT_MAPPING = { "BAAAAA+LiberationSerif-Bold,16.0": "title", "BAAAAA+LiberationSerif-Bold,12.0": "sub_title", "CAAAAA+LiberationSerif,12.0": "text", "DAAAAA+FreeMonoBold,12.0": "table_header", "EAAAAA+FreeMono,12.0": "table_text", } document = load_file(file_path, font_mapping=FONT_MAPPING) # visualise(document) # Step 3 - Add sections order_summary_sub_title_element = ( document.elements.filter_by_font("sub_title").filter_by_text_equal( "Order Summary:").extract_single_element()) totals_sub_title_element = ( document.elements.filter_by_font("sub_title").filter_by_text_equal( "Totals:").extract_single_element()) final_element = document.elements[-1] order_summary_section = document.sectioning.create_section( name="order_summary", start_element=order_summary_sub_title_element, end_element=totals_sub_title_element, include_last_element=False, ) totals_section = document.sectioning.create_section( name="totals", start_element=totals_sub_title_element, end_element=final_element, ) # visualise(document) # Step 4 - Extract tables order_summary_table = tables.extract_simple_table( order_summary_section.elements.filter_by_fonts( "table_header", "table_text"), as_text=True, ) totals_table = tables.extract_simple_table( totals_section.elements.filter_by_fonts("table_header", "table_text"), as_text=True, ) order_summary_with_header = tables.add_header_to_table( order_summary_table) self.assertListEqual( order_summary_table, [ ["Item", "Unit Cost", "Quantity", "Cost"], ["Challenger 100g\nWhole Hops", "£3.29", "1", "£3.29"], [ "Maris Otter \nPale Ale Malt \n(Crushed)", "£1.50/1000g", "4000g", "£6.00", ], ["WLP037 \nYorkshire Ale \nYeast", "£7.08", "1", "£7.08"], ["Bottle Caps", "£1 per 100", "500", "£5"], ], ) self.assertListEqual( totals_table, [ ["Subtotal:", "£26.28"], ["Shipping", "£6"], ["VAT 20%", "£6.45"], ["Total:", "£38.73"], ], ) self.assertListEqual( order_summary_with_header, [ { "Item": "Challenger 100g\nWhole Hops", "Unit Cost": "£3.29", "Quantity": "1", "Cost": "£3.29", }, { "Item": "Maris Otter \nPale Ale Malt \n(Crushed)", "Unit Cost": "£1.50/1000g", "Quantity": "4000g", "Cost": "£6.00", }, { "Item": "WLP037 \nYorkshire Ale \nYeast", "Unit Cost": "£7.08", "Quantity": "1", "Cost": "£7.08", }, { "Item": "Bottle Caps", "Unit Cost": "£1 per 100", "Quantity": "500", "Cost": "£5", }, ], )
def test_output_is_correct(self): file_path = os.path.join(os.path.dirname(__file__), "../../docs/source/example_files/grid.pdf") # Default - left to right, top to bottom document = load_file(file_path) self.assertListEqual( [element.text() for element in document.elements], ["Top Left", "Top Right", "Bottom Left", "Bottom Right"], ) # Preset - right to left, top to bottom document = load_file( file_path, element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM) self.assertListEqual( [element.text() for element in document.elements], ["Top Right", "Top Left", "Bottom Right", "Bottom Left"], ) # Preset - top to bottom, left to right document = load_file( file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT) self.assertListEqual( [element.text() for element in document.elements], ["Bottom Left", "Top Left", "Bottom Right", "Top Right"], ) # Preset - top to bottom, right to left document = load_file( file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT) self.assertListEqual( [element.text() for element in document.elements], ["Top Right", "Bottom Right", "Top Left", "Bottom Left"], ) # Custom - bottom to top, left to right def ordering_function(elements): return sorted(elements, key=lambda elem: (elem.x0, elem.y0)) document = load_file(file_path, element_ordering=ordering_function) self.assertListEqual( [element.text() for element in document.elements], ["Bottom Left", "Top Left", "Bottom Right", "Top Right"], ) # Custom - This PDF has columns! # TODO: CHANGE PATH! file_path = os.path.join( os.path.dirname(__file__), "../../docs/source/example_files/columns.pdf") # Default - left to right, top to bottom document = load_file(file_path) self.assertListEqual( [element.text() for element in document.elements], [ "Column 1 Title", "Column 2 Title", "Here is some column 1 text.", "Here is some column 2 text.", "Col 1 left", "Col 1 right", "Col 2 left", "Col 2 right", ], ) # Visualise, and we can see that the middle is at around x = 300. # visualise(document) def column_ordering_function(elements): return sorted(elements, key=lambda elem: (elem.x0 > 300, -elem.y0, elem.x0)) document = load_file(file_path, element_ordering=column_ordering_function) self.assertListEqual( [element.text() for element in document.elements], [ "Column 1 Title", "Here is some column 1 text.", "Col 1 left", "Col 1 right", "Column 2 Title", "Here is some column 2 text.", "Col 2 left", "Col 2 right", ], )