def test_extract_simple_table(self): # Checks that simple 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements result = extract_simple_table(elem_list) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assert_original_element_list_list_equal( [[elem_1, elem_2], [elem_3, elem_4]], result ) # Checks that it raises an exception when table is not rectangular i.e table # has empty cells # # elem_1 elem_2 # elem_3 elem_4 elem_5 # elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 0, 5)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5] ) elem_list = document.elements with self.assertRaises(TableExtractionError): extract_simple_table(elem_list)
def test_extract_text_from_simple_table(self): # Checks that text from simple 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # elem_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10), text="fake_text_1" ) elem_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10), text="fake_text_2" ) elem_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_3" ) elem_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_4 " ) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements result = extract_simple_table(elem_list, as_text=True) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertListEqual( [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4"]], result ) result = extract_simple_table(elem_list, as_text=True, strip_text=False) self.assertListEqual( [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4 "]], result )
def test_extract_simple_table_with_tolerance(self): # Checks that simple 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # But with elem_4 slightly overlapping elem_2, counteracted by setting tolerance elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 6.1)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements with self.assertRaises(TableExtractionError): extract_simple_table(elem_list) result = extract_simple_table(elem_list, tolerance=0.2) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assert_original_element_list_list_equal( [[elem_1, elem_2], [elem_3, elem_4]], result)
def test_extract_simple_table_with_gaps_and_wrong_reference(self): # elem_1 elem_2 elem_3 # elem_4 elem_5 elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5] ) elem_list = document.elements reference_element = self.extract_element_from_list(elem_3, elem_list) with self.assertRaises(TableExtractionError): extract_simple_table( elem_list, allow_gaps=True, reference_element=reference_element )
def test_extract_simple_table_from_different_pages(self): # Checks that simple 2*2 tables are correctly extracted from different pages # # Page 1: # elem_p1_1 elem_p1_2 # elem_p1_3 elem_p1_4 # # Page 2: # elem_p2_1 elem_p2_2 # elem_p2_3 elem_p2_4 # elem_p1_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10)) elem_p1_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10)) elem_p1_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5)) elem_p1_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5)) elem_p2_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10)) elem_p2_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10)) elem_p2_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5)) elem_p2_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document( elements={ 1: [elem_p1_1, elem_p1_2, elem_p1_3, elem_p1_4], 2: [elem_p2_1, elem_p2_2, elem_p2_3, elem_p2_4], }) elem_list = document.elements result = extract_simple_table(elem_list) self.assertEqual(len(result), 4) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertEqual(len(result[2]), 2) self.assertEqual(len(result[3]), 2) self.assert_original_element_list_list_equal( [ [elem_p1_1, elem_p1_2], [elem_p1_3, elem_p1_4], [elem_p2_1, elem_p2_2], [elem_p2_3, elem_p2_4], ], result, )
def test_extract_simple_table_with_gaps(self): # elem_1 elem_2 elem_3 # elem_4 elem_5 elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5] ) elem_list = document.elements result = extract_simple_table(elem_list, allow_gaps=True) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 3) self.assertEqual(len(result[1]), 3) self.assert_original_element_list_list_equal( [[elem_1, elem_2, elem_3], [elem_4, elem_5, None]], result )
def test_extract_simple_table_removing_duplicate_header_different_fonts_or_text( self, ): # header_elem_1 header_elem_2 # header_elem_3_different_font header_elem_4 # header_elem_5_different_text header_elem_6 # header_elem_1 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 21, 25), ) header_elem_2 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 21, 25), ) header_elem_3_different_font = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=12, bounding_box=BoundingBox(0, 5, 16, 20), ) header_elem_4 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 16, 20), ) header_elem_5_different_text = FakePDFMinerTextElement( text="header with a different name", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 11, 15), ) header_elem_6 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 11, 15), ) document = create_pdf_document( elements=[ header_elem_1, header_elem_2, header_elem_3_different_font, header_elem_4, header_elem_5_different_text, header_elem_6, ] ) elem_list = document.elements result = extract_simple_table(elem_list, remove_duplicate_header_rows=True) self.assertEqual(len(result), 3) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertEqual(len(result[2]), 2) self.assert_original_element_list_list_equal( [ [header_elem_1, header_elem_2], [header_elem_3_different_font, header_elem_4], [header_elem_5_different_text, header_elem_6], ], result, )
def test_extract_simple_table_removing_duplicate_header_rows(self): # header_elem_1 header_elem_2 header_elem_1 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 21, 25), ) header_elem_2 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(6, 10, 21, 25), ) document = create_pdf_document(elements=[header_elem_1, header_elem_2]) elem_list = document.elements result = extract_simple_table(elem_list, remove_duplicate_header_rows=True) # Extraction here should just return the whole table as it is not possible to # have duplicates of a single lined table. self.assertEqual(len(result), 1) self.assertEqual(len(result[0]), 2) self.assert_original_element_list_list_equal( [[header_elem_1, header_elem_2]], result ) # header_elem_1 header_elem_2 # elem_1 elem_2 # header_elem_3 header_elem_4 # elem_3 elem_4 # header_elem_5 header_elem_6 # elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20)) header_elem_3 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 11, 15), ) header_elem_4 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(6, 10, 11, 15), ) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) header_elem_5 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 0, 5), ) header_elem_6 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(6, 10, 0, 5), ) document = create_pdf_document( elements=[ header_elem_1, header_elem_2, elem_1, elem_2, header_elem_3, header_elem_4, elem_3, elem_4, header_elem_5, header_elem_6, ] ) elem_list = document.elements result = extract_simple_table(elem_list, remove_duplicate_header_rows=True) self.assertEqual(len(result), 3) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertEqual(len(result[2]), 2) self.assert_original_element_list_list_equal( [[header_elem_1, header_elem_2], [elem_1, elem_2], [elem_3, elem_4]], result )
def test_output_is_correct(self): file_path = os.path.join(os.path.dirname(__file__), "../../docs/source/example_files/tables.pdf") # Step 1 - Load the file FONT_MAPPING = { "BAAAAA+LiberationSerif-Bold,12.0": "header", "CAAAAA+LiberationSerif,12.0": "table_element", } document = load_file(file_path, font_mapping=FONT_MAPPING) headers = document.elements.filter_by_font("header") # Extract reference elements simple_table_header = headers.filter_by_text_equal( "Simple Table").extract_single_element() simple_table_with_gaps_header = headers.filter_by_text_equal( "Simple Table with gaps").extract_single_element() simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal( "Simple Table with gaps in first row/col").extract_single_element( ) non_simple_table_header = headers.filter_by_text_equal( "Non Simple Table").extract_single_element() non_simple_table_with_merged_cols_header = headers.filter_by_text_equal( "Non Simple Table with Merged Columns").extract_single_element() non_simple_table_with_merged_rows_header = headers.filter_by_text_equal( "Non Simple Table with Merged Rows and Columns" ).extract_single_element() over_the_page_header = headers.filter_by_text_equal( "Over the page").extract_single_element() # Extract table elements simple_table_elements = document.elements.between( simple_table_header, simple_table_with_gaps_header) simple_table_with_gaps_elements = document.elements.between( simple_table_with_gaps_header, simple_table_with_gaps_in_first_row_col_header, ) simple_table_with_gaps_in_first_row_col_elements = document.elements.between( simple_table_with_gaps_in_first_row_col_header, non_simple_table_header) non_simple_table_elements = document.elements.between( non_simple_table_header, non_simple_table_with_merged_cols_header) non_simple_table_with_merged_cols_elements = document.elements.between( non_simple_table_with_merged_cols_header, non_simple_table_with_merged_rows_header, ) non_simple_table_with_merged_rows_and_cols_elements = document.elements.between( non_simple_table_with_merged_rows_header, over_the_page_header) over_the_page_elements = document.elements.after(over_the_page_header) # Simple Table table = tables.extract_simple_table(simple_table_elements, as_text=True) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "A", "1"], ["B", "2", "B", "2"], ["C", "3", "C", "3"], ], ) # Simple Table with gaps with self.assertRaises(TableExtractionError): tables.extract_simple_table(simple_table_with_gaps_elements, as_text=True) table = tables.extract_simple_table(simple_table_with_gaps_elements, as_text=True, allow_gaps=True) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "", "1"], ["B", "", "", ""], ["C", "", "C", "3"], ], ) # Simple Table with gaps in first row/col with self.assertRaises(TableExtractionError): tables.extract_simple_table( simple_table_with_gaps_in_first_row_col_elements, as_text=True, allow_gaps=True, ) reference_element = simple_table_with_gaps_in_first_row_col_elements[9] table = tables.extract_simple_table( simple_table_with_gaps_in_first_row_col_elements, as_text=True, allow_gaps=True, reference_element=reference_element, ) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "", "Heading 4"], ["", "1", "A", ""], ["B", "2", "", "2"], ["C", "3", "C", "3"], ], ) # Non Simple Table table = tables.extract_table(non_simple_table_elements, as_text=True) self.assertListEqual( table, [ ["", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "", "1"], ["B", "", "B", "2"], ["C", "3", "C", ""], ], ) # Non Simple Table with Merged Columns with self.assertRaises(TableExtractionError): tables.extract_table(non_simple_table_with_merged_cols_elements, as_text=True) table = tables.extract_table( non_simple_table_with_merged_cols_elements, as_text=True, fix_element_in_multiple_cols=True, ) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "A", "1"], ["This text spans across multiple columns", "", "B", "2"], ["C", "3", "C", "3"], ], ) # Non Simple Table with Merged Rows and Columns table = tables.extract_table( non_simple_table_with_merged_rows_and_cols_elements, as_text=True, fix_element_in_multiple_rows=True, fix_element_in_multiple_cols=True, ) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], [ "This text spans across multiple rows and \nmultiple columns.", "", "A", "1", ], ["", "", "B", "2"], ["C", "3", "C", "3"], ], ) # Over the page table = tables.extract_simple_table(over_the_page_elements, as_text=True) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "A", "1"], ["B", "2", "B", "2"], ["C", "3", "C", "3"], ], )
def test_output_is_correct(self): # The code below should match that in the documentation example "order_summary" # Step 1 - Load the document file_path = os.path.join( os.path.dirname(__file__), "../../docs/source/example_files/order_summary.pdf", ) FONT_MAPPING = { "BAAAAA+LiberationSerif-Bold,16.0": "title", "BAAAAA+LiberationSerif-Bold,12.0": "sub_title", "CAAAAA+LiberationSerif,12.0": "text", "DAAAAA+FreeMonoBold,12.0": "table_header", "EAAAAA+FreeMono,12.0": "table_text", } document = load_file(file_path, font_mapping=FONT_MAPPING) # visualise(document) # Step 3 - Add sections order_summary_sub_title_element = ( document.elements.filter_by_font("sub_title").filter_by_text_equal( "Order Summary:").extract_single_element()) totals_sub_title_element = ( document.elements.filter_by_font("sub_title").filter_by_text_equal( "Totals:").extract_single_element()) final_element = document.elements[-1] order_summary_section = document.sectioning.create_section( name="order_summary", start_element=order_summary_sub_title_element, end_element=totals_sub_title_element, include_last_element=False, ) totals_section = document.sectioning.create_section( name="totals", start_element=totals_sub_title_element, end_element=final_element, ) # visualise(document) # Step 4 - Extract tables order_summary_table = tables.extract_simple_table( order_summary_section.elements.filter_by_fonts( "table_header", "table_text"), as_text=True, ) totals_table = tables.extract_simple_table( totals_section.elements.filter_by_fonts("table_header", "table_text"), as_text=True, ) order_summary_with_header = tables.add_header_to_table( order_summary_table) self.assertListEqual( order_summary_table, [ ["Item", "Unit Cost", "Quantity", "Cost"], ["Challenger 100g\nWhole Hops", "£3.29", "1", "£3.29"], [ "Maris Otter \nPale Ale Malt \n(Crushed)", "£1.50/1000g", "4000g", "£6.00", ], ["WLP037 \nYorkshire Ale \nYeast", "£7.08", "1", "£7.08"], ["Bottle Caps", "£1 per 100", "500", "£5"], ], ) self.assertListEqual( totals_table, [ ["Subtotal:", "£26.28"], ["Shipping", "£6"], ["VAT 20%", "£6.45"], ["Total:", "£38.73"], ], ) self.assertListEqual( order_summary_with_header, [ { "Item": "Challenger 100g\nWhole Hops", "Unit Cost": "£3.29", "Quantity": "1", "Cost": "£3.29", }, { "Item": "Maris Otter \nPale Ale Malt \n(Crushed)", "Unit Cost": "£1.50/1000g", "Quantity": "4000g", "Cost": "£6.00", }, { "Item": "WLP037 \nYorkshire Ale \nYeast", "Unit Cost": "£7.08", "Quantity": "1", "Cost": "£7.08", }, { "Item": "Bottle Caps", "Unit Cost": "£1 per 100", "Quantity": "500", "Cost": "£5", }, ], )