def test_fix_element_in_multiple_cols(self): # Checks that the following table is correctly extracted: # --------- # | 1 | # --------| # | 2 | 3 | # --------- elem_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 10, 6, 10), text="fake_text_1" ) elem_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_2" ) elem_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_3" ) document = create_pdf_document(elements=[elem_1, elem_2, elem_3]) elem_list = document.elements with self.assertRaises(TableExtractionError): result = extract_table(elem_list, as_text=True) result = extract_table( elem_list, as_text=True, fix_element_in_multiple_cols=True ) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertListEqual( [["fake_text_1", ""], ["fake_text_2", "fake_text_3"]], result )
def test_extract_table_with_tolerance(self): # Checks that simple 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # # But with elem_4 slightly overlapping elem_2, counteracted by setting tolerance elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 6.1)) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements with self.assertRaises(TableExtractionError): extract_table(elem_list) result = extract_table(elem_list, tolerance=0.2) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assert_original_element_list_list_equal( [[elem_1, elem_2], [elem_3, elem_4]], result )
def test_extract_table(self): # Checks that simple 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements result = extract_table(elem_list) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assert_original_element_list_list_equal( [[elem_1, elem_2], [elem_3, elem_4]], result ) # Checks that the following table is correctly extracted # # elem_1 elem_2 elem_6 # elem_3 elem_4 elem_5 # elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 0, 5)) elem_6 = FakePDFMinerTextElement(bounding_box=BoundingBox(16, 20, 6, 10)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6] ) elem_list = document.elements result = extract_table(elem_list) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 4) self.assertEqual(len(result[1]), 4) self.assert_original_element_list_list_equal( [[elem_1, elem_2, None, elem_6], [elem_3, elem_4, elem_5, None]], result ) # Checks that it raises an error if one element is in two rows elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(3, 8, 6, 10)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6] ) elem_list = document.elements with self.assertRaises(TableExtractionError): result = extract_table(elem_list) # Checks that it raises an error if one element is in two columns elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 3, 8)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6] ) elem_list = document.elements with self.assertRaises(TableExtractionError): result = extract_table(elem_list)
def test_extract_table_from_different_pages(self): # Checks that simple 2*2 tables are correctly extracted from different pages # # Page 1: # elem_p1_1 elem_p1_2 # elem_p1_3 elem_p1_4 # # Page 2: # elem_p2_1 elem_p2_2 # elem_p2_3 elem_p2_4 # elem_p1_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10)) elem_p1_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10)) elem_p1_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5)) elem_p1_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5)) elem_p2_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10)) elem_p2_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10)) elem_p2_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5)) elem_p2_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document( elements={ 1: [elem_p1_1, elem_p1_2, elem_p1_3, elem_p1_4], 2: [elem_p2_1, elem_p2_2, elem_p2_3, elem_p2_4], }) elem_list = document.elements result = extract_table(elem_list) self.assertEqual(len(result), 4) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertEqual(len(result[2]), 2) self.assertEqual(len(result[3]), 2) self.assert_original_element_list_list_equal( [ [elem_p1_1, elem_p1_2], [elem_p1_3, elem_p1_4], [elem_p2_1, elem_p2_2], [elem_p2_3, elem_p2_4], ], result, )
def test_extract_text_from_table(self): # Checks that text from 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # elem_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10), text="fake_text_1" ) elem_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10), text="fake_text_2" ) elem_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_3" ) elem_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_4 " ) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements result = extract_table(elem_list, as_text=True) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertListEqual( [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4"]], result ) result = extract_table(elem_list, as_text=True, strip_text=False) self.assertListEqual( [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4 "]], result ) # Checks that text from the following table is correctly extracted # # elem_1 elem_2 elem_6 # elem_3 elem_4 elem_5 # elem_5 = FakePDFMinerTextElement( bounding_box=BoundingBox(11, 15, 0, 5), text="fake_text_5" ) elem_6 = FakePDFMinerTextElement( bounding_box=BoundingBox(16, 20, 6, 10), text="fake_text_6" ) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6] ) elem_list = document.elements result = extract_table(elem_list, as_text=True) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 4) self.assertEqual(len(result[1]), 4) self.assertListEqual( [ ["fake_text_1", "fake_text_2", "", "fake_text_6"], ["fake_text_3", "fake_text_4", "fake_text_5", ""], ], result, ) result = extract_table(elem_list, as_text=True, strip_text=False) self.assertListEqual( [ ["fake_text_1", "fake_text_2", "", "fake_text_6"], ["fake_text_3", "fake_text_4 ", "fake_text_5", ""], ], result, )
def test_extract_table_removing_duplicate_header_different_fonts_or_text(self): # header_elem_1 header_elem_2 # header_elem_3_different_font header_elem_4 # header_elem_5_different_text header_elem_6 # header_elem_1 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 21, 25), ) header_elem_2 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 21, 25), ) header_elem_3_different_font = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=12, bounding_box=BoundingBox(0, 5, 16, 20), ) header_elem_4 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 16, 20), ) header_elem_5_different_text = FakePDFMinerTextElement( text="header with a different name", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 11, 15), ) header_elem_6 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 11, 15), ) document = create_pdf_document( elements=[ header_elem_1, header_elem_2, header_elem_3_different_font, header_elem_4, header_elem_5_different_text, header_elem_6, ] ) elem_list = document.elements result = extract_table(elem_list, remove_duplicate_header_rows=True) self.assertEqual(len(result), 3) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertEqual(len(result[2]), 2) self.assert_original_element_list_list_equal( [ [header_elem_1, header_elem_2], [header_elem_3_different_font, header_elem_4], [header_elem_5_different_text, header_elem_6], ], result, )
def test_extract_table_removing_duplicate_header_rows(self): # header_elem_1 header_elem_2 header_elem_1 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 21, 25), ) header_elem_2 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 21, 25), ) document = create_pdf_document(elements=[header_elem_1, header_elem_2]) elem_list = document.elements result = extract_table(elem_list, remove_duplicate_header_rows=True) # Extraction here should just return the whole table as it is not possible to # have duplicates of a single lined table. self.assertEqual(len(result), 1) self.assertEqual(len(result[0]), 2) self.assert_original_element_list_list_equal( [[header_elem_1, header_elem_2]], result ) # header_elem_1 header_elem_2 # elem_1 elem_2 # header_elem_3 header_elem_4 # elem_3 elem_4 # header_elem_5 header_elem_6 # elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20)) header_elem_3 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 11, 15), ) header_elem_4 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 11, 15), ) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10)) header_elem_5 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 0, 5), ) header_elem_6 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(6, 10, 0, 5), ) document = create_pdf_document( elements=[ header_elem_1, header_elem_2, elem_1, elem_2, header_elem_3, header_elem_4, elem_3, elem_4, header_elem_5, header_elem_6, ] ) elem_list = document.elements result = extract_table(elem_list, remove_duplicate_header_rows=True) # The last row will not be removed as the gaps do not match the header row self.assertEqual(len(result), 4) self.assertEqual(len(result[0]), 3) self.assertEqual(len(result[1]), 3) self.assertEqual(len(result[2]), 3) self.assertEqual(len(result[3]), 3) self.assert_original_element_list_list_equal( [ [header_elem_1, None, header_elem_2], [elem_1, elem_2, None], [elem_3, None, elem_4], [header_elem_5, header_elem_6, None], ], result, )
def test_output_is_correct(self): file_path = os.path.join(os.path.dirname(__file__), "../../docs/source/example_files/tables.pdf") # Step 1 - Load the file FONT_MAPPING = { "BAAAAA+LiberationSerif-Bold,12.0": "header", "CAAAAA+LiberationSerif,12.0": "table_element", } document = load_file(file_path, font_mapping=FONT_MAPPING) headers = document.elements.filter_by_font("header") # Extract reference elements simple_table_header = headers.filter_by_text_equal( "Simple Table").extract_single_element() simple_table_with_gaps_header = headers.filter_by_text_equal( "Simple Table with gaps").extract_single_element() simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal( "Simple Table with gaps in first row/col").extract_single_element( ) non_simple_table_header = headers.filter_by_text_equal( "Non Simple Table").extract_single_element() non_simple_table_with_merged_cols_header = headers.filter_by_text_equal( "Non Simple Table with Merged Columns").extract_single_element() non_simple_table_with_merged_rows_header = headers.filter_by_text_equal( "Non Simple Table with Merged Rows and Columns" ).extract_single_element() over_the_page_header = headers.filter_by_text_equal( "Over the page").extract_single_element() # Extract table elements simple_table_elements = document.elements.between( simple_table_header, simple_table_with_gaps_header) simple_table_with_gaps_elements = document.elements.between( simple_table_with_gaps_header, simple_table_with_gaps_in_first_row_col_header, ) simple_table_with_gaps_in_first_row_col_elements = document.elements.between( simple_table_with_gaps_in_first_row_col_header, non_simple_table_header) non_simple_table_elements = document.elements.between( non_simple_table_header, non_simple_table_with_merged_cols_header) non_simple_table_with_merged_cols_elements = document.elements.between( non_simple_table_with_merged_cols_header, non_simple_table_with_merged_rows_header, ) non_simple_table_with_merged_rows_and_cols_elements = document.elements.between( non_simple_table_with_merged_rows_header, over_the_page_header) over_the_page_elements = document.elements.after(over_the_page_header) # Simple Table table = tables.extract_simple_table(simple_table_elements, as_text=True) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "A", "1"], ["B", "2", "B", "2"], ["C", "3", "C", "3"], ], ) # Simple Table with gaps with self.assertRaises(TableExtractionError): tables.extract_simple_table(simple_table_with_gaps_elements, as_text=True) table = tables.extract_simple_table(simple_table_with_gaps_elements, as_text=True, allow_gaps=True) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "", "1"], ["B", "", "", ""], ["C", "", "C", "3"], ], ) # Simple Table with gaps in first row/col with self.assertRaises(TableExtractionError): tables.extract_simple_table( simple_table_with_gaps_in_first_row_col_elements, as_text=True, allow_gaps=True, ) reference_element = simple_table_with_gaps_in_first_row_col_elements[9] table = tables.extract_simple_table( simple_table_with_gaps_in_first_row_col_elements, as_text=True, allow_gaps=True, reference_element=reference_element, ) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "", "Heading 4"], ["", "1", "A", ""], ["B", "2", "", "2"], ["C", "3", "C", "3"], ], ) # Non Simple Table table = tables.extract_table(non_simple_table_elements, as_text=True) self.assertListEqual( table, [ ["", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "", "1"], ["B", "", "B", "2"], ["C", "3", "C", ""], ], ) # Non Simple Table with Merged Columns with self.assertRaises(TableExtractionError): tables.extract_table(non_simple_table_with_merged_cols_elements, as_text=True) table = tables.extract_table( non_simple_table_with_merged_cols_elements, as_text=True, fix_element_in_multiple_cols=True, ) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "A", "1"], ["This text spans across multiple columns", "", "B", "2"], ["C", "3", "C", "3"], ], ) # Non Simple Table with Merged Rows and Columns table = tables.extract_table( non_simple_table_with_merged_rows_and_cols_elements, as_text=True, fix_element_in_multiple_rows=True, fix_element_in_multiple_cols=True, ) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], [ "This text spans across multiple rows and \nmultiple columns.", "", "A", "1", ], ["", "", "B", "2"], ["C", "3", "C", "3"], ], ) # Over the page table = tables.extract_simple_table(over_the_page_elements, as_text=True) self.assertListEqual( table, [ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], ["A", "1", "A", "1"], ["B", "2", "B", "2"], ["C", "3", "C", "3"], ], )