Example #1
0
    def test_extract_simple_table(self):
        # Checks that simple 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        result = extract_simple_table(elem_list)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assert_original_element_list_list_equal(
            [[elem_1, elem_2], [elem_3, elem_4]], result
        )
        # Checks that it raises an exception when table is not rectangular i.e table
        # has empty cells
        #
        #       elem_1      elem_2
        #       elem_3      elem_4      elem_5
        #
        elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 0, 5))

        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
        )
        elem_list = document.elements
        with self.assertRaises(TableExtractionError):
            extract_simple_table(elem_list)
Example #2
0
    def test_extract_text_from_simple_table(self):
        # Checks that text from simple 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        elem_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10), text="fake_text_1"
        )
        elem_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10), text="fake_text_2"
        )
        elem_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_3"
        )
        elem_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_4 "
        )

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        result = extract_simple_table(elem_list, as_text=True)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)

        self.assertListEqual(
            [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4"]], result
        )

        result = extract_simple_table(elem_list, as_text=True, strip_text=False)
        self.assertListEqual(
            [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4 "]], result
        )
Example #3
0
    def test_extract_simple_table_with_tolerance(self):
        # Checks that simple 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        # But with elem_4 slightly overlapping elem_2, counteracted by setting tolerance
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10))
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
        elem_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 6.1))

        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        with self.assertRaises(TableExtractionError):
            extract_simple_table(elem_list)

        result = extract_simple_table(elem_list, tolerance=0.2)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assert_original_element_list_list_equal(
            [[elem_1, elem_2], [elem_3, elem_4]], result)
Example #4
0
 def test_extract_simple_table_with_gaps_and_wrong_reference(self):
     #       elem_1      elem_2      elem_3
     #       elem_4      elem_5
     elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
     elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
     elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
     elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
     elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
     document = create_pdf_document(
         elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
     )
     elem_list = document.elements
     reference_element = self.extract_element_from_list(elem_3, elem_list)
     with self.assertRaises(TableExtractionError):
         extract_simple_table(
             elem_list, allow_gaps=True, reference_element=reference_element
         )
Example #5
0
    def test_extract_simple_table_from_different_pages(self):
        # Checks that simple 2*2 tables are correctly extracted from different pages
        #
        # Page 1:
        #       elem_p1_1      elem_p1_2
        #       elem_p1_3      elem_p1_4
        #
        # Page 2:
        #       elem_p2_1      elem_p2_2
        #       elem_p2_3      elem_p2_4
        #
        elem_p1_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10))
        elem_p1_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10))
        elem_p1_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5))
        elem_p1_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5))

        elem_p2_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10))
        elem_p2_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10))
        elem_p2_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5))
        elem_p2_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5))

        document = create_pdf_document(
            elements={
                1: [elem_p1_1, elem_p1_2, elem_p1_3, elem_p1_4],
                2: [elem_p2_1, elem_p2_2, elem_p2_3, elem_p2_4],
            })
        elem_list = document.elements

        result = extract_simple_table(elem_list)
        self.assertEqual(len(result), 4)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertEqual(len(result[2]), 2)
        self.assertEqual(len(result[3]), 2)
        self.assert_original_element_list_list_equal(
            [
                [elem_p1_1, elem_p1_2],
                [elem_p1_3, elem_p1_4],
                [elem_p2_1, elem_p2_2],
                [elem_p2_3, elem_p2_4],
            ],
            result,
        )
Example #6
0
 def test_extract_simple_table_with_gaps(self):
     #       elem_1      elem_2      elem_3
     #       elem_4      elem_5
     elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
     elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
     elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
     elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
     elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
     document = create_pdf_document(
         elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
     )
     elem_list = document.elements
     result = extract_simple_table(elem_list, allow_gaps=True)
     self.assertEqual(len(result), 2)
     self.assertEqual(len(result[0]), 3)
     self.assertEqual(len(result[1]), 3)
     self.assert_original_element_list_list_equal(
         [[elem_1, elem_2, elem_3], [elem_4, elem_5, None]], result
     )
Example #7
0
    def test_extract_simple_table_removing_duplicate_header_different_fonts_or_text(
        self,
    ):
        #    header_elem_1                     header_elem_2
        #    header_elem_3_different_font      header_elem_4
        #    header_elem_5_different_text      header_elem_6
        #
        header_elem_1 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 21, 25),
        )
        header_elem_2 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 21, 25),
        )
        header_elem_3_different_font = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=12,
            bounding_box=BoundingBox(0, 5, 16, 20),
        )
        header_elem_4 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 16, 20),
        )
        header_elem_5_different_text = FakePDFMinerTextElement(
            text="header with a different name",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 11, 15),
        )
        header_elem_6 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 11, 15),
        )

        document = create_pdf_document(
            elements=[
                header_elem_1,
                header_elem_2,
                header_elem_3_different_font,
                header_elem_4,
                header_elem_5_different_text,
                header_elem_6,
            ]
        )
        elem_list = document.elements

        result = extract_simple_table(elem_list, remove_duplicate_header_rows=True)
        self.assertEqual(len(result), 3)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertEqual(len(result[2]), 2)
        self.assert_original_element_list_list_equal(
            [
                [header_elem_1, header_elem_2],
                [header_elem_3_different_font, header_elem_4],
                [header_elem_5_different_text, header_elem_6],
            ],
            result,
        )
Example #8
0
    def test_extract_simple_table_removing_duplicate_header_rows(self):
        #    header_elem_1    header_elem_2
        header_elem_1 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 21, 25),
        )
        header_elem_2 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(6, 10, 21, 25),
        )
        document = create_pdf_document(elements=[header_elem_1, header_elem_2])
        elem_list = document.elements

        result = extract_simple_table(elem_list, remove_duplicate_header_rows=True)
        # Extraction here should just return the whole table as it is not possible to
        # have duplicates of a single lined table.
        self.assertEqual(len(result), 1)
        self.assertEqual(len(result[0]), 2)
        self.assert_original_element_list_list_equal(
            [[header_elem_1, header_elem_2]], result
        )

        #    header_elem_1    header_elem_2
        #       elem_1           elem_2
        #    header_elem_3    header_elem_4
        #       elem_3           elem_4
        #    header_elem_5    header_elem_6
        #
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20))
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20))
        header_elem_3 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 11, 15),
        )
        header_elem_4 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(6, 10, 11, 15),
        )
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
        header_elem_5 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 0, 5),
        )
        header_elem_6 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(6, 10, 0, 5),
        )

        document = create_pdf_document(
            elements=[
                header_elem_1,
                header_elem_2,
                elem_1,
                elem_2,
                header_elem_3,
                header_elem_4,
                elem_3,
                elem_4,
                header_elem_5,
                header_elem_6,
            ]
        )
        elem_list = document.elements

        result = extract_simple_table(elem_list, remove_duplicate_header_rows=True)
        self.assertEqual(len(result), 3)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertEqual(len(result[2]), 2)
        self.assert_original_element_list_list_equal(
            [[header_elem_1, header_elem_2], [elem_1, elem_2], [elem_3, elem_4]], result
        )
Example #9
0
    def test_output_is_correct(self):
        file_path = os.path.join(os.path.dirname(__file__),
                                 "../../docs/source/example_files/tables.pdf")

        # Step 1 - Load the file
        FONT_MAPPING = {
            "BAAAAA+LiberationSerif-Bold,12.0": "header",
            "CAAAAA+LiberationSerif,12.0": "table_element",
        }
        document = load_file(file_path, font_mapping=FONT_MAPPING)

        headers = document.elements.filter_by_font("header")

        # Extract reference elements
        simple_table_header = headers.filter_by_text_equal(
            "Simple Table").extract_single_element()

        simple_table_with_gaps_header = headers.filter_by_text_equal(
            "Simple Table with gaps").extract_single_element()

        simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal(
            "Simple Table with gaps in first row/col").extract_single_element(
            )

        non_simple_table_header = headers.filter_by_text_equal(
            "Non Simple Table").extract_single_element()

        non_simple_table_with_merged_cols_header = headers.filter_by_text_equal(
            "Non Simple Table with Merged Columns").extract_single_element()

        non_simple_table_with_merged_rows_header = headers.filter_by_text_equal(
            "Non Simple Table with Merged Rows and Columns"
        ).extract_single_element()

        over_the_page_header = headers.filter_by_text_equal(
            "Over the page").extract_single_element()

        # Extract table elements
        simple_table_elements = document.elements.between(
            simple_table_header, simple_table_with_gaps_header)
        simple_table_with_gaps_elements = document.elements.between(
            simple_table_with_gaps_header,
            simple_table_with_gaps_in_first_row_col_header,
        )

        simple_table_with_gaps_in_first_row_col_elements = document.elements.between(
            simple_table_with_gaps_in_first_row_col_header,
            non_simple_table_header)

        non_simple_table_elements = document.elements.between(
            non_simple_table_header, non_simple_table_with_merged_cols_header)

        non_simple_table_with_merged_cols_elements = document.elements.between(
            non_simple_table_with_merged_cols_header,
            non_simple_table_with_merged_rows_header,
        )

        non_simple_table_with_merged_rows_and_cols_elements = document.elements.between(
            non_simple_table_with_merged_rows_header, over_the_page_header)

        over_the_page_elements = document.elements.after(over_the_page_header)

        # Simple Table
        table = tables.extract_simple_table(simple_table_elements,
                                            as_text=True)
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "A", "1"],
                ["B", "2", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Simple Table with gaps

        with self.assertRaises(TableExtractionError):
            tables.extract_simple_table(simple_table_with_gaps_elements,
                                        as_text=True)

        table = tables.extract_simple_table(simple_table_with_gaps_elements,
                                            as_text=True,
                                            allow_gaps=True)
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "", "1"],
                ["B", "", "", ""],
                ["C", "", "C", "3"],
            ],
        )

        # Simple Table with gaps in first row/col
        with self.assertRaises(TableExtractionError):
            tables.extract_simple_table(
                simple_table_with_gaps_in_first_row_col_elements,
                as_text=True,
                allow_gaps=True,
            )

        reference_element = simple_table_with_gaps_in_first_row_col_elements[9]
        table = tables.extract_simple_table(
            simple_table_with_gaps_in_first_row_col_elements,
            as_text=True,
            allow_gaps=True,
            reference_element=reference_element,
        )
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "", "Heading 4"],
                ["", "1", "A", ""],
                ["B", "2", "", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Non Simple Table
        table = tables.extract_table(non_simple_table_elements, as_text=True)
        self.assertListEqual(
            table,
            [
                ["", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "", "1"],
                ["B", "", "B", "2"],
                ["C", "3", "C", ""],
            ],
        )

        # Non Simple Table with Merged Columns
        with self.assertRaises(TableExtractionError):
            tables.extract_table(non_simple_table_with_merged_cols_elements,
                                 as_text=True)

        table = tables.extract_table(
            non_simple_table_with_merged_cols_elements,
            as_text=True,
            fix_element_in_multiple_cols=True,
        )
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "A", "1"],
                ["This text spans across multiple columns", "", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Non Simple Table with Merged Rows and Columns
        table = tables.extract_table(
            non_simple_table_with_merged_rows_and_cols_elements,
            as_text=True,
            fix_element_in_multiple_rows=True,
            fix_element_in_multiple_cols=True,
        )
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                [
                    "This text spans across multiple rows and \nmultiple columns.",
                    "",
                    "A",
                    "1",
                ],
                ["", "", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Over the page
        table = tables.extract_simple_table(over_the_page_elements,
                                            as_text=True)
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "A", "1"],
                ["B", "2", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )
Example #10
0
    def test_output_is_correct(self):
        # The code below should match that in the documentation example "order_summary"
        # Step 1 - Load the document
        file_path = os.path.join(
            os.path.dirname(__file__),
            "../../docs/source/example_files/order_summary.pdf",
        )
        FONT_MAPPING = {
            "BAAAAA+LiberationSerif-Bold,16.0": "title",
            "BAAAAA+LiberationSerif-Bold,12.0": "sub_title",
            "CAAAAA+LiberationSerif,12.0": "text",
            "DAAAAA+FreeMonoBold,12.0": "table_header",
            "EAAAAA+FreeMono,12.0": "table_text",
        }
        document = load_file(file_path, font_mapping=FONT_MAPPING)

        # visualise(document)

        # Step 3 - Add sections
        order_summary_sub_title_element = (
            document.elements.filter_by_font("sub_title").filter_by_text_equal(
                "Order Summary:").extract_single_element())

        totals_sub_title_element = (
            document.elements.filter_by_font("sub_title").filter_by_text_equal(
                "Totals:").extract_single_element())

        final_element = document.elements[-1]

        order_summary_section = document.sectioning.create_section(
            name="order_summary",
            start_element=order_summary_sub_title_element,
            end_element=totals_sub_title_element,
            include_last_element=False,
        )

        totals_section = document.sectioning.create_section(
            name="totals",
            start_element=totals_sub_title_element,
            end_element=final_element,
        )

        # visualise(document)

        # Step 4 - Extract tables

        order_summary_table = tables.extract_simple_table(
            order_summary_section.elements.filter_by_fonts(
                "table_header", "table_text"),
            as_text=True,
        )

        totals_table = tables.extract_simple_table(
            totals_section.elements.filter_by_fonts("table_header",
                                                    "table_text"),
            as_text=True,
        )

        order_summary_with_header = tables.add_header_to_table(
            order_summary_table)

        self.assertListEqual(
            order_summary_table,
            [
                ["Item", "Unit Cost", "Quantity", "Cost"],
                ["Challenger 100g\nWhole Hops", "£3.29", "1", "£3.29"],
                [
                    "Maris Otter \nPale Ale Malt \n(Crushed)",
                    "£1.50/1000g",
                    "4000g",
                    "£6.00",
                ],
                ["WLP037 \nYorkshire Ale \nYeast", "£7.08", "1", "£7.08"],
                ["Bottle Caps", "£1 per 100", "500", "£5"],
            ],
        )

        self.assertListEqual(
            totals_table,
            [
                ["Subtotal:", "£26.28"],
                ["Shipping", "£6"],
                ["VAT 20%", "£6.45"],
                ["Total:", "£38.73"],
            ],
        )

        self.assertListEqual(
            order_summary_with_header,
            [
                {
                    "Item": "Challenger 100g\nWhole Hops",
                    "Unit Cost": "£3.29",
                    "Quantity": "1",
                    "Cost": "£3.29",
                },
                {
                    "Item": "Maris Otter \nPale Ale Malt \n(Crushed)",
                    "Unit Cost": "£1.50/1000g",
                    "Quantity": "4000g",
                    "Cost": "£6.00",
                },
                {
                    "Item": "WLP037 \nYorkshire Ale \nYeast",
                    "Unit Cost": "£7.08",
                    "Quantity": "1",
                    "Cost": "£7.08",
                },
                {
                    "Item": "Bottle Caps",
                    "Unit Cost": "£1 per 100",
                    "Quantity": "500",
                    "Cost": "£5",
                },
            ],
        )