Ejemplo n.º 1
0
    def test_eq(self):
        bbox_1 = BoundingBox(0, 1, 0, 1)
        bbox_2 = BoundingBox(0, 1, 0, 1)
        self.assertEqual(bbox_1, bbox_2)

        bbox_3 = BoundingBox(0, 1, 0, 3)
        self.assertNotEqual(bbox_1, bbox_3)
Ejemplo n.º 2
0
    def test_extract_simple_table(self):
        # Checks that simple 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        result = extract_simple_table(elem_list)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assert_original_element_list_list_equal(
            [[elem_1, elem_2], [elem_3, elem_4]], result
        )
        # Checks that it raises an exception when table is not rectangular i.e table
        # has empty cells
        #
        #       elem_1      elem_2
        #       elem_3      elem_4      elem_5
        #
        elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 0, 5))

        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
        )
        elem_list = document.elements
        with self.assertRaises(TableExtractionError):
            extract_simple_table(elem_list)
Ejemplo n.º 3
0
    def test_extract_text_from_simple_table(self):
        # Checks that text from simple 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        elem_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10), text="fake_text_1"
        )
        elem_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10), text="fake_text_2"
        )
        elem_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_3"
        )
        elem_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_4 "
        )

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        result = extract_simple_table(elem_list, as_text=True)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)

        self.assertListEqual(
            [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4"]], result
        )

        result = extract_simple_table(elem_list, as_text=True, strip_text=False)
        self.assertListEqual(
            [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4 "]], result
        )
Ejemplo n.º 4
0
    def test_fix_element_in_multiple_cols(self):
        # Checks that the following table is correctly extracted:
        # ---------
        # | 1     |
        # --------|
        # | 2 | 3 |
        # ---------

        elem_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 10, 6, 10), text="fake_text_1"
        )
        elem_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_2"
        )
        elem_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_3"
        )

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3])
        elem_list = document.elements

        with self.assertRaises(TableExtractionError):
            result = extract_table(elem_list, as_text=True)

        result = extract_table(
            elem_list, as_text=True, fix_element_in_multiple_cols=True
        )
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertListEqual(
            [["fake_text_1", ""], ["fake_text_2", "fake_text_3"]], result
        )
Ejemplo n.º 5
0
    def test_extract_table_with_tolerance(self):
        # Checks that simple 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        # But with elem_4 slightly overlapping elem_2, counteracted by setting tolerance
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 6.1))

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        with self.assertRaises(TableExtractionError):
            extract_table(elem_list)

        result = extract_table(elem_list, tolerance=0.2)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assert_original_element_list_list_equal(
            [[elem_1, elem_2], [elem_3, elem_4]], result
        )
Ejemplo n.º 6
0
    def test_create_bounding_box(self):
        bbox = BoundingBox(0, 1, 0, 1)
        self.assertEqual(bbox.width, 1)
        self.assertEqual(bbox.height, 1)

        # Checks that it raises an exception if coordinates are not valid
        with self.assertRaises(InvalidCoordinatesError):
            BoundingBox(1, 0, 0, 1)

        with self.assertRaises(InvalidCoordinatesError):
            BoundingBox(0, 1, 1, 0)
Ejemplo n.º 7
0
    def test_element_ordering(self):
        #       elem_1      elem_2
        #       elem_3      elem_4
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10))
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))

        # Check default: left to right, top to bottom
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4])
        self.assert_original_element_list_equal(
            [elem_1, elem_2, elem_3, elem_4], document.elements)

        # Check other presets
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4],
            element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM,
        )
        self.assert_original_element_list_equal(
            [elem_2, elem_1, elem_4, elem_3], document.elements)

        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4],
            element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT,
        )
        self.assert_original_element_list_equal(
            [elem_1, elem_3, elem_2, elem_4], document.elements)

        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4],
            element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT,
        )
        self.assert_original_element_list_equal(
            [elem_2, elem_4, elem_1, elem_3], document.elements)

        # Check custom function
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4],
            element_ordering=lambda elements: [
                elements[0],
                elements[3],
                elements[1],
                elements[2],
            ],
        )
        self.assert_original_element_list_equal(
            [elem_1, elem_4, elem_2, elem_3], document.elements)
Ejemplo n.º 8
0
 def test_extract_simple_table_with_gaps_and_wrong_reference(self):
     #       elem_1      elem_2      elem_3
     #       elem_4      elem_5
     elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
     elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
     elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
     elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
     elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
     document = create_pdf_document(
         elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
     )
     elem_list = document.elements
     reference_element = self.extract_element_from_list(elem_3, elem_list)
     with self.assertRaises(TableExtractionError):
         extract_simple_table(
             elem_list, allow_gaps=True, reference_element=reference_element
         )
Ejemplo n.º 9
0
    def test_extract_table(self):
        # Checks that simple 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        result = extract_table(elem_list)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assert_original_element_list_list_equal(
            [[elem_1, elem_2], [elem_3, elem_4]], result
        )
        # Checks that the following table is correctly extracted
        #
        #       elem_1      elem_2                  elem_6
        #       elem_3      elem_4      elem_5
        #
        elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 0, 5))
        elem_6 = FakePDFMinerTextElement(bounding_box=BoundingBox(16, 20, 6, 10))
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6]
        )
        elem_list = document.elements
        result = extract_table(elem_list)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 4)
        self.assertEqual(len(result[1]), 4)
        self.assert_original_element_list_list_equal(
            [[elem_1, elem_2, None, elem_6], [elem_3, elem_4, elem_5, None]], result
        )
        # Checks that it raises an error if one element is in two rows
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(3, 8, 6, 10))
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6]
        )
        elem_list = document.elements
        with self.assertRaises(TableExtractionError):
            result = extract_table(elem_list)
        # Checks that it raises an error if one element is in two columns
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 3, 8))
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6]
        )
        elem_list = document.elements
        with self.assertRaises(TableExtractionError):
            result = extract_table(elem_list)
Ejemplo n.º 10
0
    def test_filter_partially_within_bounding_box(self, partially_within_mock):
        partially_within_mock.side_effect = (
            lambda self, bounding_box: self.text() == "within"
        )

        elem1 = FakePDFMinerTextElement(text="within")
        elem2 = FakePDFMinerTextElement(text="within")
        elem3 = FakePDFMinerTextElement()
        elem4 = FakePDFMinerTextElement(text="within")
        elem5 = FakePDFMinerTextElement()
        elem6 = FakePDFMinerTextElement(text="within")

        page1 = Page(elements=[elem1, elem2, elem3, elem4], width=100, height=100)
        page2 = Page(elements=[elem5, elem6], width=100, height=100)

        doc = PDFDocument(pages={1: page1, 2: page2})
        elem_list = doc.elements

        pdf_elem1 = self.extract_element_from_list(elem1, elem_list)
        pdf_elem2 = self.extract_element_from_list(elem2, elem_list)
        pdf_elem3 = self.extract_element_from_list(elem3, elem_list)
        pdf_elem4 = self.extract_element_from_list(elem4, elem_list)

        result = elem_list.filter_partially_within_bounding_box(
            BoundingBox(0, 1, 0, 1), 1
        )

        # expected_bbox is from the left edge of elem1 to the left edge of the page
        expected_bbox = BoundingBox(0, 1, 0, 1)
        partially_within_mock.assert_has_calls(
            [
                call(pdf_elem1, expected_bbox),
                call(pdf_elem2, expected_bbox),
                call(pdf_elem3, expected_bbox),
                call(pdf_elem4, expected_bbox),
            ],
            any_order=True,
        )

        self.assertEqual(len(result), 3)
        self.assertIn(pdf_elem1, result)
        self.assertIn(pdf_elem2, result)
        self.assertIn(pdf_elem4, result)
Ejemplo n.º 11
0
 def test_extract_simple_table_with_gaps(self):
     #       elem_1      elem_2      elem_3
     #       elem_4      elem_5
     elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
     elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
     elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
     elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
     elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
     document = create_pdf_document(
         elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
     )
     elem_list = document.elements
     result = extract_simple_table(elem_list, allow_gaps=True)
     self.assertEqual(len(result), 2)
     self.assertEqual(len(result[0]), 3)
     self.assertEqual(len(result[1]), 3)
     self.assert_original_element_list_list_equal(
         [[elem_1, elem_2, elem_3], [elem_4, elem_5, None]], result
     )
Ejemplo n.º 12
0
    def test_extract_table_from_different_pages(self):
        # Checks that simple 2*2 tables are correctly extracted from different pages
        #
        # Page 1:
        #       elem_p1_1      elem_p1_2
        #       elem_p1_3      elem_p1_4
        #
        # Page 2:
        #       elem_p2_1      elem_p2_2
        #       elem_p2_3      elem_p2_4
        #
        elem_p1_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10))
        elem_p1_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10))
        elem_p1_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5))
        elem_p1_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5))

        elem_p2_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10))
        elem_p2_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10))
        elem_p2_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5))
        elem_p2_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5))

        document = create_pdf_document(
            elements={
                1: [elem_p1_1, elem_p1_2, elem_p1_3, elem_p1_4],
                2: [elem_p2_1, elem_p2_2, elem_p2_3, elem_p2_4],
            })
        elem_list = document.elements

        result = extract_table(elem_list)
        self.assertEqual(len(result), 4)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertEqual(len(result[2]), 2)
        self.assertEqual(len(result[3]), 2)
        self.assert_original_element_list_list_equal(
            [
                [elem_p1_1, elem_p1_2],
                [elem_p1_3, elem_p1_4],
                [elem_p2_1, elem_p2_2],
                [elem_p2_3, elem_p2_4],
            ],
            result,
        )
Ejemplo n.º 13
0
 def __init__(
     self,
     bounding_box: "BoundingBox" = BoundingBox(0, 1, 0, 1),
     text: str = "fake_text",
     font_name: str = "fake_font",
     font_size: float = 10,
 ):
     super().__init__(bbox=[
         bounding_box.x0, bounding_box.y0, bounding_box.x1, bounding_box.y1
     ])
     self.text = text
     self.font_name = font_name
     self.font_size = font_size
Ejemplo n.º 14
0
def create_pdf_element(
    bounding_box: "BoundingBox" = BoundingBox(0, 1, 0, 1),
    text: str = "fake_text",
    font_name: str = "fake_font",
    font_size: float = 10,
    font_mapping: Optional[Dict[str, str]] = None,
    font_mapping_is_regex: bool = False,
    regex_flags: Union[int, re.RegexFlag] = 0,
    font_size_precision: int = 1,
) -> "PDFElement":
    document = create_pdf_document(
        elements=[
            FakePDFMinerTextElement(
                bounding_box, text=text, font_name=font_name, font_size=font_size
            )
        ],
        font_mapping=font_mapping,
        font_mapping_is_regex=font_mapping_is_regex,
        regex_flags=regex_flags,
        font_size_precision=font_size_precision,
    )
    return document.elements[0]
Ejemplo n.º 15
0
    def test_document(self):
        el_page_1_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3))
        el_page_1_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3))
        el_page_1_bottom_left = FakePDFMinerTextElement(BoundingBox(
            0, 1, 0, 1))
        el_page_1_bottom_right = FakePDFMinerTextElement(
            BoundingBox(2, 3, 0, 1))
        page_1 = Page(
            elements=[
                el_page_1_top_left,
                el_page_1_top_right,
                el_page_1_bottom_left,
                el_page_1_bottom_right,
            ],
            width=100,
            height=100,
        )

        el_page_2_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3))
        el_page_2_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3))
        el_page_2_bottom_left = FakePDFMinerTextElement(BoundingBox(
            0, 1, 0, 1))
        el_page_2_bottom_right = FakePDFMinerTextElement(
            BoundingBox(2, 3, 0, 1))
        page_2 = Page(
            elements=[
                el_page_2_bottom_right,
                el_page_2_bottom_left,
                el_page_2_top_right,
                el_page_2_top_left,
            ],
            width=100,
            height=100,
        )

        document = PDFDocument(pages={1: page_1, 2: page_2})

        # Checks elements were reordered
        expected_ordered_list = [
            el_page_1_top_left,
            el_page_1_top_right,
            el_page_1_bottom_left,
            el_page_1_bottom_right,
            el_page_2_top_left,
            el_page_2_top_right,
            el_page_2_bottom_left,
            el_page_2_bottom_right,
        ]
        self.assertEqual(
            [elem.original_element for elem in document._element_list],
            expected_ordered_list,
        )

        # Checks indexes were assigned properly
        self.assertEqual([elem._index for elem in document._element_list],
                         [0, 1, 2, 3, 4, 5, 6, 7])

        # Checks page numbers is correct
        self.assertEqual(document.page_numbers, [1, 2])

        # Checks number of pages is correct
        self.assertEqual(document.number_of_pages, 2)

        # Checks pages were assigned properly
        self.assertEqual(
            [elem.page_number for elem in document._element_list],
            [1, 1, 1, 1, 2, 2, 2, 2],
        )

        # Checks pages were instantiated correctly
        pdf_page_1 = document.get_page(1)
        self.assertEqual(page_1.width, pdf_page_1.width)
        self.assertEqual(page_1.height, pdf_page_1.height)
        self.assertEqual(el_page_1_top_left,
                         pdf_page_1.start_element.original_element)
        self.assertEqual(el_page_1_bottom_right,
                         pdf_page_1.end_element.original_element)
        self.assertEqual(pdf_page_1.page_number, 1)
        self.assertEqual(pdf_page_1.elements,
                         ElementList(document, set([0, 1, 2, 3])))

        pdf_page_2 = document.get_page(2)
        self.assertEqual(page_2.width, pdf_page_2.width)
        self.assertEqual(page_2.height, pdf_page_2.height)
        self.assertEqual(el_page_2_top_left,
                         pdf_page_2.start_element.original_element)
        self.assertEqual(el_page_2_bottom_right,
                         pdf_page_2.end_element.original_element)
        self.assertEqual(pdf_page_2.page_number, 2)
        self.assertEqual(pdf_page_2.elements,
                         ElementList(document, set([4, 5, 6, 7])))

        self.assertEqual(document.pages, [pdf_page_1, pdf_page_2])

        self.assertEqual(document.elements,
                         ElementList(document, set([0, 1, 2, 3, 4, 5, 6, 7])))
        with self.assertRaises(PageNotFoundError):
            document.get_page(3)
Ejemplo n.º 16
0
class TestPDFElement(BaseTestCase):
    element_bbox = BoundingBox(2, 5, 2, 5)

    def test_page_number(self):
        element = create_pdf_element()
        self.assertEqual(element.page_number, 1)

        with self.assertRaises(AttributeError):
            element.page_number = 2

    def test_font_name(self):
        element = create_pdf_element(font_name="test_font")
        self.assertEqual(element.font_name, "test_font")

    def test_font_size(self):
        element = create_pdf_element(font_size=2)
        self.assertEqual(element.font_size, 2)

    def test_font_size_precision(self):
        element = create_pdf_element(font_size=1.234)
        self.assertEqual(element.font_size, 1.2)

        element = create_pdf_element(font_size=1.234, font_size_precision=0)
        self.assertEqual(element.font_size, 1)

        element = create_pdf_element(font_size=1.234, font_size_precision=3)
        self.assertEqual(element.font_size, 1.234)

    def test_font(self):
        element = create_pdf_element(font_name="test_font", font_size=2)
        self.assertEqual(element.font, "test_font,2")

        element = create_pdf_element(
            font_name="test_font",
            font_size=3,
            font_mapping={"test_font,3": "test_named_font"},
        )
        self.assertEqual(element.font, "test_named_font")

        element = create_pdf_element(
            font_name="test_font",
            font_size=2,
            font_mapping={"test_font,3": "test_named_font"},
        )
        self.assertEqual(element.font, "test_font,2")

        # Test when font_mapping argument is passed to PDFDocument
        font_mapping = {}
        element = create_pdf_element(font_name="fake_font_1",
                                     font_size=10,
                                     font_mapping=font_mapping)
        self.assertEqual(element.font, "fake_font_1,10")

        font_mapping = {"fake_font_1,10": "large_text"}
        element = create_pdf_element(font_name="fake_font_1",
                                     font_size=10,
                                     font_mapping=font_mapping)
        self.assertEqual(element.font, "large_text")

        font_mapping = {r"^fake_font_\d,10$": "large_text"}
        element = create_pdf_element(
            font_name="fake_font_1",
            font_size=10,
            font_mapping=font_mapping,
            font_mapping_is_regex=True,
        )
        self.assertEqual(element.font, "large_text")

        font_mapping = {r"^fake_font_\d,10$": "large_text"}
        element = create_pdf_element(
            font_name="FAKE_FONT_1",
            font_size=10,
            font_mapping=font_mapping,
            font_mapping_is_regex=True,
        )
        self.assertEqual(element.font, "FAKE_FONT_1,10")

        font_mapping = {r"^fake_font_\d,10$": "large_text"}
        element = create_pdf_element(
            font_name="FAKE_FONT_1",
            font_size=10,
            font_mapping=font_mapping,
            font_mapping_is_regex=True,
            regex_flags=re.IGNORECASE,
        )
        self.assertEqual(element.font, "large_text")

    def test_text(self):
        element = create_pdf_element(text=" test ")
        self.assertEqual(element.text(), "test")
        self.assertEqual(element.text(stripped=False), " test ")

    def test_add_tag(self):
        element = create_pdf_element()
        self.assertEqual(element.tags, set())

        element.add_tag("foo")
        self.assertEqual(element.tags, set(["foo"]))

        element.add_tag("foo")
        self.assertEqual(element.tags, set(["foo"]))

        element.add_tag("bar")
        self.assertEqual(element.tags, set(["foo", "bar"]))

    def test_repr(self):
        element = create_pdf_element(font_name="test_font", font_size=2)
        self.assertEqual(repr(element),
                         "<PDFElement tags: set(), font: 'test_font,2'>")

        element.add_tag("foo")
        self.assertEqual(repr(element),
                         "<PDFElement tags: {'foo'}, font: 'test_font,2'>")

        element.ignore()
        self.assertEqual(
            repr(element),
            "<PDFElement tags: {'foo'}, font: 'test_font,2', ignored>")

    @data(
        BoundingBox(1, 6, 1, 6),  # This box fully encloses the element
        BoundingBox(1, 6, 0,
                    3),  # This box intersects the bottom of the element
        BoundingBox(1, 6, 0, 2),  # This box touches the bottom of the element
        BoundingBox(1, 6, 4, 6),  # This box intersects the top of the element
        BoundingBox(1, 6, 5, 6),  # This box touches the top of the element
        BoundingBox(1, 6, 3, 4),  # This box goes through center horizontally
        BoundingBox(1, 3, 1, 6),  # This box intersects the left of the element
        BoundingBox(1, 2, 1, 6),  # This box touches the left of the element
        BoundingBox(4, 6, 1, 6),  # This box intersects the left of the element
        BoundingBox(5, 6, 1, 6),  # This box touches the left of the element
        BoundingBox(3, 4, 1, 6),  # This box goes through the center vertically
        BoundingBox(3, 4, 3, 4),  # This box is enclosed inside the element
    )
    def test_partially_within_true(self, bounding_box):
        element = create_pdf_element(self.element_bbox)
        self.assertTrue(element.partially_within(bounding_box))

    @data(
        BoundingBox(1, 6, 0, 1),  # This box is underneath the element
        BoundingBox(1, 6, 6, 7),  # This box is above the element
        BoundingBox(0, 1, 1, 6),  # This box is to the left of the element
        BoundingBox(6, 7, 1, 6),  # This box is to the lerightft of the element
    )
    def test_partially_within_false(self, bounding_box):
        element = create_pdf_element(self.element_bbox)
        self.assertFalse(element.partially_within(bounding_box))

    @data(BoundingBox(1, 6, 1, 6))  # This box fully encloses the element
    def test_entirely_within_true(self, bounding_box):
        element = create_pdf_element(self.element_bbox)
        self.assertTrue(element.entirely_within(bounding_box))

    @data(
        BoundingBox(1, 6, 0,
                    3),  # This box intersects the bottom of the element
        BoundingBox(1, 6, 0, 2),  # This box touches the bottom of the element
        BoundingBox(1, 6, 4, 6),  # This box intersects the top of the element
        BoundingBox(1, 6, 5, 6),  # This box touches the top of the element
        BoundingBox(1, 6, 3, 4),  # This box goes through center horizontally
        BoundingBox(1, 3, 1, 6),  # This box intersects the left of the element
        BoundingBox(1, 2, 1, 6),  # This box touches the left of the element
        BoundingBox(4, 6, 1, 6),  # This box intersects the left of the element
        BoundingBox(5, 6, 1, 6),  # This box touches the left of the element
        BoundingBox(3, 4, 1, 6),  # This box goes through the center vertically
        BoundingBox(1, 6, 0, 1),  # This box is underneath the element
        BoundingBox(1, 6, 6, 7),  # This box is above the element
        BoundingBox(0, 1, 1, 6),  # This box is to the left of the element
        BoundingBox(6, 7, 1, 6),  # This box is to the right of the element
        BoundingBox(3, 4, 3, 4),  # This box is enclosed inside the element
    )
    def test_entirely_within_false(self, bounding_box):
        element = create_pdf_element(self.element_bbox)
        self.assertFalse(element.entirely_within(bounding_box))
Ejemplo n.º 17
0
    def test_horizontally_in_line_with(self, partially_within_mock):
        partially_within_mock.side_effect = (
            lambda self, bounding_box: self.text() == "within"
        )

        elem1 = FakePDFMinerTextElement(
            text="within", bounding_box=BoundingBox(50, 51, 50, 51)
        )
        elem2 = FakePDFMinerTextElement(text="within")
        elem3 = FakePDFMinerTextElement()
        elem4 = FakePDFMinerTextElement(text="within")
        elem5 = FakePDFMinerTextElement()
        elem6 = FakePDFMinerTextElement(text="within")

        page1 = Page(elements=[elem1, elem2, elem3, elem4], width=100, height=100)
        page2 = Page(elements=[elem5, elem6], width=100, height=100)

        doc = PDFDocument(pages={1: page1, 2: page2})
        elem_list = doc.elements

        pdf_elem1 = self.extract_element_from_list(elem1, elem_list)
        pdf_elem2 = self.extract_element_from_list(elem2, elem_list)
        pdf_elem3 = self.extract_element_from_list(elem3, elem_list)
        pdf_elem4 = self.extract_element_from_list(elem4, elem_list)

        result = elem_list.horizontally_in_line_with(pdf_elem1)

        # expected_bbox is from the left edge of elem1 to the left edge of the page
        expected_bbox = BoundingBox(0, 100, 50, 51)
        partially_within_mock.assert_has_calls(
            [
                call(pdf_elem1, expected_bbox),
                call(pdf_elem2, expected_bbox),
                call(pdf_elem3, expected_bbox),
                call(pdf_elem4, expected_bbox),
            ],
            any_order=True,
        )

        self.assertEqual(len(result), 2)
        self.assertIn(pdf_elem2, result)
        self.assertIn(pdf_elem4, result)

        # Also test with inclusive=True
        partially_within_mock.reset_mock()
        result = elem_list.horizontally_in_line_with(pdf_elem1, inclusive=True)

        partially_within_mock.assert_has_calls(
            [
                call(pdf_elem1, expected_bbox),
                call(pdf_elem2, expected_bbox),
                call(pdf_elem3, expected_bbox),
                call(pdf_elem4, expected_bbox),
            ],
            any_order=True,
        )

        self.assertEqual(len(result), 3)
        self.assertIn(pdf_elem1, result)
        self.assertIn(pdf_elem2, result)
        self.assertIn(pdf_elem4, result)

        # Test specifying tolerance
        expected_bbox = BoundingBox(0, 100, 50.1, 50.9)

        partially_within_mock.reset_mock()
        result = elem_list.horizontally_in_line_with(pdf_elem1, tolerance=0.1)

        partially_within_mock.assert_has_calls(
            [
                call(pdf_elem1, expected_bbox),
                call(pdf_elem2, expected_bbox),
                call(pdf_elem3, expected_bbox),
                call(pdf_elem4, expected_bbox),
            ],
            any_order=True,
        )
Ejemplo n.º 18
0
    def test_extract_text_from_table(self):
        # Checks that text from 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        elem_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10), text="fake_text_1"
        )
        elem_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10), text="fake_text_2"
        )
        elem_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_3"
        )
        elem_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_4 "
        )

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        result = extract_table(elem_list, as_text=True)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertListEqual(
            [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4"]], result
        )

        result = extract_table(elem_list, as_text=True, strip_text=False)
        self.assertListEqual(
            [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4 "]], result
        )

        # Checks that text from the following table is correctly extracted
        #
        #       elem_1      elem_2                  elem_6
        #       elem_3      elem_4      elem_5
        #
        elem_5 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(11, 15, 0, 5), text="fake_text_5"
        )
        elem_6 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(16, 20, 6, 10), text="fake_text_6"
        )
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6]
        )
        elem_list = document.elements
        result = extract_table(elem_list, as_text=True)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 4)
        self.assertEqual(len(result[1]), 4)
        self.assertListEqual(
            [
                ["fake_text_1", "fake_text_2", "", "fake_text_6"],
                ["fake_text_3", "fake_text_4", "fake_text_5", ""],
            ],
            result,
        )

        result = extract_table(elem_list, as_text=True, strip_text=False)
        self.assertListEqual(
            [
                ["fake_text_1", "fake_text_2", "", "fake_text_6"],
                ["fake_text_3", "fake_text_4 ", "fake_text_5", ""],
            ],
            result,
        )
Ejemplo n.º 19
0
    def test_extract_table_removing_duplicate_header_different_fonts_or_text(self):
        #    header_elem_1                     header_elem_2
        #    header_elem_3_different_font      header_elem_4
        #    header_elem_5_different_text      header_elem_6
        #
        header_elem_1 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 21, 25),
        )
        header_elem_2 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 21, 25),
        )
        header_elem_3_different_font = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=12,
            bounding_box=BoundingBox(0, 5, 16, 20),
        )
        header_elem_4 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 16, 20),
        )
        header_elem_5_different_text = FakePDFMinerTextElement(
            text="header with a different name",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 11, 15),
        )
        header_elem_6 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 11, 15),
        )

        document = create_pdf_document(
            elements=[
                header_elem_1,
                header_elem_2,
                header_elem_3_different_font,
                header_elem_4,
                header_elem_5_different_text,
                header_elem_6,
            ]
        )
        elem_list = document.elements

        result = extract_table(elem_list, remove_duplicate_header_rows=True)
        self.assertEqual(len(result), 3)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertEqual(len(result[2]), 2)
        self.assert_original_element_list_list_equal(
            [
                [header_elem_1, header_elem_2],
                [header_elem_3_different_font, header_elem_4],
                [header_elem_5_different_text, header_elem_6],
            ],
            result,
        )
Ejemplo n.º 20
0
    def test_extract_table_removing_duplicate_header_rows(self):
        #    header_elem_1    header_elem_2
        header_elem_1 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 21, 25),
        )
        header_elem_2 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 21, 25),
        )
        document = create_pdf_document(elements=[header_elem_1, header_elem_2])
        elem_list = document.elements

        result = extract_table(elem_list, remove_duplicate_header_rows=True)
        # Extraction here should just return the whole table as it is not possible to
        # have duplicates of a single lined table.
        self.assertEqual(len(result), 1)
        self.assertEqual(len(result[0]), 2)
        self.assert_original_element_list_list_equal(
            [[header_elem_1, header_elem_2]], result
        )

        #    header_elem_1                     header_elem_2
        #       elem_1           elem_2
        #    header_elem_3                     header_elem_4
        #       elem_3                         elem_4
        #    header_elem_5    header_elem_6
        #
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20))
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20))
        header_elem_3 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 11, 15),
        )
        header_elem_4 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 11, 15),
        )
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
        header_elem_5 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 0, 5),
        )
        header_elem_6 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(6, 10, 0, 5),
        )

        document = create_pdf_document(
            elements=[
                header_elem_1,
                header_elem_2,
                elem_1,
                elem_2,
                header_elem_3,
                header_elem_4,
                elem_3,
                elem_4,
                header_elem_5,
                header_elem_6,
            ]
        )
        elem_list = document.elements

        result = extract_table(elem_list, remove_duplicate_header_rows=True)
        # The last row will not be removed as the gaps do not match the header row
        self.assertEqual(len(result), 4)
        self.assertEqual(len(result[0]), 3)
        self.assertEqual(len(result[1]), 3)
        self.assertEqual(len(result[2]), 3)
        self.assertEqual(len(result[3]), 3)
        self.assert_original_element_list_list_equal(
            [
                [header_elem_1, None, header_elem_2],
                [elem_1, elem_2, None],
                [elem_3, None, elem_4],
                [header_elem_5, header_elem_6, None],
            ],
            result,
        )
Ejemplo n.º 21
0
 def test_repr(self):
     bbox = BoundingBox(0, 1, 0, 1)
     self.assertEqual(repr(bbox), "<BoundingBox x0=0, x1=1, y0=0, y1=1>")