Ejemplo n.º 1
0
    def test_image_ocr(self):
        """Test that correct words are ocred and that the word 'iruri' is on approximately proper position."""
        found_words_from_ocr = {item["word"] for item in self.ocr_data}
        digital_first_page = self.example_pdf.get_pages()[0]
        digital_words_first_page = {
            w.text.strip()
            for w in digital_first_page if w.text.strip()
        }

        # interlap of detected and digital words should be large
        inter = len(found_words_from_ocr & digital_words_first_page)
        iou = inter / (len(found_words_from_ocr) +
                       len(digital_words_first_page) - inter)
        self.assertTrue(iou > 0.9)

        # further, let's find the word 'irure' in both digital content and scanned content
        irure_el = self.example_pdf.get_page_as_html(0).xpath(
            './/word[text()="irure"]')[0]
        irure_digital_bb = Pdf.get_bounding_box_of_elem(
            irure_el).relative_to_size(
                width=self.example_pdf.get_width_height(0)[0],
                height=self.example_pdf.get_width_height(0)[1])
        irure_ocred_bb = list(
            filter(lambda i: i["word"] == "irure", self.ocr_data))[0]["bb"]

        # Bounding box of 'irure' should be similar in both cases.
        # However, we need to be tolerant here, as bounding boxes of scanned text are typically smaller
        # than the digital ones. So let's require intersection over union at least 0.4.
        self.assertGreater(irure_ocred_bb.get_iou(irure_digital_bb), 0.4)
Ejemplo n.º 2
0
class TestScanner(unittest.TestCase):

    example_pdf = Pdf(PDF_PATH)
    first_page_large = example_pdf.page_image(page_idx=0, dpi=300)
    ocr_data = None

    def setUp(self) -> None:
        self.ocr_data = Scanner.ocr_one_image(self.first_page_large)

    def test_image_ocr(self):
        """Test that correct words are ocred and that the word 'iruri' is on approximately proper position."""
        found_words_from_ocr = {item["word"] for item in self.ocr_data}
        digital_first_page = self.example_pdf.get_pages()[0]
        digital_words_first_page = {
            w.text.strip()
            for w in digital_first_page if w.text.strip()
        }

        # interlap of detected and digital words should be large
        inter = len(found_words_from_ocr & digital_words_first_page)
        iou = inter / (len(found_words_from_ocr) +
                       len(digital_words_first_page) - inter)
        self.assertTrue(iou > 0.9)

        # further, let's find the word 'irure' in both digital content and scanned content
        irure_el = self.example_pdf.get_page_as_html(0).xpath(
            './/word[text()="irure"]')[0]
        irure_digital_bb = Pdf.get_bounding_box_of_elem(
            irure_el).relative_to_size(
                width=self.example_pdf.get_width_height(0)[0],
                height=self.example_pdf.get_width_height(0)[1])
        irure_ocred_bb = list(
            filter(lambda i: i["word"] == "irure", self.ocr_data))[0]["bb"]

        # Bounding box of 'irure' should be similar in both cases.
        # However, we need to be tolerant here, as bounding boxes of scanned text are typically smaller
        # than the digital ones. So let's require intersection over union at least 0.4.
        self.assertGreater(irure_ocred_bb.get_iou(irure_digital_bb), 0.4)

    def test_ocred_pdf(self):
        """Convert the example_pdf into an image and the image back into a one-page pdf: test consistency."""
        pdf_path = mkstemp()[1]
        Scanner.image_to_one_page_ocred_pdf(
            self.first_page_large,
            pdf_path,
            pdf_width=self.example_pdf.get_width_height()[0],
            pdf_height=self.example_pdf.get_width_height()[1],
            ocr_text=self.ocr_data)
        scanned_pdf = Pdf(pdf_path)
        # get digital content of the scanned pdf
        scanned_text = scanned_pdf.layout_text
        self.assertTrue(
            re.search(r"of\s+all\s+factories\s+10\s+bil\.\s+Euro\s+4\%",
                      scanned_text))
        # cleanup
        os.remove(pdf_path)
Ejemplo n.º 3
0
 def test_ocred_pdf(self):
     """Convert the example_pdf into an image and the image back into a one-page pdf: test consistency."""
     pdf_path = mkstemp()[1]
     Scanner.image_to_one_page_ocred_pdf(
         self.first_page_large,
         pdf_path,
         pdf_width=self.example_pdf.get_width_height()[0],
         pdf_height=self.example_pdf.get_width_height()[1],
         ocr_text=self.ocr_data)
     scanned_pdf = Pdf(pdf_path)
     # get digital content of the scanned pdf
     scanned_text = scanned_pdf.layout_text
     self.assertTrue(
         re.search(r"of\s+all\s+factories\s+10\s+bil\.\s+Euro\s+4\%",
                   scanned_text))
     # cleanup
     os.remove(pdf_path)
Ejemplo n.º 4
0
    def _get_scored_words(words_in_page: List[html.HtmlElement],
                          one_annotation: Annotation,
                          threshold: float) -> List[Dict]:
        """For one page and a given annotation, find all words that has high overlap with annotation's bounding box.

        Return a list of potential word-candidates with form
        [{
            "word": html element representing the word,
            "score": proportion of the word box intersecting the annotation box
        ...,]
        """
        scored_words = []
        for word in words_in_page:
            word_box = Pdf.get_bounding_box_of_elem(word)
            annotation_and_box_interection = word_box.intersection(
                one_annotation.box)
            score = 0 if annotation_and_box_interection is None else annotation_and_box_interection.area / word_box.area
            if score > threshold:
                scored_words.append({"word": word, "score": score})
        return scored_words
Ejemplo n.º 5
0
    def test_enriched_annotations(self):
        """Check that enriched annotation for our example pdf coincide with expected results."""
        enriched = self.annotated_pdf.enriched_annotations

        # check that only rectangle-types are here
        rectangle_annots = [annot for annot in self.extracted_annots if annot.type == "rectangle"]
        for i, annot in enumerate(rectangle_annots):
            self.assertTrue(annotations_are_similar(annot, enriched[i]["annotation"]))

        # check that the textual content of first annotation is correct
        self.assertEqual(
            ' '.join(w["word"].text for w in enriched[0]["words"]),
            "Being killed at train station")

        # check that the 'scores' of words in first rectangle annotation are reasonable
        for i, w in enumerate(enriched[0]["words"]):
            word_bb = Pdf.get_bounding_box_of_elem(w["word"])
            self.assertGreater(w["score"], 0.9)

            self.assertLess(
                word_bb.intersection(rectangle_annots[0].box).area / word_bb.area - w["score"],
                0.01)
Ejemplo n.º 6
0
class TestPdf(unittest.TestCase):

    pdf = Pdf(PDF_PATH)
    pdf_rotated = Pdf(PDF_ROTATED_PATH)

    def test_basic_attributes(self):
        """Check correctness of path, name and number of pages."""
        self.assertEqual(self.pdf.pdf_path, PDF_PATH)
        self.assertEqual(self.pdf.name, "example.pdf")
        self.assertEqual(self.pdf.number_of_pages, 2)

    def test_width_height_rotation(self):
        """Check width, height, and extracted page rotation."""
        w, h = self.pdf.get_width_height(0)
        wr, hr = self.pdf_rotated.get_width_height(0)

        # width and height of the original pdf
        self.assertLessEqual(abs(w - 595), 1)
        self.assertLessEqual(abs(h - 842), 1)

        # rotated pdf should have width and height swapped
        self.assertLessEqual(abs(hr - 595), 1)
        self.assertLessEqual(abs(wr - 842), 1)

        # check page rotation
        self.assertEqual(self.pdf.page_rotation(0), 0)
        self.assertEqual(self.pdf.page_rotation(1), 0)
        self.assertEqual(self.pdf_rotated.page_rotation(0), 90)

    def test_page_image(self):
        """Check consistency of first-page image, reference image, and recovered image from rotated pdf."""
        im_1 = self.pdf.page_image(0)
        im_rot_1 = self.pdf_rotated.page_image(0)

        im_1_reconstructed = im_rot_1.rotate(90, expand=True)

        # sizes should coincide
        self.assertEqual(im_1.size, im_1_reconstructed.size)
        # first pdf page should be similar to the rotated first page of rotated pdf
        self.assertGreater(
            naive_image_similarity(np.array(im_1),
                                   np.array(im_1_reconstructed)), 0.98)
        # first page should be similar to the precomputed image from disc
        self.assertGreater(
            naive_image_similarity(
                np.array(im_1),
                np.array(Image.open(str(FIRST_PDF_PAGE_PATH)))), 0.98)

        images = list(self.pdf.images)
        images_rotated = list(self.pdf_rotated.images)

        # the 'images' method should return the precomputed images from a buffer, so here we require exact match
        self.assertEqual(im_1, images[0])
        self.assertEqual(im_rot_1, images_rotated[0])

    def test_text_extraction_from_pdf(self):
        """This is essentially testing pdftotext (probably coming from Poppler, of Xpdf)."""
        simple_text = self.pdf.simple_text
        layout_text = self.pdf.layout_text
        # xml with bounding boxes of words
        root = self.pdf.get_page_as_html(0)

        # list of strings (one per page)
        simple_pages = [page for page in simple_text.split("\f") if page]
        layout_pages = [page for page in layout_text.split("\f") if page]

        # We have two pages in the pdf
        self.assertEqual(len(simple_pages), 2)
        self.assertEqual(len(layout_pages), 2)

        # Test that first page contain expected words
        words_in_first_page = set(simple_pages[0].split())
        self.assertTrue({"Lorem", "ipsum", "Aron", "killed",
                         "*****@*****.**"}.issubset(words_in_first_page))
        self.assertFalse({
            "Autobahn", "Das", "The", "name", "hungry", "*****@*****.**"
        } & words_in_first_page)

        # this regex should be matched in a reasonably extracted layout-first-page-text
        self.assertTrue(
            re.search(r"Stolen\s+bike\s+500\s+Euro\s+3%", layout_pages[0]))
        self.assertTrue(re.search(r"[email protected]\s*\n", layout_pages[1]))

        # Find bounding box of 'extreme' word on first page
        extreme_element = root.xpath(".//word[text()='extreme']")[0]
        extreme_bb = Rectangle(x_min=extreme_element.attrib["xmin"],
                               y_min=extreme_element.attrib["ymin"],
                               x_max=extreme_element.attrib["xmax"],
                               y_max=extreme_element.attrib["ymax"])

        # Check that the bounding box is reasonable
        self.assertTrue(extreme_bb in Rectangle(
            x_min=220, y_min=530, x_max=290, y_max=590))

    def test_text_extraction_from_rotated_pdf(self):
        """Check that bounding box of a word in pdf is where it should be."""
        pages = self.pdf.get_pages()
        pages_rotated = self.pdf_rotated.get_pages()
        pages_txt = self.pdf.get_pages_as_text()

        self.assertEqual(len(pages), 2)
        self.assertEqual(len(pages_txt), 2)

        first_el_pdf = pages[0][0]
        first_el_pdf_rotated = pages_rotated[0][0]

        # both pdf should start with "Insurance" on the first page
        self.assertEqual(first_el_pdf.text, "Insurance")
        self.assertEqual(first_el_pdf_rotated.text, "Insurance")

        # enforce approximate bounding box of this first word
        self.assertTrue(
            self.pdf.get_bounding_box_of_elem(pages[0][0]) in (
                Rectangle(x_min=72, y_min=98, x_max=165, y_max=128)))

        self.assertTrue(
            self.pdf_rotated.get_bounding_box_of_elem(pages_rotated[0][0]) in (
                Rectangle(x_min=712, y_min=70, x_max=750, y_max=162)))

    def test_pdf_recreation(self):
        """Test the method `recreate_digital_content`.

        We convert the example pdf to a new pdf created from images and ocr.
        Then we test that
            * image-content of first page is similar to image of the reconstructed pdf, and
            * textual content of first page is the same as ocr-result from first page-image.
        (Ocr itself is tested in the test_ocr module.)
        """
        tmp_pdf_file = mkstemp()[1]
        self.pdf.recreate_digital_content(tmp_pdf_file,
                                          tesseract_lang='eng',
                                          tesseract_conf="")
        recreated = Pdf(tmp_pdf_file)

        # pdf should have two pages
        self.assertEqual(recreated.number_of_pages, 2)

        # size of first page should be unchanged
        pdf_widh, pdf_height = self.pdf.get_width_height(0)
        self.assertEqual(recreated.get_width_height(0), (pdf_widh, pdf_height))

        im_width, im_height = self.pdf.page_image(0, dpi=150).size
        im_recreated = recreated.page_image(0, dpi=150)

        # first page image original and reconstructed (widht equal dpi) should have approximately the same size
        self.assertLess(abs(im_width - im_recreated.size[0]) / im_width, 0.05)
        self.assertLess(
            abs(im_height - im_recreated.size[1]) / im_height, 0.05)

        # first page should be similar to the first reconstructed page (after resizing)
        im_recreated = im_recreated.resize((im_width, im_height))
        self.assertGreater(
            naive_image_similarity(np.array(self.pdf.page_image(0)),
                                   np.array(im_recreated)), 0.98)

        # check digital content
        # we will compare dictionaries {word: bounding_box} in reconstructed pdf and in ocr scan of the original one
        # these two dictionaries should have equal keys, and similar values for all keys which represent unique words
        words_and_bounding_boxes = recreated.get_pages()[0]
        words_and_bounding_boxes = {
            item.text: Pdf.get_bounding_box_of_elem(item).relative_to_size(
                width=pdf_widh, height=pdf_height)
            for item in words_and_bounding_boxes
        }

        scanned_words_and_bounding_boxes = Scanner.ocr_one_image(
            self.pdf.page_image(0), lang="eng", config="")
        scanned_words_and_bounding_boxes = {
            item["word"]: item["bb"]
            for item in scanned_words_and_bounding_boxes
        }

        # both dictionaries should have the same keys
        self.assertEqual(set(scanned_words_and_bounding_boxes),
                         set(words_and_bounding_boxes))

        # bounding boxes of the word "left" should be approximately equal (iou at least 0.4)
        self.assertGreater(
            scanned_words_and_bounding_boxes["left"].get_iou(
                words_and_bounding_boxes["left"]), 0.4)

        # cleanup
        os.remove(tmp_pdf_file)
Ejemplo n.º 7
0
    def test_pdf_recreation(self):
        """Test the method `recreate_digital_content`.

        We convert the example pdf to a new pdf created from images and ocr.
        Then we test that
            * image-content of first page is similar to image of the reconstructed pdf, and
            * textual content of first page is the same as ocr-result from first page-image.
        (Ocr itself is tested in the test_ocr module.)
        """
        tmp_pdf_file = mkstemp()[1]
        self.pdf.recreate_digital_content(tmp_pdf_file,
                                          tesseract_lang='eng',
                                          tesseract_conf="")
        recreated = Pdf(tmp_pdf_file)

        # pdf should have two pages
        self.assertEqual(recreated.number_of_pages, 2)

        # size of first page should be unchanged
        pdf_widh, pdf_height = self.pdf.get_width_height(0)
        self.assertEqual(recreated.get_width_height(0), (pdf_widh, pdf_height))

        im_width, im_height = self.pdf.page_image(0, dpi=150).size
        im_recreated = recreated.page_image(0, dpi=150)

        # first page image original and reconstructed (widht equal dpi) should have approximately the same size
        self.assertLess(abs(im_width - im_recreated.size[0]) / im_width, 0.05)
        self.assertLess(
            abs(im_height - im_recreated.size[1]) / im_height, 0.05)

        # first page should be similar to the first reconstructed page (after resizing)
        im_recreated = im_recreated.resize((im_width, im_height))
        self.assertGreater(
            naive_image_similarity(np.array(self.pdf.page_image(0)),
                                   np.array(im_recreated)), 0.98)

        # check digital content
        # we will compare dictionaries {word: bounding_box} in reconstructed pdf and in ocr scan of the original one
        # these two dictionaries should have equal keys, and similar values for all keys which represent unique words
        words_and_bounding_boxes = recreated.get_pages()[0]
        words_and_bounding_boxes = {
            item.text: Pdf.get_bounding_box_of_elem(item).relative_to_size(
                width=pdf_widh, height=pdf_height)
            for item in words_and_bounding_boxes
        }

        scanned_words_and_bounding_boxes = Scanner.ocr_one_image(
            self.pdf.page_image(0), lang="eng", config="")
        scanned_words_and_bounding_boxes = {
            item["word"]: item["bb"]
            for item in scanned_words_and_bounding_boxes
        }

        # both dictionaries should have the same keys
        self.assertEqual(set(scanned_words_and_bounding_boxes),
                         set(words_and_bounding_boxes))

        # bounding boxes of the word "left" should be approximately equal (iou at least 0.4)
        self.assertGreater(
            scanned_words_and_bounding_boxes["left"].get_iou(
                words_and_bounding_boxes["left"]), 0.4)

        # cleanup
        os.remove(tmp_pdf_file)
Ejemplo n.º 8
0
class TestAnnotation(unittest.TestCase):

    annotated_pdf = Pdf(ANNOTATED_PDF_PATH)
    extractor = AnnotationExtractor()

    expected_annotations = [
        Annotation(
            page=0,
            type="note",
            box=Rectangle(x_min=87.58,
                          y_min=45.574,
                          x_max=107.58,
                          y_max=65.574),
            text_content=
            "Daniel, include also the remaining 133 pages in the pdf!!!!",
            who_annotated="peter"),
        Annotation(page=0,
                   type="rectangle",
                   box=Rectangle(x_min=83.46,
                                 y_min=504.12,
                                 x_max=221.12,
                                 y_max=518),
                   text_content="risk",
                   who_annotated="peter"),
        Annotation(page=0,
                   type="rectangle",
                   box=Rectangle(x_min=321.27,
                                 y_min=503,
                                 x_max=374.16,
                                 y_max=517.63),
                   text_content="coverage_total",
                   who_annotated="peter"),
        Annotation(page=0,
                   type="rectangle",
                   box=Rectangle(x_min=373.4,
                                 y_min=504.5,
                                 x_max=399.66,
                                 y_max=518),
                   text_content="currency",
                   who_annotated="peter"),
        Annotation(page=0,
                   type="rectangle",
                   box=Rectangle(x_min=465.68,
                                 y_min=504.87,
                                 x_max=486.31,
                                 y_max=517.25),
                   text_content="deductible in %",
                   who_annotated="peter"),
        Annotation(page=1,
                   type="oval",
                   box=Rectangle(x_min=55.7,
                                 y_min=133.72,
                                 x_max=338.9,
                                 y_max=177.23),
                   text_content="add Honza",
                   who_annotated="peter"),
    ]

    def test_assertion_in_annotation_type(self):
        """If type is not in ADMISSIBLE_ANNOTATIONS, an error should be raised."""
        self.assertRaises(
            AssertionError, lambda: Annotation(
                page=0, type="invisible", box=Rectangle(0, 0, 0, 0)))

    def test_annotation_creation(self):
        """Test the creation of one Annotation object."""
        ann = Annotation(page=12,
                         type="rectangle",
                         box=Rectangle(10, 10, 13, 13),
                         text_content="FPP3",
                         who_annotated="terminator II",
                         label=3)

        expected_annotation_as_dict = {
            "page": 12,
            "type": "rectangle",
            "box": {
                "x_min": 10,
                "y_min": 10,
                "x_max": 13,
                "y_max": 13
            },
            "text_content": "FPP3",
            "who_annotated": "terminator II",
            "label": 3
        }

        self.assertEqual(ann.as_dict, expected_annotation_as_dict)

    def test_annotation_extraction(self):
        """Extract annotation from file and check that they correspond to expected annotations."""
        annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf)

        # each annotation is found in expected
        for annot in annotations:
            with self.subTest(annotation=annot):
                self.assertTrue(
                    any(
                        annotations_are_similar(annot, other)
                        for other in self.expected_annotations))
        # each expected annotation is found in annotations
        for exp_annot in self.expected_annotations:
            with self.subTest(expected_annotation=exp_annot):
                self.assertTrue(
                    any(
                        annotations_are_similar(exp_annot, other)
                        for other in annotations))

    def test_dump_annotations_to_file(self):
        """Dump annotations to file, load them from file, and compare that all is consistent."""
        annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf)
        temp_json_file = mkstemp()[1]
        self.extractor.dump_annotations_to_file(annotations, temp_json_file)
        with open(temp_json_file) as f:
            annots_from_file = json.load(f)

        for i, annot in enumerate(annots_from_file):
            # check that i'th annotation on page page_idx is the same in annotations and in annots_from_file
            with self.subTest(annotation=annot):
                self.assertTrue(
                    annotations_are_similar(
                        Annotation(page=annot["page"],
                                   type=annot["type"],
                                   box=Rectangle.from_dict(annot["box"]),
                                   text_content=annot["text_content"],
                                   who_annotated=annot["who_annotated"],
                                   label=annot["label"]), annotations[i]))

        os.remove(temp_json_file)