def test_image_ocr(self): """Test that correct words are ocred and that the word 'iruri' is on approximately proper position.""" found_words_from_ocr = {item["word"] for item in self.ocr_data} digital_first_page = self.example_pdf.get_pages()[0] digital_words_first_page = { w.text.strip() for w in digital_first_page if w.text.strip() } # interlap of detected and digital words should be large inter = len(found_words_from_ocr & digital_words_first_page) iou = inter / (len(found_words_from_ocr) + len(digital_words_first_page) - inter) self.assertTrue(iou > 0.9) # further, let's find the word 'irure' in both digital content and scanned content irure_el = self.example_pdf.get_page_as_html(0).xpath( './/word[text()="irure"]')[0] irure_digital_bb = Pdf.get_bounding_box_of_elem( irure_el).relative_to_size( width=self.example_pdf.get_width_height(0)[0], height=self.example_pdf.get_width_height(0)[1]) irure_ocred_bb = list( filter(lambda i: i["word"] == "irure", self.ocr_data))[0]["bb"] # Bounding box of 'irure' should be similar in both cases. # However, we need to be tolerant here, as bounding boxes of scanned text are typically smaller # than the digital ones. So let's require intersection over union at least 0.4. self.assertGreater(irure_ocred_bb.get_iou(irure_digital_bb), 0.4)
class TestScanner(unittest.TestCase): example_pdf = Pdf(PDF_PATH) first_page_large = example_pdf.page_image(page_idx=0, dpi=300) ocr_data = None def setUp(self) -> None: self.ocr_data = Scanner.ocr_one_image(self.first_page_large) def test_image_ocr(self): """Test that correct words are ocred and that the word 'iruri' is on approximately proper position.""" found_words_from_ocr = {item["word"] for item in self.ocr_data} digital_first_page = self.example_pdf.get_pages()[0] digital_words_first_page = { w.text.strip() for w in digital_first_page if w.text.strip() } # interlap of detected and digital words should be large inter = len(found_words_from_ocr & digital_words_first_page) iou = inter / (len(found_words_from_ocr) + len(digital_words_first_page) - inter) self.assertTrue(iou > 0.9) # further, let's find the word 'irure' in both digital content and scanned content irure_el = self.example_pdf.get_page_as_html(0).xpath( './/word[text()="irure"]')[0] irure_digital_bb = Pdf.get_bounding_box_of_elem( irure_el).relative_to_size( width=self.example_pdf.get_width_height(0)[0], height=self.example_pdf.get_width_height(0)[1]) irure_ocred_bb = list( filter(lambda i: i["word"] == "irure", self.ocr_data))[0]["bb"] # Bounding box of 'irure' should be similar in both cases. # However, we need to be tolerant here, as bounding boxes of scanned text are typically smaller # than the digital ones. So let's require intersection over union at least 0.4. self.assertGreater(irure_ocred_bb.get_iou(irure_digital_bb), 0.4) def test_ocred_pdf(self): """Convert the example_pdf into an image and the image back into a one-page pdf: test consistency.""" pdf_path = mkstemp()[1] Scanner.image_to_one_page_ocred_pdf( self.first_page_large, pdf_path, pdf_width=self.example_pdf.get_width_height()[0], pdf_height=self.example_pdf.get_width_height()[1], ocr_text=self.ocr_data) scanned_pdf = Pdf(pdf_path) # get digital content of the scanned pdf scanned_text = scanned_pdf.layout_text self.assertTrue( re.search(r"of\s+all\s+factories\s+10\s+bil\.\s+Euro\s+4\%", scanned_text)) # cleanup os.remove(pdf_path)
def test_ocred_pdf(self): """Convert the example_pdf into an image and the image back into a one-page pdf: test consistency.""" pdf_path = mkstemp()[1] Scanner.image_to_one_page_ocred_pdf( self.first_page_large, pdf_path, pdf_width=self.example_pdf.get_width_height()[0], pdf_height=self.example_pdf.get_width_height()[1], ocr_text=self.ocr_data) scanned_pdf = Pdf(pdf_path) # get digital content of the scanned pdf scanned_text = scanned_pdf.layout_text self.assertTrue( re.search(r"of\s+all\s+factories\s+10\s+bil\.\s+Euro\s+4\%", scanned_text)) # cleanup os.remove(pdf_path)
def _get_scored_words(words_in_page: List[html.HtmlElement], one_annotation: Annotation, threshold: float) -> List[Dict]: """For one page and a given annotation, find all words that has high overlap with annotation's bounding box. Return a list of potential word-candidates with form [{ "word": html element representing the word, "score": proportion of the word box intersecting the annotation box ...,] """ scored_words = [] for word in words_in_page: word_box = Pdf.get_bounding_box_of_elem(word) annotation_and_box_interection = word_box.intersection( one_annotation.box) score = 0 if annotation_and_box_interection is None else annotation_and_box_interection.area / word_box.area if score > threshold: scored_words.append({"word": word, "score": score}) return scored_words
def test_enriched_annotations(self): """Check that enriched annotation for our example pdf coincide with expected results.""" enriched = self.annotated_pdf.enriched_annotations # check that only rectangle-types are here rectangle_annots = [annot for annot in self.extracted_annots if annot.type == "rectangle"] for i, annot in enumerate(rectangle_annots): self.assertTrue(annotations_are_similar(annot, enriched[i]["annotation"])) # check that the textual content of first annotation is correct self.assertEqual( ' '.join(w["word"].text for w in enriched[0]["words"]), "Being killed at train station") # check that the 'scores' of words in first rectangle annotation are reasonable for i, w in enumerate(enriched[0]["words"]): word_bb = Pdf.get_bounding_box_of_elem(w["word"]) self.assertGreater(w["score"], 0.9) self.assertLess( word_bb.intersection(rectangle_annots[0].box).area / word_bb.area - w["score"], 0.01)
class TestPdf(unittest.TestCase): pdf = Pdf(PDF_PATH) pdf_rotated = Pdf(PDF_ROTATED_PATH) def test_basic_attributes(self): """Check correctness of path, name and number of pages.""" self.assertEqual(self.pdf.pdf_path, PDF_PATH) self.assertEqual(self.pdf.name, "example.pdf") self.assertEqual(self.pdf.number_of_pages, 2) def test_width_height_rotation(self): """Check width, height, and extracted page rotation.""" w, h = self.pdf.get_width_height(0) wr, hr = self.pdf_rotated.get_width_height(0) # width and height of the original pdf self.assertLessEqual(abs(w - 595), 1) self.assertLessEqual(abs(h - 842), 1) # rotated pdf should have width and height swapped self.assertLessEqual(abs(hr - 595), 1) self.assertLessEqual(abs(wr - 842), 1) # check page rotation self.assertEqual(self.pdf.page_rotation(0), 0) self.assertEqual(self.pdf.page_rotation(1), 0) self.assertEqual(self.pdf_rotated.page_rotation(0), 90) def test_page_image(self): """Check consistency of first-page image, reference image, and recovered image from rotated pdf.""" im_1 = self.pdf.page_image(0) im_rot_1 = self.pdf_rotated.page_image(0) im_1_reconstructed = im_rot_1.rotate(90, expand=True) # sizes should coincide self.assertEqual(im_1.size, im_1_reconstructed.size) # first pdf page should be similar to the rotated first page of rotated pdf self.assertGreater( naive_image_similarity(np.array(im_1), np.array(im_1_reconstructed)), 0.98) # first page should be similar to the precomputed image from disc self.assertGreater( naive_image_similarity( np.array(im_1), np.array(Image.open(str(FIRST_PDF_PAGE_PATH)))), 0.98) images = list(self.pdf.images) images_rotated = list(self.pdf_rotated.images) # the 'images' method should return the precomputed images from a buffer, so here we require exact match self.assertEqual(im_1, images[0]) self.assertEqual(im_rot_1, images_rotated[0]) def test_text_extraction_from_pdf(self): """This is essentially testing pdftotext (probably coming from Poppler, of Xpdf).""" simple_text = self.pdf.simple_text layout_text = self.pdf.layout_text # xml with bounding boxes of words root = self.pdf.get_page_as_html(0) # list of strings (one per page) simple_pages = [page for page in simple_text.split("\f") if page] layout_pages = [page for page in layout_text.split("\f") if page] # We have two pages in the pdf self.assertEqual(len(simple_pages), 2) self.assertEqual(len(layout_pages), 2) # Test that first page contain expected words words_in_first_page = set(simple_pages[0].split()) self.assertTrue({"Lorem", "ipsum", "Aron", "killed", "*****@*****.**"}.issubset(words_in_first_page)) self.assertFalse({ "Autobahn", "Das", "The", "name", "hungry", "*****@*****.**" } & words_in_first_page) # this regex should be matched in a reasonably extracted layout-first-page-text self.assertTrue( re.search(r"Stolen\s+bike\s+500\s+Euro\s+3%", layout_pages[0])) self.assertTrue(re.search(r"[email protected]\s*\n", layout_pages[1])) # Find bounding box of 'extreme' word on first page extreme_element = root.xpath(".//word[text()='extreme']")[0] extreme_bb = Rectangle(x_min=extreme_element.attrib["xmin"], y_min=extreme_element.attrib["ymin"], x_max=extreme_element.attrib["xmax"], y_max=extreme_element.attrib["ymax"]) # Check that the bounding box is reasonable self.assertTrue(extreme_bb in Rectangle( x_min=220, y_min=530, x_max=290, y_max=590)) def test_text_extraction_from_rotated_pdf(self): """Check that bounding box of a word in pdf is where it should be.""" pages = self.pdf.get_pages() pages_rotated = self.pdf_rotated.get_pages() pages_txt = self.pdf.get_pages_as_text() self.assertEqual(len(pages), 2) self.assertEqual(len(pages_txt), 2) first_el_pdf = pages[0][0] first_el_pdf_rotated = pages_rotated[0][0] # both pdf should start with "Insurance" on the first page self.assertEqual(first_el_pdf.text, "Insurance") self.assertEqual(first_el_pdf_rotated.text, "Insurance") # enforce approximate bounding box of this first word self.assertTrue( self.pdf.get_bounding_box_of_elem(pages[0][0]) in ( Rectangle(x_min=72, y_min=98, x_max=165, y_max=128))) self.assertTrue( self.pdf_rotated.get_bounding_box_of_elem(pages_rotated[0][0]) in ( Rectangle(x_min=712, y_min=70, x_max=750, y_max=162))) def test_pdf_recreation(self): """Test the method `recreate_digital_content`. We convert the example pdf to a new pdf created from images and ocr. Then we test that * image-content of first page is similar to image of the reconstructed pdf, and * textual content of first page is the same as ocr-result from first page-image. (Ocr itself is tested in the test_ocr module.) """ tmp_pdf_file = mkstemp()[1] self.pdf.recreate_digital_content(tmp_pdf_file, tesseract_lang='eng', tesseract_conf="") recreated = Pdf(tmp_pdf_file) # pdf should have two pages self.assertEqual(recreated.number_of_pages, 2) # size of first page should be unchanged pdf_widh, pdf_height = self.pdf.get_width_height(0) self.assertEqual(recreated.get_width_height(0), (pdf_widh, pdf_height)) im_width, im_height = self.pdf.page_image(0, dpi=150).size im_recreated = recreated.page_image(0, dpi=150) # first page image original and reconstructed (widht equal dpi) should have approximately the same size self.assertLess(abs(im_width - im_recreated.size[0]) / im_width, 0.05) self.assertLess( abs(im_height - im_recreated.size[1]) / im_height, 0.05) # first page should be similar to the first reconstructed page (after resizing) im_recreated = im_recreated.resize((im_width, im_height)) self.assertGreater( naive_image_similarity(np.array(self.pdf.page_image(0)), np.array(im_recreated)), 0.98) # check digital content # we will compare dictionaries {word: bounding_box} in reconstructed pdf and in ocr scan of the original one # these two dictionaries should have equal keys, and similar values for all keys which represent unique words words_and_bounding_boxes = recreated.get_pages()[0] words_and_bounding_boxes = { item.text: Pdf.get_bounding_box_of_elem(item).relative_to_size( width=pdf_widh, height=pdf_height) for item in words_and_bounding_boxes } scanned_words_and_bounding_boxes = Scanner.ocr_one_image( self.pdf.page_image(0), lang="eng", config="") scanned_words_and_bounding_boxes = { item["word"]: item["bb"] for item in scanned_words_and_bounding_boxes } # both dictionaries should have the same keys self.assertEqual(set(scanned_words_and_bounding_boxes), set(words_and_bounding_boxes)) # bounding boxes of the word "left" should be approximately equal (iou at least 0.4) self.assertGreater( scanned_words_and_bounding_boxes["left"].get_iou( words_and_bounding_boxes["left"]), 0.4) # cleanup os.remove(tmp_pdf_file)
def test_pdf_recreation(self): """Test the method `recreate_digital_content`. We convert the example pdf to a new pdf created from images and ocr. Then we test that * image-content of first page is similar to image of the reconstructed pdf, and * textual content of first page is the same as ocr-result from first page-image. (Ocr itself is tested in the test_ocr module.) """ tmp_pdf_file = mkstemp()[1] self.pdf.recreate_digital_content(tmp_pdf_file, tesseract_lang='eng', tesseract_conf="") recreated = Pdf(tmp_pdf_file) # pdf should have two pages self.assertEqual(recreated.number_of_pages, 2) # size of first page should be unchanged pdf_widh, pdf_height = self.pdf.get_width_height(0) self.assertEqual(recreated.get_width_height(0), (pdf_widh, pdf_height)) im_width, im_height = self.pdf.page_image(0, dpi=150).size im_recreated = recreated.page_image(0, dpi=150) # first page image original and reconstructed (widht equal dpi) should have approximately the same size self.assertLess(abs(im_width - im_recreated.size[0]) / im_width, 0.05) self.assertLess( abs(im_height - im_recreated.size[1]) / im_height, 0.05) # first page should be similar to the first reconstructed page (after resizing) im_recreated = im_recreated.resize((im_width, im_height)) self.assertGreater( naive_image_similarity(np.array(self.pdf.page_image(0)), np.array(im_recreated)), 0.98) # check digital content # we will compare dictionaries {word: bounding_box} in reconstructed pdf and in ocr scan of the original one # these two dictionaries should have equal keys, and similar values for all keys which represent unique words words_and_bounding_boxes = recreated.get_pages()[0] words_and_bounding_boxes = { item.text: Pdf.get_bounding_box_of_elem(item).relative_to_size( width=pdf_widh, height=pdf_height) for item in words_and_bounding_boxes } scanned_words_and_bounding_boxes = Scanner.ocr_one_image( self.pdf.page_image(0), lang="eng", config="") scanned_words_and_bounding_boxes = { item["word"]: item["bb"] for item in scanned_words_and_bounding_boxes } # both dictionaries should have the same keys self.assertEqual(set(scanned_words_and_bounding_boxes), set(words_and_bounding_boxes)) # bounding boxes of the word "left" should be approximately equal (iou at least 0.4) self.assertGreater( scanned_words_and_bounding_boxes["left"].get_iou( words_and_bounding_boxes["left"]), 0.4) # cleanup os.remove(tmp_pdf_file)
class TestAnnotation(unittest.TestCase): annotated_pdf = Pdf(ANNOTATED_PDF_PATH) extractor = AnnotationExtractor() expected_annotations = [ Annotation( page=0, type="note", box=Rectangle(x_min=87.58, y_min=45.574, x_max=107.58, y_max=65.574), text_content= "Daniel, include also the remaining 133 pages in the pdf!!!!", who_annotated="peter"), Annotation(page=0, type="rectangle", box=Rectangle(x_min=83.46, y_min=504.12, x_max=221.12, y_max=518), text_content="risk", who_annotated="peter"), Annotation(page=0, type="rectangle", box=Rectangle(x_min=321.27, y_min=503, x_max=374.16, y_max=517.63), text_content="coverage_total", who_annotated="peter"), Annotation(page=0, type="rectangle", box=Rectangle(x_min=373.4, y_min=504.5, x_max=399.66, y_max=518), text_content="currency", who_annotated="peter"), Annotation(page=0, type="rectangle", box=Rectangle(x_min=465.68, y_min=504.87, x_max=486.31, y_max=517.25), text_content="deductible in %", who_annotated="peter"), Annotation(page=1, type="oval", box=Rectangle(x_min=55.7, y_min=133.72, x_max=338.9, y_max=177.23), text_content="add Honza", who_annotated="peter"), ] def test_assertion_in_annotation_type(self): """If type is not in ADMISSIBLE_ANNOTATIONS, an error should be raised.""" self.assertRaises( AssertionError, lambda: Annotation( page=0, type="invisible", box=Rectangle(0, 0, 0, 0))) def test_annotation_creation(self): """Test the creation of one Annotation object.""" ann = Annotation(page=12, type="rectangle", box=Rectangle(10, 10, 13, 13), text_content="FPP3", who_annotated="terminator II", label=3) expected_annotation_as_dict = { "page": 12, "type": "rectangle", "box": { "x_min": 10, "y_min": 10, "x_max": 13, "y_max": 13 }, "text_content": "FPP3", "who_annotated": "terminator II", "label": 3 } self.assertEqual(ann.as_dict, expected_annotation_as_dict) def test_annotation_extraction(self): """Extract annotation from file and check that they correspond to expected annotations.""" annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf) # each annotation is found in expected for annot in annotations: with self.subTest(annotation=annot): self.assertTrue( any( annotations_are_similar(annot, other) for other in self.expected_annotations)) # each expected annotation is found in annotations for exp_annot in self.expected_annotations: with self.subTest(expected_annotation=exp_annot): self.assertTrue( any( annotations_are_similar(exp_annot, other) for other in annotations)) def test_dump_annotations_to_file(self): """Dump annotations to file, load them from file, and compare that all is consistent.""" annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf) temp_json_file = mkstemp()[1] self.extractor.dump_annotations_to_file(annotations, temp_json_file) with open(temp_json_file) as f: annots_from_file = json.load(f) for i, annot in enumerate(annots_from_file): # check that i'th annotation on page page_idx is the same in annotations and in annots_from_file with self.subTest(annotation=annot): self.assertTrue( annotations_are_similar( Annotation(page=annot["page"], type=annot["type"], box=Rectangle.from_dict(annot["box"]), text_content=annot["text_content"], who_annotated=annot["who_annotated"], label=annot["label"]), annotations[i])) os.remove(temp_json_file)