def test_with_form(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
def test_with_form_error_notext(self): parser = RasterisedDocumentParser(None) def f(): parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") self.assertRaises(ParseError, f)
def test_is_ocred(self, m2, m): parser = RasterisedDocumentParser("", uuid.uuid4()) m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ "lots of text lots of text lots of text lots of text lots of text lots of text " \ "lots of text lots of text lots of text lots of text lots of text lots of text " parser.get_text() self.assertEqual(m.call_count, 2) self.assertEqual(m2.call_count, 0)
def test_get_dpi(self): parser = RasterisedDocumentParser(None) dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png")) self.assertEqual(dpi, None) dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png")) self.assertEqual(dpi, 72)
def test_parse_empty_doc(self): parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) try: parser.get_text() except ParseError as e: self.assertEqual("Empty document, nothing to do.", str(e)) else: self.fail("Should raise exception")
def test_image_no_dpi_default(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."])
def test_image_dpi_fail(self, m): m.return_value = None parser = RasterisedDocumentParser(None) def f(): parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") self.assertRaises(ParseError, f)
def test_simple_digital(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf") self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text(), ["This is a test document."])
def test_rotate(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "rotated.pdf"), "application/pdf") self.assertContainsStrings(parser.get_text(), [ "This is the text that appears on the first page. It’s a lot of text.", "Even if the pages are rotated, OCRmyPDF still gets the job done.", "This is a really weird file with lots of nonsense text.", "If you read this, it’s your own fault. Also check your screen orientation." ])
def test_image_calc_a4_dpi(self, m): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") m.assert_called_once() args, kwargs = m.call_args self.assertEqual(kwargs['image_dpi'], 62)
def test_multi_page_mixed(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), "application/pdf") self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"]) with open(os.path.join(parser.tempdir, "sidecar.txt")) as f: sidecar = f.read() self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
def test_signed(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "signed.pdf"), "application/pdf") self.assertIsNone(parser.archive_path) self.assertContainsStrings(parser.get_text(), [ "This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable", "automated testing of signed/encrypted PDFs" ])
def test_thumbnail_fallback(self, m): def call_convert(input_file, output_file, **kwargs): if ".pdf" in input_file: raise ParseError("Does not compute.") else: run_convert(input_file=input_file, output_file=output_file, **kwargs) m.side_effect = call_convert parser = RasterisedDocumentParser(uuid.uuid4()) parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
def test_ocrmypdf_parameters(self): parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters( input_file="input.pdf", output_file="output.pdf", sidecar_file="sidecar.txt", mime_type="application/pdf", safe_fallback=False) self.assertEqual(params['input_file'], "input.pdf") self.assertEqual(params['output_file'], "output.pdf") self.assertEqual(params['sidecar'], "sidecar.txt") with override_settings(OCR_CLEAN="none"): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn("clean", params) self.assertNotIn("clean_final", params) with override_settings(OCR_CLEAN="clean"): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertTrue(params['clean']) self.assertNotIn("clean_final", params) with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertTrue(params['clean_final']) self.assertNotIn("clean", params) with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertTrue(params['clean']) self.assertNotIn("clean_final", params) with override_settings(OCR_DESKEW=True, OCR_MODE="skip"): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertTrue(params['deskew']) with override_settings(OCR_DESKEW=True, OCR_MODE="redo"): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn('deskew', params) with override_settings(OCR_DESKEW=False, OCR_MODE="skip"): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn('deskew', params)
def test_thumbnail(self): parser = RasterisedDocumentParser(uuid.uuid4()) parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
def test_tiff(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff") self.assertTrue(os.path.isfile(parser.archive_path)) self.assertTrue("this is a test document" in parser.get_text().lower())
def test_get_text_from_pdf(self): parser = RasterisedDocumentParser(uuid.uuid4()) text = parser.extract_text(None, os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf')) self.assertContainsStrings(text.strip(), ["This is a test document."])
def test_thumbnail(self): parser = RasterisedDocumentParser( os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) parser.get_thumbnail()
def test_multi_page_mixed_no_archive(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), "application/pdf") self.assertIsNone(parser.archive_path) self.assertContainsStrings(parser.get_text().lower(), ["page 4", "page 5", "page 6"])
def test_thumbnail_encrypted(self): parser = RasterisedDocumentParser(uuid.uuid4()) thumb = parser.get_thumbnail( os.path.join(self.SAMPLE_FILES, 'encrypted.pdf'), "application/pdf") self.assertTrue(os.path.isfile(thumb))
def test_skip_noarchive_notext(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
def test_image_calc_a4_dpi(self): parser = RasterisedDocumentParser(None) dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png")) self.assertEqual(dpi, 62)