def test_get_pdf_info(self): """Testing get_pdf_info(pdf_content: bytes)""" tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) test_eng_pdf: Path = Path(os.path.join(tests_dir, 'test_eng.pdf')) test_eng_pdf_content: bytes = _u.read_binary_file(test_eng_pdf) test_eng_pdf_info: _u.PdfInfo = _u.get_pdf_info(test_eng_pdf_content) self.assertEqual(test_eng_pdf_info.author, '') self.assertEqual(test_eng_pdf_info.creation_date, '2019-03-10 07:57:51+0000') self.assertEqual(test_eng_pdf_info.creator, '') self.assertEqual(test_eng_pdf_info.mod_date, '') self.assertEqual(test_eng_pdf_info.producer, 'Tesseract 4.0.0-beta.1') self.assertEqual(test_eng_pdf_info.title, '') self.assertEqual(test_eng_pdf_info.num_pages, 1) with patch('sys.stdout', new_callable=StringIO) as patched_stdout: not_pdf_info: _u.PdfInfo = _u.get_pdf_info(bytes()) stdout_value: str = patched_stdout.getvalue() self.assertEqual( stdout_value, "PyPDF2.PdfFileReader exception: Cannot read an empty file\n") self.assertEqual(not_pdf_info.author, '') self.assertEqual(not_pdf_info.creation_date, '') self.assertEqual(not_pdf_info.creator, '') self.assertEqual(not_pdf_info.mod_date, '') self.assertEqual(not_pdf_info.producer, '') self.assertEqual(not_pdf_info.title, '') self.assertEqual(not_pdf_info.num_pages, 0)
def create_pdf(self, admin_obj=None, request=None): """ This function creates self.pdf.file if it is possible 2019-03-13 :admin_obj: An admin instance of the model :request: A request instance of the current http request :return: None """ # checking that instance of OCRedFile is saved, raise DoesNotSaved exception otherwise self.is_saved() if self.can_create_pdf: content = self.file.file.read() self.file.file.seek(0) if 'image' in self.file_type: pdf_content = ocr_img2pdf(content) filename = set_pdffile_name(self, True) pdf = open(filename, 'wb') pdf.write(content) pdf.close() self.ocred_pdf.name = filename self.ocred_pdf_md5 = md5(pdf_content) OCRedFile.Counters.num_created_pdf += 1 if admin_obj and request: admin_obj.message_user(request, 'PDF created') elif 'pdf' in self.file_type: filename = set_pdffile_name(self, True) ocr_pdf(content, filename) self.ocred_pdf.name = filename self.ocred_pdf_md5 = md5(read_binary_file(filename)) OCRedFile.Counters.num_created_pdf += 1 if admin_obj and request: admin_obj.message_user(request, 'PDF created') super(OCRedFile, self).save()
def test_pdf2text(self): """Testing pdf2text(pdf_content: bytes)""" tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) pdf_notext: Path = Path(os.path.join(tests_dir, 'test_eng_notext.pdf')) pdf_notext_content: bytes = _u.read_binary_file(pdf_notext) pdf_notext_decoded: str = _u.pdf2text(pdf_notext_content) self.assertEqual(pdf_notext_decoded, '') pdf_withtext: Path = Path( os.path.join(tests_dir, 'the_pdf_withtext.pdf')) pdf_withtext_content: bytes = _u.read_binary_file(pdf_withtext) pdf_withtext_decoded: str = _u.pdf2text(pdf_withtext_content) self.assertEqual(pdf_withtext_decoded, 'The test if pdf with text') not_pdf: Path = Path(os.path.join(tests_dir, 'test_eng.png')) not_pdf_content: bytes = _u.read_binary_file(not_pdf) with self.assertRaisesMessage(pdftotext.Error, f'poppler error creating document'): _: str = _u.pdf2text(not_pdf_content)
def test_ocr_img2str(self): """The testing ocr_img2str(stdin: bytes)""" tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) test_eng_png: Path = Path(os.path.join(tests_dir, 'test_eng.png')) test_eng_png_content: bytes = _u.read_binary_file(test_eng_png) test_eng_ocred_text: str = _u.ocr_img2str(test_eng_png_content) self.assertTrue(test_eng_ocred_text, 'A some english text to test Tesseract')
def test_ocr_img2pdf(self): """The testing ocr_img2pdf(stdin: bytes)""" tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) test_eng_png: Path = Path(os.path.join(tests_dir, 'test_eng.png')) test_eng_png_content: bytes = _u.read_binary_file(test_eng_png) test_eng_ocred_pdf: bytes = _u.ocr_img2pdf(test_eng_png_content) self.assertIsNotNone(test_eng_ocred_pdf) test_eng_ocred_pdf_text: str = _u.pdf2text(test_eng_ocred_pdf) self.assertEqual(test_eng_ocred_pdf_text, 'A some english text to test Tesseract')
def test_read_binary_file(self): """Testing read_binary_file(path: str)""" tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) empty_file: Path = Path(os.path.join(tests_dir, 'empty_file.txt')) empty_content: bytes = _u.read_binary_file(empty_file) self.assertEqual(empty_content, bytes()) folder: Path = os.path.join(tests_dir, 'some_dir') with self.assertRaisesMessage( IsADirectoryError, f"[Errno 21] Is a directory: '{folder}'"): _: bytes = _u.read_binary_file(folder) not_empty_file: Path = Path( os.path.join(tests_dir, 'not_empty_file.txt')) not_empty_content: bytes = _u.read_binary_file(not_empty_file) self.assertEqual(type(not_empty_content), bytes) self.assertEqual(not_empty_content, 'content\n'.encode()) no_file: Path = Path(os.path.join(tests_dir, 'no_file.txt')) with self.assertRaisesMessage( FileNotFoundError, f"[Errno 2] No such file or directory: '{no_file}'"): _: bytes = _u.read_binary_file(no_file)
def save(self, force_insert=False, force_update=False, using=None, update_fields=None): """ This function save the instance of the model, or create it :param force_insert: :param force_update: :param using: :param update_fields: :return: None """ if self.is_saved(raise_exception=False): return if not self.file_type: self.file_type = self.file.file.content_type OCRedFile.is_valid_file_type(file_type=self.file_type, raise_exception=True) # read content of the 'file' field content = self.file.file.read() # return the reading pointer of the 'file' file to start position self.file.file.seek(0) # calculate md5 of 'file' field if if does not exist if not self.md5: self.md5 = md5(content) OCRedFile.is_valid_ocr_md5(md5_value=self.md5, raise_exception=True) # extract of ocr a content of the 'file' field if 'text' does not exist if not self.text: print(f'OCRedFile->save start OCR {self.md5}') ocr_started_datetime = timezone.now() if 'image' in self.file_type: pdf_content = ocr_img2pdf(content) self.text = pdf2text(pdf_content) if len(self.text): # create ocred_pdf only for an image that contains a text self.ocred_pdf_md5 = md5(pdf_content) if ocr_settings.OCR_STORE_PDF: self.ocred_pdf.save(set_pdffile_name(self), BytesIO(pdf_content), False) else: self.ocred_pdf.name = set_pdffile_name(self) self.ocred = timezone.now() elif 'pdf' in self.file_type: pdf_info: PdfInfo = get_pdf_info(content) self.pdf_num_pages = pdf_info.num_pages self.pdf_author = pdf_info.author if pdf_info.creation_date: self.pdf_creation_date = pdf_info.creation_date self.pdf_creator = pdf_info.creator if pdf_info.mod_date: self.pdf_mod_date = pdf_info.mod_date self.pdf_producer = pdf_info.producer self.pdf_title = pdf_info.title pdf_text = pdf2text(content) # check that loaded PDF file contains text if pdf_need_ocr(pdf_text): print( f'OCRedFile PDF OCR processing via OCRmyPDF {self.md5}' ) pdf_filename = set_pdffile_name(self) self.text = ocr_pdf(content, pdf_filename) self.ocred = timezone.now( ) # save datetime when uploaded PDF was ocred if len(self.text): # create ocred_pdf only for a pdf file that contains images with text self.ocred_pdf.name = pdf_filename self.ocred_pdf_md5 = md5( read_binary_file(pdf_filename)) if not ocr_settings.OCR_STORE_PDF: if os.path.isfile(pdf_filename): os.remove(pdf_filename) else: # remove the PDF file created by ocr_pdf(content, pdf_filename) if os.path.isfile(pdf_filename): os.remove(pdf_filename) else: print( f'OCRedFile->save use text from loaded pdf {self.md5}') self.text = pdf_text ocr_finished_datetime = timezone.now() ocr_duration: timedelta = ocr_finished_datetime - ocr_started_datetime print( f"OCRedFile->save finished OCR '{ocr_duration.seconds}.{ocr_duration.microseconds}' ms {self.md5}" ) if not ocr_settings.OCR_STORE_FILES: os.remove(self.file.path) # update counters OCRedFile.Counters.num_created_instances += 1 # checking database connection if not connection.is_usable(): try: connection.connect() except Exception as e: print(f"database reconnection exception {self.md5}") # parent method super(OCRedFile, self).save(force_insert=False, force_update=False, using=None, update_fields=None)