def test_document_url_with_another_version(self): doc_ep = DocumentPath(user_id=1, document_id=15, file_name="x.pdf") self.assertEqual(doc_ep.url(version=3), "docs/user_1/document_15/v3/x.pdf") self.assertEqual(doc_ep.url(version=2), "docs/user_1/document_15/v2/x.pdf")
def ocr_page( user_id, document_id, file_name, page_num, lang, ): logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_path = DocumentPath( user_id=user_id, document_id=document_id, file_name=file_name, ) mime_type = mime.Mime(default_storage.abspath(doc_path.url())) logger.debug(f"Mime Type = {mime_type}") page_type = '' if mime_type.is_pdf(): ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang) page_type = 'pdf' elif mime_type.is_image(): # jpeg, jpeg or png ocr_page_image(doc_path=doc_path, page_num=page_num, lang=lang) elif mime_type.is_tiff(): # new filename is a pdf file logger.debug("TIFF type detected") new_filename = convert_tiff2pdf( doc_url=default_storage.abspath(doc_path.url())) # now .pdf doc_path.file_name = new_filename # and continue as usual ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang) else: logger.error(f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True t2 = time.time() logger.debug(f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True
def test_document_url_none_vs_0(self): doc_ep = DocumentPath(user_id=1, document_id=15, file_name="x.pdf") doc_ep.inc_version() # current version = 1 doc_ep.inc_version() # current version = 2 doc_ep.inc_version() # current version = 3 self.assertEqual( # with version == None, latest version of the document # will be returned, which is 3 doc_ep.url(version=None), "docs/user_1/document_15/v3/x.pdf") self.assertEqual( # with version == 0, version 0 will be provided # i.e. version=0 returns original doc. doc_ep.url(version=0), "docs/user_1/document_15/x.pdf")
def test_inc_version(self): """ Document endpoints are now versioned. Initial version is 0. When version is 0, the "old" endpoint path applies i.e. version is not included in the path. After document is modified (blank page deleted for example), its version is incremented. If document version is > 0, then version is included in the path. """ doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") doc_ep.inc_version() self.assertEqual(doc_ep.url(), "docs/user_1/document_3/v1/x.pdf") doc_ep.inc_version() self.assertEqual(doc_ep.url(), "docs/user_1/document_3/v2/x.pdf")
def ocr_page( user_id, document_id, file_name, page_num, lang, ): logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_path = DocumentPath( user_id=user_id, document_id=document_id, file_name=file_name, ) mime_type = mime.Mime(default_storage.abspath(doc_path.url())) page_type = '' if mime_type.is_pdf(): ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang) page_type = 'pdf' else: logger.error(f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True t2 = time.time() logger.debug(f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True
def test_document_url(self): doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") self.assertEqual(doc_ep.url(), "docs/user_1/document_3/x.pdf")
def ocr_page( user_id, document_id, file_name, page_num, lang, namespace=None, ): logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_path = DocumentPath( user_id=user_id, document_id=document_id, file_name=file_name, ) if not default_storage.exists(doc_path.url()): # In case of distibuted deployment, document uploaded # by webapp is not directly available to the worker (which runs on # separate computer). Thus, if document is not locally available, # worker will download the document from whatever remote location. default_storage.download(doc_path_url=doc_path.url(), namespace=namespace) mime_type = mime.Mime(default_storage.abspath(doc_path.url())) logger.debug(f"Mime Type = {mime_type}") page_type = '' if mime_type.is_pdf(): ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang, user_id=user_id, document_id=document_id, namespace=namespace) page_type = 'pdf' elif mime_type.is_image(): # jpeg, jpeg or png ocr_page_image(doc_path=doc_path, page_num=page_num, lang=lang, user_id=user_id, document_id=document_id, namespace=namespace) elif mime_type.is_tiff(): # new filename is a pdf file logger.debug("TIFF type detected") new_filename = convert_tiff2pdf( doc_url=default_storage.abspath(doc_path.url())) # now .pdf orig_file_name = doc_path.file_name doc_path.file_name = new_filename # and continue as usual ocr_page_pdf( doc_path=doc_path, page_num=page_num, lang=lang, user_id=user_id, document_id=document_id, # Pass original file_name i.e. tiff file name as well. file_name=orig_file_name, namespace=namespace) else: logger.error(f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True t2 = time.time() logger.debug(f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True