def ocr_page( user_id, document_id, file_name, page_num, lang, ): logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_path = DocumentPath( user_id=user_id, document_id=document_id, file_name=file_name, ) mime_type = mime.Mime(default_storage.abspath(doc_path.url())) logger.debug(f"Mime Type = {mime_type}") page_type = '' if mime_type.is_pdf(): ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang) page_type = 'pdf' elif mime_type.is_image(): # jpeg, jpeg or png ocr_page_image(doc_path=doc_path, page_num=page_num, lang=lang) elif mime_type.is_tiff(): # new filename is a pdf file logger.debug("TIFF type detected") new_filename = convert_tiff2pdf( doc_url=default_storage.abspath(doc_path.url())) # now .pdf doc_path.file_name = new_filename # and continue as usual ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang) else: logger.error(f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True t2 = time.time() logger.debug(f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True
def document_download(request, id): try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") if doc.user.username == request.user.username: try: file_handle = open(default_storage.abspath(doc.path.url()), "rb") except OSError: logger.error("Cannot open local version of %s" % doc.path.url()) # return redirect( # 'boss:core_basetreenode_changelist_obj', args=(id,) # ) return redirect('browse') resp = HttpResponse(file_handle.read(), content_type="application/pdf") disposition = "attachment; filename=%s" % doc.title resp['Content-Disposition'] = disposition file_handle.close() return resp # return redirect( # 'boss:core_basetreenode_changelist_obj', args=(id,) # ) return redirect('browse')
def preview(request, id, step=None, page="1"): try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") if request.user.has_perm(Access.PERM_READ, doc): page_path = doc.get_page_path( page_num=page, step=Step(step), ) img_abs_path = default_storage.abspath(page_path.img_url()) if not os.path.exists(img_abs_path): logger.debug( f"Preview image {img_abs_path} does not exists. Generating...") extract_img(page_path, media_root=settings.MEDIA_ROOT) try: with open(img_abs_path, "rb") as f: return HttpResponse(f.read(), content_type="image/jpeg") except IOError: raise return redirect('core:index')
def document_download(request, id): """ Any user with read permission on the document must be able to download the document. """ try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") if request.user.has_perm(Access.PERM_READ, doc): try: file_handle = open(default_storage.abspath( doc.path.url() ), "rb") except OSError: logger.error( "Cannot open local version of %s" % doc.path.url() ) return redirect('admin:browse') resp = HttpResponse( file_handle.read(), content_type="application/pdf" ) disposition = "attachment; filename=%s" % doc.title resp['Content-Disposition'] = disposition file_handle.close() return resp return HttpResponseForbidden()
def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ logger.debug("OCR PDF document") page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def test_preview(self): doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) default_storage.copy_doc( src=os.path.join( BASE_DIR, "data", "berlin.pdf" ), dst=doc.path.url(), ) ret = self.client.post( reverse('core:preview', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) page_path = PagePath( document_path=doc.path, page_num=1, step=Step(1), page_count=3 ) self.assertTrue( os.path.exists( default_storage.abspath(page_path.img_url()) ) )
def node_download(request, id): """ Any user with read permission on the node must be able to download it. Node is either documennt or a folder. """ version = request.GET.get('version', None) try: node = BaseTreeNode.objects.get(id=id) except BaseTreeNode.DoesNotExist: raise Http404("Node does not exists") if request.user.has_perm(Access.PERM_READ, node): if node.is_document(): try: file_handle = open(default_storage.abspath( node.path().url(version=version) ), "rb") except OSError: logger.error( "Cannot open local version of %s" % node.path.url() ) return redirect('admin:browse') resp = HttpResponse( file_handle.read(), content_type="application/pdf" ) disposition = "attachment; filename=%s" % node.title resp['Content-Disposition'] = disposition file_handle.close() return resp else: # node is a folder with NamedTemporaryFile(prefix="download_") as fileobj: # collected into an archive all direct children of # selected folder node_ids = [_node.id for _node in node.get_children()] build_tar_archive( fileobj=fileobj, node_ids=node_ids ) # reset fileobj to initial position fileobj.seek(0) data = fileobj.read() resp = HttpResponse( data, content_type="application/x-tar" ) disposition = f"attachment; filename={node.title}.tar" resp['Content-Disposition'] = disposition return resp return HttpResponseForbidden()
def ocr_page_pdf(doc_path, page_num, lang, **kwargs): """ doc_path is an mglib.path.DocumentPath instance On success returns ``mglib.path.PagePath`` instance. """ logger.debug("OCR PDF document") file_name = kwargs.pop('file_name', None) if not file_name: file_name = doc_path.file_name page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_path.step = step extract_img(page_path, media_root=settings.MEDIA_ROOT) notify_pre_page_ocr(page_path, page_num=page_num, lang=lang, file_name=doc_path.file_name, **kwargs) if page_num <= page_count: page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_txt_ready(page_path, page_num=page_num, lang=lang, file_name=file_name, **kwargs) for step in Steps(): page_path.step = step if not step.is_thumbnail: extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_hocr_ready( page_path, page_num=page_num, lang=lang, # step as integer number step=step.current, file_name=file_name, **kwargs) return page_path
def notify_txt_ready(page_path, **kwargs): """ Notifies interested parties that .txt file is available. Notifies via django signals. Among others will send .txt content itself. Input arguments: ``page_path``: mglib.PagePath instance of current page Following keys are expected to be availble in kwargs dictinary: * ``user_id`` * ``document_id`` * ``file_name`` * ``page_num`` * ``namespace`` Always returns None. Sent signals: ``post_page_txt``. Following arguments are passed to the signal: * ``sender`` = from papermerge.core.signal_definitions.WORKER * ``user_id`` * ``document_id`` * ``file_name`` * ``page_num`` * ``lang`` * ``namespace`` = may be empty. Used to distinguish among different tenants in multi-tenant deployments. * ``txt`` = extracted .txt data (text format) """ user_id = kwargs.get('user_id', None) document_id = kwargs.get('document_id', None) page_num = kwargs.get('page_num', 1) file_name = kwargs.get('file_name', None) namespace = kwargs.get('namespace', None) if page_path: abs_path_txt = default_storage.abspath(page_path.txt_url()) if os.path.exists(abs_path_txt): with open(abs_path_txt) as f: text = f.read() signals.post_page_txt.send(sender=signals.WORKER, user_id=user_id, document_id=document_id, file_name=file_name, page_num=page_num, namespace=namespace, text=text) else: logger.warning(f"Page txt path {abs_path_txt} does not exist. " f"Page indexing was skipped.") else: logger.warning("OCR method returned empty page path. " "Page indexing was skipped.")
def hocr(request, id, step=None, page="1"): logger.debug(f"hocr for doc_id={id}, step={step}, page={page}") try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") doc_path = doc.path if request.user.has_perm(Access.PERM_READ, doc): # document absolute path doc_abs_path = default_storage.abspath(doc_path.url()) if not os.path.exists( doc_abs_path ): raise Http404("HOCR data not yet ready.") page_count = get_pagecount(doc_abs_path) if page > page_count or page < 0: raise Http404("Page does not exists") page_path = doc.page_paths[page] hocr_abs_path = default_storage.abspath(page_path.hocr_url()) logger.debug(f"Extract words from {hocr_abs_path}") if not os.path.exists(hocr_abs_path): raise Http404("HOCR data not yet ready.") # At this point local HOCR data should be available. hocr = Hocr( hocr_file_path=hocr_abs_path ) return HttpResponse( json.dumps({ 'hocr': hocr.good_json_words(), 'hocr_meta': hocr.get_meta() }), content_type="application/json", ) return HttpResponseForbidden()
def recreate_pages(self): """ Recreate page models """ self.pages.all().delete() self.page_count = get_pagecount( default_storage.abspath(self.path.url())) self.save() self.create_pages()
def test_download_hocr(self): doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) default_storage.copy_doc( src=os.path.join( BASE_DIR, "data", "berlin.pdf" ), dst=default_storage.abspath(doc.path.url()) ) # build page url page_path = doc.page_paths[1] # just remember that at the end of test # copied file must be deteled. (1) default_storage.copy_doc( src=os.path.join( BASE_DIR, "data", "page-1.hocr" ), dst=default_storage.abspath(page_path.hocr_url()) ) ret = self.client.get( reverse('core:hocr', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) # Deleting file created at (1) os.remove( default_storage.abspath(page_path.hocr_url()) )
def update_text_field(self): """Update text field from associated .txt file. Returns non-empty text string value if .txt file was found. If file was not found - will return an empty string. """ text = '' url = default_storage.abspath(self.txt_url) if not os.path.exists(url): logger.debug(f"Missing page txt {url}.") return with open(url) as file_handle: self.text = file_handle.read() self.save() logger.debug(f"text saved. len(page.text)=={len(self.text)}") text = self.text return text
def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_img(page_url, media_root=settings.MEDIA_ROOT) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) # tesseract unterhalt-1.jpg page-1 -l deu hocr if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def ocr_page( user_id, document_id, file_name, page_num, lang, ): logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_path = DocumentPath( user_id=user_id, document_id=document_id, file_name=file_name, ) mime_type = mime.Mime(default_storage.abspath(doc_path.url())) page_type = '' if mime_type.is_pdf(): ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang) page_type = 'pdf' else: logger.error(f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True t2 = time.time() logger.debug(f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True
def absfilepath(self): return default_storage.abspath(self.path.url())
def paste_pages( user, parent_id, doc_pages, dst_document=None, after=False, before=False, ): # parent_node is an instance of BaseTreeNode # doc_pages is a dictionary of format: # { # doc_id_1: [page_num_1a, page_num_2a, ...], # doc_id_2: [page_num_1b, page_num_2b, ...], # doc_id_3: [page_num_1c, page_num_2c, ...] # } # 1. Create a new document NEWDOC # 2. Build new pages for the newly created document # num_pages = len(doc_pages[doc_id_1]) + len(doc_pages[doc_id_2]) + ... # 3. for each document with ids in doc_pages.keys() (DOC): # a. copy pages data from DOC to NEWDOC # b. deletes pages from DOC (pages mentioned in doc_page[key] list) new_page_count = sum([len(pages) for pages in doc_pages.values()]) if new_page_count == 0: logger.warning("No pages to paste. Exiting.") return # 1. Create new document # 2. Build new pages for newly created document dst_doc_is_new = False if not dst_document: dst_document = Document.objects.create_document( user=user, parent_id=parent_id, lang=user.preferences['ocr__OCR_Language'], title="pasted.pdf", size=0, # updated later, after pdftk will create new doc file_name="pasted.pdf", page_count=new_page_count) dst_doc_is_new = True # for each document where are pages to paste doc_list = [] data_list = [] for doc_id in doc_pages.keys(): try: doc = Document.objects.get(id=doc_id) except Document.DoesNotExist: logger.warning(f"While pasting, doc_id={doc_id} was not found") return src = default_storage.abspath(doc.path) doc_path = doc.path doc_list.append({'doc': doc, 'page_nums': doc_pages[doc_id]}) data_list.append({ 'src': src, 'doc_path': doc_path, 'page_nums': doc_pages[doc_id] }) # returns new document version new_version = default_storage.paste_pages( dest_doc_path=dst_document.path, data_list=data_list, dest_doc_is_new=dst_doc_is_new, after_page_number=after, before_page_number=before) if new_version == dst_document.version: raise Exception("Expecting version to be incremented") dst_document.version = new_version dst_document.save() # update pages model dst_document.recreate_pages() # delete pages of source document (which where # cutted and pasted into new doc) for item in doc_list: item['doc'].delete_pages(page_numbers=item['page_nums']) return dst_document
def notify_hocr_ready(page_path, **kwargs): """ Notifies interested parties that .hocr file is available. Notifies via django signals. Among others will send hocr content itself. Input arguments: ``page_path``: mglib.PagePath instance of current page Following keys are expected to be availble in kwargs dictinary: * ``user_id`` * ``document_id`` * ``file_name`` * ``page_num`` * ``namespace`` * ``step`` Always returns None. Sent signals: ``post_page_hocr``. Following arguments are passed to the signal: * ``sender`` = from papermerge.core.signal_definitions.WORKER * ``user_id`` * ``document_id`` * ``file_name`` * ``page_num`` * ``lang`` * ``namespace`` = may be empty. Used to distinguish among different tenants in multi-tenant deployments. * ``step`` = integer number corresponding to step learn more about steps in ``mglib.step.Step`` * ``hocr`` = extracted hocr data (text format) """ user_id = kwargs.get('user_id', None) document_id = kwargs.get('document_id', None) file_name = kwargs.get('file_name', None) page_num = kwargs.get('page_num', 1) namespace = kwargs.get('namespace', None) step = kwargs.get('step', 1) if page_path: abs_path_hocr = default_storage.abspath(page_path.hocr_url()) if os.path.exists(abs_path_hocr): with open(abs_path_hocr) as f: hocr = f.read() signals.post_page_hocr.send(sender=signals.WORKER, user_id=user_id, document_id=document_id, file_name=file_name, page_num=page_num, step=step, namespace=namespace, hocr=hocr) else: logger.warning( f"Page hocr/step={step} path {abs_path_hocr} does not exist.") else: logger.warning(f"hOCR/step={step} method returned empty page path.")
def test_documents_retains_per_page_metadata_after_page_delete(self): """ DocM is a document with 3 pages. DocM has two metadata fields associated X and Y. Field has a value x=10 and y=20. Second page of the document DocM is deleted. Expected: document values of metadata fields X and Y should be preserverd: DocX.M is still 10 and DocM.Y is still 20. Important! In document browser and document viewer if user does not explicitely select a page, by default metadata associated with first page of respective document is returned. """ document_path = os.path.join(BASE_DIR, "data", "berlin.pdf") docm = Document.objects.create_document( user=self.user, title='berlin.pdf', size=os.path.getsize(document_path), lang='deu', file_name='berlin.pdf', parent_id=None, page_count=3) default_storage.copy_doc( src=document_path, dst=docm.path.url(), ) for number in range(1, 4): page = docm.pages.get(number=number) # filesystem absolute path /home/eugen/x/y/ fs_abs_path = default_storage.abspath(page.path.url()) # filesystem absolute dir fs_abs_dir = os.path.dirname(fs_abs_path) Path(fs_abs_dir).mkdir(parents=True, exist_ok=True) # create an empty file open(fs_abs_path, "w+") # indeed, docm has 3 pages self.assertEqual(docm.pages.count(), 3) docm.kv.update([{ 'key': 'X', 'kv_type': TEXT, }, { 'key': 'Y', 'kv_type': TEXT, }]) # In document browser and document viewer # if user does not explicitely select a document, by default # metadata associated with first page of respective document # is returned page = docm.pages.get(number=1) page.kv['X'] = 10 page.kv['Y'] = 20 page.refresh_from_db() self.assertEqual(page.kv['X'], '10') self.assertEqual(page.kv['Y'], '20') # Even if user deletes second page, all data (incl. metadata) # associated ramaining page (first and last) # MUST be preserved! docm.delete_pages([2]) page = docm.pages.get(number=1) self.assertEqual(page.kv['X'], '10') self.assertEqual(page.kv['Y'], '20')
def ocr_page( user_id, document_id, file_name, page_num, lang, namespace=None, ): logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_path = DocumentPath( user_id=user_id, document_id=document_id, file_name=file_name, ) if not default_storage.exists(doc_path.url()): # In case of distibuted deployment, document uploaded # by webapp is not directly available to the worker (which runs on # separate computer). Thus, if document is not locally available, # worker will download the document from whatever remote location. default_storage.download(doc_path_url=doc_path.url(), namespace=namespace) mime_type = mime.Mime(default_storage.abspath(doc_path.url())) logger.debug(f"Mime Type = {mime_type}") page_type = '' if mime_type.is_pdf(): ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang, user_id=user_id, document_id=document_id, namespace=namespace) page_type = 'pdf' elif mime_type.is_image(): # jpeg, jpeg or png ocr_page_image(doc_path=doc_path, page_num=page_num, lang=lang, user_id=user_id, document_id=document_id, namespace=namespace) elif mime_type.is_tiff(): # new filename is a pdf file logger.debug("TIFF type detected") new_filename = convert_tiff2pdf( doc_url=default_storage.abspath(doc_path.url())) # now .pdf orig_file_name = doc_path.file_name doc_path.file_name = new_filename # and continue as usual ocr_page_pdf( doc_path=doc_path, page_num=page_num, lang=lang, user_id=user_id, document_id=document_id, # Pass original file_name i.e. tiff file name as well. file_name=orig_file_name, namespace=namespace) else: logger.error(f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True t2 = time.time() logger.debug(f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True