def preview(request, id, step=None, page="1"): try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") if request.user.has_perm(Access.PERM_READ, doc): page_path = doc.get_page_path( page_num=page, step=Step(step), ) img_abs_path = default_storage.abspath(page_path.img_url()) if not os.path.exists(img_abs_path): logger.debug( f"Preview image {img_abs_path} does not exists. Generating...") extract_img(page_path, media_root=settings.MEDIA_ROOT) try: with open(img_abs_path, "rb") as f: return HttpResponse(f.read(), content_type="image/jpeg") except IOError: generic_file = "admin/img/document.png" if Step(step).is_thumbnail: generic_file = "admin/img/document_thumbnail.png" file_path = finders.find(generic_file) with open(file_path, "rb") as f: return HttpResponse(f.read(), content_type="image/png") return redirect('core:index')
def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ logger.debug("OCR PDF document") page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def ocr_page_image(doc_path, page_num, lang): """ image = jpg, jpeg, png """ logger.debug("OCR image (jpeg, jpg, png) document") page_url = PagePath( document_path=doc_path, page_num=page_num, step=Step(1), # jpeg, jpg, png are 1 page documents page_count=1) # resize and eventually convert (png -> jpg) resize_img(page_url, media_root=settings.MEDIA_ROOT) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) # First quickly generate preview images for step in Steps(): page_url.step = step resize_img(page_url, media_root=settings.MEDIA_ROOT) # reset page's step page_url.step = Step(1) # Now OCR each image for step in Steps(): if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def ocr_page_pdf(doc_path, page_num, lang, **kwargs): """ doc_path is an mglib.path.DocumentPath instance On success returns ``mglib.path.PagePath`` instance. """ logger.debug("OCR PDF document") file_name = kwargs.pop('file_name', None) if not file_name: file_name = doc_path.file_name page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_path.step = step extract_img(page_path, media_root=settings.MEDIA_ROOT) notify_pre_page_ocr(page_path, page_num=page_num, lang=lang, file_name=doc_path.file_name, **kwargs) if page_num <= page_count: page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_txt_ready(page_path, page_num=page_num, lang=lang, file_name=file_name, **kwargs) for step in Steps(): page_path.step = step if not step.is_thumbnail: extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_hocr_ready( page_path, page_num=page_num, lang=lang, # step as integer number step=step.current, file_name=file_name, **kwargs) return page_path
def ocr_page_image(doc_path, page_num, lang, **kwargs): """ image = jpg, jpeg, png On success returns ``mglib.path.PagePath`` instance. """ logger.debug("OCR image (jpeg, jpg, png) document") page_path = PagePath( document_path=doc_path, page_num=page_num, step=Step(1), # jpeg, jpg, png are 1 page documents page_count=1) notify_pre_page_ocr(page_path, page_num=page_num, lang=lang, file_name=doc_path.file_name, **kwargs) # resize and eventually convert (png -> jpg) resize_img(page_path, media_root=settings.MEDIA_ROOT) extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_txt_ready(page_path, page_num=page_num, lang=lang, file_name=doc_path.file_name, **kwargs) # First quickly generate preview images for step in Steps(): page_path.step = step resize_img(page_path, media_root=settings.MEDIA_ROOT) # reset page's step page_path.step = Step(1) # Now OCR each image for step in Steps(): if not step.is_thumbnail: extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_hocr_ready( page_path, page_num=page_num, lang=lang, # step as integer number step=step.current, file_name=doc_path.file_name, **kwargs) return page_path
def test_preview(self): doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) default_storage.copy_doc( src=os.path.join( BASE_DIR, "data", "berlin.pdf" ), dst=doc.path.url(), ) ret = self.client.post( reverse('core:preview', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) page_path = PagePath( document_path=doc.path, page_num=1, step=Step(1), page_count=3 ) self.assertTrue( os.path.exists( default_storage.abspath(page_path.img_url()) ) )
def test_ppmroot(self): doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") page_url = PagePath(document_ep=doc_ep, page_num=1, step=Step(1), page_count=3) self.assertEqual(page_url.ppmroot, "results/user_1/document_3/pages/page_1/100/page")
def test_txt_url(self): """ Without any arguments page_ep.url() returns page_ep.txt_url() """ doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") page_ep = PagePath(document_ep=doc_ep, page_num=1, step=Step(1), page_count=3) self.assertEqual(page_ep.url(), page_ep.txt_url())
def apply_metadata_plugins(document_id, page_num): try: document = Document.objects.get(id=document_id) except Document.DoesNotExist: logger.error(f"Provided document_id={document_id}, does not exists") return page_path = document.get_page_path( page_num=page_num, step=Step(), ) hocr_path = default_storage.abspath(page_path.hocr_url()) metadata_plugins = MetadataPlugins() return metadata_plugins.apply(hocr_path)
def apply_automates(document_id, page_num): logger.debug("apply_automates: Begin.") try: document = Document.objects.get(id=document_id) except Document.DoesNotExist: logger.error(f"Provided document_id={document_id}, does not exists") return page_path = document.get_page_path( page_num=page_num, step=Step(), ) user = document.user hocr_path = default_storage.abspath(page_path.hocr_url()) hocr = "" with open(hocr_path, "r") as f: hocr = f.read() automates = Automate.objects.filter(user=user) # are there automates for the user? if automates.count() == 0: logger.debug(f"No automates for user {user}. Quit.") return # check all automates for given user (the owner of the document) for automate in automates: if automate.is_a_match(hocr): logger.debug(f"Automate {automate} matched document={document}") plugin_klass = get_plugin_by_module_name(automate.plugin_name) logger.debug(f"Found plugin module={plugin_klass.__module__}") logger.debug(f"len(hocr)=={len(hocr)}") automate.apply( document=document, page_num=page_num, hocr=hocr, # Notice () - plugin passed is instance of the class plugin=plugin_klass()) else: logger.debug(f"No match for automate={automate}" f" doc_id={document_id} page_num={page_num}")
def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_img(page_url, media_root=settings.MEDIA_ROOT) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) # tesseract unterhalt-1.jpg page-1 -l deu hocr if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def test_step(self): step = Step(1) self.assertFalse(step.is_thumbnail, f"{step} is is_thumbnail, but it should not be!")
def apply_automates(document_id, page_num): logger.debug("apply_automates: Begin.") try: document = Document.objects.get(id=document_id) except Document.DoesNotExist: logger.error(f"Provided document_id={document_id}, does not exists") return # use text files from the original version of the document doc_path = DocumentPath.copy_from( document.path, version=0 ) page_count = get_pagecount( default_storage.abspath(doc_path.url()) ) page_path = PagePath( document_path=doc_path, page_num=page_num, page_count=page_count, step=Step(), ) user = document.user text_path = default_storage.abspath(page_path.txt_url()) text = "" with open(text_path, "r") as f: text = f.read() automates = Automate.objects.filter(user=user) # are there automates for the user? if automates.count() == 0: logger.debug( f"No automates for user {user}. Quit." ) return # check all automates for given user (the owner of the document) matched = [] for automate in automates: if automate.is_a_match(text): logger.debug(f"Automate {automate} matched document={document}") plugin_klass = get_plugin_by_module_name( automate.plugin_name ) plugin = plugin_klass() if plugin_klass else None automate.apply( document=document, page_num=page_num, hocr=text, # Notice () - plugin passed is instance of the class plugin=plugin ) matched.append(automate) else: logger.debug( f"No match for automate={automate}" f" doc_id={document_id} page_num={page_num}" ) message = "" message = _( "%(count)s of %(total)s Automate(s) matched. ") % { 'count': len(matched), 'total': automates.count() } if len(matched) > 0: message += _("List of matched Automates: %(matched_automates)s") % { 'matched_automates': matched } automates_matching.send( sender="papermerge.core.automate", user_id=document.user.id, document_id=document_id, level=logging.INFO, message=message, page_num=page_num, text=text )