def txt_url(self): result = PagePath(document_ep=self.document.path, page_num=self.number, page_count=self.page_count) return result.txt_url()
def txt_exists(self): result = PagePath(document_ep=self.document.doc_ep, page_num=self.number, page_count=self.page_count) return result.txt_exists()
def ocr_page_image(doc_path, page_num, lang): """ image = jpg, jpeg, png """ logger.debug("OCR image (jpeg, jpg, png) document") page_url = PagePath( document_path=doc_path, page_num=page_num, step=Step(1), # jpeg, jpg, png are 1 page documents page_count=1) # resize and eventually convert (png -> jpg) resize_img(page_url, media_root=settings.MEDIA_ROOT) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) # First quickly generate preview images for step in Steps(): page_url.step = step resize_img(page_url, media_root=settings.MEDIA_ROOT) # reset page's step page_url.step = Step(1) # Now OCR each image for step in Steps(): if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def test_preview(self): doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) default_storage.copy_doc( src=os.path.join( BASE_DIR, "data", "berlin.pdf" ), dst=doc.path.url(), ) ret = self.client.post( reverse('core:preview', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) page_path = PagePath( document_path=doc.path, page_num=1, step=Step(1), page_count=3 ) self.assertTrue( os.path.exists( default_storage.abspath(page_path.img_url()) ) )
def test_txt_url(self): """ Without any arguments page_ep.url() returns page_ep.txt_url() """ doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") page_ep = PagePath(document_ep=doc_ep, page_num=1, step=Step(1), page_count=3) self.assertEqual(page_ep.url(), page_ep.txt_url())
def paste_pages(self, dest_doc_path, data_list, dest_doc_is_new=False, after_page_number=False, before_page_number=False): """ Pastes pages in the document pointed by dest_doc_path from src_doc_path. Both dest and src are instances of mglib.path.DocumentPath """ next_ver_dp = DocumentPath.copy_from(dest_doc_path, version=dest_doc_path.version + 1) self.make_sure_path_exists(self.abspath(next_ver_dp)) pdftk.paste_pages(src=self.abspath(dest_doc_path), dst=self.abspath(next_ver_dp), data_list=data_list, dst_doc_is_new=dest_doc_is_new, after_page_number=after_page_number, before_page_number=before_page_number) if not dest_doc_is_new: # migrate document's own pages from previous # version (this differs from pasting into newly # created docs) pcount = self.get_pagecount(dest_doc_path) data_list.insert(0, { 'doc_path': dest_doc_path, 'page_nums': list(range(1, pcount + 1)) }) dest_page_num = 1 dest_page_count = sum([len(item['page_nums']) for item in data_list]) for item in data_list: src_path = item['doc_path'] for page_num in item['page_nums']: for step in Steps(): src_page_path = PagePath( document_path=src_path, page_num=int(page_num), step=step, page_count=self.get_pagecount(src_path)) dst_page_path = PagePath(document_path=next_ver_dp, page_num=dest_page_num, step=step, page_count=dest_page_count) logger.debug(f"src={src_page_path} dst={dst_page_path}") self.copy_page(src_page_path=src_page_path, dst_page_path=dst_page_path) dest_page_num += 1 return dest_doc_path.version + 1
def reorder_pages(self, doc_path, new_order): """ Reorders pages in the document pointed by doc_path. doc_path is an instance of mglib.path.DocumentPath In case of success returns document's new version. new_order is a list of following format: [ {'page_num': 2, page_order: 1}, {'page_num': 1, page_order: 2}, {'page_num': 3, page_order: 3}, {'page_num': 4, page_order: 4}, ] Example above means that in current document of 4 pages, first page was swapped with second one. page_num = older page order page_order = current page order So in human language, each hash is read: <page_num> now should be <page_order> """ src_doc_path = doc_path dst_doc_path = DocumentPath.copy_from(src_doc_path, version=doc_path.version + 1) self.make_sure_path_exists(self.abspath(dst_doc_path)) stapler.reorder_pages(src=self.abspath(src_doc_path), dst=self.abspath(dst_doc_path), new_order=new_order) page_count = self.get_pagecount(doc_path) if len(new_order) > page_count: logger.error( f"deleted_pages({new_order}) > page_count({page_count})") return for item in new_order: for step in Steps(): src_page_path = PagePath(document_path=src_doc_path, page_num=int(item['page_num']), step=step, page_count=len(new_order)) dst_page_path = PagePath(document_path=dst_doc_path, page_num=int(item['page_order']), step=step, page_count=len(new_order)) self.copy_page(src_page_path=src_page_path, dst_page_path=dst_page_path) return doc_path.version + 1
def ocr_page_image(doc_path, page_num, lang, **kwargs): """ image = jpg, jpeg, png On success returns ``mglib.path.PagePath`` instance. """ logger.debug("OCR image (jpeg, jpg, png) document") page_path = PagePath( document_path=doc_path, page_num=page_num, step=Step(1), # jpeg, jpg, png are 1 page documents page_count=1) notify_pre_page_ocr(page_path, page_num=page_num, lang=lang, file_name=doc_path.file_name, **kwargs) # resize and eventually convert (png -> jpg) resize_img(page_path, media_root=settings.MEDIA_ROOT) extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_txt_ready(page_path, page_num=page_num, lang=lang, file_name=doc_path.file_name, **kwargs) # First quickly generate preview images for step in Steps(): page_path.step = step resize_img(page_path, media_root=settings.MEDIA_ROOT) # reset page's step page_path.step = Step(1) # Now OCR each image for step in Steps(): if not step.is_thumbnail: extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_hocr_ready( page_path, page_num=page_num, lang=lang, # step as integer number step=step.current, file_name=doc_path.file_name, **kwargs) return page_path
def page_paths(self): """ Enables document instance to get quickly page paths: page_path = doc.page_path[2] page_path.url() # local url to second page of the doc. This is shortcut method when most used Step(1) is required. """ results = [None] # indexing starts from 1 # doc.page_count might be wrong because per # page logic was added just recently. So, let's use # this opportunity and correct it! page_count = get_pagecount(self.absfilepath) if page_count != self.page_count: self.page_count = page_count self.save() for page_num in range(1, page_count + 1): page_path = PagePath(document_path=self.path, page_num=page_num, step=step.Step(1), page_count=self.page_count) results.append(page_path) return results
def get_page_path(self, page_num, step): """ For Step(1) shortcut, use doc_instance.page_eps property. """ return PagePath(document_path=self.path, page_num=page_num, step=step, page_count=self.page_count)
def test_ppmroot(self): doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") page_url = PagePath(document_ep=doc_ep, page_num=1, step=Step(1), page_count=3) self.assertEqual(page_url.ppmroot, "results/user_1/document_3/pages/page_1/100/page")
def test_versioned_page_ep(self): doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") # document's version incremented doc_ep.inc_version() page_ep = PagePath(document_ep=doc_ep, page_num=1, page_count=3) self.assertEqual(page_ep.path, "results/user_1/document_3/v1/pages/page_1.txt")
def delete_pages(self, doc_path, page_numbers, skip_migration=False): """ Delets pages in the document pointed by doc_path. doc_path is an instance of mglib.path.DocumentPath In case of success returns document's new version. """ if not isinstance(page_numbers, list): logger.error("Expecting list argument") return False src_doc_path = doc_path dst_doc_path = DocumentPath.copy_from(src_doc_path, version=doc_path.version + 1) self.make_sure_path_exists(self.abspath(dst_doc_path)) stapler.delete_pages(self.abspath(src_doc_path), self.abspath(dst_doc_path), page_numbers) if skip_migration: return doc_path.version + 1 page_count = self.get_pagecount(doc_path) if len(page_numbers) > page_count: logger.error( f"deleted_pages({page_numbers}) > page_count({page_count})") return assigns = get_assigns_after_delete(total_pages=page_count, deleted_pages=page_numbers) for a in assigns: for step in Steps(): src_page_path = PagePath(document_path=src_doc_path, page_num=a[1], step=step, page_count=page_count) dst_page_path = PagePath(document_path=dst_doc_path, page_num=a[0], step=step, page_count=page_count - len(page_numbers)) self.copy_page(src_page_path=src_page_path, dst_page_path=dst_page_path) return doc_path.version + 1
def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ logger.debug("OCR PDF document") page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def ocr_page_pdf(doc_path, page_num, lang, **kwargs): """ doc_path is an mglib.path.DocumentPath instance On success returns ``mglib.path.PagePath`` instance. """ logger.debug("OCR PDF document") file_name = kwargs.pop('file_name', None) if not file_name: file_name = doc_path.file_name page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_path.step = step extract_img(page_path, media_root=settings.MEDIA_ROOT) notify_pre_page_ocr(page_path, page_num=page_num, lang=lang, file_name=doc_path.file_name, **kwargs) if page_num <= page_count: page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_txt_ready(page_path, page_num=page_num, lang=lang, file_name=file_name, **kwargs) for step in Steps(): page_path.step = step if not step.is_thumbnail: extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_hocr_ready( page_path, page_num=page_num, lang=lang, # step as integer number step=step.current, file_name=file_name, **kwargs) return page_path
def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_img(page_url, media_root=settings.MEDIA_ROOT) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) # tesseract unterhalt-1.jpg page-1 -l deu hocr if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def page_paths(self, version=None): """ Enables document instance to get quickly page paths: page_path = doc.page_path[2] page_path.url() # local url to second page of the doc. This is shortcut method when most used Step(1) is required. """ results = [None] # indexing starts from 1 page_count = self.get_pagecount(version=version) for page_num in range(1, page_count + 1): page_path = PagePath( document_path=self.path(version=version), page_num=page_num, step=step.Step(1), page_count=self.get_pagecount(version=version)) results.append(page_path) return results
def path(self): return PagePath( document_path=self.document.path, page_num=self.number, page_count=self.page_count )
def apply_automates(document_id, page_num): logger.debug("apply_automates: Begin.") try: document = Document.objects.get(id=document_id) except Document.DoesNotExist: logger.error(f"Provided document_id={document_id}, does not exists") return # use text files from the original version of the document doc_path = DocumentPath.copy_from( document.path, version=0 ) page_count = get_pagecount( default_storage.abspath(doc_path.url()) ) page_path = PagePath( document_path=doc_path, page_num=page_num, page_count=page_count, step=Step(), ) user = document.user text_path = default_storage.abspath(page_path.txt_url()) text = "" with open(text_path, "r") as f: text = f.read() automates = Automate.objects.filter(user=user) # are there automates for the user? if automates.count() == 0: logger.debug( f"No automates for user {user}. Quit." ) return # check all automates for given user (the owner of the document) matched = [] for automate in automates: if automate.is_a_match(text): logger.debug(f"Automate {automate} matched document={document}") plugin_klass = get_plugin_by_module_name( automate.plugin_name ) plugin = plugin_klass() if plugin_klass else None automate.apply( document=document, page_num=page_num, hocr=text, # Notice () - plugin passed is instance of the class plugin=plugin ) matched.append(automate) else: logger.debug( f"No match for automate={automate}" f" doc_id={document_id} page_num={page_num}" ) message = "" message = _( "%(count)s of %(total)s Automate(s) matched. ") % { 'count': len(matched), 'total': automates.count() } if len(matched) > 0: message += _("List of matched Automates: %(matched_automates)s") % { 'matched_automates': matched } automates_matching.send( sender="papermerge.core.automate", user_id=document.user.id, document_id=document_id, level=logging.INFO, message=message, page_num=page_num, text=text )
def path(self, version=None): return PagePath(document_path=self.document.path(version=version), page_num=self.number, page_count=self.page_count)