def test_pages_dirname(self): ep = DocumentPath(user_id=1, document_id=3, aux_dir="results", file_name="x.pdf") self.assertEqual(ep.pages_dirname(), "results/user_1/document_3/pages/")
def test_document_url_with_another_version(self): doc_ep = DocumentPath(user_id=1, document_id=15, file_name="x.pdf") self.assertEqual(doc_ep.url(version=3), "docs/user_1/document_15/v3/x.pdf") self.assertEqual(doc_ep.url(version=2), "docs/user_1/document_15/v2/x.pdf")
def test_versioned_page_ep(self): doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") # document's version incremented doc_ep.inc_version() page_ep = PagePath(document_ep=doc_ep, page_num=1, page_count=3) self.assertEqual(page_ep.path, "results/user_1/document_3/v1/pages/page_1.txt")
def ocr_page( user_id, document_id, file_name, page_num, lang, ): logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_path = DocumentPath( user_id=user_id, document_id=document_id, file_name=file_name, ) mime_type = mime.Mime(default_storage.abspath(doc_path.url())) logger.debug(f"Mime Type = {mime_type}") page_type = '' if mime_type.is_pdf(): ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang) page_type = 'pdf' elif mime_type.is_image(): # jpeg, jpeg or png ocr_page_image(doc_path=doc_path, page_num=page_num, lang=lang) elif mime_type.is_tiff(): # new filename is a pdf file logger.debug("TIFF type detected") new_filename = convert_tiff2pdf( doc_url=default_storage.abspath(doc_path.url())) # now .pdf doc_path.file_name = new_filename # and continue as usual ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang) else: logger.error(f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True t2 = time.time() logger.debug(f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True
def test_ppmroot(self): doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") page_url = PagePath(document_ep=doc_ep, page_num=1, step=Step(1), page_count=3) self.assertEqual(page_url.ppmroot, "results/user_1/document_3/pages/page_1/100/page")
def vpath(self, version=0): result = DocumentPath( user_id=self.user.id, document_id=self.id, version=version, file_name=self.file_name, ) return result
def test_txt_url(self): """ Without any arguments page_ep.url() returns page_ep.txt_url() """ doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") page_ep = PagePath(document_ep=doc_ep, page_num=1, step=Step(1), page_count=3) self.assertEqual(page_ep.url(), page_ep.txt_url())
def paste_pages(self, dest_doc_path, data_list, dest_doc_is_new=False, after_page_number=False, before_page_number=False): """ Pastes pages in the document pointed by dest_doc_path from src_doc_path. Both dest and src are instances of mglib.path.DocumentPath """ next_ver_dp = DocumentPath.copy_from(dest_doc_path, version=dest_doc_path.version + 1) self.make_sure_path_exists(self.abspath(next_ver_dp)) pdftk.paste_pages(src=self.abspath(dest_doc_path), dst=self.abspath(next_ver_dp), data_list=data_list, dst_doc_is_new=dest_doc_is_new, after_page_number=after_page_number, before_page_number=before_page_number) if not dest_doc_is_new: # migrate document's own pages from previous # version (this differs from pasting into newly # created docs) pcount = self.get_pagecount(dest_doc_path) data_list.insert(0, { 'doc_path': dest_doc_path, 'page_nums': list(range(1, pcount + 1)) }) dest_page_num = 1 dest_page_count = sum([len(item['page_nums']) for item in data_list]) for item in data_list: src_path = item['doc_path'] for page_num in item['page_nums']: for step in Steps(): src_page_path = PagePath( document_path=src_path, page_num=int(page_num), step=step, page_count=self.get_pagecount(src_path)) dst_page_path = PagePath(document_path=next_ver_dp, page_num=dest_page_num, step=step, page_count=dest_page_count) logger.debug(f"src={src_page_path} dst={dst_page_path}") self.copy_page(src_page_path=src_page_path, dst_page_path=dst_page_path) dest_page_num += 1 return dest_doc_path.version + 1
def reorder_pages(self, doc_path, new_order): """ Reorders pages in the document pointed by doc_path. doc_path is an instance of mglib.path.DocumentPath In case of success returns document's new version. new_order is a list of following format: [ {'page_num': 2, page_order: 1}, {'page_num': 1, page_order: 2}, {'page_num': 3, page_order: 3}, {'page_num': 4, page_order: 4}, ] Example above means that in current document of 4 pages, first page was swapped with second one. page_num = older page order page_order = current page order So in human language, each hash is read: <page_num> now should be <page_order> """ src_doc_path = doc_path dst_doc_path = DocumentPath.copy_from(src_doc_path, version=doc_path.version + 1) self.make_sure_path_exists(self.abspath(dst_doc_path)) stapler.reorder_pages(src=self.abspath(src_doc_path), dst=self.abspath(dst_doc_path), new_order=new_order) page_count = self.get_pagecount(doc_path) if len(new_order) > page_count: logger.error( f"deleted_pages({new_order}) > page_count({page_count})") return for item in new_order: for step in Steps(): src_page_path = PagePath(document_path=src_doc_path, page_num=int(item['page_num']), step=step, page_count=len(new_order)) dst_page_path = PagePath(document_path=dst_doc_path, page_num=int(item['page_order']), step=step, page_count=len(new_order)) self.copy_page(src_page_path=src_page_path, dst_page_path=dst_page_path) return doc_path.version + 1
def path(self): version = self.version if not isinstance(version, int): version = 0 result = DocumentPath( user_id=self.user.id, document_id=self.id, version=version, file_name=self.file_name, ) return result
def get_pagecount(self, doc_path): """ Returns total number of pages for this doc_path. Total number of pages = number of page_xy.txt files in pages_dirname folder. """ doc_path_pointing_to_results = DocumentPath.copy_from( doc_path, aux_dir="results") pages_dir = self.abspath(doc_path_pointing_to_results.pages_dirname()) only_dirs = [ fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi)) ] return len(only_dirs)
def path(self, version=None): if version is None: version = self.version version = int(version) result = DocumentPath( user_id=self.user.id, document_id=self.id, version=version, file_name=self.file_name, ) return result
def ocr_page( user_id, document_id, file_name, page_num, lang, ): logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_path = DocumentPath( user_id=user_id, document_id=document_id, file_name=file_name, ) mime_type = mime.Mime(default_storage.abspath(doc_path.url())) page_type = '' if mime_type.is_pdf(): ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang) page_type = 'pdf' else: logger.error(f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True t2 = time.time() logger.debug(f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True
def test_document_url_none_vs_0(self): doc_ep = DocumentPath(user_id=1, document_id=15, file_name="x.pdf") doc_ep.inc_version() # current version = 1 doc_ep.inc_version() # current version = 2 doc_ep.inc_version() # current version = 3 self.assertEqual( # with version == None, latest version of the document # will be returned, which is 3 doc_ep.url(version=None), "docs/user_1/document_15/v3/x.pdf") self.assertEqual( # with version == 0, version 0 will be provided # i.e. version=0 returns original doc. doc_ep.url(version=0), "docs/user_1/document_15/x.pdf")
def test_get_versions_2(self): storage = FileSystemStorage(location=MEDIA_ROOT) with TemporaryNode(MEDIA_ROOT) as media_root: docs = media_root.add_folder("docs") f1 = docs.add_folder("user_1/document_2") f1.add_file("doku.pdf") doc_path = DocumentPath(user_id=1, document_id=2, file_name='doku.pdf', version=2) versions = storage.get_versions(doc_path) # document has only one version - the latest self.assertEqual(versions, [0])
def test_inc_version(self): """ Document endpoints are now versioned. Initial version is 0. When version is 0, the "old" endpoint path applies i.e. version is not included in the path. After document is modified (blank page deleted for example), its version is incremented. If document version is > 0, then version is included in the path. """ doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") doc_ep.inc_version() self.assertEqual(doc_ep.url(), "docs/user_1/document_3/v1/x.pdf") doc_ep.inc_version() self.assertEqual(doc_ep.url(), "docs/user_1/document_3/v2/x.pdf")
def delete_pages(self, doc_path, page_numbers, skip_migration=False): """ Delets pages in the document pointed by doc_path. doc_path is an instance of mglib.path.DocumentPath In case of success returns document's new version. """ if not isinstance(page_numbers, list): logger.error("Expecting list argument") return False src_doc_path = doc_path dst_doc_path = DocumentPath.copy_from(src_doc_path, version=doc_path.version + 1) self.make_sure_path_exists(self.abspath(dst_doc_path)) stapler.delete_pages(self.abspath(src_doc_path), self.abspath(dst_doc_path), page_numbers) if skip_migration: return doc_path.version + 1 page_count = self.get_pagecount(doc_path) if len(page_numbers) > page_count: logger.error( f"deleted_pages({page_numbers}) > page_count({page_count})") return assigns = get_assigns_after_delete(total_pages=page_count, deleted_pages=page_numbers) for a in assigns: for step in Steps(): src_page_path = PagePath(document_path=src_doc_path, page_num=a[1], step=step, page_count=page_count) dst_page_path = PagePath(document_path=dst_doc_path, page_num=a[0], step=step, page_count=page_count - len(page_numbers)) self.copy_page(src_page_path=src_page_path, dst_page_path=dst_page_path) return doc_path.version + 1
def test_delete(self): storage = FileSystemStorage(location=MEDIA_ROOT) with TemporaryNode(MEDIA_ROOT) as media_root: docs = media_root.add_folder("docs") res = media_root.add_folder("results") f1 = docs.add_folder("user_1/document_2") f1.add_file("doku.pdf") res.add_folder("user_1/document_2/pages") doc_path = DocumentPath(user_id=1, document_id=2, file_name='doku.pdf') self.assertTrue(f1.exists()) storage.delete_doc(doc_path) self.assertFalse(f1.exists())
def test_get_versions_1(self): storage = FileSystemStorage(location=MEDIA_ROOT) with TemporaryNode(MEDIA_ROOT) as media_root: docs = media_root.add_folder("docs") res = media_root.add_folder("results") f1 = docs.add_folder("user_1/document_2") f1.add_file("doku.pdf") # simulate 2 versions of the document. f1.add_folder("v1") f1.add_folder("v2") res.add_folder("user_1/document_2/pages") doc_path = DocumentPath(user_id=1, document_id=2, file_name='doku.pdf', version=2) versions = storage.get_versions(doc_path) self.assertEqual(versions, [0, 1, 2])
def apply_automates(document_id, page_num): logger.debug("apply_automates: Begin.") try: document = Document.objects.get(id=document_id) except Document.DoesNotExist: logger.error(f"Provided document_id={document_id}, does not exists") return # use text files from the original version of the document doc_path = DocumentPath.copy_from( document.path, version=0 ) page_count = get_pagecount( default_storage.abspath(doc_path.url()) ) page_path = PagePath( document_path=doc_path, page_num=page_num, page_count=page_count, step=Step(), ) user = document.user text_path = default_storage.abspath(page_path.txt_url()) text = "" with open(text_path, "r") as f: text = f.read() automates = Automate.objects.filter(user=user) # are there automates for the user? if automates.count() == 0: logger.debug( f"No automates for user {user}. Quit." ) return # check all automates for given user (the owner of the document) matched = [] for automate in automates: if automate.is_a_match(text): logger.debug(f"Automate {automate} matched document={document}") plugin_klass = get_plugin_by_module_name( automate.plugin_name ) plugin = plugin_klass() if plugin_klass else None automate.apply( document=document, page_num=page_num, hocr=text, # Notice () - plugin passed is instance of the class plugin=plugin ) matched.append(automate) else: logger.debug( f"No match for automate={automate}" f" doc_id={document_id} page_num={page_num}" ) message = "" message = _( "%(count)s of %(total)s Automate(s) matched. ") % { 'count': len(matched), 'total': automates.count() } if len(matched) > 0: message += _("List of matched Automates: %(matched_automates)s") % { 'matched_automates': matched } automates_matching.send( sender="papermerge.core.automate", user_id=document.user.id, document_id=document_id, level=logging.INFO, message=message, page_num=page_num, text=text )
def ocr_page( user_id, document_id, file_name, page_num, lang, namespace=None, ): logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_path = DocumentPath( user_id=user_id, document_id=document_id, file_name=file_name, ) if not default_storage.exists(doc_path.url()): # In case of distibuted deployment, document uploaded # by webapp is not directly available to the worker (which runs on # separate computer). Thus, if document is not locally available, # worker will download the document from whatever remote location. default_storage.download(doc_path_url=doc_path.url(), namespace=namespace) mime_type = mime.Mime(default_storage.abspath(doc_path.url())) logger.debug(f"Mime Type = {mime_type}") page_type = '' if mime_type.is_pdf(): ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang, user_id=user_id, document_id=document_id, namespace=namespace) page_type = 'pdf' elif mime_type.is_image(): # jpeg, jpeg or png ocr_page_image(doc_path=doc_path, page_num=page_num, lang=lang, user_id=user_id, document_id=document_id, namespace=namespace) elif mime_type.is_tiff(): # new filename is a pdf file logger.debug("TIFF type detected") new_filename = convert_tiff2pdf( doc_url=default_storage.abspath(doc_path.url())) # now .pdf orig_file_name = doc_path.file_name doc_path.file_name = new_filename # and continue as usual ocr_page_pdf( doc_path=doc_path, page_num=page_num, lang=lang, user_id=user_id, document_id=document_id, # Pass original file_name i.e. tiff file name as well. file_name=orig_file_name, namespace=namespace) else: logger.error(f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True t2 = time.time() logger.debug(f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True
def test_document_url(self): doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf") self.assertEqual(doc_ep.url(), "docs/user_1/document_3/x.pdf")
def paste(self, doc_pages, after=False, before=False): """ Paste pages in current document. """ new_page_count = sum([len(pages) for pages in doc_pages.values()]) if new_page_count == 0: logger.warning("No pages to paste. Exiting.") return # for each document where are pages to paste doc_list = [] doc_ep_list = [] old_version = self.version for doc_id in doc_pages.keys(): try: doc = Document.objects.get(id=doc_id) except Document.DoesNotExist: logger.warning(f"While pasting, doc_id={doc_id} was not found") return doc_list.append({'doc': doc, 'page_nums': doc_pages[doc_id]}) doc_ep_list.append({ 'doc_ep': doc.doc_ep, 'page_nums': doc_pages[doc_id] }) # returns new document version new_version = pdftk.paste_pages(dest_doc_ep=self.doc_ep, src_doc_ep_list=doc_ep_list, dest_doc_is_new=False, after_page_number=after, before_page_number=before) if new_version == self.version: raise Exception("Expecting version to be incremented") self.version = new_version self.save() # migrate document's own pages from previous # version (this differs from pasting into newly # created docs) doc_ep_list.insert( 0, { 'doc_ep': DocumentPath(user_id=self.user.id, document_id=self.id, version=old_version, file_name=self.file_name), 'page_nums': list(range(1, self.page_count + 1)) }) ocrmigrate.migrate_cutted_pages(dest_ep=self.doc_ep, src_doc_ep_list=doc_ep_list) # delete pages of source document (which where # cutted and pasted into new doc) for item in doc_list: item['doc'].delete_pages(page_numbers=item['page_nums']) # must be at the end self.recreate_pages()