def test_download_hocr_which_does_not_exists(self): """ HOCR might not be available. It is a normal case (page OCR task is still in the queue/progress). Missing HCOR file => HTTP 404 return code is expected. """ doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) # Doc is available (for get_pagecount on server side). default_storage.copy_doc( src=os.path.join( BASE_DIR, "data", "berlin.pdf" ), dst=doc.path.url() ) # But HOCR file is missing. ret = self.client.get( reverse('core:hocr', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 404 )
def test_preview(self): doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) default_storage.copy_doc( src=os.path.join( BASE_DIR, "data", "berlin.pdf" ), dst=doc.path.url(), ) ret = self.client.post( reverse('core:preview', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) page_path = PagePath( document_path=doc.path, page_num=1, step=Step(1), page_count=3 ) self.assertTrue( os.path.exists( default_storage.abspath(page_path.img_url()) ) )
def import_file(self, file_title=None, inbox_title="Inbox", delete_after_import=True, skip_ocr=False): """ Gets as input a path to a file on a local file system and: 1. creates a document instance 2. Copies file to doc_instance.url() 4. OCR the doc Used with ./manage.py local_importer ./manage.py imap_importer command """ logger.debug(f"Importing file {self.filepath}") if file_title is None: file_title = os.path.basename(self.filepath) try: page_count = get_pagecount(self.filepath) except Exception: logger.error(f"Error while getting page count of {self.filepath}.") return False inbox, _ = Folder.objects.get_or_create(title=inbox_title, parent=None, user=self.user) doc = Document.create_document(user=self.user, title=file_title, size=os.path.getsize(self.filepath), lang=self.user_ocr_language, file_name=file_title, parent_id=inbox.id, page_count=page_count) logger.debug(f"Uploading file {self.filepath} to {doc.path.url()}") default_storage.copy_doc( src=self.filepath, dst=doc.path.url(), ) if not skip_ocr: DocumentImporter.ocr_document( document=doc, page_count=page_count, lang=self.user_ocr_language, ) if delete_after_import: # Usually we want to delete files when importing # them from local directory # When importing from Email attachment - deleting # files does not apply os.remove(self.filepath) logger.debug("Import complete.") return doc
def test_download(self): doc = Document.objects.create_document(title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3) default_storage.copy_doc(src=os.path.join(BASE_DIR, "data", "berlin.pdf"), dst=doc.path.url()) ret = self.client.post(reverse('core:node_download', args=(doc.id, ))) self.assertEqual(ret.status_code, 200)
def test_backup_single_document(self): document_path = os.path.join( BASE_DIR, "data", "berlin.pdf" ) doc = Document.create_document( user=self.testcase_user, title='berlin.pdf', size=os.path.getsize(document_path), lang='deu', file_name='berlin.pdf', parent_id=None, page_count=3 ) default_storage.copy_doc( src=document_path, dst=doc.path.url(), ) with io.BytesIO() as memoryfile: backup_documents(memoryfile, self.testcase_user) memoryfile.seek(0) self.assertTrue( _can_restore(memoryfile), 'generated backup.tar is not valid' ) memoryfile.seek(0) backup_file = tarfile.open(fileobj=memoryfile, mode='r') backup_json = backup_file.extractfile('backup.json') backup_info = json.loads(backup_json.read()) self.assertIsNotNone( backup_info.get('documents'), 'backup.json did not have a key "documents"' ) self.assertIs( len(backup_info.get('documents')), 1, 'backup.json key documents had more or less than one entry' ) self.assertIs( len(backup_file.getnames()), 2, 'backup.tar had more or less than 2 entries' ) self.assertTrue( 'berlin.pdf' in backup_file.getnames(), 'berlin.pdf was not in the backup.tar' )
def test_download_hocr(self): doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) default_storage.copy_doc( src=os.path.join( BASE_DIR, "data", "berlin.pdf" ), dst=default_storage.abspath(doc.path.url()) ) # build page url page_path = doc.page_paths[1] # just remember that at the end of test # copied file must be deteled. (1) default_storage.copy_doc( src=os.path.join( BASE_DIR, "data", "page-1.hocr" ), dst=default_storage.abspath(page_path.hocr_url()) ) ret = self.client.get( reverse('core:hocr', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) # Deleting file created at (1) os.remove( default_storage.abspath(page_path.hocr_url()) )
def upload(request): """ To understand returned value, have a look at papermerge.core.views.decorators.json_reponse decorator """ files = request.FILES.getlist('file') if not files: logger.warning("POST request.FILES is empty. Forgot adding file?") return "Missing input file", 400 if len(files) > 1: msg = "More then one files per ajax? how come?" logger.warning(msg) return msg, 400 f = files[0] logger.debug("upload for f=%s user=%s", f, request.user) user = request.user size = os.path.getsize(f.temporary_file_path()) parent_id = request.POST.get('parent', "-1") if parent_id and "-1" in parent_id: parent_id = None lang = request.POST.get('language') notes = request.POST.get('notes') try: page_count = get_pagecount(f.temporary_file_path()) except exceptions.FileTypeNotSupported: status = 400 msg = _("File type not supported." " Only pdf, tiff, png, jpeg files are supported") return msg, status logger.debug("creating document {}".format(f.name)) doc = Document.create_document(user=user, title=f.name, size=size, lang=lang, file_name=f.name, parent_id=parent_id, notes=notes, page_count=page_count) logger.debug("uploading to {}".format(doc.path.url())) default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url()) for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': doc.id, 'file_name': f.name, 'page_num': page_num, 'lang': lang }) # upload only one file at time. # after each upload return a json object with # following fields: # # - title # - preview_url # - doc_id # - action_url -> needed for renaming/deleting selected item # # with that info a new thumbnail will be created. preview_url = reverse('core:preview', args=(doc.id, 200, 1)) result = { 'title': doc.title, 'doc_id': doc.id, 'action_url': "", 'preview_url': preview_url } return result
def test_user_download_document(self): """ If user has read access to the document (even if he/she is not the owner of the document), then he/she must be able to download it. Scenario: admin user creates a document and assigns read only access for margaret (thus, root is the owner of the document). Expected: Margaret and root user must be able to download the document. Elizabet on the other hand - must not have access to the document (she was not assigned permissions for that) """ document_path = os.path.join( BASE_DIR, "data", "berlin.pdf" ) doc = Document.create_document( user=self.root_user, title='berlin.pdf', size=os.path.getsize(document_path), lang='deu', file_name='berlin.pdf', page_count=3 ) # copy document from its test/data place # to the media storage, as if document was uploaded. default_storage.copy_doc( src=document_path, dst=doc.path.url(), ) create_access( node=doc, name=self.margaret_user.username, model_type=Access.MODEL_USER, access_type=Access.ALLOW, access_inherited=False, permissions={ READ: True } # allow read access to margaret ) self.client.login( testcase_user=self.margaret_user ) url = reverse( 'core:document_download', args=(doc.id,) ) ret = self.client.get(url) self.assertEqual( ret.status_code, 200 ) # also, root/admin must be able to download it self.client.logout() self.client.login( testcase_user=self.root_user ) ret = self.client.get(url) self.assertEqual( ret.status_code, 200 ) self.client.logout() # for elizabet on the other hand, access is forbidden. self.client.login(testcase_user=self.elizabet_user) ret = self.client.get(url) self.assertEqual( ret.status_code, 403 )
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False): restore_file.seek(0) with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) leading_user_in_path = False _user = user if not user: leading_user_in_path = True # user was not specified. It is assument that # backup.json contains a list of users. # Thus recreate users first. for backup_user in backup_info['users']: user = User.objects.create( username=backup_user['username'], email=backup_user['email'], is_active=backup_user['is_active'], is_superuser=backup_user['is_superuser']) # in case --include-user-password switch was used # update user (raw digest of) password field password = backup_user.get('password') if password: user.password = password user.save() for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue logger.debug(f"Restoring file {restore_file}...") splitted_path = PurePath(restore_file).parts base, ext = os.path.splitext( remove_backup_filename_id(splitted_path[-1])) # if there is leading username, remove it. if leading_user_in_path: username = splitted_path[0] _user = User.objects.get(username=username) splitted_path = splitted_path[1:] if backup_info.get('documents', False): backup_info_documents = backup_info['documents'] else: backup_info_documents = _get_json_user_documents_list( backup_info, _user) leading_user_in_path = True for info in backup_info_documents: document_info = info if info['path'] == restore_file: break parent = None # variables used only to shorten debug message _sp = splitted_path _rf = restore_file logger.debug( f"{_rf}: splitted_path={_sp} len(splitted_path)={len(_sp)}") # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter( title=folder, user=_user).filter(parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=_user) parent = new_folder else: parent = folder_object with NamedTemporaryFile("w+b", suffix=ext) as temp_output: logger.debug(f"Extracting {restore_file}...") ff = restore_archive.extractfile(restore_file) temp_output.write(ff.read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.objects.create_document( user=_user, title=document_info['title'], size=size, lang=document_info['lang'], file_name=remove_backup_filename_id(splitted_path[-1]), parent_id=parent_id, notes="", page_count=page_count, rebuild_tree=False # speeds up 100x ) tag_attributes = document_info.get('tags', []) for attrs in tag_attributes: attrs['user'] = _user tag, created = Tag.objects.get_or_create(**attrs) new_doc.tags.add(tag) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) if not skip_ocr: for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': _user.id, 'document_id': new_doc.id, 'file_name': new_doc.file_name, 'page_num': page_num, 'lang': document_info['lang'] })
def post(self, request): files = request.FILES.getlist('file') if not files: logger.warning("POST request.FILES is empty. Forgot adding file?") return HttpResponseBadRequest("Missing input file") if len(files) > 1: logger.warning("More then one files per ajax? how come?") return HttpResponse(json.dumps({}), content_type="application/json", status_code=400) f = files[0] logger.debug("upload for f=%s user=%s", f, request.user) user = request.user size = os.path.getsize(f.temporary_file_path()) parent_id = request.POST.get('parent', "-1") if parent_id and "-1" in parent_id: parent_id = None lang = request.POST.get('language') notes = request.POST.get('notes') page_count = get_pagecount(f.temporary_file_path()) logger.info("creating document {}".format(f.name)) doc = Document.create_document(user=user, title=f.name, size=size, lang=lang, file_name=f.name, parent_id=parent_id, notes=notes, page_count=page_count) logger.debug("uploading to {}".format(doc.path.url())) default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url()) for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': doc.id, 'file_name': f.name, 'page_num': page_num, 'lang': lang }) # upload only one file at time. # after each upload return a json object with # following fields: # # - title # - preview_url # - doc_id # - action_url -> needed for renaming/deleting selected item # # with that info a new thumbnail will be created. action_url = reverse('boss:core_basetreenode_change', args=(doc.id, )) preview_url = reverse('core:preview', args=(doc.id, 200, 1)) result = { 'title': doc.title, 'doc_id': doc.id, 'action_url': action_url, 'preview_url': preview_url } logger.info("and response is!") return HttpResponse(json.dumps(result), content_type="application/json")
def test_documents_retains_per_page_metadata_after_page_delete(self): """ DocM is a document with 3 pages. DocM has two metadata fields associated X and Y. Field has a value x=10 and y=20. Second page of the document DocM is deleted. Expected: document values of metadata fields X and Y should be preserverd: DocX.M is still 10 and DocM.Y is still 20. Important! In document browser and document viewer if user does not explicitely select a page, by default metadata associated with first page of respective document is returned. """ document_path = os.path.join(BASE_DIR, "data", "berlin.pdf") docm = Document.objects.create_document( user=self.user, title='berlin.pdf', size=os.path.getsize(document_path), lang='deu', file_name='berlin.pdf', parent_id=None, page_count=3) default_storage.copy_doc( src=document_path, dst=docm.path.url(), ) for number in range(1, 4): page = docm.pages.get(number=number) # filesystem absolute path /home/eugen/x/y/ fs_abs_path = default_storage.abspath(page.path.url()) # filesystem absolute dir fs_abs_dir = os.path.dirname(fs_abs_path) Path(fs_abs_dir).mkdir(parents=True, exist_ok=True) # create an empty file open(fs_abs_path, "w+") # indeed, docm has 3 pages self.assertEqual(docm.pages.count(), 3) docm.kv.update([{ 'key': 'X', 'kv_type': TEXT, }, { 'key': 'Y', 'kv_type': TEXT, }]) # In document browser and document viewer # if user does not explicitely select a document, by default # metadata associated with first page of respective document # is returned page = docm.pages.get(number=1) page.kv['X'] = 10 page.kv['Y'] = 20 page.refresh_from_db() self.assertEqual(page.kv['X'], '10') self.assertEqual(page.kv['Y'], '20') # Even if user deletes second page, all data (incl. metadata) # associated ramaining page (first and last) # MUST be preserved! docm.delete_pages([2]) page = docm.pages.get(number=1) self.assertEqual(page.kv['X'], '10') self.assertEqual(page.kv['Y'], '20')
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False): restore_file.seek(0) with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) leading_user_in_path = False _user = user if not user: leading_user_in_path = True # user was not specified. It is assument that # backup.json contains a list of users. # Thus recreate users first. for backup_user in backup_info['users']: User.objects.create(username=backup_user['username'], email=backup_user['email'], is_active=backup_user['is_active'], is_superuser=backup_user['is_superuser']) for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue splitted_path = PurePath(restore_file).parts base, ext = os.path.splitext( remove_backup_filename_id(splitted_path[-1])) # if there is leading username, remove it. if leading_user_in_path: username = splitted_path[0] _user = User.objects.get(username=username) splitted_path = splitted_path[1:] if backup_info.get('documents', False): backup_info_documents = backup_info['documents'] else: backup_info_documents = _get_json_user_documents_list( backup_info, _user) leading_user_in_path = True for info in backup_info_documents: document_info = info if info['path'] == restore_file: break parent = None # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter( title=folder, user=_user).filter(parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=_user) parent = new_folder else: parent = folder_object with NamedTemporaryFile("w+b", suffix=ext) as temp_output: ff = restore_archive.extractfile(restore_file) temp_output.write(ff.read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.create_document( user=_user, title=document_info['title'], size=size, lang=document_info['lang'], file_name=remove_backup_filename_id(splitted_path[-1]), parent_id=parent_id, notes="", page_count=page_count, rebuild_tree=False # speeds up 100x ) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) if not skip_ocr: for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': _user.id, 'document_id': new_doc.id, 'file_name': new_doc.file_name, 'page_num': page_num, 'lang': document_info['lang'] })
def move_tempfile(self, doc): default_storage.copy_doc(src=self.path, dst=doc.path().url()) return None
def test_backup_document_hierachy(self): folder_1 = Folder.objects.create(title='1', parent=None, user=self.testcase_user) folder_2 = Folder.objects.create(title='2', parent=folder_1, user=self.testcase_user) folder_3 = Folder.objects.create(title='3', parent=folder_1, user=self.testcase_user) Folder.objects.create(title='4', parent=None, user=self.testcase_user) document_path = os.path.join(BASE_DIR, "data", "berlin.pdf") doc_1 = Document.create_document(user=self.testcase_user, title='berlin.pdf', size=os.path.getsize(document_path), lang='deu', file_name='berlin.pdf', parent_id=folder_2.id, page_count=3) default_storage.copy_doc( src=document_path, dst=doc_1.path.url(), ) doc_2 = Document.create_document(user=self.testcase_user, title='berlin.pdf', size=os.path.getsize(document_path), lang='deu', file_name='berlin.pdf', parent_id=folder_3.id, page_count=3) default_storage.copy_doc( src=document_path, dst=doc_2.path.url(), ) with io.BytesIO() as memoryfile: backup_documents(memoryfile, self.testcase_user) memoryfile.seek(0) self.assertTrue(_can_restore(memoryfile), 'generated backup.tar is not valid') memoryfile.seek(0) backup_file = tarfile.open(fileobj=memoryfile, mode='r') backup_json = backup_file.extractfile('backup.json') backup_info = json.loads(backup_json.read()) self.assertIsNotNone(backup_info.get('documents'), 'backup.json did not have a key "documents"') self.assertIs( len(backup_info.get('documents')), 2, 'backup.json key documents had more or less than two entry') self.assertIs(len(backup_file.getnames()), 3, 'backup.tar had more or less than 2 entries') self.assertTrue( f"1/2/berlin.pdf__{doc_1.id}" in backup_file.getnames(), 'berlin.pdf was not in the backup.tar at folder 1/2/') self.assertTrue( f"1/3/berlin.pdf__{doc_2.id}" in backup_file.getnames(), 'berlin.pdf was not in the backup.tar at folder 1/3/') self.assertFalse( '4' in backup_file.getnames(), 'Folder 4 was in backup.tar but should have been ignored')
def test_basic_two_folders(self): """ Creates following hierarchy: + Folder_1 + berlin_f_1.pdf + Folder_2 + berlin_f_2.pdf + berlin_root_1.pdf + berlin_root_2.pdf """ f1 = Folder.objects.create(title='Folder_1', parent=None, user=self.testcase_user) f2 = Folder.objects.create(title='Folder_2', parent=None, user=self.testcase_user) document_path = os.path.join(BASE_DIR, "data", "berlin.pdf") doc_in_root_1 = Document.objects.create_document( user=self.testcase_user, title='berlin_root_1.pdf', size=os.path.getsize(document_path), lang='deu', file_name='berlin_root_1.pdf', page_count=3) default_storage.copy_doc( src=document_path, dst=doc_in_root_1.path.url(), ) doc_in_root_2 = Document.objects.create_document( user=self.testcase_user, title='berlin_root_2.pdf', size=os.path.getsize(document_path), lang='deu', file_name='berlin_root_2.pdf', page_count=3) default_storage.copy_doc( src=document_path, dst=doc_in_root_2.path.url(), ) doc_in_f_1 = Document.objects.create_document( user=self.testcase_user, title='berlin_f_1.pdf', size=os.path.getsize(document_path), lang='deu', parent_id=f1.id, file_name='berlin_f_1.pdf', page_count=3) default_storage.copy_doc( src=document_path, dst=doc_in_f_1.path.url(), ) doc_in_f_2 = Document.objects.create_document( user=self.testcase_user, title='berlin_f_2.pdf', size=os.path.getsize(document_path), lang='deu', parent_id=f2.id, file_name='berlin_f_2.pdf', page_count=3) default_storage.copy_doc( src=document_path, dst=doc_in_f_2.path.url(), ) """ User selected two documents in the root dir berlin_root_1.pdf, and berlin_root_1.pdf plus Folder_1 and Folder_2. Selection is marked with square brackets [...] + [Folder_1] + berlin_f_1.pdf + [Folder_2] + berlin_f_2.pdf + [berlin_root_1.pdf] + [berlin_root_2.pdf] """ selected_ids = [doc_in_root_1.id, doc_in_root_2.id, f1.id, f2.id] with io.BytesIO() as memoryfile: build_tar_archive( # <-- THIS IS WHAT WE ARE TESTING fileobj=memoryfile, node_ids=selected_ids) memoryfile.seek(0) archive_file = tarfile.open(fileobj=memoryfile, mode='r') berlin_root_1_handle = archive_file.extractfile( 'berlin_root_1.pdf') data = berlin_root_1_handle.read() self.assertTrue(len(data) > 0) berlin_f_1_handle = archive_file.extractfile( 'Folder_1/berlin_f_1.pdf') data = berlin_f_1_handle.read() self.assertTrue(len(data) > 0) berlin_f_2_handle = archive_file.extractfile( 'Folder_2/berlin_f_2.pdf') data = berlin_f_2_handle.read() self.assertTrue(len(data) > 0) with self.assertRaises(KeyError): # there is no file Accounting/Expenses/Paris.pdf # in archive, thus, KeyError exception is expected archive_file.extractfile('Accounting/Expenses/Paris.pdf')
def restore_documents(restore_file: io.BytesIO, username, skip_ocr=False): restore_file.seek(0) user = User.objects.filter(username=username).first() with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue for info in backup_info['documents']: document_info = info if info['path'] == restore_file: break splitted_path = PurePath(restore_file).parts parent = None # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter(title=folder).filter( parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=user) parent = new_folder else: parent = folder_object document_object = Document.objects.filter( title=splitted_path[-1]).filter(parent=parent).first() if document_object is not None: logger.error("Document %s already exists, skipping", restore_file) else: with NamedTemporaryFile("w+b") as temp_output: temp_output.write( restore_archive.extractfile(restore_file).read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.create_document( user=user, title=splitted_path[-1], size=size, lang=document_info['lang'], file_name=splitted_path[-1], parent_id=parent_id, notes="", page_count=page_count) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) for page_num in range(1, page_count + 1): if not skip_ocr: ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': new_doc.id, 'file_name': splitted_path[-1], 'page_num': page_num, 'lang': document_info['lang'] })