Ejemplo n.º 1
0
 def test_preview(self):
     doc = Document.create_document(
         title="berlin.pdf",
         user=self.testcase_user,
         lang="ENG",
         file_name="berlin.pdf",
         size=1222,
         page_count=3
     )
     copy2doc_url(
         src_file_path=os.path.join(
             BASE_DIR, "data", "berlin.pdf"
         ),
         doc_url=doc.path.url()
     )
     ret = self.client.post(
         reverse('core:preview', args=(doc.id, 1, 1))
     )
     self.assertEqual(
         ret.status_code,
         200
     )
     page_path = PagePath(
         document_path=doc.path,
         page_num=1,
         step=Step(1),
         page_count=3
     )
     self.assertTrue(
         os.path.exists(
             default_storage.abspath(page_path.img_url())
         )
     )
Ejemplo n.º 2
0
    def test_download_hocr_which_does_not_exists(self):
        """
        HOCR might not be available. It is a normal case
        (page OCR task is still in the queue/progress).

        Missing HCOR file => HTTP 404 return code is expected.
        """
        doc = Document.create_document(
            title="berlin.pdf",
            user=self.testcase_user,
            lang="ENG",
            file_name="berlin.pdf",
            size=1222,
            page_count=3
        )
        # Doc is available (for get_pagecount on server side).
        copy2doc_url(
            src_file_path=os.path.join(
                BASE_DIR, "data", "berlin.pdf"
            ),
            doc_url=doc.path.url()
        )
        # But HOCR file is missing.
        ret = self.client.get(
            reverse('core:hocr', args=(doc.id, 1, 1))
        )
        self.assertEqual(
            ret.status_code,
            404
        )
Ejemplo n.º 3
0
 def test_preview(self):
     doc = Document.create_document(
         title="andromeda.pdf",
         user=self.testcase_user,
         lang="ENG",
         file_name="andromeda.pdf",
         size=1222,
         page_count=3
     )
     copy2doc_url(
         src_file_path=os.path.join(
             BASE_DIR, "data", "andromeda.pdf"
         ),
         doc_url=doc.doc_ep.url()
     )
     ret = self.client.post(
         reverse('core:preview', args=(doc.id, 1, 1))
     )
     self.assertEqual(
         ret.status_code,
         200
     )
     page_url = PageEp(
         document_ep=doc.doc_ep,
         page_num=1,
         step=Step(1),
         page_count=3
     )
     self.assertTrue(
         os.path.exists(page_url.img_exists())
     )
Ejemplo n.º 4
0
 def test_download(self):
     doc = Document.create_document(title="andromeda.pdf",
                                    user=self.testcase_user,
                                    lang="ENG",
                                    file_name="andromeda.pdf",
                                    size=1222,
                                    page_count=3)
     copy2doc_url(src_file_path=os.path.join(BASE_DIR, "data",
                                             "andromeda.pdf"),
                  doc_url=doc.doc_ep.url())
     ret = self.client.post(
         reverse('core:document_download', args=(doc.id, )))
     self.assertEqual(ret.status_code, 200)
Ejemplo n.º 5
0
    def test_download_hocr(self):
        doc = Document.create_document(
            title="berlin.pdf",
            user=self.testcase_user,
            lang="ENG",
            file_name="berlin.pdf",
            size=1222,
            page_count=3
        )

        copy2doc_url(
            src_file_path=os.path.join(
                BASE_DIR, "data", "berlin.pdf"
            ),
            doc_url=default_storage.abspath(doc.path.url())
        )
        # build page url
        page_path = doc.page_paths[1]

        # just remember that at the end of test
        # copied file must be deteled. (1)
        copy2doc_url(
            src_file_path=os.path.join(
                BASE_DIR, "data", "page-1.hocr"
            ),
            doc_url=default_storage.abspath(page_path.hocr_url())
        )
        ret = self.client.get(
            reverse('core:hocr', args=(doc.id, 1, 1))
        )
        self.assertEqual(
            ret.status_code,
            200
        )
        # Deleting file created at (1)
        os.remove(
            default_storage.abspath(page_path.hocr_url())
        )
Ejemplo n.º 6
0
    def import_file(filepath,
                    username=None,
                    file_title=None,
                    inbox_title="Inbox",
                    delete_after_import=False,
                    start_ocr_async=True,
                    upload=True):
        """
        Gets as input a path to a file on a local file system and:
            1. creates a document instance (if there is a available space).
            2. Copies file to doc_instance.url()
            3. (optionally) uploads the document to S3 storage.
            4. (optionally) starts ocr_async task.

        Is used on customers instance by:
            * import_file command - to import files from SFTP directory
            * import_attachment command - to import attachments from mailbox
        """
        logger.debug(f"Importing file {filepath}")

        if username is None:
            user = get_root_user()
        else:
            user = User.objects.get(username=username)

        if file_title is None:
            file_title = get_file_title(filepath)

        if not is_storage_left(filepath, user=user):
            logger.error(f"user.username reached his disk quota")
            return False

        lang = Document.get_default_language()
        # get_pagecount() might raise an exception in case
        # file is either wrong (not a PDF) or not yet
        # completed to upload
        try:
            page_count = get_pagecount(filepath)
        except Exception:
            # which means that document is not yet fully
            # uploaded by SFTP client.
            logger.error(f"File {filepath} not yet ready for importing.")
            return False

        inbox, _ = Folder.objects.get_or_create(title=inbox_title,
                                                parent=None,
                                                user=user)
        doc = Document.create_document(user=user,
                                       title=file_title,
                                       size=get_file_size(filepath),
                                       lang=lang,
                                       file_name=file_title,
                                       parent_id=inbox.id,
                                       page_count=page_count)
        logger.debug(f"Uploading file {filepath} to {doc.doc_ep.url()}")
        # Import file is executed as root (import-file.service)
        # (because import-file need to access/delete sftp files, folder
        # as of another system user)
        # Thus, after copying file into (newly created) folders,
        # it need to change permissions (of newly created files and folders)
        # to the app_user/app_group.
        copy2doc_url(src_file_path=filepath,
                     doc_url=doc.doc_ep.url(),
                     user=settings.APP_USER,
                     group=settings.APP_GROUP)

        if upload and settings.S3:
            upload_document_to_s3(doc.doc_ep)

        if start_ocr_async and settings.OCR:
            Document.ocr_async(document=doc,
                               page_count=page_count,
                               lang=lang,
                               s3_enabled=settings.S3)

        if delete_after_import:
            os.remove(filepath)

        return True
Ejemplo n.º 7
0
    def post(self, request):

        files = request.FILES.getlist('file')
        if not files:
            logger.warning("POST request.FILES is empty. Forgot adding file?")

        if len(files) > 1:
            logger.warning("More then one files per ajax? how come?")
            return HttpResponse(json.dumps({}),
                                content_type="application/json",
                                status_code=400)

        f = files[0]

        logger.debug("upload for f=%s user=%s", f, request.user)

        if not is_storage_left(f.temporary_file_path()):
            logger.warning("Storage is full for user=%s.", request.user)
            msg = "Cannot upload file {}. Storage is full.".format(f.name)

            return HttpResponse(json.dumps({'error': msg}),
                                status=400,
                                content_type="application/json")

        user = request.user
        size = os.path.getsize(f.temporary_file_path())
        parent_id = request.POST.get('parent', "-1")
        if parent_id and "-1" in parent_id:
            parent_id = None

        lang = request.POST.get('language')
        notes = request.POST.get('notes')
        page_count = get_pagecount(f.temporary_file_path())
        logger.info("creating document {}".format(f.name))

        doc = Document.create_document(user=user,
                                       title=f.name,
                                       size=size,
                                       lang=lang,
                                       file_name=f.name,
                                       parent_id=parent_id,
                                       notes=notes,
                                       page_count=page_count)
        logger.debug("uploading to {}".format(doc.doc_ep.url()))

        copy2doc_url(src_file_path=f.temporary_file_path(),
                     doc_url=doc.doc_ep.url())

        if settings.S3:
            upload_document_to_s3(doc.doc_ep)

        if settings.OCR:
            Document.ocr_async(document=doc, page_count=page_count, lang=lang)

        # upload only one file at time.
        # after each upload return a json object with
        # following fields:
        #
        # - title
        # - preview_url
        # - doc_id
        # - action_url  -> needed for renaming/deleting selected item
        #
        # with that info a new thumbnail will be created.

        action_url = reverse('boss:core_basetreenode_change', args=(doc.id, ))

        preview_url = reverse('core:preview', args=(doc.id, 200, 1))

        result = {
            'title': doc.title,
            'doc_id': doc.id,
            'action_url': action_url,
            'preview_url': preview_url
        }
        logger.info("and response is!")
        return HttpResponse(json.dumps(result),
                            content_type="application/json")