Example #1
0
def ocr_page(
    user_id,
    document_id,
    file_name,
    page_num,
    lang,
):
    logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()

    doc_path = DocumentPath(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    mime_type = mime.Mime(default_storage.abspath(doc_path.url()))

    logger.debug(f"Mime Type = {mime_type}")

    page_type = ''
    if mime_type.is_pdf():
        ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang)
        page_type = 'pdf'
    elif mime_type.is_image():  # jpeg, jpeg or png
        ocr_page_image(doc_path=doc_path, page_num=page_num, lang=lang)
    elif mime_type.is_tiff():
        # new filename is a pdf file
        logger.debug("TIFF type detected")
        new_filename = convert_tiff2pdf(
            doc_url=default_storage.abspath(doc_path.url()))
        # now .pdf
        doc_path.file_name = new_filename
        # and continue as usual
        ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang)
    else:
        logger.error(f" user_id={user_id}"
                     f" doc_id={document_id}"
                     f" page_num={page_num} error=Unkown file type")
        return True

    t2 = time.time()
    logger.debug(f" user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num} page_type={page_type}"
                 f" total_exec_time={t2-t1:.2f}")

    return True
Example #2
0
def document_download(request, id):

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    if doc.user.username == request.user.username:
        try:
            file_handle = open(default_storage.abspath(doc.path.url()), "rb")
        except OSError:
            logger.error("Cannot open local version of %s" % doc.path.url())
            # return redirect(
            #     'boss:core_basetreenode_changelist_obj', args=(id,)
            # )
            return redirect('browse')

        resp = HttpResponse(file_handle.read(), content_type="application/pdf")
        disposition = "attachment; filename=%s" % doc.title
        resp['Content-Disposition'] = disposition
        file_handle.close()
        return resp

    # return redirect(
    #     'boss:core_basetreenode_changelist_obj', args=(id,)
    # )
    return redirect('browse')
Example #3
0
def preview(request, id, step=None, page="1"):

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    if request.user.has_perm(Access.PERM_READ, doc):
        page_path = doc.get_page_path(
            page_num=page,
            step=Step(step),
        )
        img_abs_path = default_storage.abspath(page_path.img_url())

        if not os.path.exists(img_abs_path):
            logger.debug(
                f"Preview image {img_abs_path} does not exists. Generating...")
            extract_img(page_path, media_root=settings.MEDIA_ROOT)

        try:
            with open(img_abs_path, "rb") as f:
                return HttpResponse(f.read(), content_type="image/jpeg")
        except IOError:
            raise

    return redirect('core:index')
Example #4
0
def document_download(request, id):
    """
    Any user with read permission on the document must be
    able to download the document.
    """
    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    if request.user.has_perm(Access.PERM_READ, doc):
        try:
            file_handle = open(default_storage.abspath(
                doc.path.url()
            ), "rb")
        except OSError:
            logger.error(
                "Cannot open local version of %s" % doc.path.url()
            )
            return redirect('admin:browse')

        resp = HttpResponse(
            file_handle.read(),
            content_type="application/pdf"
        )
        disposition = "attachment; filename=%s" % doc.title
        resp['Content-Disposition'] = disposition
        file_handle.close()
        return resp

    return HttpResponseForbidden()
Example #5
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    logger.debug("OCR PDF document")

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)

    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url
Example #6
0
 def test_preview(self):
     doc = Document.create_document(
         title="berlin.pdf",
         user=self.testcase_user,
         lang="ENG",
         file_name="berlin.pdf",
         size=1222,
         page_count=3
     )
     default_storage.copy_doc(
         src=os.path.join(
             BASE_DIR, "data", "berlin.pdf"
         ),
         dst=doc.path.url(),
     )
     ret = self.client.post(
         reverse('core:preview', args=(doc.id, 1, 1))
     )
     self.assertEqual(
         ret.status_code,
         200
     )
     page_path = PagePath(
         document_path=doc.path,
         page_num=1,
         step=Step(1),
         page_count=3
     )
     self.assertTrue(
         os.path.exists(
             default_storage.abspath(page_path.img_url())
         )
     )
Example #7
0
def node_download(request, id):
    """
    Any user with read permission on the node must be
    able to download it.

    Node is either documennt or a folder.
    """
    version = request.GET.get('version', None)

    try:
        node = BaseTreeNode.objects.get(id=id)
    except BaseTreeNode.DoesNotExist:
        raise Http404("Node does not exists")

    if request.user.has_perm(Access.PERM_READ, node):

        if node.is_document():
            try:
                file_handle = open(default_storage.abspath(
                    node.path().url(version=version)
                ), "rb")
            except OSError:
                logger.error(
                    "Cannot open local version of %s" % node.path.url()
                )
                return redirect('admin:browse')

            resp = HttpResponse(
                file_handle.read(),
                content_type="application/pdf"
            )
            disposition = "attachment; filename=%s" % node.title
            resp['Content-Disposition'] = disposition
            file_handle.close()

            return resp
        else:  # node is a folder

            with NamedTemporaryFile(prefix="download_") as fileobj:
                # collected into an archive all direct children of
                # selected folder
                node_ids = [_node.id for _node in node.get_children()]
                build_tar_archive(
                    fileobj=fileobj,
                    node_ids=node_ids
                )
                # reset fileobj to initial position
                fileobj.seek(0)
                data = fileobj.read()
                resp = HttpResponse(
                    data,
                    content_type="application/x-tar"
                )
                disposition = f"attachment; filename={node.title}.tar"
                resp['Content-Disposition'] = disposition

                return resp

    return HttpResponseForbidden()
Example #8
0
def ocr_page_pdf(doc_path, page_num, lang, **kwargs):
    """
    doc_path is an mglib.path.DocumentPath instance

    On success returns ``mglib.path.PagePath`` instance.
    """
    logger.debug("OCR PDF document")

    file_name = kwargs.pop('file_name', None)

    if not file_name:
        file_name = doc_path.file_name

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        for step in Steps():
            page_path.step = step
            extract_img(page_path, media_root=settings.MEDIA_ROOT)

    notify_pre_page_ocr(page_path,
                        page_num=page_num,
                        lang=lang,
                        file_name=doc_path.file_name,
                        **kwargs)

    if page_num <= page_count:
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
        notify_txt_ready(page_path,
                         page_num=page_num,
                         lang=lang,
                         file_name=file_name,
                         **kwargs)

        for step in Steps():
            page_path.step = step
            if not step.is_thumbnail:
                extract_hocr(page_path,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)
                notify_hocr_ready(
                    page_path,
                    page_num=page_num,
                    lang=lang,
                    # step as integer number
                    step=step.current,
                    file_name=file_name,
                    **kwargs)

    return page_path
Example #9
0
def notify_txt_ready(page_path, **kwargs):
    """
    Notifies interested parties that .txt file is available.

    Notifies via django signals. Among others will send
    .txt content itself. Input arguments:

    ``page_path``: mglib.PagePath instance of current page
    Following keys are expected to be availble in kwargs dictinary:

        * ``user_id``
        * ``document_id``
        * ``file_name``
        * ``page_num``
        * ``namespace``

    Always returns None.

    Sent signals: ``post_page_txt``.

    Following arguments are passed to the signal:
        * ``sender`` = from papermerge.core.signal_definitions.WORKER
        * ``user_id``
        * ``document_id``
        * ``file_name``
        * ``page_num``
        * ``lang``
        * ``namespace`` = may be empty. Used to distinguish among
            different tenants in multi-tenant deployments.
        * ``txt`` = extracted .txt data (text format)
    """

    user_id = kwargs.get('user_id', None)
    document_id = kwargs.get('document_id', None)
    page_num = kwargs.get('page_num', 1)
    file_name = kwargs.get('file_name', None)
    namespace = kwargs.get('namespace', None)

    if page_path:
        abs_path_txt = default_storage.abspath(page_path.txt_url())

        if os.path.exists(abs_path_txt):
            with open(abs_path_txt) as f:
                text = f.read()

                signals.post_page_txt.send(sender=signals.WORKER,
                                           user_id=user_id,
                                           document_id=document_id,
                                           file_name=file_name,
                                           page_num=page_num,
                                           namespace=namespace,
                                           text=text)
        else:
            logger.warning(f"Page txt path {abs_path_txt} does not exist. "
                           f"Page indexing was skipped.")
    else:
        logger.warning("OCR method returned empty page path. "
                       "Page indexing was skipped.")
Example #10
0
def hocr(request, id, step=None, page="1"):

    logger.debug(f"hocr for doc_id={id}, step={step}, page={page}")

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    doc_path = doc.path

    if request.user.has_perm(Access.PERM_READ, doc):
        # document absolute path
        doc_abs_path = default_storage.abspath(doc_path.url())
        if not os.path.exists(
            doc_abs_path
        ):
            raise Http404("HOCR data not yet ready.")

        page_count = get_pagecount(doc_abs_path)
        if page > page_count or page < 0:
            raise Http404("Page does not exists")

        page_path = doc.page_paths[page]
        hocr_abs_path = default_storage.abspath(page_path.hocr_url())

        logger.debug(f"Extract words from {hocr_abs_path}")

        if not os.path.exists(hocr_abs_path):
            raise Http404("HOCR data not yet ready.")

        # At this point local HOCR data should be available.
        hocr = Hocr(
            hocr_file_path=hocr_abs_path
        )

        return HttpResponse(
            json.dumps({
                'hocr': hocr.good_json_words(),
                'hocr_meta': hocr.get_meta()
            }),
            content_type="application/json",
        )

    return HttpResponseForbidden()
Example #11
0
 def recreate_pages(self):
     """
     Recreate page models
     """
     self.pages.all().delete()
     self.page_count = get_pagecount(
         default_storage.abspath(self.path.url()))
     self.save()
     self.create_pages()
Example #12
0
    def test_download_hocr(self):
        doc = Document.create_document(
            title="berlin.pdf",
            user=self.testcase_user,
            lang="ENG",
            file_name="berlin.pdf",
            size=1222,
            page_count=3
        )

        default_storage.copy_doc(
            src=os.path.join(
                BASE_DIR, "data", "berlin.pdf"
            ),
            dst=default_storage.abspath(doc.path.url())
        )
        # build page url
        page_path = doc.page_paths[1]

        # just remember that at the end of test
        # copied file must be deteled. (1)
        default_storage.copy_doc(
            src=os.path.join(
                BASE_DIR, "data", "page-1.hocr"
            ),
            dst=default_storage.abspath(page_path.hocr_url())
        )
        ret = self.client.get(
            reverse('core:hocr', args=(doc.id, 1, 1))
        )
        self.assertEqual(
            ret.status_code,
            200
        )
        # Deleting file created at (1)
        os.remove(
            default_storage.abspath(page_path.hocr_url())
        )
Example #13
0
    def update_text_field(self):
        """Update text field from associated .txt file.

        Returns non-empty text string value if .txt file was found.
        If file was not found - will return an empty string.
        """
        text = ''
        url = default_storage.abspath(self.txt_url)

        if not os.path.exists(url):
            logger.debug(f"Missing page txt {url}.")
            return

        with open(url) as file_handle:
            self.text = file_handle.read()
            self.save()
            logger.debug(f"text saved. len(page.text)=={len(self.text)}")
            text = self.text

        return text
Example #14
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    page_count = get_pagecount(default_storage.abspath(doc_path.url()))
    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_img(page_url, media_root=settings.MEDIA_ROOT)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)
            # tesseract unterhalt-1.jpg page-1 -l deu hocr
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url
Example #15
0
def ocr_page(
    user_id,
    document_id,
    file_name,
    page_num,
    lang,
):
    logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()

    doc_path = DocumentPath(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    mime_type = mime.Mime(default_storage.abspath(doc_path.url()))

    page_type = ''
    if mime_type.is_pdf():
        ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang)
        page_type = 'pdf'
    else:
        logger.error(f" user_id={user_id}"
                     f" doc_id={document_id}"
                     f" page_num={page_num} error=Unkown file type")
        return True

    t2 = time.time()
    logger.debug(f" user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num} page_type={page_type}"
                 f" total_exec_time={t2-t1:.2f}")

    return True
Example #16
0
 def absfilepath(self):
     return default_storage.abspath(self.path.url())
Example #17
0
    def paste_pages(
        user,
        parent_id,
        doc_pages,
        dst_document=None,
        after=False,
        before=False,
    ):
        # parent_node is an instance of BaseTreeNode
        # doc_pages is a dictionary of format:
        # {
        #    doc_id_1: [page_num_1a, page_num_2a, ...],
        #    doc_id_2: [page_num_1b, page_num_2b, ...],
        #    doc_id_3: [page_num_1c, page_num_2c, ...]
        # }
        # 1. Create a new document NEWDOC
        # 2. Build new pages for the newly created document
        # num_pages = len(doc_pages[doc_id_1]) + len(doc_pages[doc_id_2]) + ...
        # 3. for each document with ids in doc_pages.keys() (DOC):
        #     a. copy pages data from DOC to NEWDOC
        #     b. deletes pages from DOC (pages mentioned in doc_page[key] list)
        new_page_count = sum([len(pages) for pages in doc_pages.values()])

        if new_page_count == 0:
            logger.warning("No pages to paste. Exiting.")
            return

        # 1. Create new document
        # 2. Build new pages for newly created document
        dst_doc_is_new = False
        if not dst_document:
            dst_document = Document.objects.create_document(
                user=user,
                parent_id=parent_id,
                lang=user.preferences['ocr__OCR_Language'],
                title="pasted.pdf",
                size=0,  # updated later, after pdftk will create new doc
                file_name="pasted.pdf",
                page_count=new_page_count)
            dst_doc_is_new = True

        # for each document where are pages to paste
        doc_list = []
        data_list = []
        for doc_id in doc_pages.keys():
            try:
                doc = Document.objects.get(id=doc_id)
            except Document.DoesNotExist:
                logger.warning(f"While pasting, doc_id={doc_id} was not found")
                return

            src = default_storage.abspath(doc.path)
            doc_path = doc.path

            doc_list.append({'doc': doc, 'page_nums': doc_pages[doc_id]})
            data_list.append({
                'src': src,
                'doc_path': doc_path,
                'page_nums': doc_pages[doc_id]
            })

        # returns new document version
        new_version = default_storage.paste_pages(
            dest_doc_path=dst_document.path,
            data_list=data_list,
            dest_doc_is_new=dst_doc_is_new,
            after_page_number=after,
            before_page_number=before)

        if new_version == dst_document.version:
            raise Exception("Expecting version to be incremented")

        dst_document.version = new_version
        dst_document.save()
        # update pages model
        dst_document.recreate_pages()

        # delete pages of source document (which where
        # cutted and pasted into new doc)
        for item in doc_list:
            item['doc'].delete_pages(page_numbers=item['page_nums'])

        return dst_document
Example #18
0
def notify_hocr_ready(page_path, **kwargs):
    """
    Notifies interested parties that .hocr file is available.

    Notifies via django signals. Among others will send
    hocr content itself. Input arguments:

    ``page_path``: mglib.PagePath instance of current page
    Following keys are expected to be availble in kwargs dictinary:

        * ``user_id``
        * ``document_id``
        * ``file_name``
        * ``page_num``
        * ``namespace``
        * ``step``

    Always returns None.

    Sent signals: ``post_page_hocr``.

    Following arguments are passed to the signal:
        * ``sender`` = from papermerge.core.signal_definitions.WORKER
        * ``user_id``
        * ``document_id``
        * ``file_name``
        * ``page_num``
        * ``lang``
        * ``namespace`` = may be empty. Used to distinguish among
            different tenants in multi-tenant deployments.
        * ``step`` = integer number corresponding to step
            learn more about steps in ``mglib.step.Step``
        * ``hocr`` = extracted hocr data (text format)
    """

    user_id = kwargs.get('user_id', None)
    document_id = kwargs.get('document_id', None)
    file_name = kwargs.get('file_name', None)
    page_num = kwargs.get('page_num', 1)
    namespace = kwargs.get('namespace', None)
    step = kwargs.get('step', 1)

    if page_path:
        abs_path_hocr = default_storage.abspath(page_path.hocr_url())

        if os.path.exists(abs_path_hocr):
            with open(abs_path_hocr) as f:
                hocr = f.read()

                signals.post_page_hocr.send(sender=signals.WORKER,
                                            user_id=user_id,
                                            document_id=document_id,
                                            file_name=file_name,
                                            page_num=page_num,
                                            step=step,
                                            namespace=namespace,
                                            hocr=hocr)
        else:
            logger.warning(
                f"Page hocr/step={step} path {abs_path_hocr} does not exist.")
    else:
        logger.warning(f"hOCR/step={step} method returned empty page path.")
Example #19
0
    def test_documents_retains_per_page_metadata_after_page_delete(self):
        """
        DocM is a document with 3 pages. DocM has two metadata fields
        associated X and Y. Field has a value x=10 and y=20.

        Second page of the document DocM is deleted.
        Expected:
            document values of metadata fields X and Y should be preserverd:
            DocX.M is still 10 and DocM.Y is still 20.

        Important!

        In document browser and document viewer
        if user does not explicitely select a page, by default
        metadata associated with first page of respective document
        is returned.
        """
        document_path = os.path.join(BASE_DIR, "data", "berlin.pdf")
        docm = Document.objects.create_document(
            user=self.user,
            title='berlin.pdf',
            size=os.path.getsize(document_path),
            lang='deu',
            file_name='berlin.pdf',
            parent_id=None,
            page_count=3)

        default_storage.copy_doc(
            src=document_path,
            dst=docm.path.url(),
        )

        for number in range(1, 4):
            page = docm.pages.get(number=number)
            # filesystem absolute path /home/eugen/x/y/
            fs_abs_path = default_storage.abspath(page.path.url())
            # filesystem absolute dir
            fs_abs_dir = os.path.dirname(fs_abs_path)
            Path(fs_abs_dir).mkdir(parents=True, exist_ok=True)
            # create an empty file
            open(fs_abs_path, "w+")

        # indeed, docm has 3 pages
        self.assertEqual(docm.pages.count(), 3)
        docm.kv.update([{
            'key': 'X',
            'kv_type': TEXT,
        }, {
            'key': 'Y',
            'kv_type': TEXT,
        }])
        # In document browser and document viewer
        # if user does not explicitely select a document, by default
        # metadata associated with first page of respective document
        # is returned
        page = docm.pages.get(number=1)
        page.kv['X'] = 10
        page.kv['Y'] = 20

        page.refresh_from_db()

        self.assertEqual(page.kv['X'], '10')

        self.assertEqual(page.kv['Y'], '20')

        # Even if user deletes second page, all data (incl. metadata)
        # associated ramaining page (first and last)
        # MUST be preserved!
        docm.delete_pages([2])

        page = docm.pages.get(number=1)

        self.assertEqual(page.kv['X'], '10')
        self.assertEqual(page.kv['Y'], '20')
Example #20
0
def ocr_page(
    user_id,
    document_id,
    file_name,
    page_num,
    lang,
    namespace=None,
):
    logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()
    doc_path = DocumentPath(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    if not default_storage.exists(doc_path.url()):
        # In case of distibuted deployment, document uploaded
        # by webapp is not directly available to the worker (which runs on
        # separate computer). Thus, if document is not locally available,
        # worker will download the document from whatever remote location.
        default_storage.download(doc_path_url=doc_path.url(),
                                 namespace=namespace)

    mime_type = mime.Mime(default_storage.abspath(doc_path.url()))
    logger.debug(f"Mime Type = {mime_type}")

    page_type = ''

    if mime_type.is_pdf():
        ocr_page_pdf(doc_path=doc_path,
                     page_num=page_num,
                     lang=lang,
                     user_id=user_id,
                     document_id=document_id,
                     namespace=namespace)
        page_type = 'pdf'
    elif mime_type.is_image():  # jpeg, jpeg or png
        ocr_page_image(doc_path=doc_path,
                       page_num=page_num,
                       lang=lang,
                       user_id=user_id,
                       document_id=document_id,
                       namespace=namespace)
    elif mime_type.is_tiff():
        # new filename is a pdf file
        logger.debug("TIFF type detected")
        new_filename = convert_tiff2pdf(
            doc_url=default_storage.abspath(doc_path.url()))
        # now .pdf
        orig_file_name = doc_path.file_name
        doc_path.file_name = new_filename
        # and continue as usual
        ocr_page_pdf(
            doc_path=doc_path,
            page_num=page_num,
            lang=lang,
            user_id=user_id,
            document_id=document_id,
            # Pass original file_name i.e. tiff file name as well.
            file_name=orig_file_name,
            namespace=namespace)
    else:
        logger.error(f" user_id={user_id}"
                     f" doc_id={document_id}"
                     f" page_num={page_num} error=Unkown file type")
        return True

    t2 = time.time()
    logger.debug(f" user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num} page_type={page_type}"
                 f" total_exec_time={t2-t1:.2f}")

    return True