Example #1
0
    def txt_url(self):

        result = PagePath(document_ep=self.document.path,
                          page_num=self.number,
                          page_count=self.page_count)

        return result.txt_url()
Example #2
0
    def txt_exists(self):

        result = PagePath(document_ep=self.document.doc_ep,
                          page_num=self.number,
                          page_count=self.page_count)

        return result.txt_exists()
Example #3
0
def ocr_page_image(doc_path, page_num, lang):
    """
    image = jpg, jpeg, png
    """
    logger.debug("OCR image (jpeg, jpg, png) document")

    page_url = PagePath(
        document_path=doc_path,
        page_num=page_num,
        step=Step(1),
        # jpeg, jpg, png are 1 page documents
        page_count=1)
    # resize and eventually convert (png -> jpg)
    resize_img(page_url, media_root=settings.MEDIA_ROOT)
    extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

    # First quickly generate preview images
    for step in Steps():
        page_url.step = step
        resize_img(page_url, media_root=settings.MEDIA_ROOT)
    # reset page's step
    page_url.step = Step(1)
    # Now OCR each image
    for step in Steps():
        if not step.is_thumbnail:
            extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

    return page_url
Example #4
0
 def test_preview(self):
     doc = Document.create_document(
         title="berlin.pdf",
         user=self.testcase_user,
         lang="ENG",
         file_name="berlin.pdf",
         size=1222,
         page_count=3
     )
     default_storage.copy_doc(
         src=os.path.join(
             BASE_DIR, "data", "berlin.pdf"
         ),
         dst=doc.path.url(),
     )
     ret = self.client.post(
         reverse('core:preview', args=(doc.id, 1, 1))
     )
     self.assertEqual(
         ret.status_code,
         200
     )
     page_path = PagePath(
         document_path=doc.path,
         page_num=1,
         step=Step(1),
         page_count=3
     )
     self.assertTrue(
         os.path.exists(
             default_storage.abspath(page_path.img_url())
         )
     )
Example #5
0
 def test_txt_url(self):
     """
     Without any arguments
         page_ep.url() returns page_ep.txt_url()
     """
     doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
     page_ep = PagePath(document_ep=doc_ep,
                        page_num=1,
                        step=Step(1),
                        page_count=3)
     self.assertEqual(page_ep.url(), page_ep.txt_url())
Example #6
0
    def paste_pages(self,
                    dest_doc_path,
                    data_list,
                    dest_doc_is_new=False,
                    after_page_number=False,
                    before_page_number=False):
        """
        Pastes pages in the document pointed by dest_doc_path
        from src_doc_path. Both dest and src are instances of
        mglib.path.DocumentPath
        """
        next_ver_dp = DocumentPath.copy_from(dest_doc_path,
                                             version=dest_doc_path.version + 1)
        self.make_sure_path_exists(self.abspath(next_ver_dp))

        pdftk.paste_pages(src=self.abspath(dest_doc_path),
                          dst=self.abspath(next_ver_dp),
                          data_list=data_list,
                          dst_doc_is_new=dest_doc_is_new,
                          after_page_number=after_page_number,
                          before_page_number=before_page_number)

        if not dest_doc_is_new:
            # migrate document's own pages from previous
            # version (this differs from pasting into newly
            # created docs)
            pcount = self.get_pagecount(dest_doc_path)
            data_list.insert(0, {
                'doc_path': dest_doc_path,
                'page_nums': list(range(1, pcount + 1))
            })

        dest_page_num = 1
        dest_page_count = sum([len(item['page_nums']) for item in data_list])
        for item in data_list:
            src_path = item['doc_path']
            for page_num in item['page_nums']:
                for step in Steps():
                    src_page_path = PagePath(
                        document_path=src_path,
                        page_num=int(page_num),
                        step=step,
                        page_count=self.get_pagecount(src_path))
                    dst_page_path = PagePath(document_path=next_ver_dp,
                                             page_num=dest_page_num,
                                             step=step,
                                             page_count=dest_page_count)
                    logger.debug(f"src={src_page_path}  dst={dst_page_path}")
                    self.copy_page(src_page_path=src_page_path,
                                   dst_page_path=dst_page_path)
                dest_page_num += 1

        return dest_doc_path.version + 1
Example #7
0
    def reorder_pages(self, doc_path, new_order):
        """
        Reorders pages in the document pointed by doc_path.
        doc_path is an instance of mglib.path.DocumentPath

        In case of success returns document's new version.

        new_order is a list of following format:

            [
                {'page_num': 2, page_order: 1},
                {'page_num': 1, page_order: 2},
                {'page_num': 3, page_order: 3},
                {'page_num': 4, page_order: 4},
            ]
        Example above means that in current document of 4 pages,
        first page was swapped with second one.
        page_num    = older page order
        page_order  = current page order
        So in human language, each hash is read:
            <page_num> now should be <page_order>
        """
        src_doc_path = doc_path
        dst_doc_path = DocumentPath.copy_from(src_doc_path,
                                              version=doc_path.version + 1)
        self.make_sure_path_exists(self.abspath(dst_doc_path))

        stapler.reorder_pages(src=self.abspath(src_doc_path),
                              dst=self.abspath(dst_doc_path),
                              new_order=new_order)

        page_count = self.get_pagecount(doc_path)

        if len(new_order) > page_count:
            logger.error(
                f"deleted_pages({new_order}) > page_count({page_count})")
            return

        for item in new_order:
            for step in Steps():
                src_page_path = PagePath(document_path=src_doc_path,
                                         page_num=int(item['page_num']),
                                         step=step,
                                         page_count=len(new_order))
                dst_page_path = PagePath(document_path=dst_doc_path,
                                         page_num=int(item['page_order']),
                                         step=step,
                                         page_count=len(new_order))
                self.copy_page(src_page_path=src_page_path,
                               dst_page_path=dst_page_path)

        return doc_path.version + 1
Example #8
0
def ocr_page_image(doc_path, page_num, lang, **kwargs):
    """
    image = jpg, jpeg, png

    On success returns ``mglib.path.PagePath`` instance.
    """
    logger.debug("OCR image (jpeg, jpg, png) document")

    page_path = PagePath(
        document_path=doc_path,
        page_num=page_num,
        step=Step(1),
        # jpeg, jpg, png are 1 page documents
        page_count=1)
    notify_pre_page_ocr(page_path,
                        page_num=page_num,
                        lang=lang,
                        file_name=doc_path.file_name,
                        **kwargs)
    # resize and eventually convert (png -> jpg)
    resize_img(page_path, media_root=settings.MEDIA_ROOT)
    extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
    notify_txt_ready(page_path,
                     page_num=page_num,
                     lang=lang,
                     file_name=doc_path.file_name,
                     **kwargs)

    # First quickly generate preview images
    for step in Steps():
        page_path.step = step
        resize_img(page_path, media_root=settings.MEDIA_ROOT)
    # reset page's step
    page_path.step = Step(1)
    # Now OCR each image
    for step in Steps():
        if not step.is_thumbnail:
            extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
            notify_hocr_ready(
                page_path,
                page_num=page_num,
                lang=lang,
                # step as integer number
                step=step.current,
                file_name=doc_path.file_name,
                **kwargs)

    return page_path
Example #9
0
    def page_paths(self):
        """
        Enables document instance to get quickly page
        paths:

            page_path = doc.page_path[2]
            page_path.url() # local url to second page of the doc.

        This is shortcut method when most used Step(1) is required.
        """

        results = [None]  # indexing starts from 1

        # doc.page_count might be wrong because per
        # page logic was added just recently. So, let's use
        # this opportunity and correct it!
        page_count = get_pagecount(self.absfilepath)

        if page_count != self.page_count:
            self.page_count = page_count
            self.save()

        for page_num in range(1, page_count + 1):
            page_path = PagePath(document_path=self.path,
                                 page_num=page_num,
                                 step=step.Step(1),
                                 page_count=self.page_count)
            results.append(page_path)

        return results
Example #10
0
 def get_page_path(self, page_num, step):
     """
     For Step(1) shortcut, use doc_instance.page_eps property.
     """
     return PagePath(document_path=self.path,
                     page_num=page_num,
                     step=step,
                     page_count=self.page_count)
Example #11
0
 def test_ppmroot(self):
     doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
     page_url = PagePath(document_ep=doc_ep,
                         page_num=1,
                         step=Step(1),
                         page_count=3)
     self.assertEqual(page_url.ppmroot,
                      "results/user_1/document_3/pages/page_1/100/page")
Example #12
0
    def test_versioned_page_ep(self):
        doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
        # document's version incremented
        doc_ep.inc_version()

        page_ep = PagePath(document_ep=doc_ep, page_num=1, page_count=3)
        self.assertEqual(page_ep.path,
                         "results/user_1/document_3/v1/pages/page_1.txt")
Example #13
0
    def delete_pages(self, doc_path, page_numbers, skip_migration=False):
        """
        Delets pages in the document pointed by doc_path.
        doc_path is an instance of mglib.path.DocumentPath

        In case of success returns document's new version.
        """

        if not isinstance(page_numbers, list):
            logger.error("Expecting list argument")
            return False

        src_doc_path = doc_path
        dst_doc_path = DocumentPath.copy_from(src_doc_path,
                                              version=doc_path.version + 1)
        self.make_sure_path_exists(self.abspath(dst_doc_path))
        stapler.delete_pages(self.abspath(src_doc_path),
                             self.abspath(dst_doc_path), page_numbers)

        if skip_migration:
            return doc_path.version + 1

        page_count = self.get_pagecount(doc_path)

        if len(page_numbers) > page_count:
            logger.error(
                f"deleted_pages({page_numbers}) > page_count({page_count})")
            return

        assigns = get_assigns_after_delete(total_pages=page_count,
                                           deleted_pages=page_numbers)
        for a in assigns:
            for step in Steps():
                src_page_path = PagePath(document_path=src_doc_path,
                                         page_num=a[1],
                                         step=step,
                                         page_count=page_count)
                dst_page_path = PagePath(document_path=dst_doc_path,
                                         page_num=a[0],
                                         step=step,
                                         page_count=page_count -
                                         len(page_numbers))
                self.copy_page(src_page_path=src_page_path,
                               dst_page_path=dst_page_path)

        return doc_path.version + 1
Example #14
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    logger.debug("OCR PDF document")

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)

    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url
Example #15
0
def ocr_page_pdf(doc_path, page_num, lang, **kwargs):
    """
    doc_path is an mglib.path.DocumentPath instance

    On success returns ``mglib.path.PagePath`` instance.
    """
    logger.debug("OCR PDF document")

    file_name = kwargs.pop('file_name', None)

    if not file_name:
        file_name = doc_path.file_name

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        for step in Steps():
            page_path.step = step
            extract_img(page_path, media_root=settings.MEDIA_ROOT)

    notify_pre_page_ocr(page_path,
                        page_num=page_num,
                        lang=lang,
                        file_name=doc_path.file_name,
                        **kwargs)

    if page_num <= page_count:
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
        notify_txt_ready(page_path,
                         page_num=page_num,
                         lang=lang,
                         file_name=file_name,
                         **kwargs)

        for step in Steps():
            page_path.step = step
            if not step.is_thumbnail:
                extract_hocr(page_path,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)
                notify_hocr_ready(
                    page_path,
                    page_num=page_num,
                    lang=lang,
                    # step as integer number
                    step=step.current,
                    file_name=file_name,
                    **kwargs)

    return page_path
Example #16
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    page_count = get_pagecount(default_storage.abspath(doc_path.url()))
    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_img(page_url, media_root=settings.MEDIA_ROOT)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)
            # tesseract unterhalt-1.jpg page-1 -l deu hocr
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url
Example #17
0
    def page_paths(self, version=None):
        """
        Enables document instance to get quickly page
        paths:

            page_path = doc.page_path[2]
            page_path.url() # local url to second page of the doc.

        This is shortcut method when most used Step(1) is required.
        """

        results = [None]  # indexing starts from 1

        page_count = self.get_pagecount(version=version)

        for page_num in range(1, page_count + 1):
            page_path = PagePath(
                document_path=self.path(version=version),
                page_num=page_num,
                step=step.Step(1),
                page_count=self.get_pagecount(version=version))
            results.append(page_path)

        return results
Example #18
0
 def path(self):
     return PagePath(
         document_path=self.document.path,
         page_num=self.number,
         page_count=self.page_count
     )
Example #19
0
def apply_automates(document_id, page_num):

    logger.debug("apply_automates: Begin.")
    try:
        document = Document.objects.get(id=document_id)
    except Document.DoesNotExist:
        logger.error(f"Provided document_id={document_id}, does not exists")
        return

    # use text files from the original version of the document
    doc_path = DocumentPath.copy_from(
        document.path,
        version=0
    )
    page_count = get_pagecount(
        default_storage.abspath(doc_path.url())
    )
    page_path = PagePath(
        document_path=doc_path,
        page_num=page_num,
        page_count=page_count,
        step=Step(),
    )
    user = document.user

    text_path = default_storage.abspath(page_path.txt_url())
    text = ""
    with open(text_path, "r") as f:
        text = f.read()

    automates = Automate.objects.filter(user=user)
    # are there automates for the user?
    if automates.count() == 0:
        logger.debug(
            f"No automates for user {user}. Quit."
        )
        return

    # check all automates for given user (the owner of the document)
    matched = []
    for automate in automates:
        if automate.is_a_match(text):
            logger.debug(f"Automate {automate} matched document={document}")

            plugin_klass = get_plugin_by_module_name(
                automate.plugin_name
            )
            plugin = plugin_klass() if plugin_klass else None

            automate.apply(
                document=document,
                page_num=page_num,
                hocr=text,
                # Notice () - plugin passed is instance of the class
                plugin=plugin
            )
            matched.append(automate)
        else:
            logger.debug(
                f"No match for automate={automate}"
                f" doc_id={document_id} page_num={page_num}"
            )

    message = ""

    message = _(
        "%(count)s of %(total)s Automate(s) matched. ") % {
        'count': len(matched),
        'total': automates.count()
    }

    if len(matched) > 0:
        message += _("List of matched Automates: %(matched_automates)s") % {
            'matched_automates': matched
        }

    automates_matching.send(
        sender="papermerge.core.automate",
        user_id=document.user.id,
        document_id=document_id,
        level=logging.INFO,
        message=message,
        page_num=page_num,
        text=text
    )
Example #20
0
    def path(self, version=None):

        return PagePath(document_path=self.document.path(version=version),
                        page_num=self.number,
                        page_count=self.page_count)