Example #1
0
 def test_pages_dirname(self):
     ep = DocumentPath(user_id=1,
                       document_id=3,
                       aux_dir="results",
                       file_name="x.pdf")
     self.assertEqual(ep.pages_dirname(),
                      "results/user_1/document_3/pages/")
Example #2
0
    def test_document_url_with_another_version(self):

        doc_ep = DocumentPath(user_id=1, document_id=15, file_name="x.pdf")
        self.assertEqual(doc_ep.url(version=3),
                         "docs/user_1/document_15/v3/x.pdf")

        self.assertEqual(doc_ep.url(version=2),
                         "docs/user_1/document_15/v2/x.pdf")
Example #3
0
    def test_versioned_page_ep(self):
        doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
        # document's version incremented
        doc_ep.inc_version()

        page_ep = PagePath(document_ep=doc_ep, page_num=1, page_count=3)
        self.assertEqual(page_ep.path,
                         "results/user_1/document_3/v1/pages/page_1.txt")
Example #4
0
def ocr_page(
    user_id,
    document_id,
    file_name,
    page_num,
    lang,
):
    logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()

    doc_path = DocumentPath(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    mime_type = mime.Mime(default_storage.abspath(doc_path.url()))

    logger.debug(f"Mime Type = {mime_type}")

    page_type = ''
    if mime_type.is_pdf():
        ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang)
        page_type = 'pdf'
    elif mime_type.is_image():  # jpeg, jpeg or png
        ocr_page_image(doc_path=doc_path, page_num=page_num, lang=lang)
    elif mime_type.is_tiff():
        # new filename is a pdf file
        logger.debug("TIFF type detected")
        new_filename = convert_tiff2pdf(
            doc_url=default_storage.abspath(doc_path.url()))
        # now .pdf
        doc_path.file_name = new_filename
        # and continue as usual
        ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang)
    else:
        logger.error(f" user_id={user_id}"
                     f" doc_id={document_id}"
                     f" page_num={page_num} error=Unkown file type")
        return True

    t2 = time.time()
    logger.debug(f" user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num} page_type={page_type}"
                 f" total_exec_time={t2-t1:.2f}")

    return True
Example #5
0
 def test_ppmroot(self):
     doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
     page_url = PagePath(document_ep=doc_ep,
                         page_num=1,
                         step=Step(1),
                         page_count=3)
     self.assertEqual(page_url.ppmroot,
                      "results/user_1/document_3/pages/page_1/100/page")
Example #6
0
    def vpath(self, version=0):
        result = DocumentPath(
            user_id=self.user.id,
            document_id=self.id,
            version=version,
            file_name=self.file_name,
        )

        return result
Example #7
0
 def test_txt_url(self):
     """
     Without any arguments
         page_ep.url() returns page_ep.txt_url()
     """
     doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
     page_ep = PagePath(document_ep=doc_ep,
                        page_num=1,
                        step=Step(1),
                        page_count=3)
     self.assertEqual(page_ep.url(), page_ep.txt_url())
Example #8
0
    def paste_pages(self,
                    dest_doc_path,
                    data_list,
                    dest_doc_is_new=False,
                    after_page_number=False,
                    before_page_number=False):
        """
        Pastes pages in the document pointed by dest_doc_path
        from src_doc_path. Both dest and src are instances of
        mglib.path.DocumentPath
        """
        next_ver_dp = DocumentPath.copy_from(dest_doc_path,
                                             version=dest_doc_path.version + 1)
        self.make_sure_path_exists(self.abspath(next_ver_dp))

        pdftk.paste_pages(src=self.abspath(dest_doc_path),
                          dst=self.abspath(next_ver_dp),
                          data_list=data_list,
                          dst_doc_is_new=dest_doc_is_new,
                          after_page_number=after_page_number,
                          before_page_number=before_page_number)

        if not dest_doc_is_new:
            # migrate document's own pages from previous
            # version (this differs from pasting into newly
            # created docs)
            pcount = self.get_pagecount(dest_doc_path)
            data_list.insert(0, {
                'doc_path': dest_doc_path,
                'page_nums': list(range(1, pcount + 1))
            })

        dest_page_num = 1
        dest_page_count = sum([len(item['page_nums']) for item in data_list])
        for item in data_list:
            src_path = item['doc_path']
            for page_num in item['page_nums']:
                for step in Steps():
                    src_page_path = PagePath(
                        document_path=src_path,
                        page_num=int(page_num),
                        step=step,
                        page_count=self.get_pagecount(src_path))
                    dst_page_path = PagePath(document_path=next_ver_dp,
                                             page_num=dest_page_num,
                                             step=step,
                                             page_count=dest_page_count)
                    logger.debug(f"src={src_page_path}  dst={dst_page_path}")
                    self.copy_page(src_page_path=src_page_path,
                                   dst_page_path=dst_page_path)
                dest_page_num += 1

        return dest_doc_path.version + 1
Example #9
0
    def reorder_pages(self, doc_path, new_order):
        """
        Reorders pages in the document pointed by doc_path.
        doc_path is an instance of mglib.path.DocumentPath

        In case of success returns document's new version.

        new_order is a list of following format:

            [
                {'page_num': 2, page_order: 1},
                {'page_num': 1, page_order: 2},
                {'page_num': 3, page_order: 3},
                {'page_num': 4, page_order: 4},
            ]
        Example above means that in current document of 4 pages,
        first page was swapped with second one.
        page_num    = older page order
        page_order  = current page order
        So in human language, each hash is read:
            <page_num> now should be <page_order>
        """
        src_doc_path = doc_path
        dst_doc_path = DocumentPath.copy_from(src_doc_path,
                                              version=doc_path.version + 1)
        self.make_sure_path_exists(self.abspath(dst_doc_path))

        stapler.reorder_pages(src=self.abspath(src_doc_path),
                              dst=self.abspath(dst_doc_path),
                              new_order=new_order)

        page_count = self.get_pagecount(doc_path)

        if len(new_order) > page_count:
            logger.error(
                f"deleted_pages({new_order}) > page_count({page_count})")
            return

        for item in new_order:
            for step in Steps():
                src_page_path = PagePath(document_path=src_doc_path,
                                         page_num=int(item['page_num']),
                                         step=step,
                                         page_count=len(new_order))
                dst_page_path = PagePath(document_path=dst_doc_path,
                                         page_num=int(item['page_order']),
                                         step=step,
                                         page_count=len(new_order))
                self.copy_page(src_page_path=src_page_path,
                               dst_page_path=dst_page_path)

        return doc_path.version + 1
Example #10
0
    def path(self):
        version = self.version
        if not isinstance(version, int):
            version = 0

        result = DocumentPath(
            user_id=self.user.id,
            document_id=self.id,
            version=version,
            file_name=self.file_name,
        )

        return result
Example #11
0
    def get_pagecount(self, doc_path):
        """
        Returns total number of pages for this doc_path.
        Total number of pages = number of page_xy.txt files
        in pages_dirname folder.
        """
        doc_path_pointing_to_results = DocumentPath.copy_from(
            doc_path, aux_dir="results")
        pages_dir = self.abspath(doc_path_pointing_to_results.pages_dirname())

        only_dirs = [
            fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
        ]
        return len(only_dirs)
Example #12
0
    def path(self, version=None):

        if version is None:
            version = self.version

        version = int(version)

        result = DocumentPath(
            user_id=self.user.id,
            document_id=self.id,
            version=version,
            file_name=self.file_name,
        )

        return result
Example #13
0
def ocr_page(
    user_id,
    document_id,
    file_name,
    page_num,
    lang,
):
    logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()

    doc_path = DocumentPath(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    mime_type = mime.Mime(default_storage.abspath(doc_path.url()))

    page_type = ''
    if mime_type.is_pdf():
        ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang)
        page_type = 'pdf'
    else:
        logger.error(f" user_id={user_id}"
                     f" doc_id={document_id}"
                     f" page_num={page_num} error=Unkown file type")
        return True

    t2 = time.time()
    logger.debug(f" user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num} page_type={page_type}"
                 f" total_exec_time={t2-t1:.2f}")

    return True
Example #14
0
    def test_document_url_none_vs_0(self):
        doc_ep = DocumentPath(user_id=1, document_id=15, file_name="x.pdf")
        doc_ep.inc_version()  # current version = 1
        doc_ep.inc_version()  # current version = 2
        doc_ep.inc_version()  # current version = 3

        self.assertEqual(
            # with version == None, latest version of the document
            # will be returned, which is 3
            doc_ep.url(version=None),
            "docs/user_1/document_15/v3/x.pdf")

        self.assertEqual(
            # with version == 0, version 0 will be provided
            # i.e. version=0 returns original doc.
            doc_ep.url(version=0),
            "docs/user_1/document_15/x.pdf")
Example #15
0
    def test_get_versions_2(self):
        storage = FileSystemStorage(location=MEDIA_ROOT)

        with TemporaryNode(MEDIA_ROOT) as media_root:
            docs = media_root.add_folder("docs")
            f1 = docs.add_folder("user_1/document_2")
            f1.add_file("doku.pdf")

            doc_path = DocumentPath(user_id=1,
                                    document_id=2,
                                    file_name='doku.pdf',
                                    version=2)
            versions = storage.get_versions(doc_path)

            # document has only one version - the latest
            self.assertEqual(versions, [0])
Example #16
0
    def test_inc_version(self):
        """
        Document endpoints are now versioned.
        Initial version is 0.
        When version is 0, the "old" endpoint path applies i.e.
        version is not included in the path.
        After document is modified (blank page deleted for example),
        its version is incremented. If document version is > 0, then
        version is included in the path.
        """
        doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
        doc_ep.inc_version()

        self.assertEqual(doc_ep.url(), "docs/user_1/document_3/v1/x.pdf")

        doc_ep.inc_version()

        self.assertEqual(doc_ep.url(), "docs/user_1/document_3/v2/x.pdf")
Example #17
0
    def delete_pages(self, doc_path, page_numbers, skip_migration=False):
        """
        Delets pages in the document pointed by doc_path.
        doc_path is an instance of mglib.path.DocumentPath

        In case of success returns document's new version.
        """

        if not isinstance(page_numbers, list):
            logger.error("Expecting list argument")
            return False

        src_doc_path = doc_path
        dst_doc_path = DocumentPath.copy_from(src_doc_path,
                                              version=doc_path.version + 1)
        self.make_sure_path_exists(self.abspath(dst_doc_path))
        stapler.delete_pages(self.abspath(src_doc_path),
                             self.abspath(dst_doc_path), page_numbers)

        if skip_migration:
            return doc_path.version + 1

        page_count = self.get_pagecount(doc_path)

        if len(page_numbers) > page_count:
            logger.error(
                f"deleted_pages({page_numbers}) > page_count({page_count})")
            return

        assigns = get_assigns_after_delete(total_pages=page_count,
                                           deleted_pages=page_numbers)
        for a in assigns:
            for step in Steps():
                src_page_path = PagePath(document_path=src_doc_path,
                                         page_num=a[1],
                                         step=step,
                                         page_count=page_count)
                dst_page_path = PagePath(document_path=dst_doc_path,
                                         page_num=a[0],
                                         step=step,
                                         page_count=page_count -
                                         len(page_numbers))
                self.copy_page(src_page_path=src_page_path,
                               dst_page_path=dst_page_path)

        return doc_path.version + 1
Example #18
0
    def test_delete(self):
        storage = FileSystemStorage(location=MEDIA_ROOT)

        with TemporaryNode(MEDIA_ROOT) as media_root:
            docs = media_root.add_folder("docs")
            res = media_root.add_folder("results")
            f1 = docs.add_folder("user_1/document_2")
            f1.add_file("doku.pdf")
            res.add_folder("user_1/document_2/pages")

            doc_path = DocumentPath(user_id=1,
                                    document_id=2,
                                    file_name='doku.pdf')

            self.assertTrue(f1.exists())

            storage.delete_doc(doc_path)

            self.assertFalse(f1.exists())
Example #19
0
    def test_get_versions_1(self):
        storage = FileSystemStorage(location=MEDIA_ROOT)

        with TemporaryNode(MEDIA_ROOT) as media_root:
            docs = media_root.add_folder("docs")
            res = media_root.add_folder("results")
            f1 = docs.add_folder("user_1/document_2")
            f1.add_file("doku.pdf")
            # simulate 2 versions of the document.
            f1.add_folder("v1")
            f1.add_folder("v2")
            res.add_folder("user_1/document_2/pages")

            doc_path = DocumentPath(user_id=1,
                                    document_id=2,
                                    file_name='doku.pdf',
                                    version=2)
            versions = storage.get_versions(doc_path)

            self.assertEqual(versions, [0, 1, 2])
Example #20
0
def apply_automates(document_id, page_num):

    logger.debug("apply_automates: Begin.")
    try:
        document = Document.objects.get(id=document_id)
    except Document.DoesNotExist:
        logger.error(f"Provided document_id={document_id}, does not exists")
        return

    # use text files from the original version of the document
    doc_path = DocumentPath.copy_from(
        document.path,
        version=0
    )
    page_count = get_pagecount(
        default_storage.abspath(doc_path.url())
    )
    page_path = PagePath(
        document_path=doc_path,
        page_num=page_num,
        page_count=page_count,
        step=Step(),
    )
    user = document.user

    text_path = default_storage.abspath(page_path.txt_url())
    text = ""
    with open(text_path, "r") as f:
        text = f.read()

    automates = Automate.objects.filter(user=user)
    # are there automates for the user?
    if automates.count() == 0:
        logger.debug(
            f"No automates for user {user}. Quit."
        )
        return

    # check all automates for given user (the owner of the document)
    matched = []
    for automate in automates:
        if automate.is_a_match(text):
            logger.debug(f"Automate {automate} matched document={document}")

            plugin_klass = get_plugin_by_module_name(
                automate.plugin_name
            )
            plugin = plugin_klass() if plugin_klass else None

            automate.apply(
                document=document,
                page_num=page_num,
                hocr=text,
                # Notice () - plugin passed is instance of the class
                plugin=plugin
            )
            matched.append(automate)
        else:
            logger.debug(
                f"No match for automate={automate}"
                f" doc_id={document_id} page_num={page_num}"
            )

    message = ""

    message = _(
        "%(count)s of %(total)s Automate(s) matched. ") % {
        'count': len(matched),
        'total': automates.count()
    }

    if len(matched) > 0:
        message += _("List of matched Automates: %(matched_automates)s") % {
            'matched_automates': matched
        }

    automates_matching.send(
        sender="papermerge.core.automate",
        user_id=document.user.id,
        document_id=document_id,
        level=logging.INFO,
        message=message,
        page_num=page_num,
        text=text
    )
Example #21
0
def ocr_page(
    user_id,
    document_id,
    file_name,
    page_num,
    lang,
    namespace=None,
):
    logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()
    doc_path = DocumentPath(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    if not default_storage.exists(doc_path.url()):
        # In case of distibuted deployment, document uploaded
        # by webapp is not directly available to the worker (which runs on
        # separate computer). Thus, if document is not locally available,
        # worker will download the document from whatever remote location.
        default_storage.download(doc_path_url=doc_path.url(),
                                 namespace=namespace)

    mime_type = mime.Mime(default_storage.abspath(doc_path.url()))
    logger.debug(f"Mime Type = {mime_type}")

    page_type = ''

    if mime_type.is_pdf():
        ocr_page_pdf(doc_path=doc_path,
                     page_num=page_num,
                     lang=lang,
                     user_id=user_id,
                     document_id=document_id,
                     namespace=namespace)
        page_type = 'pdf'
    elif mime_type.is_image():  # jpeg, jpeg or png
        ocr_page_image(doc_path=doc_path,
                       page_num=page_num,
                       lang=lang,
                       user_id=user_id,
                       document_id=document_id,
                       namespace=namespace)
    elif mime_type.is_tiff():
        # new filename is a pdf file
        logger.debug("TIFF type detected")
        new_filename = convert_tiff2pdf(
            doc_url=default_storage.abspath(doc_path.url()))
        # now .pdf
        orig_file_name = doc_path.file_name
        doc_path.file_name = new_filename
        # and continue as usual
        ocr_page_pdf(
            doc_path=doc_path,
            page_num=page_num,
            lang=lang,
            user_id=user_id,
            document_id=document_id,
            # Pass original file_name i.e. tiff file name as well.
            file_name=orig_file_name,
            namespace=namespace)
    else:
        logger.error(f" user_id={user_id}"
                     f" doc_id={document_id}"
                     f" page_num={page_num} error=Unkown file type")
        return True

    t2 = time.time()
    logger.debug(f" user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num} page_type={page_type}"
                 f" total_exec_time={t2-t1:.2f}")

    return True
Example #22
0
 def test_document_url(self):
     doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
     self.assertEqual(doc_ep.url(), "docs/user_1/document_3/x.pdf")
Example #23
0
    def paste(self, doc_pages, after=False, before=False):
        """
        Paste pages in current document.
        """
        new_page_count = sum([len(pages) for pages in doc_pages.values()])

        if new_page_count == 0:
            logger.warning("No pages to paste. Exiting.")
            return

        # for each document where are pages to paste
        doc_list = []
        doc_ep_list = []
        old_version = self.version

        for doc_id in doc_pages.keys():
            try:
                doc = Document.objects.get(id=doc_id)
            except Document.DoesNotExist:
                logger.warning(f"While pasting, doc_id={doc_id} was not found")
                return
            doc_list.append({'doc': doc, 'page_nums': doc_pages[doc_id]})
            doc_ep_list.append({
                'doc_ep': doc.doc_ep,
                'page_nums': doc_pages[doc_id]
            })

        # returns new document version
        new_version = pdftk.paste_pages(dest_doc_ep=self.doc_ep,
                                        src_doc_ep_list=doc_ep_list,
                                        dest_doc_is_new=False,
                                        after_page_number=after,
                                        before_page_number=before)

        if new_version == self.version:
            raise Exception("Expecting version to be incremented")

        self.version = new_version
        self.save()

        # migrate document's own pages from previous
        # version (this differs from pasting into newly
        # created docs)
        doc_ep_list.insert(
            0, {
                'doc_ep':
                DocumentPath(user_id=self.user.id,
                             document_id=self.id,
                             version=old_version,
                             file_name=self.file_name),
                'page_nums':
                list(range(1, self.page_count + 1))
            })

        ocrmigrate.migrate_cutted_pages(dest_ep=self.doc_ep,
                                        src_doc_ep_list=doc_ep_list)

        # delete pages of source document (which where
        # cutted and pasted into new doc)
        for item in doc_list:
            item['doc'].delete_pages(page_numbers=item['page_nums'])

        # must be at the end
        self.recreate_pages()