Esempio n. 1
0
def preview(request, id, step=None, page="1"):

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    if request.user.has_perm(Access.PERM_READ, doc):
        page_path = doc.get_page_path(
            page_num=page,
            step=Step(step),
        )
        img_abs_path = default_storage.abspath(page_path.img_url())

        if not os.path.exists(img_abs_path):
            logger.debug(
                f"Preview image {img_abs_path} does not exists. Generating...")
            extract_img(page_path, media_root=settings.MEDIA_ROOT)

        try:
            with open(img_abs_path, "rb") as f:
                return HttpResponse(f.read(), content_type="image/jpeg")
        except IOError:
            generic_file = "admin/img/document.png"
            if Step(step).is_thumbnail:
                generic_file = "admin/img/document_thumbnail.png"

            file_path = finders.find(generic_file)

            with open(file_path, "rb") as f:
                return HttpResponse(f.read(), content_type="image/png")

    return redirect('core:index')
Esempio n. 2
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    logger.debug("OCR PDF document")

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)

    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url
Esempio n. 3
0
def ocr_page_image(doc_path, page_num, lang):
    """
    image = jpg, jpeg, png
    """
    logger.debug("OCR image (jpeg, jpg, png) document")

    page_url = PagePath(
        document_path=doc_path,
        page_num=page_num,
        step=Step(1),
        # jpeg, jpg, png are 1 page documents
        page_count=1)
    # resize and eventually convert (png -> jpg)
    resize_img(page_url, media_root=settings.MEDIA_ROOT)
    extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

    # First quickly generate preview images
    for step in Steps():
        page_url.step = step
        resize_img(page_url, media_root=settings.MEDIA_ROOT)
    # reset page's step
    page_url.step = Step(1)
    # Now OCR each image
    for step in Steps():
        if not step.is_thumbnail:
            extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

    return page_url
Esempio n. 4
0
def ocr_page_pdf(doc_path, page_num, lang, **kwargs):
    """
    doc_path is an mglib.path.DocumentPath instance

    On success returns ``mglib.path.PagePath`` instance.
    """
    logger.debug("OCR PDF document")

    file_name = kwargs.pop('file_name', None)

    if not file_name:
        file_name = doc_path.file_name

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        for step in Steps():
            page_path.step = step
            extract_img(page_path, media_root=settings.MEDIA_ROOT)

    notify_pre_page_ocr(page_path,
                        page_num=page_num,
                        lang=lang,
                        file_name=doc_path.file_name,
                        **kwargs)

    if page_num <= page_count:
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
        notify_txt_ready(page_path,
                         page_num=page_num,
                         lang=lang,
                         file_name=file_name,
                         **kwargs)

        for step in Steps():
            page_path.step = step
            if not step.is_thumbnail:
                extract_hocr(page_path,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)
                notify_hocr_ready(
                    page_path,
                    page_num=page_num,
                    lang=lang,
                    # step as integer number
                    step=step.current,
                    file_name=file_name,
                    **kwargs)

    return page_path
Esempio n. 5
0
def ocr_page_image(doc_path, page_num, lang, **kwargs):
    """
    image = jpg, jpeg, png

    On success returns ``mglib.path.PagePath`` instance.
    """
    logger.debug("OCR image (jpeg, jpg, png) document")

    page_path = PagePath(
        document_path=doc_path,
        page_num=page_num,
        step=Step(1),
        # jpeg, jpg, png are 1 page documents
        page_count=1)
    notify_pre_page_ocr(page_path,
                        page_num=page_num,
                        lang=lang,
                        file_name=doc_path.file_name,
                        **kwargs)
    # resize and eventually convert (png -> jpg)
    resize_img(page_path, media_root=settings.MEDIA_ROOT)
    extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
    notify_txt_ready(page_path,
                     page_num=page_num,
                     lang=lang,
                     file_name=doc_path.file_name,
                     **kwargs)

    # First quickly generate preview images
    for step in Steps():
        page_path.step = step
        resize_img(page_path, media_root=settings.MEDIA_ROOT)
    # reset page's step
    page_path.step = Step(1)
    # Now OCR each image
    for step in Steps():
        if not step.is_thumbnail:
            extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
            notify_hocr_ready(
                page_path,
                page_num=page_num,
                lang=lang,
                # step as integer number
                step=step.current,
                file_name=doc_path.file_name,
                **kwargs)

    return page_path
Esempio n. 6
0
 def test_preview(self):
     doc = Document.create_document(
         title="berlin.pdf",
         user=self.testcase_user,
         lang="ENG",
         file_name="berlin.pdf",
         size=1222,
         page_count=3
     )
     default_storage.copy_doc(
         src=os.path.join(
             BASE_DIR, "data", "berlin.pdf"
         ),
         dst=doc.path.url(),
     )
     ret = self.client.post(
         reverse('core:preview', args=(doc.id, 1, 1))
     )
     self.assertEqual(
         ret.status_code,
         200
     )
     page_path = PagePath(
         document_path=doc.path,
         page_num=1,
         step=Step(1),
         page_count=3
     )
     self.assertTrue(
         os.path.exists(
             default_storage.abspath(page_path.img_url())
         )
     )
Esempio n. 7
0
 def test_ppmroot(self):
     doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
     page_url = PagePath(document_ep=doc_ep,
                         page_num=1,
                         step=Step(1),
                         page_count=3)
     self.assertEqual(page_url.ppmroot,
                      "results/user_1/document_3/pages/page_1/100/page")
Esempio n. 8
0
 def test_txt_url(self):
     """
     Without any arguments
         page_ep.url() returns page_ep.txt_url()
     """
     doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
     page_ep = PagePath(document_ep=doc_ep,
                        page_num=1,
                        step=Step(1),
                        page_count=3)
     self.assertEqual(page_ep.url(), page_ep.txt_url())
Esempio n. 9
0
def apply_metadata_plugins(document_id, page_num):

    try:
        document = Document.objects.get(id=document_id)
    except Document.DoesNotExist:
        logger.error(f"Provided document_id={document_id}, does not exists")
        return

    page_path = document.get_page_path(
        page_num=page_num,
        step=Step(),
    )
    hocr_path = default_storage.abspath(page_path.hocr_url())
    metadata_plugins = MetadataPlugins()

    return metadata_plugins.apply(hocr_path)
Esempio n. 10
0
def apply_automates(document_id, page_num):

    logger.debug("apply_automates: Begin.")
    try:
        document = Document.objects.get(id=document_id)
    except Document.DoesNotExist:
        logger.error(f"Provided document_id={document_id}, does not exists")
        return

    page_path = document.get_page_path(
        page_num=page_num,
        step=Step(),
    )
    user = document.user

    hocr_path = default_storage.abspath(page_path.hocr_url())
    hocr = ""
    with open(hocr_path, "r") as f:
        hocr = f.read()

    automates = Automate.objects.filter(user=user)
    # are there automates for the user?
    if automates.count() == 0:
        logger.debug(f"No automates for user {user}. Quit.")
        return

    # check all automates for given user (the owner of the document)
    for automate in automates:

        if automate.is_a_match(hocr):
            logger.debug(f"Automate {automate} matched document={document}")
            plugin_klass = get_plugin_by_module_name(automate.plugin_name)
            logger.debug(f"Found plugin module={plugin_klass.__module__}")
            logger.debug(f"len(hocr)=={len(hocr)}")
            automate.apply(
                document=document,
                page_num=page_num,
                hocr=hocr,
                # Notice () - plugin passed is instance of the class
                plugin=plugin_klass())
        else:
            logger.debug(f"No match for automate={automate}"
                         f" doc_id={document_id} page_num={page_num}")
Esempio n. 11
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    page_count = get_pagecount(default_storage.abspath(doc_path.url()))
    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_img(page_url, media_root=settings.MEDIA_ROOT)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)
            # tesseract unterhalt-1.jpg page-1 -l deu hocr
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url
Esempio n. 12
0
 def test_step(self):
     step = Step(1)
     self.assertFalse(step.is_thumbnail,
                      f"{step} is is_thumbnail, but it should not be!")
Esempio n. 13
0
def apply_automates(document_id, page_num):

    logger.debug("apply_automates: Begin.")
    try:
        document = Document.objects.get(id=document_id)
    except Document.DoesNotExist:
        logger.error(f"Provided document_id={document_id}, does not exists")
        return

    # use text files from the original version of the document
    doc_path = DocumentPath.copy_from(
        document.path,
        version=0
    )
    page_count = get_pagecount(
        default_storage.abspath(doc_path.url())
    )
    page_path = PagePath(
        document_path=doc_path,
        page_num=page_num,
        page_count=page_count,
        step=Step(),
    )
    user = document.user

    text_path = default_storage.abspath(page_path.txt_url())
    text = ""
    with open(text_path, "r") as f:
        text = f.read()

    automates = Automate.objects.filter(user=user)
    # are there automates for the user?
    if automates.count() == 0:
        logger.debug(
            f"No automates for user {user}. Quit."
        )
        return

    # check all automates for given user (the owner of the document)
    matched = []
    for automate in automates:
        if automate.is_a_match(text):
            logger.debug(f"Automate {automate} matched document={document}")

            plugin_klass = get_plugin_by_module_name(
                automate.plugin_name
            )
            plugin = plugin_klass() if plugin_klass else None

            automate.apply(
                document=document,
                page_num=page_num,
                hocr=text,
                # Notice () - plugin passed is instance of the class
                plugin=plugin
            )
            matched.append(automate)
        else:
            logger.debug(
                f"No match for automate={automate}"
                f" doc_id={document_id} page_num={page_num}"
            )

    message = ""

    message = _(
        "%(count)s of %(total)s Automate(s) matched. ") % {
        'count': len(matched),
        'total': automates.count()
    }

    if len(matched) > 0:
        message += _("List of matched Automates: %(matched_automates)s") % {
            'matched_automates': matched
        }

    automates_matching.send(
        sender="papermerge.core.automate",
        user_id=document.user.id,
        document_id=document_id,
        level=logging.INFO,
        message=message,
        page_num=page_num,
        text=text
    )