def preview(request, id, step=None, page="1"):

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    if request.user.has_perm(Access.PERM_READ, doc):
        doc_ep = doc.doc_ep

        if not doc_ep.exists():
            download(doc_ep)

        page_ep = doc.get_page_ep(
            page_num=page,
            step=Step(step),
        )
        if not page_ep.img_exists():
            extract_img(page_ep)

        try:
            with open(page_ep.img_url(), "rb") as f:
                return HttpResponse(f.read(), content_type="image/jpeg")
        except IOError:
            raise

    return redirect('core:index')
Exemple #2
0
def hocr(request, id, step=None, page="1"):

    logger.debug(f"hocr for doc_id={id}, step={step}, page={page}")

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    doc_ep = doc.doc_ep

    if request.user.has_perm(Access.PERM_READ, doc):
        if not doc_ep.exists():
            download(doc_ep)

        page_count = get_pagecount(doc_ep.url())
        if page > page_count or page < 0:
            raise Http404("Page does not exists")

        page_ep = doc.page_eps[page]

        logger.debug(f"Extract words from {page_ep.hocr_url()}")

        if not page_ep.hocr_exists():
            # check if HOCR data exists on S3
            if settings.S3 and page_ep.hocr_exists(ep=Endpoint.S3):
                # ok, it should be able to download it.
                download_hocr(page_ep)
            else:
                # normal scenario, HOCR is not yet ready
                raise Http404("HOCR data not yet ready.")

        # At this point local HOCR data should be available.
        hocr = Hocr(
            hocr_file_path=page_ep.hocr_url()
        )

        return HttpResponse(
            json.dumps({
                'hocr': hocr.good_json_words(),
                'hocr_meta': hocr.get_meta()
            }),
            content_type="application/json",
        )

    return HttpResponseForbidden()
    def update_text_field(self):
        """Update text field from associated .txt file.

        Returns non-empty text string value if .txt file was found.
        If file was not found - will return an empty string.
        """
        if not settings.OCR:
            return ''

        text = ''
        logger.debug(f"Checking {self.txt_url}")

        if not self.txt_exists:
            logger.debug(
                f"Missing page txt {self.txt_url}."
            )
            # skip download to local media storage if S3
            # is disabled.
            if not settings.S3:
                logger.info(f"S3 disabled")
                return ''

            if not storage.download(self.page_ep):
                logger.info(
                    f"document_log "
                    f" username={self.user.username}"
                    f" doc_id={self.document.id}"
                    f" page_num={self.number}"
                    f" text_len={len(text.strip())}"
                )
                return ''
        else:
            logger.debug(f"Page txt {self.txt_url} exists.")

        with open(self.txt_url) as file_handle:
            self.text = file_handle.read()
            self.save()
            logger.debug(
                f"text saved. len(page.text)=={len(self.text)}"
            )
            text = self.text
            logger.info(
                f"document_log "
                f" username={self.user.username}"
                f" doc_id={self.document.id}"
                f" page_num={self.number}"
                f" text_len={len(self.text.strip())}"
            )

        return text
Exemple #4
0
def ocr_page(self,
             user_id,
             document_id,
             file_name,
             page_num,
             lang,
             s3_upload=True,
             s3_download=True,
             test_local_alternative=None):
    # A task being bound (bind=True) means the first argument
    # to the task will always be the
    # task instance (self).
    # https://celery.readthedocs.io/en/latest/userguide/tasks.html#bound-tasks
    logger.info(f"worker_log task_id={self.request.id}"
                f" user_id={user_id} doc_id={document_id}"
                f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()

    doc_ep = DocumentEp(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    logger.debug(f"Received document_url={doc_ep.url(Endpoint.S3)}")

    if not doc_ep.exists():
        logger.debug((f"doc_ep={doc_ep.url()} does not exists."
                      f"Processing with download."))
        download(doc_ep,
                 s3_download=s3_download,
                 test_local_alternative=test_local_alternative)
    else:
        logger.debug(f"Local copy {doc_ep.url()} exists.")

    mime_type = mime.Mime(doc_ep.url())

    page_ep = None
    page_type = ''
    if mime_type.is_pdf():
        tx1 = time.time()
        page_ep = ocr_page_pdf(doc_ep=doc_ep, page_num=page_num, lang=lang)
        page_type = 'pdf'
        tx2 = time.time()
        logger.info(
            f"worker_log task_id={self.request.id}"
            f" user_id={user_id}"
            f" doc_id={document_id}"
            f" page_num={page_num} page_type=pdf page_ocr_time={tx2-tx1:.2f}")
    else:
        logger.info(f"worker_log task_id={self.request.id}"
                    f" user_id={user_id}"
                    f" doc_id={document_id}"
                    f" page_num={page_num} error=Unkown file type")
        return True

    if page_ep and s3_upload:
        upload_page(page_ep)
        logger.info(
            f"worker_log task_id={self.request.id}"
            f" user_id={user_id}"
            f" doc_id={document_id}"
            f" page_num={page_num} uploaded={page_ep.url(Endpoint.S3)}")

    t2 = time.time()
    logger.info(f"worker_log success task_id={self.request.id}"
                f" user_id={user_id} doc_id={document_id}"
                f" page_num={page_num} page_type={page_type}"
                f" total_exec_time={t2-t1:.2f}")

    return True