Beispiel #1
0
    def test_extract_words_from(self):
        hocr_file = os.path.join(BASE_DIR, "data", "page-1.hocr")
        hocr = Hocr(hocr_file_path=hocr_file)

        try:
            json.dumps({
                'hocr': hocr.good_json_words(),
                'hocr_meta': hocr.get_meta()
            })
        except TypeError:
            self.assertTrue(False, "Unserializable result")
Beispiel #2
0
    def test_extract_img_size(self):
        hocr_file = os.path.join(BASE_DIR, "data", "page-1.hocr")
        hocr = Hocr(hocr_file_path=hocr_file)

        self.assertEqual(hocr.width, 1240)

        self.assertEqual(hocr.height, 1754)
Beispiel #3
0
def hocr(request, id, step=None, page="1"):

    logger.debug(f"hocr for doc_id={id}, step={step}, page={page}")

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    doc_ep = doc.doc_ep

    if request.user.has_perm(Access.PERM_READ, doc):
        if not doc_ep.exists():
            download(doc_ep)

        page_count = get_pagecount(doc_ep.url())
        if page > page_count or page < 0:
            raise Http404("Page does not exists")

        page_ep = doc.page_eps[page]

        logger.debug(f"Extract words from {page_ep.hocr_url()}")

        if not page_ep.hocr_exists():
            # check if HOCR data exists on S3
            if settings.S3 and page_ep.hocr_exists(ep=Endpoint.S3):
                # ok, it should be able to download it.
                download_hocr(page_ep)
            else:
                # normal scenario, HOCR is not yet ready
                raise Http404("HOCR data not yet ready.")

        # At this point local HOCR data should be available.
        hocr = Hocr(
            hocr_file_path=page_ep.hocr_url()
        )

        return HttpResponse(
            json.dumps({
                'hocr': hocr.good_json_words(),
                'hocr_meta': hocr.get_meta()
            }),
            content_type="application/json",
        )

    return HttpResponseForbidden()
Beispiel #4
0
def hocr(request, id, step=None, page="1"):

    logger.debug(f"hocr for doc_id={id}, step={step}, page={page}")

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    doc_path = doc.path

    if request.user.has_perm(Access.PERM_READ, doc):
        # document absolute path
        doc_abs_path = default_storage.abspath(doc_path.url())
        if not os.path.exists(
            doc_abs_path
        ):
            raise Http404("HOCR data not yet ready.")

        page_count = get_pagecount(doc_abs_path)
        if page > page_count or page < 0:
            raise Http404("Page does not exists")

        page_path = doc.page_paths[page]
        hocr_abs_path = default_storage.abspath(page_path.hocr_url())

        logger.debug(f"Extract words from {hocr_abs_path}")

        if not os.path.exists(hocr_abs_path):
            raise Http404("HOCR data not yet ready.")

        # At this point local HOCR data should be available.
        hocr = Hocr(
            hocr_file_path=hocr_abs_path
        )

        return HttpResponse(
            json.dumps({
                'hocr': hocr.good_json_words(),
                'hocr_meta': hocr.get_meta()
            }),
            content_type="application/json",
        )

    return HttpResponseForbidden()
Beispiel #5
0
    def test_empty_file_hocr(self):
        """
        If empty or invalid file is provided then
        json_good_words() and get_meta()
        will return empty list.
        """
        file = tempfile.NamedTemporaryFile(mode="r+t")
        hocr = Hocr(hocr_file_path=file.name)

        self.assertEqual(hocr.good_json_words(), [])
        meta = hocr.get_meta()
        self.assertEqual(
            meta, {
                'count_all': 0,
                'bad_words': [],
                'count_good': 0,
                'count_bad': 0,
                'count_non_empty': 0,
                'count_low_wconf': 0,
                'width': 0,
                'height': 0,
                'min_wconf': 30
            })
        file.close()