def test_extract_words_from(self): hocr_file = os.path.join(BASE_DIR, "data", "page-1.hocr") hocr = Hocr(hocr_file_path=hocr_file) try: json.dumps({ 'hocr': hocr.good_json_words(), 'hocr_meta': hocr.get_meta() }) except TypeError: self.assertTrue(False, "Unserializable result")
def hocr(request, id, step=None, page="1"): logger.debug(f"hocr for doc_id={id}, step={step}, page={page}") try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") doc_ep = doc.doc_ep if request.user.has_perm(Access.PERM_READ, doc): if not doc_ep.exists(): download(doc_ep) page_count = get_pagecount(doc_ep.url()) if page > page_count or page < 0: raise Http404("Page does not exists") page_ep = doc.page_eps[page] logger.debug(f"Extract words from {page_ep.hocr_url()}") if not page_ep.hocr_exists(): # check if HOCR data exists on S3 if settings.S3 and page_ep.hocr_exists(ep=Endpoint.S3): # ok, it should be able to download it. download_hocr(page_ep) else: # normal scenario, HOCR is not yet ready raise Http404("HOCR data not yet ready.") # At this point local HOCR data should be available. hocr = Hocr( hocr_file_path=page_ep.hocr_url() ) return HttpResponse( json.dumps({ 'hocr': hocr.good_json_words(), 'hocr_meta': hocr.get_meta() }), content_type="application/json", ) return HttpResponseForbidden()
def hocr(request, id, step=None, page="1"): logger.debug(f"hocr for doc_id={id}, step={step}, page={page}") try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") doc_path = doc.path if request.user.has_perm(Access.PERM_READ, doc): # document absolute path doc_abs_path = default_storage.abspath(doc_path.url()) if not os.path.exists( doc_abs_path ): raise Http404("HOCR data not yet ready.") page_count = get_pagecount(doc_abs_path) if page > page_count or page < 0: raise Http404("Page does not exists") page_path = doc.page_paths[page] hocr_abs_path = default_storage.abspath(page_path.hocr_url()) logger.debug(f"Extract words from {hocr_abs_path}") if not os.path.exists(hocr_abs_path): raise Http404("HOCR data not yet ready.") # At this point local HOCR data should be available. hocr = Hocr( hocr_file_path=hocr_abs_path ) return HttpResponse( json.dumps({ 'hocr': hocr.good_json_words(), 'hocr_meta': hocr.get_meta() }), content_type="application/json", ) return HttpResponseForbidden()
def test_empty_file_hocr(self): """ If empty or invalid file is provided then json_good_words() and get_meta() will return empty list. """ file = tempfile.NamedTemporaryFile(mode="r+t") hocr = Hocr(hocr_file_path=file.name) self.assertEqual(hocr.good_json_words(), []) meta = hocr.get_meta() self.assertEqual( meta, { 'count_all': 0, 'bad_words': [], 'count_good': 0, 'count_bad': 0, 'count_non_empty': 0, 'count_low_wconf': 0, 'width': 0, 'height': 0, 'min_wconf': 30 }) file.close()