def preview(request, id, step=None, page="1"): try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") if request.user.has_perm(Access.PERM_READ, doc): page_path = doc.get_page_path( page_num=page, step=Step(step), ) img_abs_path = default_storage.abspath(page_path.img_url()) if not os.path.exists(img_abs_path): logger.debug( f"Preview image {img_abs_path} does not exists. Generating...") extract_img(page_path, media_root=settings.MEDIA_ROOT) try: with open(img_abs_path, "rb") as f: return HttpResponse(f.read(), content_type="image/jpeg") except IOError: raise return redirect('core:index')
def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ logger.debug("OCR PDF document") page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def ocr_page_pdf(doc_path, page_num, lang, **kwargs): """ doc_path is an mglib.path.DocumentPath instance On success returns ``mglib.path.PagePath`` instance. """ logger.debug("OCR PDF document") file_name = kwargs.pop('file_name', None) if not file_name: file_name = doc_path.file_name page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_path.step = step extract_img(page_path, media_root=settings.MEDIA_ROOT) notify_pre_page_ocr(page_path, page_num=page_num, lang=lang, file_name=doc_path.file_name, **kwargs) if page_num <= page_count: page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_txt_ready(page_path, page_num=page_num, lang=lang, file_name=file_name, **kwargs) for step in Steps(): page_path.step = step if not step.is_thumbnail: extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_hocr_ready( page_path, page_num=page_num, lang=lang, # step as integer number step=step.current, file_name=file_name, **kwargs) return page_path
def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_img(page_url, media_root=settings.MEDIA_ROOT) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) # tesseract unterhalt-1.jpg page-1 -l deu hocr if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url