def read_document(path, lang='eng', spellcheck=False, event="ocr_progress_bar"): """Call Tesseract OCR to extract the text from a document.""" from PIL import Image import requests import tesserocr if path is None: return None if not lang_available(lang): frappe.msgprint(frappe._( "The selected language is not available. Please contact your administrator." ), raise_exception=True) frappe.publish_realtime(event, {"progress": "0"}, user=frappe.session.user) if path.startswith('/assets/'): # from public folder fullpath = os.path.abspath(path) elif path.startswith('/files/'): # public file fullpath = frappe.get_site_path() + '/public' + path elif path.startswith('/private/files/'): # private file fullpath = frappe.get_site_path() + path elif path.startswith('/'): # local file (mostly for tests) fullpath = os.path.abspath(path) else: # external link fullpath = requests.get(path, stream=True).raw ocr = frappe.get_doc("OCR Settings") text = " " with tesserocr.PyTessBaseAPI(lang=lang) as api: if path.endswith('.pdf'): from wand.image import Image as wi # https://stackoverflow.com/questions/43072050/pyocr-with-tesseract-runs-out-of-memory with wi(filename=fullpath, resolution=ocr.pdf_resolution) as pdf: pdf_image = pdf.convert('jpeg') i = 0 size = len(pdf_image.sequence) * 3 for img in pdf_image.sequence: with wi(image=img) as img_page: image_blob = img_page.make_blob('jpeg') frappe.publish_realtime(event, {"progress": [i, size]}, user=frappe.session.user) i += 1 recognized_text = " " image = Image.open(io.BytesIO(image_blob)) api.SetImage(image) frappe.publish_realtime(event, {"progress": [i, size]}, user=frappe.session.user) i += 1 recognized_text = api.GetUTF8Text() text = text + recognized_text frappe.publish_realtime(event, {"progress": [i, size]}, user=frappe.session.user) i += 1 else: image = Image.open(fullpath) api.SetImage(image) frappe.publish_realtime(event, {"progress": [33, 100]}, user=frappe.session.user) text = api.GetUTF8Text() frappe.publish_realtime(event, {"progress": [66, 100]}, user=frappe.session.user) if spellcheck: text = get_spellchecked_text(text, lang) frappe.publish_realtime(event, {"progress": [100, 100]}, user=frappe.session.user) return text
def test_666_lang_available(self): self.assertFalse(lang_available("666"))
def test_osd_lang_available(self): self.assertTrue(lang_available("osd"))
def test_equ_lang_available(self): self.assertTrue(lang_available("equ"))
def test_eng_lang_available(self): self.assertTrue(lang_available("eng"))