コード例 #1
0
def read_document(path,
                  lang='eng',
                  spellcheck=False,
                  event="ocr_progress_bar"):
    """Call Tesseract OCR to extract the text from a document."""
    from PIL import Image
    import requests
    import tesserocr

    if path is None:
        return None

    if not lang_available(lang):
        frappe.msgprint(frappe._(
            "The selected language is not available. Please contact your administrator."
        ),
                        raise_exception=True)

    frappe.publish_realtime(event, {"progress": "0"}, user=frappe.session.user)

    if path.startswith('/assets/'):
        # from public folder
        fullpath = os.path.abspath(path)
    elif path.startswith('/files/'):
        # public file
        fullpath = frappe.get_site_path() + '/public' + path
    elif path.startswith('/private/files/'):
        # private file
        fullpath = frappe.get_site_path() + path
    elif path.startswith('/'):
        # local file (mostly for tests)
        fullpath = os.path.abspath(path)
    else:
        # external link
        fullpath = requests.get(path, stream=True).raw

    ocr = frappe.get_doc("OCR Settings")

    text = " "
    with tesserocr.PyTessBaseAPI(lang=lang) as api:

        if path.endswith('.pdf'):
            from wand.image import Image as wi

            # https://stackoverflow.com/questions/43072050/pyocr-with-tesseract-runs-out-of-memory
            with wi(filename=fullpath, resolution=ocr.pdf_resolution) as pdf:
                pdf_image = pdf.convert('jpeg')
                i = 0
                size = len(pdf_image.sequence) * 3

                for img in pdf_image.sequence:
                    with wi(image=img) as img_page:
                        image_blob = img_page.make_blob('jpeg')
                        frappe.publish_realtime(event, {"progress": [i, size]},
                                                user=frappe.session.user)
                        i += 1

                        recognized_text = " "

                        image = Image.open(io.BytesIO(image_blob))
                        api.SetImage(image)
                        frappe.publish_realtime(event, {"progress": [i, size]},
                                                user=frappe.session.user)
                        i += 1

                        recognized_text = api.GetUTF8Text()
                        text = text + recognized_text
                        frappe.publish_realtime(event, {"progress": [i, size]},
                                                user=frappe.session.user)
                        i += 1

        else:
            image = Image.open(fullpath)
            api.SetImage(image)
            frappe.publish_realtime(event, {"progress": [33, 100]},
                                    user=frappe.session.user)

            text = api.GetUTF8Text()
            frappe.publish_realtime(event, {"progress": [66, 100]},
                                    user=frappe.session.user)

    if spellcheck:
        text = get_spellchecked_text(text, lang)

    frappe.publish_realtime(event, {"progress": [100, 100]},
                            user=frappe.session.user)

    return text
コード例 #2
0
 def test_666_lang_available(self):
     self.assertFalse(lang_available("666"))
コード例 #3
0
 def test_osd_lang_available(self):
     self.assertTrue(lang_available("osd"))
コード例 #4
0
 def test_equ_lang_available(self):
     self.assertTrue(lang_available("equ"))
コード例 #5
0
 def test_eng_lang_available(self):
     self.assertTrue(lang_available("eng"))