def test_langs_error(self, popen):
     self.stdout.stdout.read.return_value = b"No languages\n"
     self.stdout.wait.return_value = 1
     popen.return_value = self.stdout
     with self.assertRaises(tesseract.TesseractError) as te:
         tesseract.get_available_languages()
     self.assertEqual(te.exception.status, 1)
     self.assertEqual("unable to get languages", te.exception.message)
     popen.assert_called_once_with(["tesseract", "--list-langs"],
                                   startupinfo=None,
                                   creationflags=0,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)
Example #2
0
 def test_langs(self):
     langs = tesseract.get_available_languages()
     self.assertTrue("eng" in langs,
                     ("English training does not appear to be installed."
                      " (required for the tests)"))
     self.assertTrue("fra" in langs,
                     ("French training does not appear to be installed."
                      " (required for the tests)"))
     self.assertTrue("jpn" in langs,
                     ("Japanese training does not appear to be installed."
                      " (required for the tests)"))
Example #3
0
 def test_langs(self):
     langs = tesseract.get_available_languages()
     self.assertTrue("eng" in langs,
                     ("English training does not appear to be installed."
                      " (required for the tests)"))
     self.assertTrue("fra" in langs,
                     ("French training does not appear to be installed."
                      " (required for the tests)"))
     self.assertTrue("jpn" in langs,
                     ("Japanese training does not appear to be installed."
                      " (required for the tests)"))
Example #4
0
 def test_langs(self, popen):
     message = ("List of available languages (4):\n"
                "eng\n"
                "fra\n"
                "jpn\n"
                "osd\n")
     self.stdout.stdout.read.return_value = message.encode()
     popen.return_value = self.stdout
     langs = tesseract.get_available_languages()
     for lang in ("eng", "fra", "jpn", "osd"):
         self.assertIn(lang, langs)
     popen.assert_called_once_with(["tesseract", "--list-langs"],
                                   startupinfo=None,
                                   creationflags=0,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)
Example #5
0
#!/usr/bin/env python
# codeing=utf-8

from PIL import Image
import sys
from pyocr import pyocr
from pyocr import tesseract

if __name__ == '__main__':
    print tesseract.get_available_languages()
    tools = pyocr.get_available_tools()[:]
    if len(tools) == 0:
        print("No OCR tool found")
        sys.exit(1)
    print("Using '%s'" % (tools[0].get_name()))
    tools[0].image_to_string(Image.open('test.png'),
                             lang='fra',
                             builder=TextBuilder())
# # # # # #
# # # # # #
# # # # # #
# Module 5: Convert PDFs to text data
# # # # # #
# # # # # #
# # # # # #

from wand.image import Image
from PIL import Image as PI
import pyocr
import pyocr.builders
import io

from pyocr import tesseract as tool  # tool = pyocr.get_available_tools()[0]
lang = tool.get_available_languages()[0]
import time
import datetime

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import shutil
import os
import sys
import local_filepaths as fp

# # # # # #
#  Ascertain which PDFs are still to be processed
# # # # # #