def test_langs_error(self, popen): self.stdout.stdout.read.return_value = b"No languages\n" self.stdout.wait.return_value = 1 popen.return_value = self.stdout with self.assertRaises(tesseract.TesseractError) as te: tesseract.get_available_languages() self.assertEqual(te.exception.status, 1) self.assertEqual("unable to get languages", te.exception.message) popen.assert_called_once_with(["tesseract", "--list-langs"], startupinfo=None, creationflags=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
def test_langs(self): langs = tesseract.get_available_languages() self.assertTrue("eng" in langs, ("English training does not appear to be installed." " (required for the tests)")) self.assertTrue("fra" in langs, ("French training does not appear to be installed." " (required for the tests)")) self.assertTrue("jpn" in langs, ("Japanese training does not appear to be installed." " (required for the tests)"))
def test_langs(self, popen): message = ("List of available languages (4):\n" "eng\n" "fra\n" "jpn\n" "osd\n") self.stdout.stdout.read.return_value = message.encode() popen.return_value = self.stdout langs = tesseract.get_available_languages() for lang in ("eng", "fra", "jpn", "osd"): self.assertIn(lang, langs) popen.assert_called_once_with(["tesseract", "--list-langs"], startupinfo=None, creationflags=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
#!/usr/bin/env python # codeing=utf-8 from PIL import Image import sys from pyocr import pyocr from pyocr import tesseract if __name__ == '__main__': print tesseract.get_available_languages() tools = pyocr.get_available_tools()[:] if len(tools) == 0: print("No OCR tool found") sys.exit(1) print("Using '%s'" % (tools[0].get_name())) tools[0].image_to_string(Image.open('test.png'), lang='fra', builder=TextBuilder())
# # # # # # # # # # # # # # # # # # # Module 5: Convert PDFs to text data # # # # # # # # # # # # # # # # # # from wand.image import Image from PIL import Image as PI import pyocr import pyocr.builders import io from pyocr import tesseract as tool # tool = pyocr.get_available_tools()[0] lang = tool.get_available_languages()[0] import time import datetime import pandas as pd pd.set_option('display.max_columns', None) import numpy as np import shutil import os import sys import local_filepaths as fp # # # # # # # Ascertain which PDFs are still to be processed # # # # # #