def check_options(options): gs_version = ghostscript.version() check_external_program( program='gs', package='ghostscript', version_checker=gs_version, need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports ) if gs_version in ('9.24', '9.51'): raise MissingDependencyError( f"Ghostscript {gs_version} contains serious regressions and is not " "supported. Please upgrade to a newer version, or downgrade to the " "previous version.") # We have these constraints to check for. # 1. Ghostscript < 9.20 mangles multibyte Unicode # 2. hocr doesn't work on non-Latin languages (so don't select it) is_latin = options.languages.issubset(HOCR_OK_LANGS) if gs_version < '9.20' and options.output_type != 'pdf' and not is_latin: # https://bugs.ghostscript.com/show_bug.cgi?id=696874 # Ghostscript < 9.20 fails to encode multibyte characters properly log.warning( f"The installed version of Ghostscript ({gs_version}) does not work " "correctly with the OCR languages you specified. Use --output-type pdf or " "upgrade to Ghostscript 9.20 or later to avoid this issue.") if options.output_type == 'pdfa': options.output_type = 'pdfa-2' if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19': raise MissingDependencyError( "--output-type pdfa-3 requires Ghostscript 9.19 or later")
def get_languages(): def lang_error(output): msg = ( "Tesseract failed to report available languages.\n" "Output from Tesseract:\n" "-----------\n" ) msg += output return msg args_tess = ['tesseract', '--list-langs'] try: proc = run( args_tess, text=True, stdout=PIPE, stderr=STDOUT, logs_errors_to_stdout=True, check=True, ) output = proc.stdout except CalledProcessError as e: raise MissingDependencyError(lang_error(e.output)) from e for line in output.splitlines(): if line.startswith('Error'): raise MissingDependencyError(lang_error(output)) _header, *rest = output.splitlines() return set(lang.strip() for lang in rest)
def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]: SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'} with Image.open(input_file) as im: im_modified = False if im.mode not in SUFFIXES: log.info("Converting image to other colorspace") try: if im.mode == 'P' and len(im.getcolors()) == 2: im = im.convert(mode='1') else: im = im.convert(mode='RGB') except IOError as e: raise MissingDependencyError( "Could not convert image with type " + im.mode) from e else: im_modified = True try: suffix = SUFFIXES[im.mode] except KeyError: raise MissingDependencyError( "Failed to convert image to a supported format.") from e if im_modified or input_file.suffix != '.png': input_png = tmpdir / 'input.png' im.save(input_png, format='PNG', compress_level=1) else: # No changes, PNG input, just use the file we already have input_png = input_file output_pnm = tmpdir / f'output{suffix}' return input_png, output_pnm
def run(input_file, output_file, dpi, mode_args): args_unpaper = ['unpaper', '-v', '--dpi', str(dpi)] + mode_args SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'} with TemporaryDirectory() as tmpdir, Image.open(input_file) as im: if im.mode not in SUFFIXES.keys(): log.info("Converting image to other colorspace") try: if im.mode == 'P' and len(im.getcolors()) == 2: im = im.convert(mode='1') else: im = im.convert(mode='RGB') except IOError as e: im.close() raise MissingDependencyError( "Could not convert image with type " + im.mode) from e try: suffix = SUFFIXES[im.mode] except KeyError: raise MissingDependencyError( "Failed to convert image to a supported format.") from e input_pnm = Path(tmpdir) / f'input{suffix}' output_pnm = Path(tmpdir) / f'output{suffix}' im.save(input_pnm, format='PPM') # To prevent any shenanigans from accepting arbitrary parameters in # --unpaper-args, we: # 1) run with cwd set to a tmpdir with only unpaper's files # 2) forbid the use of '/' in arguments, to prevent changing paths # 3) append absolute paths for the input and output file # This should ensure that a user cannot clobber some other file with # their unpaper arguments (whether intentionally or otherwise) args_unpaper.extend([os.fspath(input_pnm), os.fspath(output_pnm)]) try: proc = external_run( args_unpaper, check=True, close_fds=True, universal_newlines=True, stderr=STDOUT, cwd=tmpdir, stdout=PIPE, ) except CalledProcessError as e: log.debug(e.output) raise e from e else: log.debug(proc.stdout) # unpaper sets dpi to 72; fix this try: with Image.open(output_pnm) as imout: imout.save(output_file, dpi=(dpi, dpi)) except (FileNotFoundError, OSError): raise SubprocessOutputError( "unpaper: failed to produce the expected output file. " + " Called with: " + str(args_unpaper)) from None
def check_external_program( *, program: str, package: str, version_checker: Union[str, Callable], need_version: str, required_for: Optional[str] = None, recommended=False, version_parser: Type[Version] = LooseVersion, ): """Check for required version of external program and raise exception if not. Args: program: The name of the program to test. package: The name of a software package that typically supplies this program. Usually the same as program. version_check: A callable without arguments that retrieves the installed version of program. need_version: The minimum required version. required_for: The name of an argument of feature that requires this program. recommended: If this external program is recommended, instead of raising an exception, log a warning and allow execution to continue. version_parser: A class that should be used to parse and compare version numbers. Used when version numbers do not follow standard conventions. """ try: if callable(version_checker): found_version = version_checker() else: found_version = version_checker except (CalledProcessError, FileNotFoundError, MissingDependencyError): _error_missing_program(program, package, required_for, recommended) if not recommended: raise MissingDependencyError(program) return def remove_leading_v(s): if s.startswith('v'): return s[1:] return s found_version = remove_leading_v(found_version) need_version = remove_leading_v(need_version) if found_version and version_parser(found_version) < version_parser( need_version): _error_old_version(program, package, need_version, found_version, required_for) if not recommended: raise MissingDependencyError(program) log.debug('Found %s %s', program, found_version)
def check_options(options): check_external_program( program='tesseract', package={'linux': 'tesseract-ocr'}, version_checker=tesseract.version, need_version='4.0.0', # using backport for Travis CI ) # Decide on what renderer to use if options.pdf_renderer == 'auto': options.pdf_renderer = 'sandwich' if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf( set(options.languages)): raise MissingDependencyError( "You are using an alpha version of Tesseract 4.0 that does not support " "the textonly_pdf parameter. We don't support versions this old.") if not tesseract.has_user_words() and (options.user_words or options.user_patterns): log.warning( "Tesseract 4.0 ignores --user-words and --user-patterns, so these " "arguments have no effect.") if options.tesseract_pagesegmode in (0, 2): log.warning( "The --tesseract-pagesegmode argument you select will disable OCR. " "This may cause processing to fail.")
def get_version(program: str, *, version_arg: str = '--version', regex=r'(\d+(\.\d+)*)', env=None): """Get the version of the specified program Arguments: program: The program to version check. version_arg: The argument needed to ask for its version, e.g. ``--version``. regex: A regular expression to parse the program's output and obtain the version. env: Custom ``os.environ`` in which to run program. """ args_prog = [program, version_arg] try: proc = run( args_prog, close_fds=True, text=True, stdout=PIPE, stderr=STDOUT, check=True, env=env, ) output = proc.stdout except FileNotFoundError as e: raise MissingDependencyError( f"Could not find program '{program}' on the PATH") from e except CalledProcessError as e: if e.returncode != 0: raise MissingDependencyError( f"Ran program '{program}' but it exited with an error:\n{e.output}" ) from e raise MissingDependencyError( f"Could not find program '{program}' on the PATH") from e match = re.match(regex, output.strip()) if not match: raise MissingDependencyError( f"The program '{program}' did not report its version. " f"Message was:\n{output}") version = match.group(1) return version
def check_options_languages(options, ocr_engine_languages): if not options.languages: options.languages = {DEFAULT_LANGUAGE} system_lang = locale.getlocale()[0] if system_lang and not system_lang.startswith('en'): log.debug("No language specified; assuming --language %s", DEFAULT_LANGUAGE) if not ocr_engine_languages: return if not options.languages.issubset(ocr_engine_languages): msg = (f"OCR engine does not have language data for the following " "requested languages: \n") for lang in options.languages - ocr_engine_languages: msg += lang + '\n' raise MissingDependencyError(msg)
def check_external_program( *, program, package, version_checker, need_version, required_for=None, recommended=False, ): try: if callable(version_checker): found_version = version_checker() else: found_version = version_checker except (CalledProcessError, FileNotFoundError, MissingDependencyError): _error_missing_program(program, package, required_for, recommended) if not recommended: raise MissingDependencyError() return def remove_leading_v(s): if s.startswith('v'): return s[1:] return s found_version = remove_leading_v(found_version) need_version = remove_leading_v(need_version) if found_version and LooseVersion(found_version) < LooseVersion( need_version): _error_old_version(program, package, need_version, found_version, required_for) if not recommended: raise MissingDependencyError() log.debug('Found %s %s', program, found_version)
def get_version(program, *, version_arg='--version', regex=r'(\d+(\.\d+)*)', env=None): """Get the version of the specified program""" args_prog = [program, version_arg] try: proc = run( args_prog, close_fds=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT, check=True, env=env, ) output = proc.stdout except FileNotFoundError as e: raise MissingDependencyError( f"Could not find program '{program}' on the PATH") from e except CalledProcessError as e: if e.returncode != 0: raise MissingDependencyError( f"Ran program '{program}' but it exited with an error:\n{e.output}" ) from e raise MissingDependencyError( f"Could not find program '{program}' on the PATH") from e try: version = re.match(regex, output.strip()).group(1) except AttributeError as e: raise MissingDependencyError( f"The program '{program}' did not report its version. " f"Message was:\n{output}") return version
def check_options_languages(options, ocr_engine_languages): if not options.languages: options.languages = {DEFAULT_LANGUAGE} system_lang = locale.getlocale()[0] if system_lang and not system_lang.startswith('en'): log.debug("No language specified; assuming --language %s", DEFAULT_LANGUAGE) if not ocr_engine_languages: return missing_languages = options.languages - ocr_engine_languages if missing_languages: msg = ("OCR engine does not have language data for the following " "requested languages: \n") msg += '\n'.join(lang for lang in missing_languages) msg += '\nNote: most languages are identified by a 3-digit ISO 639-2 Code' raise MissingDependencyError(msg)
def has_textonly_pdf(langs=None): """Does Tesseract have textonly_pdf capability? Available in v4.00.00alpha since January 2017. Best to parse the parameter list. """ args_tess = tess_base_args(langs, engine_mode=None) + ['--print-parameters', 'pdf'] params = '' try: proc = run(args_tess, check=True, stdout=PIPE, stderr=STDOUT) params = proc.stdout except CalledProcessError as e: raise MissingDependencyError( "Could not --print-parameters from tesseract. This can happen if the " "TESSDATA_PREFIX environment is not set to a valid tessdata folder. " ) from e if b'textonly_pdf' in params: return True return False
from ocrmypdf.subprocess._windows import shim_env_path libname = 'liblept-5' os.environ['PATH'] = shim_env_path() else: libname = 'lept' _libpath = find_library(libname) if not _libpath: raise MissingDependencyError(""" --------------------------------------------------------------------- This error normally occurs when ocrmypdf can't find the Leptonica library, which is usually installed with Tesseract OCR. It could be that Tesseract is not installed properly, we can't find the installation on your system PATH environment variable. The library we are looking for is usually called: liblept-5.dll (Windows) liblept*.dylib (macOS) liblept*.so (Linux/BSD) Please review our installation procedures to find a solution: https://ocrmypdf.readthedocs.io/en/latest/installation.html --------------------------------------------------------------------- """) if os.name == 'nt': # On Windows, recent versions of libpng require zlib. We have to make sure # the zlib version being loaded is the same one that libpng was built with. # This tries to import zlib from Tesseract's installation folder, falling back # to find_library() if liblept is being loaded from somewhere else. # Loading zlib from other places could cause a version mismatch _zlib_path = os.path.join(os.path.dirname(_libpath), 'zlib1.dll') if not os.path.exists(_zlib_path):
from ocrmypdf.helpers import Resolution from ocrmypdf.subprocess import get_version, run log = logging.getLogger(__name__) _gswin = None if os.name == 'nt': _gswin = which('gswin64c') if not _gswin: _gswin = which('gswin32c') if not _gswin: raise MissingDependencyError(""" --------------------------------------------------------------------- This error normally occurs when ocrmypdf can't Ghostscript. Please ensure Ghostscript is installed and its location is added to the system PATH environment variable. For details see: https://ocrmypdf.readthedocs.io/en/latest/installation.html --------------------------------------------------------------------- """) _gswin = Path(_gswin).stem GS = _gswin if _gswin else 'gs' del _gswin def version(): return get_version(GS) def jpeg_passthrough_available() -> bool:
This error normally occurs when ocrmypdf find can't Ghostscript. Please ensure Ghostscript is installed and its location is added to the system PATH environment variable. For details see: https://ocrmypdf.readthedocs.io/en/latest/installation.html --------------------------------------------------------------------- """ _gswin = None if os.name == 'nt': _gswin = which('gswin64c') if not _gswin: _gswin = which('gswin32c') if not _gswin: raise MissingDependencyError(missing_gs_error) _gswin = Path(_gswin).stem GS = _gswin if _gswin else 'gs' del _gswin def version(): return get_version(GS) def jpeg_passthrough_available() -> bool: """Returns True if the installed version of Ghostscript supports JPEG passthru Prior to 9.23, Ghostscript decoded and re-encoded JPEGs internally. In 9.23 it gained the ability to keep JPEGs unmodified. However, the 9.23