def quantize(input_file, output_file, quality_min, quality_max): input_file = fspath(input_file) output_file = fspath(output_file) if input_file.endswith('.jpg'): with Image.open(input_file) as im, NamedTemporaryFile( suffix='.png') as tmp: im.save(tmp) args = [ 'pngquant', '--force', '--skip-if-larger', '--output', output_file, '--quality', f'{quality_min}-{quality_max}', '--', tmp.name, ] run(args) else: args = [ 'pngquant', '--force', '--skip-if-larger', '--output', output_file, '--quality', f'{quality_min}-{quality_max}', '--', input_file, ] run(args)
def get_orientation(input_file: Path, engine_mode: int, timeout: float): args_tesseract = tess_base_args(['osd'], engine_mode) + [ '--psm', '0', fspath(input_file), 'stdout', ] try: p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True) stdout = p.stdout except TimeoutExpired: return OrientationConfidence(angle=0, confidence=0.0) except CalledProcessError as e: tesseract_log_output(e.stdout) tesseract_log_output(e.stderr) if ( b'Too few characters. Skipping this page' in e.output or b'Image too large' in e.output ): return OrientationConfidence(0, 0) raise SubprocessOutputError() from e else: osd = {} for line in stdout.decode().splitlines(): line = line.strip() parts = line.split(':', maxsplit=2) if len(parts) == 2: osd[parts[0].strip()] = parts[1].strip() angle = int(osd.get('Orientation in degrees', 0)) oc = OrientationConfidence( angle=angle, confidence=float(osd.get('Orientation confidence', 0)) ) return oc
def get_languages(): def lang_error(output): msg = ("Tesseract failed to report available languages.\n" "Output from Tesseract:\n" "-----------\n") msg += output return msg args_tess = ['tesseract', '--list-langs'] try: proc = run( args_tess, text=True, stdout=PIPE, stderr=STDOUT, logs_errors_to_stdout=True, check=True, ) output = proc.stdout except CalledProcessError as e: raise MissingDependencyError(lang_error(e.output)) from e for line in output.splitlines(): if line.startswith('Error'): raise MissingDependencyError(lang_error(output)) _header, *rest = output.splitlines() return {lang.strip() for lang in rest}
def generate_pdf( *, input_file: Path, output_pdf: Path, output_text: Path, languages: List[str], engine_mode: int, tessconfig: List[str], timeout: float, pagesegmode: int, user_words, user_patterns, ): """Use Tesseract to render a PDF. input_file -- image to analyze output_pdf -- file to generate output_text -- OCR text file languages -- list of languages to consider engine_mode -- engine mode argument for tess v4 tessconfig -- tesseract configuration timeout -- timeout (seconds) """ args_tesseract = tess_base_args(languages, engine_mode) if pagesegmode is not None: args_tesseract.extend(['--psm', str(pagesegmode)]) args_tesseract.extend(['-c', 'textonly_pdf=1']) if user_words: args_tesseract.extend(['--user-words', user_words]) if user_patterns: args_tesseract.extend(['--user-patterns', user_patterns]) prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes # Reminder: test suite tesseract test plugins might break after any changes # to the number of order parameters here args_tesseract.extend([os.fspath(input_file), os.fspath(prefix), 'pdf', 'txt']) args_tesseract.extend(tessconfig) try: p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True) stdout = p.stdout if os.path.exists(prefix + '.txt'): shutil.move(prefix + '.txt', output_text) except TimeoutExpired: page_timedout(timeout) use_skip_page(output_pdf, output_text) except CalledProcessError as e: tesseract_log_output(e.output) if b'Image too large' in e.output: use_skip_page(output_pdf, output_text) return raise SubprocessOutputError() from e else: tesseract_log_output(stdout)
def rasterize_pdf( input_file: os.PathLike, output_file: os.PathLike, *, raster_device: str, raster_dpi: Resolution, pageno: int = 1, page_dpi: Optional[Resolution] = None, rotation: Optional[int] = None, filter_vector: bool = False, ): """Rasterize one page of a PDF at resolution raster_dpi in canvas units.""" raster_dpi = raster_dpi.round(6) if not page_dpi: page_dpi = raster_dpi args_gs = ([ GS, '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE', f'-sDEVICE={raster_device}', f'-dFirstPage={pageno}', f'-dLastPage={pageno}', f'-r{raster_dpi.x:f}x{raster_dpi.y:f}', ] + (['-dFILTERVECTOR'] if filter_vector else []) + [ '-o', '-', '-sstdout=%stderr', '-dAutoRotatePages=/None', # Probably has no effect on raster '-f', fspath(input_file), ]) try: p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True) except CalledProcessError as e: log.error(e.stderr.decode(errors='replace')) raise SubprocessOutputError('Ghostscript rasterizing failed') else: stderr = p.stderr.decode(errors='replace') if _gs_error_reported(stderr): log.error(stderr) with Image.open(BytesIO(p.stdout)) as im: if rotation is not None: log.debug("Rotating output by %i", rotation) # rotation is a clockwise angle and Image.ROTATE_* is # counterclockwise so this cancels out the rotation if rotation == 90: im = im.transpose(Image.ROTATE_90) elif rotation == 180: im = im.transpose(Image.ROTATE_180) elif rotation == 270: im = im.transpose(Image.ROTATE_270) if rotation % 180 == 90: page_dpi = page_dpi.flip_axis() im.save(fspath(output_file), dpi=page_dpi)
def run_rig_args(args, **kwargs): # Remove the two arguments that tell ghostscript to create a PDF/A # Does not remove the Postscript definition file - not necessary # to cause PDF/A creation failure new_args = [ arg for arg in args if not arg.startswith('-dPDFA') and not arg.endswith('.ps') ] proc = run(new_args, **kwargs) return proc
def generate_hocr( *, input_file: Path, output_hocr: Path, output_text: Path, languages: List[str], engine_mode: int, tessconfig: List[str], timeout: float, pagesegmode: int, user_words, user_patterns, ): prefix = output_hocr.with_suffix('') args_tesseract = tess_base_args(languages, engine_mode) if pagesegmode is not None: args_tesseract.extend(['--psm', str(pagesegmode)]) if user_words: args_tesseract.extend(['--user-words', user_words]) if user_patterns: args_tesseract.extend(['--user-patterns', user_patterns]) # Reminder: test suite tesseract test plugins will break after any changes # to the number of order parameters here args_tesseract.extend([fspath(input_file), fspath(prefix), 'hocr', 'txt']) args_tesseract.extend(tessconfig) try: p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True) stdout = p.stdout except TimeoutExpired: # Generate a HOCR file with no recognized text if tesseract times out # Temporary workaround to hocrTransform not being able to function if # it does not have a valid hOCR file. page_timedout(timeout) _generate_null_hocr(output_hocr, output_text, input_file) except CalledProcessError as e: tesseract_log_output(e.output) if b'Image too large' in e.output: _generate_null_hocr(output_hocr, output_text, input_file) return raise SubprocessOutputError() from e else: tesseract_log_output(stdout) # The sidecar text file will get the suffix .txt; rename it to # whatever caller wants it named if prefix.with_suffix('.txt').exists(): prefix.with_suffix('.txt').replace(output_text)
def convert_group(*, cwd, infiles, out_prefix): args = [ 'jbig2', '-b', out_prefix, '-s', # symbol mode (lossy) # '-r', # refinement mode (lossless symbol mode, currently disabled in # jbig2) '-p', ] args.extend(infiles) proc = run(args, cwd=cwd, stdout=PIPE, stderr=PIPE) proc.check_returncode() return proc
def has_textonly_pdf(langs=None): """Does Tesseract have textonly_pdf capability? Available in v4.00.00alpha since January 2017. Best to parse the parameter list. """ args_tess = tess_base_args(langs, engine_mode=None) + ['--print-parameters', 'pdf'] params = '' try: proc = run(args_tess, check=True, stdout=PIPE, stderr=STDOUT) params = proc.stdout except CalledProcessError as e: raise MissingDependencyError( "Could not --print-parameters from tesseract. This can happen if the " "TESSDATA_PREFIX environment is not set to a valid tessdata folder. " ) from e if b'textonly_pdf' in params: return True return False
def quantize(input_file: Path, output_file: Path, quality_min: int, quality_max: int): with input_as_png(input_file) as input_stream: args = [ 'pngquant', '--force', '--skip-if-larger', '--quality', f'{quality_min}-{quality_max}', '--', # pngquant: stop processing arguments '-', # pngquant: stream input and output ] result = run(args, stdin=input_stream, stdout=PIPE, stderr=PIPE, check=False) if result.returncode == 0: # input_file could be the same as output_file, so we defer the write output_file.write_bytes(result.stdout)
def cached_run(options, run_args, **run_kwargs): run_args = [str(arg) for arg in run_args] # flatten PosixPaths args = parser.parse_args(run_args[1:]) if args.imagename in ('stdin', '-'): return run(run_args, **run_kwargs) source_file = options.input_file cache_folder = get_cache_folder(source_file, run_args, args) cache_folder.mkdir(parents=True, exist_ok=True) log.debug(f"Using Tesseract cache {cache_folder}") if (cache_folder / 'stderr.bin').exists(): log.debug("Cache HIT") # Replicate stdout/err if args.outputbase != 'stdout': if not args.configfiles: args.configfiles.append('txt') for configfile in args.configfiles: # cp cache -> output tessfile = args.outputbase + '.' + configfile shutil.copy(str(cache_folder / configfile) + '.bin', tessfile) return CompletedProcess( args=run_args, returncode=0, stdout=(cache_folder / 'stdout.bin').read_bytes(), stderr=(cache_folder / 'stderr.bin').read_bytes(), ) log.debug("Cache MISS") cache_kwargs = { k: v for k, v in run_kwargs.items() if k not in ('stdout', 'stderr') } assert cache_kwargs['check'] try: p = run(run_args, stdout=PIPE, stderr=PIPE, **cache_kwargs) except CalledProcessError as e: log.exception(e) raise # Pass exception onward # Update cache (cache_folder / 'stdout.bin').write_bytes(p.stdout) (cache_folder / 'stderr.bin').write_bytes(p.stderr) if args.outputbase != 'stdout': if not args.configfiles: args.configfiles.append('txt') for configfile in args.configfiles: if configfile not in ('hocr', 'pdf', 'txt'): continue # cp pwd/{outputbase}.{configfile} -> {cache}/{configfile} tessfile = args.outputbase + '.' + configfile shutil.copy(tessfile, str(cache_folder / configfile) + '.bin') manifest = {} manifest['tesseract_version'] = TesseractOcrEngine.version().replace( '\n', ' ') manifest['platform'] = platform.platform() manifest['python'] = platform.python_version() manifest['argv_slug'] = cache_folder.name manifest['sourcefile'] = str(Path(source_file).relative_to(TESTS_ROOT)) def clean_sys_argv(): for arg in run_args[1:]: yield re.sub(r'.*/ocrmypdf[.]io[.][^/]+[/](.*)', r'$TMPDIR/\1', arg) manifest['args'] = list(clean_sys_argv()) with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f: json.dump(manifest, f) f.write('\n') f.flush() return p
def generate_pdfa( pdf_pages, output_file: os.PathLike, compression: str, pdf_version: str = '1.5', pdfa_part: str = '2', ): compression_args = [] if compression == 'jpeg': compression_args = [ "-dAutoFilterColorImages=false", "-dColorImageFilter=/DCTEncode", "-dAutoFilterGrayImages=false", "-dGrayImageFilter=/DCTEncode", ] elif compression == 'lossless': compression_args = [ "-dAutoFilterColorImages=false", "-dColorImageFilter=/FlateEncode", "-dAutoFilterGrayImages=false", "-dGrayImageFilter=/FlateEncode", ] else: compression_args = [ "-dAutoFilterColorImages=true", "-dAutoFilterGrayImages=true", ] # Older versions of Ghostscript expect a leading slash in # sColorConversionStrategy, newer ones should not have it. See Ghostscript # git commit fe1c025d. strategy = 'RGB' if version() >= '9.19' else '/RGB' if version() == '9.23': # 9.23: new feature JPEG passthrough is broken in some cases, best to # disable it always # https://bugs.ghostscript.com/show_bug.cgi?id=699216 compression_args.append('-dPassThroughJPEGImages=false') # nb no need to specify ProcessColorModel when ColorConversionStrategy # is set; see: # https://bugs.ghostscript.com/show_bug.cgi?id=699392 args_gs = ([ GS, "-dQUIET", "-dBATCH", "-dNOPAUSE", "-dSAFER", "-dCompatibilityLevel=" + str(pdf_version), "-sDEVICE=pdfwrite", "-dAutoRotatePages=/None", "-sColorConversionStrategy=" + strategy, ] + compression_args + [ "-dJPEGQ=95", "-dPDFA=" + pdfa_part, "-dPDFACompatibilityPolicy=1", "-o", "-", "-sstdout=%stderr", ]) args_gs.extend(fspath(s) for s in pdf_pages) # Stringify Path objs try: with Path(output_file).open('wb') as output: p = run(args_gs, stdout=output, stderr=PIPE, check=True) except CalledProcessError as e: # Ghostscript does not change return code when it fails to create # PDF/A - check PDF/A status elsewhere log.error(e.stderr.decode(errors='replace')) raise SubprocessOutputError('Ghostscript PDF/A rendering failed') else: stderr = p.stderr.decode('utf-8', errors='replace') if _gs_error_reported(stderr): last_part = None repcount = 0 for part in stderr.split('****'): if part != last_part: if repcount > 1: log.error( f"(previous error message repeated {repcount} times)" ) repcount = 0 log.error(part) else: repcount += 1 last_part = part elif 'overprint mode not set' in stderr: # Unless someone is going to print PDF/A documents on a # magical sRGB printer I can't see the removal of overprinting # being a problem.... log.debug("Ghostscript had to remove PDF 'overprinting' from the " "input file to complete PDF/A conversion. ")
def run_append_stderr(*args, **kwargs): proc = run(*args, **kwargs) proc.stderr = b'\n'.join([proc.stderr, elision_warning.encode('utf-8')]) return proc
def convert_single(*, cwd, infile, outfile): args = ['jbig2', '-p', infile] with open(outfile, 'wb') as fstdout: proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE) proc.check_returncode() return proc