def run_ocrmypdf_api(input_file, output_file, *args): """Run ocrmypdf via API and let caller deal with results Does not currently have a way to manipulate the PATH except for Tesseract. """ args = [str(input_file), str(output_file) ] + [str(arg) for arg in args if arg is not None] _parser, options, plugin_manager = get_parser_options_plugins(args=args) api.check_options(options, plugin_manager) return api.run_pipeline(options, plugin_manager=None, api=False)
def check_ocrmypdf(input_file, output_file, *args): """Run ocrmypdf and confirmed that a valid file was created""" args = [str(input_file), str(output_file) ] + [str(arg) for arg in args if arg is not None] _parser, options, plugin_manager = get_parser_options_plugins(args=args) api.check_options(options, plugin_manager) result = api.run_pipeline(options, plugin_manager=plugin_manager, api=True) assert result == 0 assert output_file.exists(), "Output file not created" assert output_file.stat().st_size > 100, "PDF too small or empty" return output_file
def run_ocrmypdf_api(input_file, output_file, *args, env=None): "Run ocrmypdf and let caller deal with results" options = cli.parser.parse_args( [str(input_file), str(output_file)] + [str(arg) for arg in args if arg is not None]) api.check_options(options) if env: options.tesseract_env = env.copy() options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file) if options.tesseract_env: assert all( isinstance(v, (str, bytes)) for v in options.tesseract_env.values()) return api.run_pipeline(options, api=False)
def check_ocrmypdf(input_file, output_file, *args, env=None): """Run ocrmypdf and confirmed that a valid file was created""" options = cli.parser.parse_args( [str(input_file), str(output_file)] + [str(arg) for arg in args if arg is not None]) api.check_options(options) if env: options.tesseract_env = env options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = input_file result = api.run_pipeline(options, api=True) assert result == 0 assert os.path.exists(str(output_file)), "Output file not created" assert os.stat(str(output_file)).st_size > 100, "PDF too small or empty" return output_file
def run_ocrmypdf_api(input_file, output_file, *args, env=None): """Run ocrmypdf via API and let caller deal with results Does not currently have a way to manipulate the PATH except for Tesseract. """ options = cli.parser.parse_args( [str(input_file), str(output_file)] + [str(arg) for arg in args if arg is not None] ) api.check_options(options) if env: options.tesseract_env = env.copy() options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file) first_path = env.get('_OCRMYPDF_TEST_PATH', '').split(os.pathsep)[0] if 'spoof' in first_path: assert 'gs' not in first_path, "use run_ocrmypdf() for gs" assert 'tesseract' in first_path if options.tesseract_env: assert all(isinstance(v, (str, bytes)) for v in options.tesseract_env.values()) return api.run_pipeline(options, api=False)