def check_options(options): check_external_program( program='tesseract', package={'linux': 'tesseract-ocr'}, version_checker=tesseract.version, need_version='4.0.0', # using backport for Travis CI ) # Decide on what renderer to use if options.pdf_renderer == 'auto': options.pdf_renderer = 'sandwich' if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf( set(options.languages)): raise MissingDependencyError( "You are using an alpha version of Tesseract 4.0 that does not support " "the textonly_pdf parameter. We don't support versions this old.") if not tesseract.has_user_words() and (options.user_words or options.user_patterns): log.warning( "Tesseract 4.0 ignores --user-words and --user-patterns, so these " "arguments have no effect.") if options.tesseract_pagesegmode in (0, 2): log.warning( "The --tesseract-pagesegmode argument you select will disable OCR. " "This may cause processing to fail.")
def check_options(options): check_external_program( program='tesseract', package={'linux': 'tesseract-ocr'}, version_checker=tesseract.version, need_version='4.0.0-beta.1', # using backport for Travis CI version_parser=tesseract.TesseractVersion, ) # Decide on what renderer to use if options.pdf_renderer == 'auto': options.pdf_renderer = 'sandwich' if not tesseract.has_user_words() and (options.user_words or options.user_patterns): log.warning( "Tesseract 4.0 ignores --user-words and --user-patterns, so these " "arguments have no effect.") if options.tesseract_pagesegmode in (0, 2): log.warning( "The --tesseract-pagesegmode argument you select will disable OCR. " "This may cause processing to fail.")
p, _out, err = run_ocrmypdf( resources / 'ccitt.pdf', outdir / 'out.pdf', '--pdf-renderer', renderer, '--tesseract-config', cfg_file, ) assert ( "parameter not found" in err.lower() or "error occurred while parsing" in err.lower() ), "No error message" assert p.returncode == ExitCode.invalid_config @pytest.mark.skipif(not tesseract.has_user_words(), reason='not functional until 4.1.0') def test_user_words_ocr(resources, outdir): # Does not actually test if --user-words causes output to differ word_list = outdir / 'wordlist.txt' sidecar_after = outdir / 'sidecar.txt' with word_list.open('w') as f: f.write('cromulent\n') # a perfectly cromulent word check_ocrmypdf( resources / 'crom.png', outdir / 'out.pdf', '--image-dpi', 150, '--sidecar', sidecar_after,
''') p, _out, err = run_ocrmypdf( resources / 'ccitt.pdf', outdir / 'out.pdf', '--pdf-renderer', renderer, '--tesseract-config', cfg_file, ) assert ("parameter not found" in err.lower() or "error occurred while parsing" in err.lower()), "No error message" assert p.returncode == ExitCode.invalid_config @pytest.mark.skipif(not tesseract.has_user_words(), reason='not functional until 4.1.0') def test_user_words_ocr(resources, outdir): # Does not actually test if --user-words causes output to differ word_list = outdir / 'wordlist.txt' sidecar_after = outdir / 'sidecar.txt' with word_list.open('w') as f: f.write('cromulent\n') # a perfectly cromulent word check_ocrmypdf( resources / 'crom.png', outdir / 'out.pdf', '--image-dpi', 150, '--sidecar',