Exemple #1
0
def test_override_metadata(spoof_tesseract_noop, output_type):
    input_file = _infile('c02-22.pdf')
    output_file = _outfile('test_override_metadata.pdf')

    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'
    high_unicode = 'U+1030C is: 𐌌'

    p, out, err = run_ocrmypdf(
        input_file, output_file,
        '--title', german,
        '--author', chinese,
        '--subject', high_unicode,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok

    pdf = output_file

    out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
    lines_pdfinfo = out_pdfinfo.splitlines()
    pdfinfo = {}
    for line in lines_pdfinfo:
        k, v = line.strip().split(':', maxsplit=1)
        pdfinfo[k.strip()] = v.strip()

    assert pdfinfo['Title'] == german
    assert pdfinfo['Author'] == chinese
    assert pdfinfo['Subject'] == high_unicode
    assert pdfinfo.get('Keywords', '') == ''

    pdfa_info = file_claims_pdfa(output_file)
    assert pdfa_info['output'] == output_type
def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                           outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, out, err = run_ocrmypdf(input_file,
                               outpdf,
                               '--title',
                               german,
                               '--author',
                               chinese,
                               '--output-type',
                               output_type,
                               env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok, err

    before = pikepdf.open(input_file)
    after = pikepdf.open(outpdf)

    if ghostscript.version() >= '9.24':
        pytest.xfail('Ghostscript 9.24+ does not support Unicode DOCINFO')

    assert after.metadata.Title == german, after.metadata
    assert after.metadata.Author == chinese, after.metadata
    assert after.metadata.get('/Keywords', '') == ''

    before_date = decode_pdf_date(str(before.metadata.CreationDate))
    after_date = decode_pdf_date(str(after.metadata.CreationDate))
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
Exemple #3
0
def test_override_metadata(spoof_tesseract_noop, output_type):
    input_file = _infile('c02-22.pdf')
    output_file = _outfile('test_override_metadata.pdf')

    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'
    high_unicode = 'U+1030C is: 𐌌'

    p, out, err = run_ocrmypdf(
        input_file, output_file,
        '--title', german,
        '--author', chinese,
        '--subject', high_unicode,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok

    pdf = output_file

    out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
    lines_pdfinfo = out_pdfinfo.splitlines()
    pdfinfo = {}
    for line in lines_pdfinfo:
        k, v = line.strip().split(':', maxsplit=1)
        pdfinfo[k.strip()] = v.strip()

    assert pdfinfo['Title'] == german
    assert pdfinfo['Author'] == chinese
    assert pdfinfo['Subject'] == high_unicode
    assert pdfinfo.get('Keywords', '') == ''

    pdfa_info = file_claims_pdfa(output_file)
    assert pdfa_info['output'] == output_type
Exemple #4
0
def test_override_metadata(output_type, resources, outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, _out, err = run_ocrmypdf(
        input_file,
        outpdf,
        '--title',
        german,
        '--author',
        chinese,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    assert p.returncode == ExitCode.ok, err

    before = pikepdf.open(input_file)
    after = pikepdf.open(outpdf)

    assert after.docinfo.Title == german, after.docinfo
    assert after.docinfo.Author == chinese, after.docinfo
    assert after.docinfo.get('/Keywords', '') == ''

    before_date = decode_pdf_date(str(before.docinfo.CreationDate))
    after_date = decode_pdf_date(str(after.docinfo.CreationDate))
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
Exemple #5
0
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, out, err = run_ocrmypdf(
        input_file,
        outpdf,
        '--title',
        german,
        '--author',
        chinese,
        '--output-type',
        output_type,
        env=spoof_tesseract_noop,
    )

    assert p.returncode == ExitCode.ok, err

    before = pikepdf.open(input_file)
    after = pikepdf.open(outpdf)

    assert after.docinfo.Title == german, after.docinfo
    assert after.docinfo.Author == chinese, after.docinfo
    assert after.docinfo.get('/Keywords', '') == ''

    before_date = decode_pdf_date(str(before.docinfo.CreationDate))
    after_date = decode_pdf_date(str(after.docinfo.CreationDate))
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
Exemple #6
0
def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                           outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, out, err = run_ocrmypdf(
        input_file, outpdf,
        '--title', german,
        '--author', chinese,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok, err

    before = pypdf.PdfFileReader(str(input_file))
    after = pypdf.PdfFileReader(outpdf)

    assert after.documentInfo['/Title'] == german
    assert after.documentInfo['/Author'] == chinese
    assert after.documentInfo.get('/Keywords', '') == ''

    before_date = decode_pdf_date(before.documentInfo['/CreationDate'])
    after_date = decode_pdf_date(after.documentInfo['/CreationDate'])
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
Exemple #7
0
def test_pdfa_1(spoof_tesseract_cache, resources, outpdf):
    check_ocrmypdf(resources / 'ccitt.pdf',
                   outpdf,
                   '--output-type',
                   'pdfa-1',
                   env=spoof_tesseract_cache)

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['conformance'] == 'PDF/A-1B'
Exemple #8
0
def test_pdfa_1(spoof_tesseract_cache, resources, outpdf):
    check_ocrmypdf(
        resources / 'ccitt.pdf', outpdf,
        '--output-type', 'pdfa-1',
        env=spoof_tesseract_cache
    )

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['conformance'] == 'PDF/A-1B'
def test_pdfa_n(spoof_tesseract_cache, pdfa_level, resources, outpdf):
    if pdfa_level == '3' and ghostscript.version() < '9.19':
        pytest.xfail(reason='Ghostscript >= 9.19 required')

    check_ocrmypdf(
        resources / 'ccitt.pdf', outpdf,
        '--output-type', 'pdfa-' + pdfa_level,
        env=spoof_tesseract_cache
    )

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['conformance'] == 'PDF/A-{}B'.format(pdfa_level)
Exemple #10
0
def test_preserve_metadata(spoof_tesseract_noop, output_type):
    pdf_before = pypdf.PdfFileReader(_infile('graph.pdf'))

    output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf',
                            '--output-type', output_type,
                            env=spoof_tesseract_noop)

    pdf_after = pypdf.PdfFileReader(output)

    for key in ('/Title', '/Author'):
        assert pdf_before.documentInfo[key] == pdf_after.documentInfo[key]

    pdfa_info = file_claims_pdfa(output)
    assert pdfa_info['output'] == output_type
Exemple #11
0
def test_preserve_metadata(spoof_tesseract_noop, output_type):
    pdf_before = pypdf.PdfFileReader(_infile('graph.pdf'))

    output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf',
                            '--output-type', output_type,
                            env=spoof_tesseract_noop)

    pdf_after = pypdf.PdfFileReader(output)

    for key in ('/Title', '/Author'):
        assert pdf_before.documentInfo[key] == pdf_after.documentInfo[key]

    pdfa_info = file_claims_pdfa(output)
    assert pdfa_info['output'] == output_type
Exemple #12
0
def test_pdfa_n(pdfa_level, resources, outpdf):
    if pdfa_level == '3' and ghostscript.version() < '9.19':
        pytest.xfail(reason='Ghostscript >= 9.19 required')

    check_ocrmypdf(
        resources / 'ccitt.pdf',
        outpdf,
        '--output-type',
        'pdfa-' + pdfa_level,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}B'
Exemple #13
0
def test_preserve_metadata(spoof_tesseract_noop, output_type,
                           resources, outpdf):
    pdf_before = pikepdf.open(resources / 'graph.pdf')

    output = check_ocrmypdf(
            resources / 'graph.pdf', outpdf,
            '--output-type', output_type,
            env=spoof_tesseract_noop)

    pdf_after = pikepdf.open(output)

    for key in ('/Title', '/Author'):
        assert pdf_before.docinfo[key] == pdf_after.docinfo[key]

    pdfa_info = file_claims_pdfa(str(output))
    assert pdfa_info['output'] == output_type
Exemple #14
0
def test_preserve_metadata(spoof_tesseract_noop, output_type, resources, outpdf):
    pdf_before = pikepdf.open(resources / 'graph.pdf')

    output = check_ocrmypdf(
        resources / 'graph.pdf',
        outpdf,
        '--output-type',
        output_type,
        env=spoof_tesseract_noop,
    )

    pdf_after = pikepdf.open(output)

    for key in ('/Title', '/Author'):
        assert pdf_before.docinfo[key] == pdf_after.docinfo[key]

    pdfa_info = file_claims_pdfa(str(output))
    assert pdfa_info['output'] == output_type
Exemple #15
0
def test_preserve_docinfo(output_type, resources, outpdf):
    pdf_before = pikepdf.open(resources / 'graph.pdf')

    output = check_ocrmypdf(
        resources / 'graph.pdf',
        outpdf,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    pdf_after = pikepdf.open(output)

    for key in ('/Title', '/Author'):
        assert pdf_before.docinfo[key] == pdf_after.docinfo[key]

    pdfa_info = file_claims_pdfa(str(output))
    assert pdfa_info['output'] == output_type
Exemple #16
0
def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                           outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, out, err = run_ocrmypdf(
        input_file, outpdf,
        '--title', german,
        '--author', chinese,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok, err

    reader = pypdf.PdfFileReader(outpdf)

    assert reader.documentInfo['/Title'] == german
    assert reader.documentInfo['/Author'] == chinese
    assert reader.documentInfo.get('/Keywords', '') == ''

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
Exemple #17
0
def run_pipeline(options, *, plugin_manager, api=False):
    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    if not plugin_manager:
        plugin_manager = get_plugin_manager(options.plugins)

    work_folder = Path(mkdtemp(prefix="ocrmypdf.io."))
    debug_log_handler = None
    if (
        (options.keep_temporary_files or options.verbose >= 1)
        and not os.environ.get('PYTEST_CURRENT_TEST', '')
        and not api
    ):
        # Debug log for command line interface only with verbose output
        # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
        # when pytest is running
        debug_log_handler = configure_debug_logging(
            Path(work_folder) / "debug.log"
        )  # pragma: no cover

    pikepdf_enable_mmap()

    executor = setup_executor(plugin_manager)
    try:
        check_requested_output_file(options)
        start_input_file, original_filename = create_input_file(options, work_folder)

        # Triage image or pdf
        origin_pdf = triage(
            original_filename, start_input_file, work_folder / 'origin.pdf', options
        )

        # Gather pdfinfo and create context
        pdfinfo = get_pdfinfo(
            origin_pdf,
            executor=executor,
            detailed_analysis=options.redo_ocr,
            progbar=options.progress_bar,
            max_workers=options.jobs if not options.use_threads else 1,  # To help debug
            check_pages=options.pages,
        )

        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)

        # Validate options are okay for this pdf
        validate_pdfinfo_options(context)

        # Execute the pipeline
        exec_concurrent(context, executor)

        if options.output_file == '-':
            log.info("Output sent to stdout")
        elif (
            hasattr(options.output_file, 'writable') and options.output_file.writable()
        ):
            log.info("Output written to stream")
        elif samefile(options.output_file, os.devnull):
            pass  # Say nothing when sending to dev null
        else:
            if options.output_type.startswith('pdfa'):
                pdfa_info = file_claims_pdfa(options.output_file)
                if pdfa_info['pass']:
                    log.info(
                        "Output file is a %s (as expected)", pdfa_info['conformance']
                    )
                else:
                    log.warning(
                        "Output file is okay but is not PDF/A (seems to be %s)",
                        pdfa_info['conformance'],
                    )
                    return ExitCode.pdfa_conversion_failed
            if not check_pdf(options.output_file):
                log.warning('Output file: The generated PDF is INVALID')
                return ExitCode.invalid_output_pdf
            report_output_file_size(options, start_input_file, options.output_file)

    except (KeyboardInterrupt if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("KeyboardInterrupt")
        else:
            log.error("KeyboardInterrupt")
        return ExitCode.ctrl_c
    except (ExitCodeException if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("ExitCodeException")
        elif str(e):
            log.error("%s: %s", type(e).__name__, str(e))
        else:
            log.error(type(e).__name__)
        return e.exit_code
    except (Exception if not api else NeverRaise) as e:  # pylint: disable=broad-except
        log.exception("An exception occurred while executing the pipeline")
        return ExitCode.other_error
    finally:
        if debug_log_handler:
            try:
                debug_log_handler.close()
                log.removeHandler(debug_log_handler)
            except EnvironmentError as e:
                print(e, file=sys.stderr)
        cleanup_working_files(work_folder, options)

    return ExitCode.ok