def test_report_file_size(tmp_path, caplog):
    in_ = tmp_path / 'a.pdf'
    out = tmp_path / 'b.pdf'
    in_.write_bytes(b'123')
    out.write_bytes(b'')
    opts = make_opts()
    vd.report_output_file_size(opts, in_, out)
    assert caplog.text == ''

    os.truncate(in_, 25001)
    os.truncate(out, 50000)
    vd.report_output_file_size(opts, in_, out)
    assert 'No reason' in caplog.text
def test_report_file_size(tmp_path, caplog):
    in_ = tmp_path / 'a.pdf'
    out = tmp_path / 'b.pdf'
    pdf = pikepdf.new()
    pdf.save(in_)
    pdf.save(out)
    opts = make_opts(output_type='pdf')
    vd.report_output_file_size(opts, in_, out)
    assert caplog.text == ''
    caplog.clear()

    waste_of_space = b'Dummy' * 5000
    pdf.Root.Dummy = waste_of_space
    pdf.save(in_)
    pdf.Root.Dummy2 = waste_of_space + waste_of_space
    pdf.save(out)

    with patch('ocrmypdf._validation.jbig2enc.available', return_value=True), patch(
        'ocrmypdf._validation.pngquant.available', return_value=True
    ):
        vd.report_output_file_size(opts, in_, out)
        assert 'No reason' in caplog.text
    caplog.clear()

    with patch('ocrmypdf._validation.jbig2enc.available', return_value=False), patch(
        'ocrmypdf._validation.pngquant.available', return_value=True
    ):
        vd.report_output_file_size(opts, in_, out)
        assert 'optional dependency' in caplog.text
    caplog.clear()

    opts = make_opts(in_, out, optimize=0, output_type='pdf')
    vd.report_output_file_size(opts, in_, out)
    assert 'disabled' in caplog.text
    caplog.clear()
def test_report_file_size(tmp_path, caplog):
    in_ = tmp_path / 'a.pdf'
    out = tmp_path / 'b.pdf'
    in_.write_bytes(b'123')
    out.write_bytes(b'')
    opts = make_opts()
    vd.report_output_file_size(opts, in_, out)
    assert caplog.text == ''
    caplog.clear()

    os.truncate(in_, 25001)
    os.truncate(out, 50000)
    with patch('ocrmypdf._validation.jbig2enc.available',
               return_value=True), patch(
                   'ocrmypdf._validation.pngquant.available',
                   return_value=True):
        vd.report_output_file_size(opts, in_, out)
        assert 'No reason' in caplog.text
    caplog.clear()

    with patch('ocrmypdf._validation.jbig2enc.available',
               return_value=False), patch(
                   'ocrmypdf._validation.pngquant.available',
                   return_value=True):
        vd.report_output_file_size(opts, in_, out)
        assert 'optional dependency' in caplog.text
    caplog.clear()

    opts = make_opts(in_, out, optimize=0)
    vd.report_output_file_size(opts, in_, out)
    assert 'disabled' in caplog.text
    caplog.clear()
Exemple #4
0
def run_pipeline(options, *, plugin_manager, api=False):
    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    if not plugin_manager:
        plugin_manager = get_plugin_manager(options.plugins)

    work_folder = Path(mkdtemp(prefix="ocrmypdf.io."))
    debug_log_handler = None
    if (
        (options.keep_temporary_files or options.verbose >= 1)
        and not os.environ.get('PYTEST_CURRENT_TEST', '')
        and not api
    ):
        # Debug log for command line interface only with verbose output
        # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
        # when pytest is running
        debug_log_handler = configure_debug_logging(
            Path(work_folder) / "debug.log"
        )  # pragma: no cover

    pikepdf_enable_mmap()

    executor = setup_executor(plugin_manager)
    try:
        check_requested_output_file(options)
        start_input_file, original_filename = create_input_file(options, work_folder)

        # Triage image or pdf
        origin_pdf = triage(
            original_filename, start_input_file, work_folder / 'origin.pdf', options
        )

        # Gather pdfinfo and create context
        pdfinfo = get_pdfinfo(
            origin_pdf,
            executor=executor,
            detailed_analysis=options.redo_ocr,
            progbar=options.progress_bar,
            max_workers=options.jobs if not options.use_threads else 1,  # To help debug
            check_pages=options.pages,
        )

        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)

        # Validate options are okay for this pdf
        validate_pdfinfo_options(context)

        # Execute the pipeline
        exec_concurrent(context, executor)

        if options.output_file == '-':
            log.info("Output sent to stdout")
        elif (
            hasattr(options.output_file, 'writable') and options.output_file.writable()
        ):
            log.info("Output written to stream")
        elif samefile(options.output_file, os.devnull):
            pass  # Say nothing when sending to dev null
        else:
            if options.output_type.startswith('pdfa'):
                pdfa_info = file_claims_pdfa(options.output_file)
                if pdfa_info['pass']:
                    log.info(
                        "Output file is a %s (as expected)", pdfa_info['conformance']
                    )
                else:
                    log.warning(
                        "Output file is okay but is not PDF/A (seems to be %s)",
                        pdfa_info['conformance'],
                    )
                    return ExitCode.pdfa_conversion_failed
            if not check_pdf(options.output_file):
                log.warning('Output file: The generated PDF is INVALID')
                return ExitCode.invalid_output_pdf
            report_output_file_size(options, start_input_file, options.output_file)

    except (KeyboardInterrupt if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("KeyboardInterrupt")
        else:
            log.error("KeyboardInterrupt")
        return ExitCode.ctrl_c
    except (ExitCodeException if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("ExitCodeException")
        elif str(e):
            log.error("%s: %s", type(e).__name__, str(e))
        else:
            log.error(type(e).__name__)
        return e.exit_code
    except (Exception if not api else NeverRaise) as e:  # pylint: disable=broad-except
        log.exception("An exception occurred while executing the pipeline")
        return ExitCode.other_error
    finally:
        if debug_log_handler:
            try:
                debug_log_handler.close()
                log.removeHandler(debug_log_handler)
            except EnvironmentError as e:
                print(e, file=sys.stderr)
        cleanup_working_files(work_folder, options)

    return ExitCode.ok