Example #1
0
def test_no_cpu_count(monkeypatch):
    def cpu_count_raises():
        raise NotImplementedError()

    monkeypatch.setattr(multiprocessing, 'cpu_count', cpu_count_raises)
    with pytest.warns(expected_warning=UserWarning):
        assert helpers.available_cpu_count() == 1
Example #2
0
def _pdf_pageinfo_concurrent(
    pdf,
    executor: Executor,
    infile,
    progbar,
    max_workers,
    check_pages,
    detailed_analysis=False,
):
    pages = [None] * len(pdf.pages)

    def update_pageinfo(result, pbar):
        page = result
        if not page:
            raise InputFileError("Could read a page in the PDF")
        pages[page.pageno] = page
        pbar.update()

    if max_workers is None:
        max_workers = available_cpu_count()

    total = len(pdf.pages)

    use_threads = False  # No performance gain if threaded due to GIL
    n_workers = min(1 + len(pages) // 4, max_workers)
    if n_workers == 1:
        # But if we decided on only one worker, there is no point in using
        # a separate process.
        use_threads = True

    # If we use a thread, we can pass the already-open Pdf for them to use
    # If we use processes, we pass a None which tells the init function to open its
    # own
    initial_pdf = pdf if use_threads else None

    contexts = ((n, initial_pdf, infile, check_pages, detailed_analysis)
                for n in range(total))
    assert n_workers == 1 if use_threads else n_workers >= 1, "Not multithreadable"
    executor(
        use_threads=use_threads,
        max_workers=n_workers,
        tqdm_kwargs=dict(total=total,
                         desc="Scanning contents",
                         unit='page',
                         disable=not progbar),
        worker_initializer=partial(
            _pdf_pageinfo_sync_init,
            initial_pdf,
            infile,
            logging.getLogger('pdfminer').level,
        ),
        task=_pdf_pageinfo_sync,
        task_arguments=contexts,
        task_finished=update_pageinfo,
    )
    return pages
Example #3
0
def test_no_cpu_count(monkeypatch):
    invoked = False

    def cpu_count_raises():
        nonlocal invoked
        invoked = True
        raise NotImplementedError()

    monkeypatch.setattr(multiprocessing, 'cpu_count', cpu_count_raises)
    with pytest.warns(expected_warning=UserWarning):
        assert helpers.available_cpu_count() == 1
    assert invoked, "Patched function called during test"
Example #4
0
def _pdf_pageinfo_concurrent(pdf,
                             infile,
                             progbar,
                             max_workers,
                             check_pages,
                             detailed_analysis=False):
    global worker_pdf  # pylint: disable=global-statement
    pages = [None] * len(pdf.pages)

    def update_pageinfo(result, pbar):
        page = result
        if not page:
            raise InputFileError("Could read a page in the PDF")
        pages[page.pageno] = page
        pbar.update()

    if max_workers is None:
        max_workers = available_cpu_count()

    total = len(pdf.pages)
    contexts = ((n, infile, check_pages, detailed_analysis)
                for n in range(total))

    use_threads = False  # No performance gain if threaded due to GIL
    n_workers = min(1 + len(pages) // 4, max_workers)
    if n_workers == 1:
        # But if we decided on only one worker, there is no point in using
        # a separate process.
        use_threads = True

    try:
        exec_progress_pool(
            use_threads=use_threads,
            max_workers=n_workers,
            tqdm_kwargs=dict(total=total,
                             desc="Scanning contents",
                             unit='page',
                             disable=not progbar),
            task_initializer=partial(_pdf_pageinfo_sync_init, infile,
                                     logging.getLogger('pdfminer').level),
            task=_pdf_pageinfo_sync,
            task_arguments=contexts,
            task_finished=update_pageinfo,
        )
    finally:
        if worker_pdf and use_threads:
            assert n_workers == 1, "Should have only one worker when threaded"
            # This is messy, but if we ran in thread, close worker_pdf
            worker_pdf.close()
    return pages
Example #5
0
def run_pipeline(options, *, plugin_manager, api=False):
    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    if not plugin_manager:
        plugin_manager = get_plugin_manager(options.plugins)

    work_folder = Path(mkdtemp(prefix="ocrmypdf.io."))
    debug_log_handler = None
    if (
        (options.keep_temporary_files or options.verbose >= 1)
        and not os.environ.get('PYTEST_CURRENT_TEST', '')
        and not api
    ):
        # Debug log for command line interface only with verbose output
        # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
        # when pytest is running
        debug_log_handler = configure_debug_logging(
            Path(work_folder) / "debug.log"
        )  # pragma: no cover

    pikepdf_enable_mmap()

    executor = setup_executor(plugin_manager)
    try:
        check_requested_output_file(options)
        start_input_file, original_filename = create_input_file(options, work_folder)

        # Triage image or pdf
        origin_pdf = triage(
            original_filename, start_input_file, work_folder / 'origin.pdf', options
        )

        # Gather pdfinfo and create context
        pdfinfo = get_pdfinfo(
            origin_pdf,
            executor=executor,
            detailed_analysis=options.redo_ocr,
            progbar=options.progress_bar,
            max_workers=options.jobs if not options.use_threads else 1,  # To help debug
            check_pages=options.pages,
        )

        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)

        # Validate options are okay for this pdf
        validate_pdfinfo_options(context)

        # Execute the pipeline
        exec_concurrent(context, executor)

        if options.output_file == '-':
            log.info("Output sent to stdout")
        elif (
            hasattr(options.output_file, 'writable') and options.output_file.writable()
        ):
            log.info("Output written to stream")
        elif samefile(options.output_file, os.devnull):
            pass  # Say nothing when sending to dev null
        else:
            if options.output_type.startswith('pdfa'):
                pdfa_info = file_claims_pdfa(options.output_file)
                if pdfa_info['pass']:
                    log.info(
                        "Output file is a %s (as expected)", pdfa_info['conformance']
                    )
                else:
                    log.warning(
                        "Output file is okay but is not PDF/A (seems to be %s)",
                        pdfa_info['conformance'],
                    )
                    return ExitCode.pdfa_conversion_failed
            if not check_pdf(options.output_file):
                log.warning('Output file: The generated PDF is INVALID')
                return ExitCode.invalid_output_pdf
            report_output_file_size(options, start_input_file, options.output_file)

    except (KeyboardInterrupt if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("KeyboardInterrupt")
        else:
            log.error("KeyboardInterrupt")
        return ExitCode.ctrl_c
    except (ExitCodeException if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("ExitCodeException")
        elif str(e):
            log.error("%s: %s", type(e).__name__, str(e))
        else:
            log.error(type(e).__name__)
        return e.exit_code
    except (Exception if not api else NeverRaise) as e:  # pylint: disable=broad-except
        log.exception("An exception occurred while executing the pipeline")
        return ExitCode.other_error
    finally:
        if debug_log_handler:
            try:
                debug_log_handler.close()
                log.removeHandler(debug_log_handler)
            except EnvironmentError as e:
                print(e, file=sys.stderr)
        cleanup_working_files(work_folder, options)

    return ExitCode.ok