Ejemplo n.º 1
0
def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
    if options.input_file == '-':
        # stdin
        log.info('reading file from standard input')
        target = work_folder / 'stdin'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(sys.stdin.buffer, stream_buffer)
        return target, "stdin"
    elif hasattr(options.input_file, 'readable'):
        if not options.input_file.readable():
            raise InputFileError("Input file stream is not readable")
        log.info('reading file from input stream')
        target = work_folder / 'stream'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(options.input_file, stream_buffer)
        return target, "stream"
    else:
        try:
            target = work_folder / 'origin'
            safe_symlink(options.input_file, target)
            return target, os.fspath(options.input_file)
        except FileNotFoundError:
            msg = f"File not found - {options.input_file}"
            if Path('/.dockerenv').exists():  # pragma: no cover
                msg += (
                    "\nDocker cannot your working directory unless you "
                    "explicitly share it with the Docker container and set up"
                    "permissions correctly.\n"
                    "You may find it easier to use stdin/stdout:"
                    "\n"
                    "\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf\n"
                )
            raise InputFileError(msg)
Ejemplo n.º 2
0
 def test_safe_symlink_relink(self, tmp_path):
     (tmp_path / 'regular_file_a').touch()
     (tmp_path / 'regular_file_b').write_bytes(b'ABC')
     (tmp_path / 'link').symlink_to(tmp_path / 'regular_file_a')
     helpers.safe_symlink(tmp_path / 'regular_file_b', tmp_path / 'link')
     assert (tmp_path / 'link').samefile(
         tmp_path / 'regular_file_b') or (tmp_path /
                                          'link').read_bytes() == b'ABC'
Ejemplo n.º 3
0
def optimize(
        input_file: Path,
        output_file: Path,
        context,
        save_settings,
        executor: Executor = SerialExecutor(),
) -> None:
    options = context.options
    if options.optimize == 0:
        safe_symlink(input_file, output_file)
        return

    if options.jpeg_quality == 0:
        options.jpeg_quality = DEFAULT_JPEG_QUALITY if options.optimize < 3 else 40
    if options.png_quality == 0:
        options.png_quality = DEFAULT_PNG_QUALITY if options.optimize < 3 else 30
    if options.jbig2_page_group_size == 0:
        options.jbig2_page_group_size = 10 if options.jbig2_lossy else 1

    with pikepdf.Pdf.open(input_file) as pike:
        root = output_file.parent / 'images'
        root.mkdir(exist_ok=True)

        jpegs, pngs = extract_images_generic(pike, root, options)
        transcode_jpegs(pike, jpegs, root, options, executor)
        # if options.optimize >= 2:
        # Try pngifying the jpegs
        #    transcode_pngs(pike, jpegs, jpg_name, root, options)
        transcode_pngs(pike, pngs, png_name, root, options, executor)

        jbig2_groups = extract_images_jbig2(pike, root, options)
        convert_to_jbig2(pike, jbig2_groups, root, options, executor)

        target_file = output_file.with_suffix('.opt.pdf')
        pike.remove_unreferenced_resources()
        pike.save(target_file, **save_settings)

    input_size = input_file.stat().st_size
    output_size = target_file.stat().st_size
    if output_size == 0:
        raise OutputFileAccessError(
            f"Output file not created after optimizing. We probably ran "
            f"out of disk space in the temporary folder: {tempfile.gettempdir()}."
        )
    ratio = input_size / output_size
    savings = 1 - output_size / input_size
    log.info(f"Optimize ratio: {ratio:.2f} savings: {(savings):.1%}")

    if savings < 0:
        log.info("Image optimization did not improve the file - discarded")
        # We still need to save the file
        with pikepdf.open(input_file) as pike:
            pike.remove_unreferenced_resources()
            pike.save(output_file, **save_settings)
    else:
        safe_symlink(target_file, output_file)
Ejemplo n.º 4
0
def create_input_file(options, work_folder: Path) -> (Path, str):
    if options.input_file == '-':
        # stdin
        log.info('reading file from standard input')
        target = work_folder / 'stdin'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(sys.stdin.buffer, stream_buffer)
        return target, "<stdin>"
    else:
        try:
            target = work_folder / 'origin'
            safe_symlink(options.input_file, target)
            return target, os.fspath(options.input_file)
        except FileNotFoundError:
            raise InputFileError(f"File not found - {options.input_file}")
Ejemplo n.º 5
0
def triage(original_filename, input_file, output_file, options):
    try:
        if _pdf_guess_version(input_file):
            if options.image_dpi:
                log.warning(
                    "Argument --image-dpi is being ignored because the "
                    "input file is a PDF, not an image.")
            # Origin file is a pdf create a symlink with pdf extension
            safe_symlink(input_file, output_file)
            return output_file
    except EnvironmentError as e:
        log.debug(f"Temporary file was at: {input_file}")
        msg = str(e).replace(str(input_file), original_filename)
        raise InputFileError(msg) from e

    triage_image_file(input_file, output_file, options)
    return output_file
Ejemplo n.º 6
0
def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext):
    options = context.options
    input_pdfinfo = context.pdfinfo
    fix_docinfo_file = context.get_path('fix_docinfo.pdf')
    output_file = context.get_path('pdfa.pdf')

    # If the DocumentInfo record contains NUL characters, Ghostscript will
    # produce XMP metadata which contains invalid XML entities (&#0;).
    # NULs in DocumentInfo seem to be common since older Acrobats included them.
    # pikepdf can deal with this, but we make the world a better place by
    # stamping them out as soon as possible.
    modified = False
    with pikepdf.open(input_pdf) as pdf_file:
        try:
            len(pdf_file.docinfo)
        except TypeError:
            log.error(
                "File contains a malformed DocumentInfo block - continuing anyway"
            )
        else:
            if pdf_file.docinfo:
                for k, v in pdf_file.docinfo.items():
                    if b'\x00' in bytes(v):
                        pdf_file.docinfo[k] = bytes(v).replace(b'\x00', b'')
                        modified = True
        if modified:
            pdf_file.save(fix_docinfo_file)
        else:
            safe_symlink(input_pdf, fix_docinfo_file)

    context.plugin_manager.hook.generate_pdfa(
        pdf_version=input_pdfinfo.min_version,
        pdf_pages=[fix_docinfo_file],
        pdfmark=input_ps_stub,
        output_file=output_file,
        compression=options.pdfa_image_compression,
        pdfa_part=options.output_type[-1],  # is pdfa-1, pdfa-2, or pdfa-3
        progressbar_class=(context.plugin_manager.hook.get_progressbar_class()
                           if options.progress_bar else None),
    )

    return output_file
Ejemplo n.º 7
0
def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
    if options.input_file == '-':
        # stdin
        log.info('reading file from standard input')
        target = work_folder / 'stdin'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(sys.stdin.buffer, stream_buffer)
        return target, "stdin"
    elif hasattr(options.input_file, 'readable'):
        if not options.input_file.readable():
            raise InputFileError("Input file stream is not readable")
        log.info('reading file from input stream')
        target = work_folder / 'stream'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(options.input_file, stream_buffer)
        return target, "stream"
    else:
        try:
            target = work_folder / 'origin'
            safe_symlink(options.input_file, target)
            return target, os.fspath(options.input_file)
        except FileNotFoundError:
            raise InputFileError(f"File not found - {options.input_file}")
Ejemplo n.º 8
0
 def test_safe_symlink_overwrite(self, tmp_path):
     (tmp_path / 'regular_file').touch()
     with pytest.raises(FileExistsError):
         helpers.safe_symlink(tmp_path / 'input', tmp_path / 'regular_file')
Ejemplo n.º 9
0
 def test_safe_symlink_link_self(self, tmp_path, caplog):
     helpers.safe_symlink(tmp_path / 'self', tmp_path / 'self')
     assert caplog.record_tuples[0][1] == logging.WARNING