def create_input_file(options, work_folder: Path) -> Tuple[Path, str]: if options.input_file == '-': # stdin log.info('reading file from standard input') target = work_folder / 'stdin' with open(target, 'wb') as stream_buffer: copyfileobj(sys.stdin.buffer, stream_buffer) return target, "stdin" elif hasattr(options.input_file, 'readable'): if not options.input_file.readable(): raise InputFileError("Input file stream is not readable") log.info('reading file from input stream') target = work_folder / 'stream' with open(target, 'wb') as stream_buffer: copyfileobj(options.input_file, stream_buffer) return target, "stream" else: try: target = work_folder / 'origin' safe_symlink(options.input_file, target) return target, os.fspath(options.input_file) except FileNotFoundError: msg = f"File not found - {options.input_file}" if Path('/.dockerenv').exists(): # pragma: no cover msg += ( "\nDocker cannot your working directory unless you " "explicitly share it with the Docker container and set up" "permissions correctly.\n" "You may find it easier to use stdin/stdout:" "\n" "\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf\n" ) raise InputFileError(msg)
def test_safe_symlink_relink(self, tmp_path): (tmp_path / 'regular_file_a').touch() (tmp_path / 'regular_file_b').write_bytes(b'ABC') (tmp_path / 'link').symlink_to(tmp_path / 'regular_file_a') helpers.safe_symlink(tmp_path / 'regular_file_b', tmp_path / 'link') assert (tmp_path / 'link').samefile( tmp_path / 'regular_file_b') or (tmp_path / 'link').read_bytes() == b'ABC'
def optimize( input_file: Path, output_file: Path, context, save_settings, executor: Executor = SerialExecutor(), ) -> None: options = context.options if options.optimize == 0: safe_symlink(input_file, output_file) return if options.jpeg_quality == 0: options.jpeg_quality = DEFAULT_JPEG_QUALITY if options.optimize < 3 else 40 if options.png_quality == 0: options.png_quality = DEFAULT_PNG_QUALITY if options.optimize < 3 else 30 if options.jbig2_page_group_size == 0: options.jbig2_page_group_size = 10 if options.jbig2_lossy else 1 with pikepdf.Pdf.open(input_file) as pike: root = output_file.parent / 'images' root.mkdir(exist_ok=True) jpegs, pngs = extract_images_generic(pike, root, options) transcode_jpegs(pike, jpegs, root, options, executor) # if options.optimize >= 2: # Try pngifying the jpegs # transcode_pngs(pike, jpegs, jpg_name, root, options) transcode_pngs(pike, pngs, png_name, root, options, executor) jbig2_groups = extract_images_jbig2(pike, root, options) convert_to_jbig2(pike, jbig2_groups, root, options, executor) target_file = output_file.with_suffix('.opt.pdf') pike.remove_unreferenced_resources() pike.save(target_file, **save_settings) input_size = input_file.stat().st_size output_size = target_file.stat().st_size if output_size == 0: raise OutputFileAccessError( f"Output file not created after optimizing. We probably ran " f"out of disk space in the temporary folder: {tempfile.gettempdir()}." ) ratio = input_size / output_size savings = 1 - output_size / input_size log.info(f"Optimize ratio: {ratio:.2f} savings: {(savings):.1%}") if savings < 0: log.info("Image optimization did not improve the file - discarded") # We still need to save the file with pikepdf.open(input_file) as pike: pike.remove_unreferenced_resources() pike.save(output_file, **save_settings) else: safe_symlink(target_file, output_file)
def create_input_file(options, work_folder: Path) -> (Path, str): if options.input_file == '-': # stdin log.info('reading file from standard input') target = work_folder / 'stdin' with open(target, 'wb') as stream_buffer: copyfileobj(sys.stdin.buffer, stream_buffer) return target, "<stdin>" else: try: target = work_folder / 'origin' safe_symlink(options.input_file, target) return target, os.fspath(options.input_file) except FileNotFoundError: raise InputFileError(f"File not found - {options.input_file}")
def triage(original_filename, input_file, output_file, options): try: if _pdf_guess_version(input_file): if options.image_dpi: log.warning( "Argument --image-dpi is being ignored because the " "input file is a PDF, not an image.") # Origin file is a pdf create a symlink with pdf extension safe_symlink(input_file, output_file) return output_file except EnvironmentError as e: log.debug(f"Temporary file was at: {input_file}") msg = str(e).replace(str(input_file), original_filename) raise InputFileError(msg) from e triage_image_file(input_file, output_file, options) return output_file
def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext): options = context.options input_pdfinfo = context.pdfinfo fix_docinfo_file = context.get_path('fix_docinfo.pdf') output_file = context.get_path('pdfa.pdf') # If the DocumentInfo record contains NUL characters, Ghostscript will # produce XMP metadata which contains invalid XML entities (�). # NULs in DocumentInfo seem to be common since older Acrobats included them. # pikepdf can deal with this, but we make the world a better place by # stamping them out as soon as possible. modified = False with pikepdf.open(input_pdf) as pdf_file: try: len(pdf_file.docinfo) except TypeError: log.error( "File contains a malformed DocumentInfo block - continuing anyway" ) else: if pdf_file.docinfo: for k, v in pdf_file.docinfo.items(): if b'\x00' in bytes(v): pdf_file.docinfo[k] = bytes(v).replace(b'\x00', b'') modified = True if modified: pdf_file.save(fix_docinfo_file) else: safe_symlink(input_pdf, fix_docinfo_file) context.plugin_manager.hook.generate_pdfa( pdf_version=input_pdfinfo.min_version, pdf_pages=[fix_docinfo_file], pdfmark=input_ps_stub, output_file=output_file, compression=options.pdfa_image_compression, pdfa_part=options.output_type[-1], # is pdfa-1, pdfa-2, or pdfa-3 progressbar_class=(context.plugin_manager.hook.get_progressbar_class() if options.progress_bar else None), ) return output_file
def create_input_file(options, work_folder: Path) -> Tuple[Path, str]: if options.input_file == '-': # stdin log.info('reading file from standard input') target = work_folder / 'stdin' with open(target, 'wb') as stream_buffer: copyfileobj(sys.stdin.buffer, stream_buffer) return target, "stdin" elif hasattr(options.input_file, 'readable'): if not options.input_file.readable(): raise InputFileError("Input file stream is not readable") log.info('reading file from input stream') target = work_folder / 'stream' with open(target, 'wb') as stream_buffer: copyfileobj(options.input_file, stream_buffer) return target, "stream" else: try: target = work_folder / 'origin' safe_symlink(options.input_file, target) return target, os.fspath(options.input_file) except FileNotFoundError: raise InputFileError(f"File not found - {options.input_file}")
def test_safe_symlink_overwrite(self, tmp_path): (tmp_path / 'regular_file').touch() with pytest.raises(FileExistsError): helpers.safe_symlink(tmp_path / 'input', tmp_path / 'regular_file')
def test_safe_symlink_link_self(self, tmp_path, caplog): helpers.safe_symlink(tmp_path / 'self', tmp_path / 'self') assert caplog.record_tuples[0][1] == logging.WARNING