Beispiel #1
0
def extract_images(
    pike: Pdf,
    root: Path,
    options,
    extract_fn: Callable[..., Optional[XrefExt]],
) -> Iterator[Tuple[int, XrefExt]]:
    """Extract image using extract_fn

    Enumerate images on each page, lookup their xref/ID number in the PDF.
    Exclude images that are soft masks (i.e. alpha transparency related).
    Record the page number on which an image is first used, since images may be
    used on multiple pages (or multiple times on the same page).

    Current we do not check Form XObjects or other objects that may contain
    images, and we don't evaluate alternate images or thumbnails.

    extract_fn must decide if wants to extract the image in this context. If
    it does a tuple should be returned: (xref, ext) where .ext is the file
    extension. extract_fn must also extract the file it finds interesting.
    """

    include_xrefs: MutableSet[Xref] = set()
    exclude_xrefs: MutableSet[Xref] = set()
    pageno_for_xref = {}
    errors = 0
    for pageno, page in enumerate(pike.pages):
        try:
            xobjs = page.Resources.XObject
        except AttributeError:
            continue
        for _imname, image in dict(xobjs).items():
            if image.objgen[1] != 0:
                continue  # Ignore images in an incremental PDF
            xref = Xref(image.objgen[0])
            if hasattr(image, 'SMask'):
                # Ignore soft masks
                smask_xref = Xref(image.SMask.objgen[0])
                exclude_xrefs.add(smask_xref)
                log.debug(f"Skipping image {smask_xref} because it is an SMask")
            include_xrefs.add(xref)
            log.debug(f"Treating {xref} as an optimization candidate")
            if xref not in pageno_for_xref:
                pageno_for_xref[xref] = pageno

    working_xrefs = include_xrefs - exclude_xrefs
    for xref in working_xrefs:
        image = pike.get_object((xref, 0))
        try:
            result = extract_fn(
                pike=pike, root=root, image=image, xref=xref, options=options
            )
        except Exception:  # pylint: disable=broad-except
            log.exception(f"While extracting image xref {xref}, an error occurred")
            errors += 1
        else:
            if result:
                _, ext = result
                yield pageno_for_xref[xref], XrefExt(xref, ext)
Beispiel #2
0
def convert_to_jbig2(
    pike: Pdf,
    jbig2_groups: Dict[int, List[XrefExt]],
    root: Path,
    options,
    executor: Executor,
) -> None:
    """Convert images to JBIG2 and insert into PDF.

    When the JBIG2 page group size is > 1 we do several JBIG2 images at once
    and build a symbol dictionary that will span several pages. Each JBIG2
    image must reference to its symbol dictionary. If too many pages shared the
    same dictionary JBIG2 encoding becomes more expensive and less efficient.
    The default value of 10 was determined through testing. Currently this
    must be lossy encoding since jbig2enc does not support refinement coding.

    When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own
    and needs no dictionary. Currently this must be lossless JBIG2.
    """
    jbig2_globals_dict: Optional[Dictionary]

    _produce_jbig2_images(jbig2_groups, root, options, executor)

    for group, xref_exts in jbig2_groups.items():
        prefix = f'group{group:08d}'
        jbig2_symfile = root / (prefix + '.sym')
        if jbig2_symfile.exists():
            jbig2_globals_data = jbig2_symfile.read_bytes()
            jbig2_globals = Stream(pike, jbig2_globals_data)
            jbig2_globals_dict = Dictionary(JBIG2Globals=jbig2_globals)
        elif options.jbig2_page_group_size == 1:
            jbig2_globals_dict = None
        else:
            raise FileNotFoundError(jbig2_symfile)

        for n, xref_ext in enumerate(xref_exts):
            xref, _ = xref_ext
            jbig2_im_file = root / (prefix + f'.{n:04d}')
            jbig2_im_data = jbig2_im_file.read_bytes()
            im_obj = pike.get_object(xref, 0)
            im_obj.write(jbig2_im_data,
                         filter=Name.JBIG2Decode,
                         decode_parms=jbig2_globals_dict)
Beispiel #3
0
def transcode_jpegs(pike: Pdf, jpegs: Sequence[Xref], root: Path, options) -> None:
    for xref in tqdm(
        jpegs, desc="JPEGs", unit='image', disable=not options.progress_bar
    ):
        in_jpg = jpg_name(root, xref)
        opt_jpg = in_jpg.with_suffix('.opt.jpg')

        # This produces a debug warning from PIL
        # DEBUG:PIL.Image:Error closing: 'NoneType' object has no attribute
        # 'close'.  Seems to be mostly harmless
        # https://github.com/python-pillow/Pillow/issues/1144
        with Image.open(in_jpg) as im:
            im.save(opt_jpg, optimize=True, quality=options.jpeg_quality)

        if opt_jpg.stat().st_size > in_jpg.stat().st_size:
            log.debug("xref %s, jpeg, made larger - skip", xref)
            continue

        compdata = leptonica.CompressedData.open(opt_jpg)
        im_obj = pike.get_object(xref, 0)
        im_obj.write(compdata.read(), filter=Name.DCTDecode)
Beispiel #4
0
def _transcode_png(pike: Pdf, filename: Path, xref: Xref) -> bool:
    output = filename.with_suffix('.png.pdf')
    with output.open('wb') as f:
        img2pdf.convert(fspath(filename), outputstream=f)

    with pikepdf.open(output) as pdf_image:
        foreign_image = next(pdf_image.pages[0].images.values())
        local_image = pike.copy_foreign(foreign_image)

        im_obj = pike.get_object(xref, 0)
        im_obj.write(
            local_image.read_raw_bytes(),
            filter=local_image.Filter,
            decode_parms=local_image.DecodeParms,
        )

        # Don't copy keys from the new image...
        del_keys = set(im_obj.keys()) - set(local_image.keys())
        # ...except for the keep_fields, which are essential to displaying
        # the image correctly and preserving its metadata. (/Decode arrays
        # and /SMaskInData are implicitly discarded prior to this point.)
        keep_fields = {
            '/ID',
            '/Intent',
            '/Interpolate',
            '/Mask',
            '/Metadata',
            '/OC',
            '/OPI',
            '/SMask',
            '/StructParent',
        }
        del_keys -= keep_fields
        for key in local_image.keys():
            if key != Name.Length and str(key) not in keep_fields:
                im_obj[key] = local_image[key]
        for key in del_keys:
            del im_obj[key]
    return True
Beispiel #5
0
def _transcode_png(pike: Pdf, filename: Path, xref: Xref) -> bool:
    output = filename.with_suffix('.png.pdf')
    with output.open('wb') as f:
        img2pdf.convert(fspath(filename), outputstream=f)

    with pikepdf.open(output) as pdf_image:
        foreign_image = next(pdf_image.pages[0].images.values())
        local_image = pike.copy_foreign(foreign_image)

        im_obj = pike.get_object(xref, 0)
        im_obj.write(
            local_image.read_raw_bytes(),
            filter=local_image.Filter,
            decode_parms=local_image.DecodeParms,
        )

        del_keys = set(im_obj.keys()) - set(local_image.keys())
        for key in local_image.keys():
            if key != Name.Length:
                im_obj[key] = local_image[key]
        for key in del_keys:
            del im_obj[key]
Beispiel #6
0
def transcode_pngs(
    pike: Pdf,
    images: Sequence[Xref],
    image_name_fn: Callable[[Path, Xref], Path],
    root: Path,
    options,
) -> None:
    modified: MutableSet[Xref] = set()
    if options.optimize >= 2:
        png_quality = (
            max(10, options.png_quality - 10),
            min(100, options.png_quality + 10),
        )

        def pngquant_args():
            for xref in images:
                log.debug(image_name_fn(root, xref))
                yield (
                    image_name_fn(root, xref),
                    png_name(root, xref),
                    png_quality[0],
                    png_quality[1],
                )
                modified.add(xref)

        def pngquant_fn(args):
            pngquant.quantize(*args)

        exec_progress_pool(
            use_threads=True,
            max_workers=options.jobs,
            tqdm_kwargs=dict(
                desc="PNGs",
                total=len(images),
                unit='image',
                disable=not options.progress_bar,
            ),
            task=pngquant_fn,
            task_arguments=pngquant_args(),
        )

    for xref in modified:
        im_obj = pike.get_object(xref, 0)
        try:
            pix = leptonica.Pix.open(png_name(root, xref))
            if pix.mode == '1':
                compdata = pix.generate_pdf_ci_data(leptonica.lept.L_G4_ENCODE,
                                                    0)
            else:
                compdata = leptonica.CompressedData.open(png_name(root, xref))
        except leptonica.LeptonicaError as e:
            # Most likely this means file not found, i.e. quantize did not
            # produce an improved version
            log.error(e)
            continue

        # If re-coded image is larger don't use it - we test here because
        # pngquant knows the size of the temporary output file but not the actual
        # object in the PDF
        if len(compdata) > int(im_obj.stream_dict.Length):
            log.debug(
                f"pngquant: pngquant did not improve over original image "
                f"{len(compdata)} > {int(im_obj.stream_dict.Length)}")
            continue
        if compdata.type == leptonica.lept.L_FLATE_ENCODE:
            rewrite_png(pike, im_obj, compdata)
        elif compdata.type == leptonica.lept.L_G4_ENCODE:
            rewrite_png_as_g4(pike, im_obj, compdata)