Esempio n. 1
0
def set_layer_visibility(pdf, layers_to_show):
    """Set visibility of layers."""
    try:
        ocgs = pdf.Root.OCProperties.OCGs
    except (AttributeError, KeyError):
        logger.error("Unable to locate layers in PDF.")
        sys.exit(1)

    ocgs_on = []
    for ocg in ocgs:
        if ocg.Name in layers_to_show:
            logger.info("Layer %s will be visible.", ocg.Name)
            ocgs_on.append(ocg)
        else:
            logger.info("Layer %s will be hidden.", ocg.Name)

    ocgs_config = pikepdf.Dictionary(
        BaseState=pikepdf.Name('/OFF'),
        ON=ocgs_on,
        Order=ocgs,
    )

    pdf.Root.OCProperties = pikepdf.Dictionary(
        D=ocgs_config,
        OCGs=ocgs,
    )

    # Needed for google-chrome (at least):
    for ocg in ocgs:
        if '/View' in ocg.Usage:
            del ocg.Usage.View
        if '/Print' in ocg.Usage:
            del ocg.Usage.Print
Esempio n. 2
0
def transcode_pngs(pike, pngs, root, log, options):
    if options.optimize >= 2:
        png_quality = (
            max(10, options.png_quality - 10),
            min(100, options.png_quality + 10)
        )
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=options.jobs) as executor:
            for xref in pngs:
                executor.submit(
                    pngquant.quantize,
                    png_name(root, xref), png_name(root, xref),
                    png_quality[0], png_quality[1])

    for xref in pngs:
        im_obj = pike.get_object(xref, 0)

        # Open, transcode (!), package for PDF
        try:
            pix = leptonica.Pix.open(png_name(root, xref))
            if pix.depth == 1:
                pix = pix.invert()  # PDF assumes 1 is black for monochrome
            compdata = pix.generate_pdf_ci_data(
                leptonica.lept.L_FLATE_ENCODE, 0
            )
        except leptonica.LeptonicaError as e:
            log.error(e)
            continue

        # This is what we should be doing: open the compressed data without
        # transcoding. However this shifts each pixel row by one for some
        # reason.
        #compdata = leptonica.CompressedData.open(png_name(root, xref))
        if len(compdata) > int(im_obj.stream_dict.Length):
            continue  # If we produced a larger image, don't use

        predictor = Null()
        if compdata.predictor > 0:
            predictor = pikepdf.Dictionary({'/Predictor': compdata.predictor})

        im_obj.BitsPerComponent = compdata.bps
        im_obj.Width = compdata.w
        im_obj.Height = compdata.h

        if compdata.ncolors > 0:
            palette_pdf_string = compdata.get_palette_pdf_string()
            palette_data = pikepdf.Object.parse(palette_pdf_string)
            palette_stream = pikepdf.Stream(pike, bytes(palette_data))
            palette = [pikepdf.Name('/Indexed'), pikepdf.Name('/DeviceRGB'),
                       compdata.ncolors - 1, palette_stream]
            cs = palette
        else:
            if compdata.spp == 1:
                cs = pikepdf.Name('/DeviceGray')
            elif compdata.spp == 3:
                cs = pikepdf.Name('/DeviceRGB')
            elif compdata.spp == 4:
                cs = pikepdf.Name('/DeviceCMYK')
        im_obj.ColorSpace = cs
        im_obj.write(compdata.read(), pikepdf.Name('/FlateDecode'), predictor)
Esempio n. 3
0
def merge(pdf_streams, names, outpath, first_page):
    output = pikepdf.new()
    pgcounts = []

    for stream, name in tqdm.tqdm(zip(pdf_streams, names),
                                  total=len(names),
                                  desc="Merging PDFs"):
        pgcounts.append(append_pdf(output, stream, name))

    # add page numbering

    amount_of_contents = 0
    for amt, name in zip(pgcounts, names):
        if is_contents(name):
            amount_of_contents += amt
        else:
            break

    if amount_of_contents != 0:
        output.Root.PageLabels = {
            "/Nums": [
                0, {
                    "/S": pikepdf.Name("/r")
                }, amount_of_contents, {
                    "/S": pikepdf.Name("/D"),
                    "/St": first_page
                }
            ]
        }

    with tqdm.tqdm(total=100, desc="Writing PDF") as pbar:
        last = 0

        def update(x):
            nonlocal last
            g = x - last
            last = x
            pbar.update(g)

        output.save(outpath, progress=update)
Esempio n. 4
0
def convert_to_jbig2(pike, jbig2_groups, root, log, options):
    """
    Convert a group of JBIG2 images and insert into PDF.

    We use a group because JBIG2 works best with a symbol dictionary that spans
    multiple pages. When inserted back into the PDF, each JBIG2 must reference
    the symbol dictionary it is associated with. So convert a group at a time,
    and replace their streams with a parameter set that points to the
    appropriate dictionary.

    If too many pages shared the same dictionary JBIG2 encoding becomes more
    expensive and less efficient.

    """
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=options.jobs) as executor:
        futures = []
        for group, xref_exts in jbig2_groups.items():
            prefix = 'group{:08d}'.format(group)
            future = executor.submit(
                jbig2enc.convert_group,
                cwd=fspath(root),
                infiles=(img_name(root, xref, ext) for xref, ext in xref_exts),
                out_prefix=prefix
            )
            futures.append(future)
        for future in concurrent.futures.as_completed(futures):
            proc = future.result()
            log.debug(proc.stderr.decode())

    for group, xref_exts in jbig2_groups.items():
        prefix = 'group{:08d}'.format(group)
        jbig2_globals_data = (root / (prefix + '.sym')).read_bytes()
        jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data)

        for n, xref_ext in enumerate(xref_exts):
            xref, _ = xref_ext
            jbig2_im_file = root / (prefix + '.{:04d}'.format(n))
            jbig2_im_data = jbig2_im_file.read_bytes()
            im_obj = pike.get_object(xref, 0)
            im_obj.write(
                jbig2_im_data, pikepdf.Name('/JBIG2Decode'),
                pikepdf.Dictionary({
                    '/JBIG2Globals': jbig2_globals
                })
            )
Esempio n. 5
0
def transcode_jpegs(pike, jpegs, root, log, options):
    for xref in jpegs:
        in_jpg = Path(jpg_name(root, xref))
        opt_jpg = in_jpg.with_suffix('.opt.jpg')

        # This produces a debug warning from PIL
        # DEBUG:PIL.Image:Error closing: 'NoneType' object has no attribute
        # 'close'.  Seems to be mostly harmless
        # https://github.com/python-pillow/Pillow/issues/1144
        with Image.open(fspath(in_jpg)) as im:
            im.save(fspath(opt_jpg),
                    optimize=True,
                    quality=options.jpeg_quality)
        # pylint: disable=no-member
        if opt_jpg.stat().st_size > in_jpg.stat().st_size:
            log.debug("xref %s, jpeg, made larger - skip", xref)
            continue

        compdata = leptonica.CompressedData.open(opt_jpg)
        im_obj = pike.get_object(xref, 0)
        im_obj.write(compdata.read(), filter=pikepdf.Name('/DCTDecode'))
def update_dest(zoom_factor,current):
    dest = pikepdf.Array()
    dest.append(current[0])
    dest.append(pikepdf.Name("/XYZ"))
    dest.append(0)
    dest.append(0)
    dest.append(zoom_factor)
    dest_type = current[1]
    
    if dest_type == "/XYZ":
        dest[2] = current[2]
        dest[3] = current[3]
    elif dest_type in ("/FitH","/FitBH"):            
        dest[3] = current[2]          
    elif dest_type in ("/FitV","/FitBV"):            
        dest[2] = current[2]          
    elif dest_type == "/FitR":
        dest[2] = current[2]
        dest[3] = current[4]
    #("/Fit","/FitB"):
    
    return dest    
Esempio n. 7
0
def convert_to_jbig2(pike, jbig2_groups, root, log, options):
    """Convert images to JBIG2 and insert into PDF.

    When the JBIG2 page group size is > 1 we do several JBIG2 images at once
    and build a symbol dictionary that will span several pages. Each JBIG2
    image must reference to its symbol dictionary. If too many pages shared the
    same dictionary JBIG2 encoding becomes more expensive and less efficient.
    The default value of 10 was determined through testing. Currently this
    must be lossy encoding since jbig2enc does not support refinement coding.

    When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own
    and needs no dictionary. Currently this is must be lossless JBIG2.
    """

    _produce_jbig2_images(jbig2_groups, root, log, options)

    for group, xref_exts in jbig2_groups.items():
        prefix = f'group{group:08d}'
        jbig2_symfile = root / (prefix + '.sym')
        if jbig2_symfile.exists():
            jbig2_globals_data = jbig2_symfile.read_bytes()
            jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data)
            jbig2_globals_dict = pikepdf.Dictionary(
                {'/JBIG2Globals': jbig2_globals})
        elif options.jbig2_page_group_size == 1:
            jbig2_globals_dict = None
        else:
            raise FileNotFoundError(jbig2_symfile)

        for n, xref_ext in enumerate(xref_exts):
            xref, _ = xref_ext
            jbig2_im_file = root / (prefix + f'.{n:04d}')
            jbig2_im_data = jbig2_im_file.read_bytes()
            im_obj = pike.get_object(xref, 0)
            im_obj.write(
                jbig2_im_data,
                filter=pikepdf.Name('/JBIG2Decode'),
                decode_parms=jbig2_globals_dict,
            )
Esempio n. 8
0
def main(tmpdirname, pdf_name):
    total_savings = 0

    logging.info('Processing %s.', pdf_name)

    my_pdf = pikepdf.open(pdf_name)

    img_num = 0
    # total_objs = num_image_objects(mypdf)
    # for image_obj in tqdm(image_objects(my_pdf), total=total_objs):
    for image_obj in image_objects(my_pdf):

        if '/Filter' not in image_obj:
            continue

        # FIXME: to improve *a lot*
        if (image_obj.Filter != '/DCTDecode'
                and not (isinstance(image_obj.Filter, pikepdf.Array)
                         and len(image_obj.Filter) == 1
                         and image_obj.Filter[0] == '/DCTDecode')):
            continue

        if not (image_obj.ColorSpace in ('/DeviceRGB', '/DeviceGray') or
                (isinstance(image_obj.ColorSpace, pikepdf.Array)
                 and image_obj.ColorSpace[0] == '/DeviceN' and
                 image_obj.ColorSpace[2] in ('/DeviceRGB', '/DeviceGray'))):
            continue

        # FIXME: Enable this code to process more images
        # if not (image_obj.ColorSpace in ('/DeviceRGB', '/DeviceGray') or
        #         (isinstance(image_obj.ColorSpace, pikepdf.Array) and
        #          image_obj.ColorSpace[0] == '/ICCBased' and str(image_obj.ColorSpace[1].Alternate) in
        #          ('/DeviceRGB', '/DeviceGray'))):
        #     continue

        img_num += 1
        logging.debug('Found a JPEG as %s', image_obj.ColorSpace)

        tempname = os.path.join(tmpdirname, f'img-{img_num:05d}.jpg')
        source = open(tempname, 'wb')

        size_before = source.write(image_obj.read_raw_bytes())
        logging.debug('Wrote %d bytes to the tempfile %s.', size_before,
                      tempname)
        source.close()

        # print('Calling jpgcrush...')
        subprocess.check_call(['jpgcrush', tempname])
        # print('Return code was: %d.' % ret)

        # # Unfortunately, the -purejpg of jhead is too aggressive and may
        # # strip way too much to the point of modifying the image, in some
        # # cases.
        # logging.debug('Calling jhead...')
        # subprocess.check_call(['jhead', '-dt', '-dc', '-de', source.name])
        # # print('Return code was: %d.' % ret)

        targetfn = open(tempname, 'rb')
        target = targetfn.read()

        size_after = len(target)
        logging.debug('Read back %d bytes from the tempfile %s.', size_after,
                      tempname)
        image_obj.write(target, filter=pikepdf.Name('/DCTDecode'))
        logging.debug('The image is back on the PDF file.')

        total_savings += size_before - size_after

    final_filename = os.path.splitext(pdf_name)[0] + '.jpg.pdf'
    logging.info('Saved %d bytes to create %s.', total_savings, final_filename)
    my_pdf.save(final_filename)

    my_pdf.close()
Esempio n. 9
0
import pikepdf
from pikepdf import Pdf, OutlineItem

with Pdf.open('TampaFD_TemporalPDF-4.pdf') as pdf:
    with pdf.open_outline() as outline:
        new_action = pikepdf.Dictionary()
        new_action['/S'] = pikepdf.Name('/JavaScript')
        new_action['/JS'] = "app.alert(\"Hello from Robin\");"
        test_item = OutlineItem('Test Alert Robin', action=new_action)
        outline.root.append(test_item)
    pdf.save('output1.pdf')
Esempio n. 10
0
    def run(self, progress_dlg):
        # open a new copy of the input
        output = pikepdf.Pdf.open(self.pdf.filename)
        self.colour_type = None

        if self.keep_ocs == 'all' and len(self.line_props) == 0:
            return output

        if self.keep_ocs is None and self.keep_non_oc == False:
            print(_('No layers selected, generated PDF would be blank.'))
            return None

        if len(self.page_range) == 0:
            # human input page range is 1-indexed
            page_range = range(1, len(output.pages) + 1)
        else:
            # get rid of duplicates and zeros in the page range
            page_range = list(set([p for p in self.page_range if p > 0]))

        n_page = len(page_range)
        progress_dlg.SetRange(n_page)
        Yield()

        # change the decimal precision because it's really high
        for p in page_range:
            # print(_('Processing layers in page {}...'.format(p)))
            # apply the filter and reassign the page contents
            newstream = self.filter_content(output.pages[p - 1])
            output.pages[p - 1].Contents = output.make_stream(newstream)

            # check if there are form xobjects, and if so, filter them as well
            if '/XObject' in output.pages[p - 1].Resources.keys():
                for k in output.pages[p - 1].Resources.XObject.keys():
                    xobj = output.pages[p - 1].Resources.XObject[k]
                    if '/OC' in xobj.keys():
                        oc = None
                        if '/Name' in xobj.OC.keys():
                            oc = str(xobj.OC.Name)

                        elif '/OCGs' in xobj.OC.keys(
                        ) and '/Name' in xobj.OC.OCGs.keys():
                            oc = str(xobj.OC.OCGs.keys())

                        if oc in self.keep_ocs:
                            if oc in self.line_props.keys():
                                newstream = self.filter_content(xobj, layer=oc)
                                xobj.write(newstream)
                        else:
                            # if we don't want to keep it, just blank it out
                            newstream = b''
                            xobj.write(newstream)
                    else:
                        if xobj.Subtype == pikepdf.Name('/Form'):
                            newstream = self.filter_content(xobj)
                            xobj.write(newstream)

            progress_dlg.Update(page_range.index(p))
            Yield()

            if progress_dlg.WasCancelled():
                return None

        # edit the OCG listing in the root
        OCGs = [
            oc for oc in output.Root.OCProperties.OCGs
            if str(oc.Name) in self.keep_ocs
        ]
        output.Root.OCProperties.OCGs = OCGs

        # by default, unlock all layers and show all OCGs
        output.Root.OCProperties.D.Locked = []
        output.Root.OCProperties.D.Order = self.filter_ocg_order(
            output.Root.OCProperties.D.Order)

        output.remove_unreferenced_resources()

        return output