def generate_booklet(pdfqueue, tmp_dir, pages): file, filename = make_tmp_file(tmp_dir) content_dict = pikepdf.Dictionary({}) file_indexes = {p.nfile for p in pages} source_files = { n: pikepdf.open(pdfqueue[n - 1].copyname) for n in file_indexes } for i in range(len(pages) // 2): even = i % 2 == 0 first = pages[-i - 1 if even else i] second = pages[i if even else -i - 1] second_page_size = second.size_in_points() first_page_size = first.size_in_points() page_size = [ max(second_page_size[0], first_page_size[0]) * 2, max(second_page_size[1], first_page_size[1]) ] first_original = source_files[first.nfile].pages[first.npage - 1] first_foreign = _apply_geom_transform( file, file.copy_foreign(first_original), first) second_original = source_files[second.nfile].pages[second.npage - 1] second_foreign = _apply_geom_transform( file, file.copy_foreign(second_original), second) content_dict[f'/Page{i*2}'] = pikepdf.Page( first_foreign).as_form_xobject() content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page( second_foreign).as_form_xobject() # See PDF reference section 4.2.3 Transformation Matrices tx1 = -first_foreign.MediaBox[0] ty1 = -first_foreign.MediaBox[1] tx2 = first_page_size[0] - float(second_foreign.MediaBox[0]) ty2 = -second_foreign.MediaBox[1] content_txt = (f"q 1 0 0 1 {tx1} {ty1} cm /Page{i*2} Do Q " f"q 1 0 0 1 {tx2} {ty2} cm /Page{i*2 + 1} Do Q ") newpage = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=[0, 0, *page_size], Resources=pikepdf.Dictionary(XObject=content_dict), Contents=pikepdf.Stream(file, content_txt.encode())) # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174 if pikepdf.__version__ < '2.7.0': newpage = file.make_indirect(newpage) file.pages.append(newpage) file.save(filename) return filename
def generate_booklet(pdfqueue, tmp_dir, pages): file, filename = make_tmp_file(tmp_dir) content_dict = pikepdf.Dictionary({}) file_indexes = {p.nfile for p in pages} source_files = { n: pikepdf.open(pdfqueue[n - 1].copyname) for n in file_indexes } for i in range(len(pages) // 2): even = i % 2 == 0 first = pages[-i - 1 if even else i] second = pages[i if even else -i - 1] second_page_size = second.size_in_points() first_page_size = first.size_in_points() page_size = [ max(second_page_size[0], first_page_size[0]) * 2, max(second_page_size[1], first_page_size[1]) ] first_original = source_files[first.nfile].pages[first.npage - 1] first_foreign = file.copy_foreign(first_original) _update_angle(first, first_original, first_foreign) second_original = source_files[second.nfile].pages[second.npage - 1] second_foreign = file.copy_foreign(second_original) _update_angle(second, second_original, second_foreign) content_dict[f'/Page{i*2}'] = pikepdf.Page( first_foreign).as_form_xobject() content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page( second_foreign).as_form_xobject() content_txt = ( f'q 1 0 0 1 0 0 cm /Page{i*2} Do Q' f' q 1 0 0 1 {first_page_size[0]} 0 cm /Page{i*2 + 1} Do Q ') newpage = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=[0, 0, *page_size], Resources=pikepdf.Dictionary(XObject=content_dict), Contents=pikepdf.Stream(file, content_txt.encode())) # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174 if pikepdf.__version__ < '2.7.0': newpage = file.make_indirect(newpage) file.pages.append(newpage) file.save(filename) return filename
def _scale(doc, page, factor): """ Scale a page """ if factor == 1: return page rotate = 0 if "/Rotate" in page: # We'll set the rotate attribute on the resulting page so we must # unset it on the input page before rotate = page.Rotate page.Rotate = 0 page = doc.make_indirect(page) page_id = len(doc.pages) newmediabox = [factor * float(x) for x in page.MediaBox] content = "q {} 0 0 {} 0 0 cm /p{} Do Q".format(factor, factor, page_id) xobject = pikepdf.Page(page).as_form_xobject() new_page = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=newmediabox, Contents=doc.make_stream(content.encode()), Resources={'/XObject': { '/p{}'.format(page_id): xobject }}, Rotate=rotate, ) return new_page
def test_draw_page(pal, monkeypatch): # Test page drawing error handling independent of whether mudraw is installed page0 = pikepdf.Page(pal.pages[0]) def raise_filenotfound(prog_args, *args, **kwargs): raise FileNotFoundError(prog_args[0]) monkeypatch.setattr(pikepdf._methods, 'run', raise_filenotfound) mimebundle = page0._repr_mimebundle_( include=['image/png'], exclude=['application/pdf'] ) assert ( 'image/png' not in mimebundle ), "Generated image/png when mudraw() was rigged to fail" def return_simple_png(prog_args, *args, **kwargs): im = Image.new('1', (1, 1)) bio = BytesIO() im.save(bio, format='PNG') bio.seek(0) return subprocess.CompletedProcess(prog_args, 0, stdout=bio.read(), stderr=b'') monkeypatch.setattr(pikepdf._methods, 'run', return_simple_png) mimebundle = page0._repr_mimebundle_( include=['image/png'], exclude=['application/pdf'] ) assert ( 'image/png' in mimebundle ), "Did not generate image/png when mudraw() was rigged to succeed"
def _scale(doc, page, factor): """ Scale a page """ if factor == 1: return page rotate = 0 if "/Rotate" in page: # We'll set the rotate attribute on the resulting page so we must # unset it on the input page before rotate = page.Rotate page.Rotate = 0 page_id = len(doc.pages) newmediabox = [factor * float(x) for x in page.MediaBox] content = "q {} 0 0 {} 0 0 cm /p{} Do Q".format(factor, factor, page_id) xobject = pikepdf.Page(page).as_form_xobject() new_page = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=newmediabox, Contents=doc.make_stream(content.encode()), Resources={'/XObject': { '/p{}'.format(page_id): xobject }}, Rotate=rotate, ) # This was needed for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174 # It's also needed with pikepdf 4.2 else we get: # RuntimeError: QPDFPageObjectHelper::getFormXObjectForPage called with a direct object # when calling as_form_xobject in generate_booklet new_page = doc.make_indirect(new_page) return new_page
def test_filter_names(pal): page = pikepdf.Page(pal.pages[0]) filter = FilterCollectNames() result = page.get_filtered_contents(filter) assert result == b'' assert filter.names == ['/Im0'] after = page.obj.Contents.read_bytes() assert after != b''
def test_image_scale0(resources, outpdf): with pikepdf.open(resources / 'cmyk.pdf') as cmyk: xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject() p = pikepdf.Pdf.new() p.add_blank_page(page_size=(72, 72)) objname = pikepdf.Page(p.pages[0]).add_resource( p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0) print(objname) p.pages[0].Contents = pikepdf.Stream( p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname)) p.save(outpdf) pi = pdfinfo.PdfInfo(outpdf, detailed_analysis=True, progbar=False, max_workers=1) assert not pi.pages[0]._images[0].dpi.is_finite assert pi.pages[0].dpi == Resolution(0, 0)
def mergePage(layerPage, mainPage, pdf, name) -> None: contentsForName = pdf.copy_foreign( pikepdf.Page(layerPage).as_form_xobject()) newContents = b'q\n %s Do\nQ\n' % (name.encode()) if not mainPage.Resources.get("/XObject"): mainPage.Resources["/XObject"] = pikepdf.Dictionary({}) mainPage.Resources["/XObject"][name] = contentsForName # Use the MediaBox from the merged page mainPage.MediaBox = layerPage.MediaBox mainPage.page_contents_add(contents=pikepdf.Stream(pdf, newContents), prepend=True)
def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] rich_page = pikepdf.Page(page) rich_page.contents_coalesce() for operands, operator in pikepdf.parse_content_stream(page, ''): if not in_text_obj: if operator == pikepdf.Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == pikepdf.Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == pikepdf.Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() def convert(op): try: return op.unparse() except AttributeError: return str(op).encode('ascii') lines = [] for operands, operator in stream: if operator == pikepdf.Operator('INLINE IMAGE'): iim = operands[0] line = iim.unparse() else: line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse() lines.append(line) content_stream = b'\n'.join(lines) page.Contents = pikepdf.Stream(pdf, content_stream)
def _scale(doc, page, factor): """ Scale a page """ if factor == 1: return page page = doc.make_indirect(page) page_id = len(doc.pages) newmediabox = [factor * float(x) for x in page.MediaBox] content = "q {} 0 0 {} 0 0 cm /p{} Do Q".format(factor, factor, page_id) xobject = pikepdf.Page(page).as_form_xobject() new_page = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=newmediabox, Contents=doc.make_stream(content.encode()), Resources={'/XObject': { '/p{}'.format(page_id): xobject }}, ) return new_page
def main() -> None: #Initialize parser parser = init_argparse() args = parser.parse_args() #Open PDF for file in args.files: current_file = pikepdf.Pdf.open(file) #For each page in the pdf for page in current_file.pages: pagehelper = pikepdf.Page(page) #Navigate to Resources/XObject (errror if not found) cf_xobjects = pagehelper.resources.XObject #Loop through each entry in XObject and store the one with the maximum length #Other possible conditionals are whatever is /DCTDecode maxlength = -1 for cf_object in cf_xobjects: cfo_length = cf_xobjects.get(cf_object).Length if cfo_length > maxlength: maxlength = cfo_length #Loop through again and delete all but those ones #For deletion, need to use dictionary addressing as otherwise we're just deleteing the object reference for cf_object in cf_xobjects: cfo_length = cf_xobjects.get(cf_object).Length if cfo_length != maxlength: del cf_xobjects[cf_object] #Remove unreferenced resources #Probably unnecessary as we're doing this backward current_file.remove_unreferenced_resources() #Save the pdf with compression save_filename = file[:-4] + "XObjectsRemoved.pdf" current_file.save(filename_or_stream=save_filename, object_stream_mode=pikepdf.ObjectStreamMode.generate, compress_streams=True, recompress_flate=True, encryption=False) #Close the current file current_file.close()
def test_display_rich_page(pal): page0 = pikepdf.Page(pal.pages[0]) mimebundle = page0._repr_mimebundle_( include=['application/pdf'], exclude=['application/malware'] ) assert 'application/pdf' in mimebundle
def test_invalid_handle_token(pal): page = pikepdf.Page(pal.pages[0]) with pytest.raises((TypeError, pikepdf.PdfError)): page.get_filtered_contents(FilterInvalid())
def test_tokenfilter_is_abstract(pal): page = pikepdf.Page(pal.pages[0]) try: result = page.get_filtered_contents(pikepdf.TokenFilter()) except pikepdf.PdfError: assert 'Tried to call pure virtual' in pal.get_warnings()[0]
def graph_page(graph): return pikepdf.Page(graph.pages[0])
def _graft_text_layer( self, *, page_num: int, textpdf: Path, font: pikepdf.Object, font_key: pikepdf.Object, procset: pikepdf.Object, text_rotation: int, strip_old_text: bool, ): """Insert the text layer from text page 0 on to pdf_base at page_num""" log.debug("Grafting") if Path(textpdf).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file with pikepdf.open(textpdf) as pdf_text: pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() base_page = self.pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1]) # -rotation because the input is a clockwise angle and this formula # uses CCW text_rotation = -text_rotation % 360 rotate = pikepdf.PdfMatrix().rotated(text_rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if text_rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht # log.debug('%r', scale_x, scale_y) scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate, and # finally move the lower left corner to match the mediabox ctm = translate @ rotate @ scale @ untranslate @ corner base_resources = _ensure_dictionary(base_page, Name.Resources) base_xobjs = _ensure_dictionary(base_resources, Name.XObject) text_xobj_name = Name('/' + str(uuid.uuid4())) xobj = self.pdf_base.make_stream(pdf_text_contents) base_xobjs[text_xobj_name] = xobj xobj.Type = Name.XObject xobj.Subtype = Name.Form xobj.FormType = 1 xobj.BBox = mediabox _update_resources(obj=xobj, font=font, font_key=font_key, procset=[Name.PDF]) pdf_draw_xobj = ((b'q %s cm\n' % ctm.encode()) + (b'%s Do\n' % text_xobj_name) + b'\nQ\n') new_text_layer = pikepdf.Stream(self.pdf_base, pdf_draw_xobj) if strip_old_text: strip_invisible_text(self.pdf_base, base_page) if hasattr(pikepdf.Page, 'contents_add'): # pikepdf >= 2.14 adds this method and deprecates the one below pikepdf.Page(base_page).contents_add(new_text_layer, prepend=True) else: # pikepdf < 2.14 base_page.page_contents_add(new_text_layer, prepend=True) # pragma: no cover _update_resources(obj=base_page, font=font, font_key=font_key, procset=procset)
def run(self, rows=0, cols=0, actually_trim=0): if self.in_doc is None: print(_('Input document not loaded')) return if len(self.page_range) == 0: self.set_page_range() # initialize a new document and copy over the layer info (OCGs) if it exists new_doc = pikepdf.Pdf.new() if '/OCProperties' in self.in_doc.Root.keys(): localRoot = new_doc.copy_foreign(self.in_doc.Root) new_doc.Root.OCProperties = localRoot.OCProperties content_dict = pikepdf.Dictionary({}) page_names = [] pw = None ph = None page_size_ref = 0 page_count = len(self.in_doc.pages) trim = [self.units_to_px(t) for t in self.trim] for p in self.page_range: if p > page_count: print( _('Only {} pages in document, skipping {}').format( page_count, p)) continue if p > 0: pagekey = f'/Page{p}' if pagekey not in content_dict.keys(): # copy the page over as an xobject # pikepdf.pages is zero indexed, so subtract one localpage = new_doc.copy_foreign(self.in_doc.pages[p - 1]) # set the trim box to cut off content if requested if actually_trim == 1: if '/TrimBox' not in localpage.keys(): localpage.TrimBox = copy.copy(localpage.MediaBox) localpage.TrimBox[0] = float( localpage.TrimBox[0]) + trim[0] localpage.TrimBox[1] = float( localpage.TrimBox[1]) + trim[3] localpage.TrimBox[2] = float( localpage.TrimBox[2]) - trim[1] localpage.TrimBox[3] = float( localpage.TrimBox[3]) - trim[2] content_dict[pagekey] = pikepdf.Page( localpage).as_form_xobject() # only get the width/height for the first page if pw is None: pw = float(localpage.MediaBox[2]) ph = float(localpage.MediaBox[3]) page_size_ref = p elif abs(pw - float(localpage.MediaBox[2])) > 1 or abs( ph - float(localpage.MediaBox[3])) > 1: print( _('Warning: page {} is a different size from {}, output may be unpredictable' .format(p, page_size_ref))) page_names.append(pagekey) else: page_names.append(None) # take the most common page width/height # create a new document with a page big enough to contain all the tiled pages, plus requested margin # figure out how big it needs to be based on requested columns/rows n_tiles = len(page_names) if cols == 0 and rows == 0: # try for square cols = math.ceil(math.sqrt(n_tiles)) rows = cols # columns take priority if both are specified if cols > 0: rrows = rows rows = math.ceil(n_tiles / cols) if rrows != rows and rrows != 0: print( _('Warning: requested {} columns and {} rows, but {} rows are needed with {} pages' ).format(cols, rrows, rows, n_tiles)) else: cols = math.ceil(n_tiles / rows) # convert the margin and trim options into pixels unitstr = 'cm' if self.units else 'in' margin = self.units_to_px(self.margin) rotstr = _('None') if self.rotation == 1: rotstr = _('Clockwise') if self.rotation == 2: rotstr = _('Counterclockwise') orderstr = _('Rows then columns') if self.col_major: orderstr = _('Columns then rows') lrstr = _('Left to right') if self.right_to_left: lrstr = _('Right to left') btstr = _('Top to bottom') if self.bottom_to_top: btstr = _('Bottom to top') print(_('Tiling with {} rows and {} columns').format(rows, cols)) print(_('Options') + ':') print(' ' + _('Margins') + ': {} {}'.format(self.margin, unitstr)) print(' ' + _('Trim') + ': {} {}'.format(self.trim, unitstr)) print(' ' + _('Rotation') + ': {}'.format(rotstr)) print(' ' + _('Page order') + ': {}, {}, {}'.format(orderstr, lrstr, btstr)) # define the media box with the final grid + margins # run through the width/height combos to find the maximum required # R is the rotation matrix (default to identity) R = [1, 0, 0, 1] # We need to account for the shift in origin if page rotation is applied o_shift = [0, 0] if self.rotation != 0: # define the rotation transform and # swap the trim order if self.rotation == 1: R = [0, -1, 1, 0] o_shift = [0, pw] order = [3, 2, 0, 1] if self.rotation == 2: R = [0, 1, -1, 0] o_shift = [ph, 0] order = [2, 3, 1, 0] # swap width and height of pages tmp = ph ph = pw pw = tmp trim = [trim[o] for o in order] # define the output page media box width = (pw - trim[0] - trim[1]) * cols height = (ph - trim[2] - trim[3]) * rows media_box = [0, 0, width + 2 * margin, height + 2 * margin] i = 0 content_txt = '' for i in range(n_tiles): if not page_names[i]: continue if self.col_major: c = math.floor(i / rows) r = i % rows else: r = math.floor(i / cols) c = i % cols if self.right_to_left: c = cols - c - 1 if not self.bottom_to_top: r = rows - r - 1 x0 = margin - trim[0] + c * (pw - trim[0] - trim[1]) y0 = margin - trim[3] + r * (ph - trim[2] - trim[3]) # don't scale, just shift and rotate # first shift to origin, then rotate, then shift to final destination content_txt += f'q {R[0]} {R[1]} {R[2]} {R[3]} {x0+o_shift[0]} {y0+o_shift[1]} cm ' content_txt += f'{page_names[i]} Do Q ' newpage = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=media_box, Resources=pikepdf.Dictionary(XObject=content_dict), Contents=pikepdf.Stream(new_doc, content_txt.encode())) new_doc.pages.append(newpage) return new_doc
def test_filter_thru(pal, filter, expected): page = pikepdf.Page(pal.pages[0]) page.add_content_token_filter(filter()) after = page.obj.Contents.read_bytes() assert after == expected
def test_tokenfilter_is_abstract(pal): page = pikepdf.Page(pal.pages[0]) with pytest.raises((RuntimeError, pikepdf.PdfError)): page.get_filtered_contents(pikepdf.TokenFilter())
def test_invalid_tokenfilter(pal): page = pikepdf.Page(pal.pages[0]) with pytest.raises(TypeError): page.get_filtered_contents(list())