def test_page_labels(): p = Pdf.new() d = Dictionary(Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary()) for n in range(5): p.pages.append(d) p.pages[n].Contents = Stream(p, b"BT (Page %s) Tj ET" % str(n).encode()) p.Root.PageLabels = p.make_indirect( Dictionary( Nums=Array( [ 0, # new label rules begin at index 0 Dictionary(S=Name.r), # use lowercase roman numerals, until... 2, # new label rules begin at index 2 Dictionary( S=Name.D, St=42, P='Prefix-' ), # label pages as 'Prefix-42', 'Prefix-43', ... ] ) ) ) labels = ['i', 'ii', 'Prefix-42', 'Prefix-43', 'Prefix-44'] for n in range(5): rawpage = p.pages[n] page = Page(rawpage) assert page.label == labels[n]
def test_filter_decodeparms_mismatch(self, stream_object): with pytest.raises(ValueError, match=r"filter.*and decode_parms"): stream_object.write( compress(b'foo'), filter=[Name.FlateDecode], decode_parms=[Dictionary(), Dictionary()], )
def test_oddwidth_grayscale(bits, check_pixels): pdf = pikepdf.new() pdf.add_blank_page(page_size=(108, 72)) imobj = Stream( pdf, bytes([0b00011011, 0b11011000, 0b00000001]), BitsPerComponent=bits, ColorSpace=Name.DeviceGray, Width=3, Height=2, Type=Name.XObject, Subtype=Name.Image, ) pdf.pages[0].Contents = Stream(pdf, b'108 0 0 72 0 0 cm /Im0 Do') pdf.pages[0].Resources = Dictionary(XObject=Dictionary(Im0=imobj)) pim = PdfImage(pdf.pages[0].Resources.XObject.Im0) assert pim.mode == 'L' assert pim.bits_per_component == bits bio = BytesIO() pim.extract_to(stream=bio) bio.seek(0) im = Image.open(bio) assert im.mode == 'L' assert im.size == (3, 2) # pdf.save(f'oddbit_{bits}.pdf') for check_x, check_y, val in check_pixels: assert im.getpixel((check_x, check_y)) == val
def test_repr_dict(self): d = Dictionary({ '/Boolean': True, '/Integer': 42, '/Real': Decimal('42.42'), '/String': String('hi'), '/Array': Array([1, 2, 3.14]), '/Operator': Operator('q'), '/Dictionary': Dictionary({'/Color': 'Red'}), '/None': None, }) if LooseVersion(pikepdf.__libqpdf_version__) >= LooseVersion('10.2.0'): short_pi = '3.14' else: short_pi = '3.140000' expected = ("""\ pikepdf.Dictionary({ "/Array": [ 1, 2, Decimal('%s') ], "/Boolean": True, "/Dictionary": { "/Color": "Red" }, "/Integer": 42, "/None": None, "/Operator": pikepdf.Operator("q"), "/Real": Decimal('42.42'), "/String": "hi" }) """ % short_pi) def strip_all_whitespace(s): return ''.join(s.split()) assert strip_all_whitespace(repr(d)) == strip_all_whitespace(expected) assert eval(repr(d)) == d
def test_repr_dict(self): d = Dictionary({ '/Boolean': True, '/Integer': 42, '/Real': Decimal('42.42'), '/String': String('hi'), '/Array': Array([1, 2, 3.14]), '/Operator': Operator('q'), '/Dictionary': Dictionary({'/Color': 'Red'}) }) expected = """\ pikepdf.Dictionary({ "/Array": [ 1, 2, Decimal('3.140000') ], "/Boolean": True, "/Dictionary": { "/Color": "Red" }, "/Integer": 42, "/Operator": pikepdf.Operator("q"), "/Real": Decimal('42.42'), "/String": "hi" }) """ def strip_all_whitespace(s): return ''.join(s.split()) assert strip_all_whitespace(repr(d)) == strip_all_whitespace(expected) assert eval(repr(d)) == d
def test_invalid_stream_object(): with pytest.raises(TypeError): parse_content_stream(42) with pytest.raises(TypeError): parse_content_stream(Dictionary({"/Hi": 3})) with pytest.raises(PdfError): false_page = Dictionary(Type=Name.Page, Contents=42) parse_content_stream(false_page)
def test_unattached_page(): rawpage = Dictionary( Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary() ) page = Page(rawpage) with pytest.raises(ValueError, match='not attached'): page.index with pytest.raises(ValueError, match='not attached'): page.label
def rewrite_png(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover # When a PNG is inserted into a PDF, we more or less copy the IDAT section from # the PDF and transfer the rest of the PNG headers to PDF image metadata. # One thing we have to do is tell the PDF reader whether a predictor was used # on the image before Flate encoding. (Typically one is.) # According to Leptonica source, PDF readers don't actually need us # to specify the correct predictor, they just need a value of either: # 1 - no predictor # 10-14 - there is a predictor # Leptonica's compdata->predictor only tells TRUE or FALSE # 10-14 means the actual predictor is specified in the data, so for any # number >= 10 the PDF reader will use whatever the PNG data specifies. # In practice Leptonica should use Paeth, 14, but 15 seems to be the # designated value for "optimal". So we will use 15. # See: # - PDF RM 7.4.4.4 Table 10 # - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757 predictor = 15 if compdata.predictor > 0 else 1 dparms = Dictionary(Predictor=predictor) if predictor > 1: dparms.BitsPerComponent = compdata.bps # Yes, this is redundant dparms.Colors = compdata.spp dparms.Columns = compdata.w im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h log.debug( f"PNG {im_obj.objgen}: palette={compdata.ncolors} spp={compdata.spp} bps={compdata.bps}" ) if compdata.ncolors > 0: # .ncolors is the number of colors in the palette, not the number of # colors used in a true color image. The palette string is always # given as RGB tuples even when the image is grayscale; see # https://github.com/DanBloomberg/leptonica/blob/master/src/colormap.c#L2067 palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [ Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream ] cs = palette else: # ncolors == 0 means we are using a colorspace without a palette if compdata.spp == 1: cs = Name.DeviceGray elif compdata.spp == 4: cs = Name.DeviceCMYK else: # spp == 3 cs = Name.DeviceRGB im_obj.ColorSpace = cs im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
def from_dictionary_object(cls, obj: Dictionary): """Creates a ``OutlineItem`` from a PDF document's ``Dictionary`` object. Does not process nested items. Arguments: obj: ``Dictionary`` object representing a single outline node. """ title = str(obj.Title) destination = obj.get(Name.Dest) action = obj.get(Name.A) return cls(title, destination=destination, action=action, obj=obj)
def _save_level_outline( self, parent: Dictionary, outline_items: Iterable[OutlineItem], level: int, visited_objs: Set[Tuple[int, int]], ): count = 0 prev: Optional[Dictionary] = None first: Optional[Dictionary] = None for item in outline_items: out_obj = item.to_dictionary_object(self._pdf) objgen = out_obj.objgen if objgen in visited_objs: if self._strict: raise OutlineStructureError( f"Outline object {objgen} reoccurred in structure") out_obj = item.to_dictionary_object(self._pdf, create_new=True) else: visited_objs.add(objgen) out_obj.Parent = parent count += 1 if prev is not None: prev.Next = out_obj out_obj.Prev = prev else: first = out_obj if Name.Prev in out_obj: del out_obj.Prev prev = out_obj if level < self._max_depth: sub_items = item.children else: sub_items = () self._save_level_outline(out_obj, sub_items, level + 1, visited_objs) if item.is_closed: out_obj.Count = -cast(int, out_obj.Count) else: count += cast(int, out_obj.Count) if count: assert prev is not None and first is not None if Name.Next in prev: del prev.Next parent.First = first parent.Last = prev else: if Name.First in parent: del parent.First if Name.Last in parent: del parent.Last parent.Count = count
def test_invalid_stream_object(): with pytest.raises(TypeError, match="must be a pikepdf.Object"): parse_content_stream(42) with pytest.raises(TypeError, match="called on page or stream"): parse_content_stream(Dictionary({"/Hi": 3})) with pytest.raises( TypeError, match="parse_content_stream called on non-stream Object"): false_page = Dictionary(Type=Name.Page, Contents=42) parse_content_stream(false_page)
def test_copy(): d = Dictionary({ '/Boolean': True, '/Integer': 42, '/Real': Decimal('42.42'), '/String': String('hi'), '/Array': Array([1, 2, 3.14]), '/Dictionary': Dictionary({'/Color': 'Red'}), }) d2 = copy(d) assert d2 == d assert d2 is not d assert d2['/Dictionary'] == d['/Dictionary']
def test_update_info(graph, outdir): new_title = '我敢打赌,你只是想看看这意味着什么' graph.docinfo['/Title'] = new_title graph.save(outdir / 'out.pdf') new = Pdf.open(outdir / 'out.pdf') assert new.docinfo['/Title'] == new_title assert graph.docinfo['/Author'] == new.docinfo['/Author'] with pytest.raises(ValueError): new.docinfo = Dictionary({'/Keywords': 'bob'}) new.docinfo = graph.make_indirect(Dictionary({'/Keywords': 'bob'})) assert new.docinfo.is_indirect, "/Info must be an indirect object"
def test_ccitt(self, stream_object): ccitt = b'\x00' # Not valid data, just for testing decode_parms stream_object.write( ccitt, filter=Name.CCITTFaxDecode, decode_parms=Dictionary(K=-1, Columns=8, Length=1), )
def to_dictionary_object(self, pdf: Pdf, create_new: bool = False) -> Dictionary: """Creates a ``Dictionary`` object from this outline node's data, or updates the existing object. Page numbers are resolved to a page reference on the input ``Pdf`` object. Arguments: pdf: PDF document object. create_new: If set to ``True``, creates a new object instead of modifying an existing one in-place. """ if create_new or self.obj is None: self.obj = obj = pdf.make_indirect(Dictionary()) else: obj = self.obj obj.Title = self.title if self.destination is not None: if isinstance(self.destination, int): self.destination = make_page_destination( pdf, self.destination, self.page_location, **self.page_location_kwargs, ) obj.Dest = self.destination if Name.A in obj: del obj.A elif self.action is not None: obj.A = self.action if Name.Dest in obj: del obj.Dest return obj
def set_pagelabels(doc, page_labels): arr = [] for label in page_labels: pn = label['start'] - 1 # page index 1-based -> 0-based d = {} if 'style' in label and label['style'] != 'none': d['/S'] = Name('/' + label['style']) if 'prefix' in label: d['/P'] = label['prefix'] if 'initial_count' in label: d['/St'] = label['initial_count'] obj = Dictionary(d) arr.append(pn) arr.append(obj) obj = Dictionary({'/Nums': Array(arr)}) doc.root[Name.PageLabels] = obj
def test_docinfo_delete_missing(sandwich): with sandwich.open_metadata() as m: d = Dictionary(Creator="test creator") assert 'xmp:CreateDate' in m assert m['xmp:CreatorTool'] != 'test creator' m.load_from_docinfo(d, delete_missing=True) assert m['xmp:CreatorTool'] == 'test creator' assert 'xmp:CreateDate' not in m
def _save(self): if self._root is None: return if Name.Outlines in self._pdf.Root: outlines = self._pdf.Root.Outlines else: self._pdf.Root.Outlines = outlines = self._pdf.make_indirect( Dictionary(Type=Name.Outlines)) self._save_level_outline(outlines, self._root, 0, set())
def test_with_same_owner_as(vera, outlines, outpdf): assert vera.Root.is_owned_by(vera) # return reference to self indirect_dict = vera.make_indirect(Dictionary(Foo=42)) vera.Root.IndirectDict = indirect_dict vera.save(outpdf) # copy direct object case vera.Root.CopiedDirectNames = Dictionary(Foo=42).with_same_owner_as( vera.Root) vera.save(outpdf) # copy foreign case vera.Root.ForeignNames = outlines.Root.Names.with_same_owner_as(vera.Root) vera.save(outpdf) # invalid other owner case with pytest.raises(ValueError): outlines.Root.Names.with_same_owner_as(Dictionary(Foo=42))
def test_failed_add_page_cleanup(): pdf = Pdf.new() d = Dictionary(Type=Name.NotAPage) num_objects = len(pdf.objects) with pytest.raises(TypeError, match="only pages can be inserted"): pdf.pages.append(d) assert len(pdf.pages) == 0 # If we fail to add a new page, we expect one new null object handle to be # be added (since QPDF does not remove the object outright) assert len(pdf.objects) == num_objects + 1, "QPDF semantics changed" assert pdf.objects[-1] is None, "Left a stale object behind without deleting" # But we'd better not delete an existing object... d2 = pdf.make_indirect(Dictionary(Type=Name.StillNotAPage)) with pytest.raises(TypeError, match="only pages can be inserted"): pdf.pages.append(d2) assert len(pdf.pages) == 0 assert d2.same_owner_as(pdf.Root)
def test_json(): d = Dictionary({ '/Boolean': True, '/Integer': 42, '/Real': Decimal('42.42'), '/String': String('hi'), '/Array': Array([1, 2, 3.14]), '/Dictionary': Dictionary({'/Color': 'Red'}), }) json_bytes = d.to_json(False) as_dict = json.loads(json_bytes) assert as_dict == { "/Array": [1, 2, 3.14], "/Boolean": True, "/Dictionary": { "/Color": "Red" }, "/Integer": 42, "/Real": 42.42, "/String": "hi", }
class TestAddResource: d = Dictionary(Type=Name.XObject, Subtype=Name.Image, Width=1, Height=1) def test_basic(self, graph_page): d = self.d with pytest.raises(ValueError, match="already exists"): graph_page.add_resource(d, Name.XObject, Name.Im0, replace_existing=False) res = graph_page.add_resource(d, Name.XObject, Name.Im0, replace_existing=True) assert graph_page.resources.XObject[res].Width == 1 res2 = graph_page.add_resource(d, Name.XObject, prefix='Im') assert str(res2).startswith("/Im") assert graph_page.resources.XObject[res2].Height == 1 def test_resources_exists_but_wrong_type(self, graph_page): del graph_page.obj.Resources graph_page.obj.Resources = Name.Dummy with pytest.raises(TypeError, match='exists but is not a dictionary'): graph_page.add_resource(self.d, Name.XObject, Name.Im0, replace_existing=False) def test_create_resource_dict_if_not_exists(self, graph_page): del graph_page.obj.Resources graph_page.add_resource(self.d, Name.XObject, Name.Im0, replace_existing=False) assert Name.Resources in graph_page.obj def test_name_and_prefix(self, graph_page): with pytest.raises(ValueError, match="one of"): graph_page.add_resource(self.d, Name.XObject, name=Name.X, prefix='y') def test_unrecognized_object_not_disturbed(self, graph_page): graph_page.obj.Resources.InvalidItem = Array([42]) graph_page.add_resource(self.d, Name.Pattern) assert Name.InvalidItem in graph_page.obj.Resources
def test_json(): d = Dictionary({ '/Boolean': True, '/Integer': 42, '/Real': Decimal('42.42'), '/String': String('hi'), '/Array': Array([1, 2, 3.14]), '/Dictionary': Dictionary({'/Color': 'Red'}), }) json_bytes = d.to_json(False) try: as_dict = json.loads(json_bytes) except TypeError: as_dict = json.loads(json_bytes.decode('utf-8')) # Py3.5 shim assert as_dict == { "/Array": [1, 2, 3.140000], "/Boolean": True, "/Dictionary": { "/Color": "Red" }, "/Integer": 42, "/Real": 42.42, "/String": "hi", }
def test_extract_direct_fails_nondefault_colortransform(congress): xobj, _pdf = congress xobj.DecodeParms = Dictionary( ColorTransform=42 # Non standard (or allowed in the spec) ) pim = PdfImage(xobj) bio = BytesIO() with pytest.raises(UnsupportedImageTypeError): pim._extract_direct(stream=bio) xobj.ColorSpace = Name.DeviceCMYK pim = PdfImage(xobj) with pytest.raises(UnsupportedImageTypeError): pim._extract_direct(stream=bio)
def test_nametree_crud(outline): nt = NameTree(outline.Root.Names.Dests) assert nt.obj == outline.Root.Names.Dests assert '0' in nt assert isinstance(nt['0'], Object) assert 'foo' not in nt assert '3' in nt del nt['3'] assert '3' not in nt nt['3'] = Dictionary(Entry=3) assert nt['3'].Entry == 3 nt['newentry'] = Array([42]) assert nt['newentry'] == Array([42]) nt['py_newentry'] = 42
def rewrite_png_as_g4(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover im_obj.BitsPerComponent = 1 im_obj.Width = compdata.w im_obj.Height = compdata.h im_obj.write(compdata.read()) log.debug(f"PNG to G4 {im_obj.objgen}") if Name.Predictor in im_obj: del im_obj.Predictor if Name.DecodeParms in im_obj: del im_obj.DecodeParms im_obj.DecodeParms = Dictionary( K=-1, BlackIs1=bool(compdata.minisblack), Columns=compdata.w ) im_obj.Filter = Name.CCITTFaxDecode return
def convert_to_jbig2( pike: Pdf, jbig2_groups: Dict[int, List[XrefExt]], root: Path, options, executor: Executor, ) -> None: """Convert images to JBIG2 and insert into PDF. When the JBIG2 page group size is > 1 we do several JBIG2 images at once and build a symbol dictionary that will span several pages. Each JBIG2 image must reference to its symbol dictionary. If too many pages shared the same dictionary JBIG2 encoding becomes more expensive and less efficient. The default value of 10 was determined through testing. Currently this must be lossy encoding since jbig2enc does not support refinement coding. When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own and needs no dictionary. Currently this must be lossless JBIG2. """ jbig2_globals_dict: Optional[Dictionary] _produce_jbig2_images(jbig2_groups, root, options, executor) for group, xref_exts in jbig2_groups.items(): prefix = f'group{group:08d}' jbig2_symfile = root / (prefix + '.sym') if jbig2_symfile.exists(): jbig2_globals_data = jbig2_symfile.read_bytes() jbig2_globals = Stream(pike, jbig2_globals_data) jbig2_globals_dict = Dictionary(JBIG2Globals=jbig2_globals) elif options.jbig2_page_group_size == 1: jbig2_globals_dict = None else: raise FileNotFoundError(jbig2_symfile) for n, xref_ext in enumerate(xref_exts): xref, _ = xref_ext jbig2_im_file = root / (prefix + f'.{n:04d}') jbig2_im_data = jbig2_im_file.read_bytes() im_obj = pike.get_object(xref, 0) im_obj.write(jbig2_im_data, filter=Name.JBIG2Decode, decode_parms=jbig2_globals_dict)
def test_dict_or_array_dict(): pdf = pikepdf.new() imobj = Stream( pdf, b'dummy', BitsPerComponent=1, ColorSpace=Name.DeviceGray, DecodeParms=Array([Dictionary( BlackIs1=False, Columns=16, K=-1, )]), Filter=Array([Name.CCITTFaxDecode]), Height=16, Width=16, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.decode_parms[ 0].K == -1 # Check that array of dict is unpacked properly
def test_fourpages_to_4up(fourpages, graph, outpdf): pdf = Pdf.new() pdf.add_blank_page(page_size=(1000, 1000)) page = Page(pdf.pages[0]) pdf.pages.extend(fourpages.pages) page.add_overlay(pdf.pages[1], Rectangle(0, 500, 500, 1000)) page.add_overlay(Page(pdf.pages[2]), Rectangle(500, 500, 1000, 1000)) page.add_overlay( Page(pdf.pages[3]).as_form_xobject(), Rectangle(0, 0, 500, 500)) page.add_underlay(pdf.pages[4], Rectangle(500, 0, 1000, 500)) page.add_underlay(graph.pages[0]) with pytest.raises(TypeError): page.add_overlay(Dictionary(Key=123)) del pdf.pages[1:] pdf.save(outpdf)
def test_dest_or_action(outlines_doc): first_obj = outlines_doc.Root.Outlines.First first_page = outlines_doc.pages[0] assert '/A' in first_obj assert '/Dest' not in first_obj with outlines_doc.open_outline() as outline: first = outline.root[0] # Set to first page. first.destination = 0 # Reference should be replaced at this point. assert first.destination == [first_page, Name.Fit] assert first_obj.Dest == first.destination # Original action should be gone assert '/A' not in first_obj # Now save with a new action instead with outlines_doc.open_outline() as outline: first = outline.root[0] first.action = Dictionary(D=first.destination, S=Name.GoTo) first.destination = None assert first_obj.A.D == [first_page, Name.Fit] assert '/Dest' not in first_obj
def transcode_pngs(pike, images, image_name_fn, root, log, options): if options.optimize >= 2: png_quality = ( max(10, options.png_quality - 10), min(100, options.png_quality + 10), ) with concurrent.futures.ThreadPoolExecutor( max_workers=options.jobs ) as executor: for xref in images: log.debug(image_name_fn(root, xref)) executor.submit( pngquant.quantize, image_name_fn(root, xref), png_name(root, xref), png_quality[0], png_quality[1], ) for xref in images: im_obj = pike.get_object(xref, 0) try: compdata = leptonica.CompressedData.open(png_name(root, xref)) except leptonica.LeptonicaError as e: # Most likely this means file not found, i.e. quantize did not # produce an improved version log.error(e) continue # If re-coded image is larger don't use it - we test here because # pngquant knows the size of the temporary output file but not the actual # object in the PDF if len(compdata) > int(im_obj.stream_dict.Length): log.debug( f"pngquant: pngquant did not improve over original image " f"{len(compdata)} > {int(im_obj.stream_dict.Length)}" ) continue # When a PNG is inserted into a PDF, we more or less copy the IDAT section from # the PDF and transfer the rest of the PNG headers to PDF image metadata. # One thing we have to do is tell the PDF reader whether a predictor was used # on the image before Flate encoding. (Typically one is.) # According to Leptonica source, PDF readers don't actually need us # to specify the correct predictor, they just need a value of either: # 1 - no predictor # 10-14 - there is a predictor # Leptonica's compdata->predictor only tells TRUE or FALSE # From there the PNG decoder can infer the rest from the file. # In practice the predictor should be Paeth, 14, so we'll use that. # See: # - PDF RM 7.4.4.4 Table 10 # - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757 predictor = 14 if compdata.predictor > 0 else 1 dparms = Dictionary(Predictor=predictor) if predictor > 1: dparms.BitsPerComponent = compdata.bps # Yes, this is redundant dparms.Colors = compdata.spp dparms.Columns = compdata.w im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h if compdata.ncolors > 0: # .ncolors is the number of colors in the palette, not the number of # colors used in a true color image palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [ Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream, ] cs = palette else: if compdata.spp == 1: # PDF interprets binary-1 as black in 1bpp, but PNG sets # black to 0 for 1bpp. Create a palette that informs the PDF # of the mapping - seems cleaner to go this way but pikepdf # needs to be patched to support it. # palette = [Name.Indexed, Name.DeviceGray, 1, b"\xff\x00"] # cs = palette cs = Name.DeviceGray elif compdata.spp == 3: cs = Name.DeviceRGB elif compdata.spp == 4: cs = Name.DeviceCMYK if compdata.bps == 1: im_obj.Decode = [1, 0] # Bit of a kludge but this inverts photometric too im_obj.ColorSpace = cs im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)