def _addPage(self, page, action): assert page["/Type"] == "/Page" page[NameObject("/Parent")] = self._pages page = self._addObject(page) pages = self.getObject(self._pages) action(pages["/Kids"], page) pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1)
def _flatten(self, pages=None, inherit=None, indirectRef=None): inheritablePageAttributes = (NameObject("/Resources"), NameObject("/MediaBox"), NameObject("/CropBox"), NameObject("/Rotate")) if inherit is None: inherit = dict() if pages is None: self.flattenedPages = [] catalog = self.trailer["/Root"].getObject() pages = catalog["/Pages"].getObject() t = pages["/Type"] if t == "/Pages": for attr in inheritablePageAttributes: if attr in pages: inherit[attr] = pages[attr] for page in pages["/Kids"]: addt = {} if isinstance(page, IndirectObject): addt["indirectRef"] = page self._flatten(page.getObject(), inherit, **addt) elif t == "/Page": for attr, value in inherit.items(): # if the page has it's own value, it does not inherit the # parent's value: if attr not in pages: pages[attr] = value pageObj = PageObject(self, indirectRef) pageObj.update(pages) self.flattenedPages.append(pageObj)
def __init__(self): self._header = b_("%PDF-1.3") self._objects = [] # array of indirect objects # The root of our page tree node. pages = DictionaryObject() pages.update({ NameObject("/Type"): NameObject("/Pages"), NameObject("/Count"): NumberObject(0), NameObject("/Kids"): ArrayObject() }) self._pages = self._addObject(pages) # info object info = DictionaryObject() info.update({ NameObject("/Producer"): createStringObject( u"Python PDF Library - http://pybrary.net/pyPdf/") }) self._info = self._addObject(info) # root object root = DictionaryObject() root.update({ NameObject("/Type"): NameObject("/Catalog"), NameObject("/Pages"): self._pages }) self._root = self._addObject(root)
def addBookmark(self, title, pagenum, parent=None): """ Add a bookmark to the pdf, using the specified title and pointing at the specified page number. A parent can be specified to make this a nested bookmark below the parent. """ if parent is None: iloc = [len(self.bookmarks)-1] elif type(parent) == list: iloc = parent else: iloc = self.findBookmark(parent) dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) if parent is None: self.bookmarks.append(dest) else: bmparent = self.bookmarks for i in iloc[:-1]: bmparent = bmparent[i] npos = iloc[-1]+1 if npos < len(bmparent) and type(bmparent[npos]) == list: bmparent[npos].append(dest) else: bmparent.insert(npos, [dest])
def addTransformation(self, ctm): originalContent = self.getContents() if originalContent is not None: newContent = PageObject._addTransformationMatrix( originalContent, self.pdf, ctm) newContent = PageObject._pushPopGS(newContent, self.pdf) self[NameObject('/Contents')] = newContent
def addNamedDestination(self, title, pagenum): pageRef = self.getObject(self._pages)['/Kids'][pagenum] dest = DictionaryObject() dest.update({ NameObject('/D'): ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), NameObject('/S'): NameObject('/GoTo') }) destRef = self._addObject(dest) nd = self.getNamedDestRoot() nd.extend([title, destRef]) return destRef
def addNamedDestination(self, title, pagenum): """ Add a destination to the pdf, using the specified title and pointing at the specified page number. """ dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) self.named_dests.append(dest)
def addBookmarkDict(self, bookmark, parent=None): bookmarkObj = TreeObject() for k, v in bookmark.items(): bookmarkObj[NameObject(str(k))] = v bookmarkObj.update(bookmark) if '/A' in bookmark: action = DictionaryObject() for k, v in bookmark['/A'].items(): action[NameObject(str(k))] = v actionRef = self._addObject(action) bookmarkObj['/A'] = actionRef bookmarkRef = self._addObject(bookmarkObj) outlineRef = self.getOutlineRoot() if parent is None: parent = outlineRef parent = parent.getObject() parent.addChild(bookmarkRef, self) return bookmarkRef
def createBlankPage(pdf=None, width=None, height=None): page = PageObject(pdf) # Creates a new page (cf PDF Reference 7.7.3.3) page.__setitem__(NameObject('/Type'), NameObject('/Page')) page.__setitem__(NameObject('/Parent'), NullObject()) page.__setitem__(NameObject('/Resources'), DictionaryObject()) if width is None or height is None: if pdf is not None and pdf.getNumPages() > 0: lastpage = pdf.getPage(pdf.getNumPages() - 1) width = lastpage.mediaBox.getWidth() height = lastpage.mediaBox.getHeight() else: raise utils.PageSizeNotDefinedError() page.__setitem__(NameObject('/MediaBox'), RectangleObject([0, 0, width, height])) return page
def _write_dests(self): dests = self.named_dests for v in dests: pageno = None if '/Page' in v: for i, p in enumerate(self.pages): if p.id == v['/Page']: v[NameObject('/Page')] = p.out_pagedata pageno = i if pageno is not None: self.output.addNamedDestinationObject(v)
def getNamedDestRoot(self): root = self.getObject(self._root) if '/Names' in root and isinstance(root['/Names'], DictionaryObject): names = root['/Names'] idnum = self._objects.index(names) + 1 namesRef = IndirectObject(idnum, 0, self) assert namesRef.getObject() == names if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject): dests = names['/Dests'] idnum = self._objects.index(dests) + 1 destsRef = IndirectObject(idnum, 0, self) assert destsRef.getObject() == dests if '/Names' in dests: nd = dests['/Names'] else: nd = ArrayObject() dests[NameObject('/Names')] = nd else: dests = DictionaryObject() destsRef = self._addObject(dests) names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd else: names = DictionaryObject() namesRef = self._addObject(names) root[NameObject('/Names')] = namesRef dests = DictionaryObject() destsRef = self._addObject(dests) names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd return nd
def addBookmark(self, title, pagenum, parent=None): """ Add a bookmark to the pdf, using the specified title and pointing at the specified page number. A parent can be specified to make this a nested bookmark below the parent. """ pageRef = self.getObject(self._pages)['/Kids'][pagenum] action = DictionaryObject() action.update({ NameObject('/D'): ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), NameObject('/S'): NameObject('/GoTo') }) actionRef = self._addObject(action) outlineRef = self.getOutlineRoot() if parent is None: parent = outlineRef bookmark = TreeObject() bookmark.update({ NameObject('/A'): actionRef, NameObject('/Title'): createStringObject(title) }) bookmarkRef = self._addObject(bookmark) parent = parent.getObject() parent.addChild(bookmarkRef, self) return bookmarkRef
def getOutlineRoot(self): root = self.getObject(self._root) if '/Outlines' in root: outline = root['/Outlines'] idnum = self._objects.index(outline) + 1 outlineRef = IndirectObject(idnum, 0, self) assert outlineRef.getObject() == outline else: outline = TreeObject() outline.update({}) outlineRef = self._addObject(outline) root[NameObject('/Outlines')] = outlineRef return outline
def _mergeResources(res1, res2, resource): newRes = DictionaryObject() newRes.update(res1.get(resource, DictionaryObject()).getObject()) page2Res = res2.get(resource, DictionaryObject()).getObject() renameRes = {} for key in page2Res.keys(): if key in newRes and newRes[key] != page2Res[key]: newname = NameObject(key + "renamed") renameRes[key] = newname newRes[newname] = page2Res[key] elif key not in newRes: newRes[key] = page2Res.raw_get(key) return newRes, renameRes
def _trim_dests(self, pdf, dests, pages): """ Removes any named destinations that are not a part of the specified page set """ new_dests = [] for k, o in dests.items(): for j in range(*pages): if pdf.getPage(j).getObject() == o['/Page'].getObject(): o[NameObject('/Page')] = o['/Page'].getObject() assert str(k) == str(o['/Title']) new_dests.append(o) break return new_dests
def _associate_dests_to_pages(self, pages): for nd in self.named_dests: pageno = None np = nd['/Page'] if type(np) == NumberObject: continue for p in pages: if np.getObject() == p.pagedata.getObject(): pageno = p.id if pageno is not None: nd[NameObject('/Page')] = NumberObject(pageno) else: raise ValueError("Unresolved named destination '%s'" % nd['/Title'])
def _write_bookmarks(self, bookmarks=None, parent=None): if bookmarks is None: bookmarks = self.bookmarks last_added = None for b in bookmarks: if type(b) == list: self._write_bookmarks(b, last_added) continue pageno = None if '/Page' in b: for i, p in enumerate(self.pages): if p.id == b['/Page']: b[NameObject('/Page')] = p.out_pagedata pageno = i if pageno is not None: last_added = self.output.addBookmarkDestination(b, parent)
def add(self, title, pagenum): pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] action = DictionaryObject() action.update({NameObject('/D'): ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), NameObject('/S'): NameObject('/GoTo')}) actionRef = self.pdf._addObject(action) bookmark = TreeObject() bookmark.update({NameObject('/A'): actionRef, NameObject('/Title'): createStringObject(title)}) self.pdf._addObject(bookmark) self.tree.addChild(bookmark)
def _associate_bookmarks_to_pages(self, pages, bookmarks=None): if bookmarks is None: bookmarks = self.bookmarks for b in bookmarks: if type(b) == list: self._associate_bookmarks_to_pages(pages, b) continue pageno = None bp = b['/Page'] if type(bp) == NumberObject: continue for p in pages: if bp.getObject() == p.pagedata.getObject(): pageno = p.id if pageno is not None: b[NameObject('/Page')] = NumberObject(pageno) else: raise ValueError("Unresolved bookmark '%s'" % b['/Title'])
def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True): import time import random if owner_pwd is None: owner_pwd = user_pwd if use_128bit: V = 2 rev = 3 keylen = 128 / 8 else: V = 1 rev = 2 keylen = 40 / 8 # permit everything: P = -1 O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) ID_1 = md5(repr(time.time())).digest() ID_2 = md5(repr(random.random())).digest() self._ID = ArrayObject( (ByteStringObject(ID_1), ByteStringObject(ID_2))) if rev == 2: U, key = _alg34(user_pwd, O, P, ID_1) else: assert rev == 3 U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) encrypt = DictionaryObject() encrypt[NameObject("/Filter")] = NameObject("/Standard") encrypt[NameObject("/V")] = NumberObject(V) if V == 2: encrypt[NameObject("/Length")] = NumberObject(keylen * 8) encrypt[NameObject("/R")] = NumberObject(rev) encrypt[NameObject("/O")] = ByteStringObject(O) encrypt[NameObject("/U")] = ByteStringObject(U) encrypt[NameObject("/P")] = NumberObject(P) self._encrypt = self._addObject(encrypt) self._encrypt_key = key
def _buildOutline(self, node): dest, title, outline = None, None, None if "/A" in node and "/Title" in node: # Action, section 8.5 (only type GoTo supported) title = node["/Title"] action = node["/A"] if action["/S"] == "/GoTo": dest = action["/D"] elif "/Dest" in node and "/Title" in node: # Destination, section 8.2.1 title = node["/Title"] dest = node["/Dest"] # if destination found, then create outline if dest: if isinstance(dest, ArrayObject): outline = self._buildDestination(title, dest) elif isinstance(dest, unicode) and dest in self._namedDests: outline = self._namedDests[dest] outline[NameObject("/Title")] = title else: raise utils.PdfReadError("Unexpected destination %r" % dest) return outline
def _trim_outline(self, pdf, outline, pages): """ Removes any outline/bookmark entries that are not a part of the specified page set """ new_outline = [] prev_header_added = True for i, o in enumerate(outline): if type(o) == list: sub = self._trim_outline(pdf, o, pages) if sub: if not prev_header_added: new_outline.append(outline[i-1]) new_outline.append(sub) else: prev_header_added = False for j in range(*pages): if pdf.getPage(j).getObject() == o['/Page'].getObject(): o[NameObject('/Page')] = o['/Page'].getObject() new_outline.append(o) prev_header_added = True break return new_outline
def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): # First we work on merging the resource dictionaries. This allows us # to find out what symbols in the content streams we might need to # rename. newResources = DictionaryObject() rename = {} originalResources = self["/Resources"].getObject() page2Resources = page2["/Resources"].getObject() for res in [ "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties" ]: new, newrename = PageObject._mergeResources( originalResources, page2Resources, res) if new: newResources[NameObject(res)] = new rename.update(newrename) # Combine /ProcSet sets. newResources[NameObject("/ProcSet")] = ArrayObject( frozenset( originalResources.get("/ProcSet", ArrayObject()).getObject()).union( frozenset( page2Resources.get( "/ProcSet", ArrayObject()).getObject()))) newContentArray = ArrayObject() originalContent = self.getContents() if originalContent is not None: newContentArray.append( PageObject._pushPopGS(originalContent, self.pdf)) page2Content = page2.getContents() if page2Content is not None: if page2transformation is not None: page2Content = page2transformation(page2Content) page2Content = PageObject._contentStreamRename( page2Content, rename, self.pdf) page2Content = PageObject._pushPopGS(page2Content, self.pdf) newContentArray.append(page2Content) # if expanding the page to fit a new page, # calculate the new media box size if expand: corners1 = [ self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(), self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric() ] corners2 = [ page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(), page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(), page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(), page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric() ] if ctm is not None: new_x = map( lambda i: ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4], range(0, 8, 2)) new_y = map( lambda i: ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5], range(0, 8, 2)) else: new_x = corners2[0:8:2] new_y = corners2[1:8:2] lowerleft = [min(new_x), min(new_y)] upperright = [max(new_x), max(new_y)] lowerleft = [ min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]) ] upperright = [ max(corners1[2], upperright[0]), max(corners1[3], upperright[1]) ] self.mediaBox.setLowerLeft(lowerleft) self.mediaBox.setUpperRight(upperright) self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) self[NameObject('/Resources')] = newResources
def __init__(self, title, page, typ, *args): DictionaryObject.__init__(self) self[NameObject("/Title")] = title self[NameObject("/Page")] = page self[NameObject("/Type")] = typ # from table 8.2 of the PDF 1.6 reference. if typ == "/XYZ": (self[NameObject("/Left")], self[NameObject("/Top")], self[NameObject("/Zoom")]) = args elif typ == "/FitR": (self[NameObject("/Left")], self[NameObject("/Bottom")], self[NameObject("/Right")], self[NameObject("/Top")]) = args elif typ in ["/FitH", "FitBH"]: self[NameObject("/Top")], = args elif typ in ["/FitV", "FitBV"]: self[NameObject("/Left")], = args elif typ in ["/Fit", "FitB"]: pass else: raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
def _rotate(self, angle): currentAngle = self.get("/Rotate", 0) self[NameObject("/Rotate")] = NumberObject(currentAngle + angle)
def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True): """ >>> merge(position, file, bookmark=None, pages=None, import_bookmarks=True) Merges the pages from the source document specified by "file" into the output file at the page number specified by "position". Optionally, you may specify a bookmark to be applied at the beginning of the included file by supplying the text of the bookmark in the "bookmark" parameter. You may prevent the source document's bookmarks from being imported by specifying "import_bookmarks" as False. You may also use the "pages" parameter to merge only the specified range of pages from the source document into the output document. """ # This parameter is passed to self.inputs.append and means # that the stream used was created in this method. my_file = False # If the fileobj parameter is a string, assume it is a path # and create a file object at that location. If it is a file, # copy the file's contents into a StringIO stream object; if # it is a PdfFileReader, copy that reader's stream into a # StringIO stream. # If fileobj is none of the above types, it is not modified if type(fileobj) in (str, unicode): fileobj = file(fileobj, 'rb') my_file = True elif type(fileobj) == file: fileobj.seek(0) filecontent = fileobj.read() fileobj = StringIO(filecontent) my_file = True elif type(fileobj) == PdfFileReader: orig_tell = fileobj.stream.tell() fileobj.stream.seek(0) filecontent = StringIO(fileobj.stream.read()) # reset the stream to its original location fileobj.stream.seek(orig_tell) fileobj = filecontent my_file = True # Create a new PdfFileReader instance using the stream # (either file or StringIO) created above pdfr = PdfFileReader(fileobj, strict=self.strict) # Find the range of pages to merge if pages is None: pages = (0, pdfr.getNumPages()) elif type(pages) in (int, float, str, unicode): raise TypeError('"pages" must be a tuple of (start, end)') srcpages = [] if bookmark: bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit')) outline = [] if import_bookmarks: outline = pdfr.getOutlines() outline = self._trim_outline(pdfr, outline, pages) if bookmark: self.bookmarks += [bookmark, outline] else: self.bookmarks += outline dests = pdfr.namedDestinations dests = self._trim_dests(pdfr, dests, pages) self.named_dests += dests # Gather all the pages that are going to be merged for i in range(*pages): pg = pdfr.getPage(i) id = self.id_count self.id_count += 1 mp = _MergedPage(pg, pdfr, id) srcpages.append(mp) self._associate_dests_to_pages(srcpages) self._associate_bookmarks_to_pages(srcpages) # Slice to insert the pages at the specified position self.pages[position:position] = srcpages # Keep track of our input files so we can close them later self.inputs.append((fileobj, pdfr, my_file))
def getNumPages(self): pages = self.getObject(self._pages) return int(pages[NameObject("/Count")])
def compressContentStreams(self): content = self.getContents() if content is not None: if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) self[NameObject("/Contents")] = content.flateEncode()
def setRectangle(self, name, value): if not isinstance(name, NameObject): name = NameObject(name) self[name] = value
def write(self, stream): externalReferenceMap = {} # PDF objects sometimes have circular references to their /Page objects # inside their object tree (for example, annotations). Those will be # indirect references to objects that we've recreated in this PDF. To # address this problem, PageObject's store their original object # reference number, and we add it to the external reference map before # we sweep for indirect references. This forces self-page-referencing # trees to reference the correct new object location, rather than # copying in a new copy of the page object. for objIndex in xrange(len(self._objects)): obj = self._objects[objIndex] if isinstance(obj, PageObject) and obj.indirectRef is not None: data = obj.indirectRef externalReferenceMap.setdefault(data.pdf, {}) externalReferenceMap[data.pdf].setdefault(data.generation, {}) externalReferenceMap[data.pdf][data.generation][data.idnum] = \ IndirectObject(objIndex + 1, 0, self) self.stack = [] self._sweepIndirectReferences(externalReferenceMap, self._root) del self.stack # Begin writing: object_positions = [] stream.write(self._header + b_("\n")) for i in range(len(self._objects)): idnum = (i + 1) obj = self._objects[i] object_positions.append(stream.tell()) stream.write(b_(str(idnum) + " 0 obj\n")) key = None if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: pack1 = struct.pack("<i", i + 1)[:3] pack2 = struct.pack("<i", 0)[:2] key = self._encrypt_key + pack1 + pack2 assert len(key) == (len(self._encrypt_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._encrypt_key) + 5)] if obj is not None: obj.writeToStream(stream, key) stream.write(b_("\nendobj\n")) # xref table xref_location = stream.tell() stream.write(b_("xref\n")) stream.write(b_("0 %s\n" % (len(self._objects) + 1))) stream.write(b_("%010d %05d f \n" % (0, 65535))) for offset in object_positions: stream.write(b_("%010d %05d n \n" % (offset, 0))) # trailer stream.write(b_("trailer\n")) trailer = DictionaryObject() trailer.update({ NameObject("/Size"): NumberObject(len(self._objects) + 1), NameObject("/Root"): self._root, NameObject("/Info"): self._info }) if hasattr(self, "_ID"): trailer[NameObject("/ID")] = self._ID if hasattr(self, "_encrypt"): trailer[NameObject("/Encrypt")] = self._encrypt trailer.writeToStream(stream, None) # eof stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))