class PdfFileMerger(object): """ PdfFileMerger merges multiple PDFs into a single PDF. It can concatenate, slice, insert, or any combination of the above. See the functions "merge" (or "append") and "write" (or "overwrite") for usage information. """ def __init__(self, strict=True): """ >>> PdfFileMerger() Initializes a PdfFileMerger, no parameters required """ self.inputs = [] self.pages = [] self.output = PdfFileWriter() self.bookmarks = [] self.named_dests = [] self.id_count = 0 self.strict = strict def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True): """ >>> merge(position, file, bookmark=None, pages=None, import_bookmarks=True) Merges the pages from the source document specified by "file" into the output file at the page number specified by "position". Optionally, you may specify a bookmark to be applied at the beginning of the included file by supplying the text of the bookmark in the "bookmark" parameter. You may prevent the source document's bookmarks from being imported by specifying "import_bookmarks" as False. You may also use the "pages" parameter to merge only the specified range of pages from the source document into the output document. """ # This parameter is passed to self.inputs.append and means # that the stream used was created in this method. my_file = False # If the fileobj parameter is a string, assume it is a path # and create a file object at that location. If it is a file, # copy the file's contents into a StringIO stream object; if # it is a PdfFileReader, copy that reader's stream into a # StringIO stream. # If fileobj is none of the above types, it is not modified if type(fileobj) in (str, unicode): fileobj = file(fileobj, 'rb') my_file = True elif type(fileobj) == file: fileobj.seek(0) filecontent = fileobj.read() fileobj = StringIO(filecontent) my_file = True elif type(fileobj) == PdfFileReader: orig_tell = fileobj.stream.tell() fileobj.stream.seek(0) filecontent = StringIO(fileobj.stream.read()) # reset the stream to its original location fileobj.stream.seek(orig_tell) fileobj = filecontent my_file = True # Create a new PdfFileReader instance using the stream # (either file or StringIO) created above pdfr = PdfFileReader(fileobj, strict=self.strict) # Find the range of pages to merge if pages is None: pages = (0, pdfr.getNumPages()) elif type(pages) in (int, float, str, unicode): raise TypeError('"pages" must be a tuple of (start, end)') srcpages = [] if bookmark: bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit')) outline = [] if import_bookmarks: outline = pdfr.getOutlines() outline = self._trim_outline(pdfr, outline, pages) if bookmark: self.bookmarks += [bookmark, outline] else: self.bookmarks += outline dests = pdfr.namedDestinations dests = self._trim_dests(pdfr, dests, pages) self.named_dests += dests # Gather all the pages that are going to be merged for i in range(*pages): pg = pdfr.getPage(i) id = self.id_count self.id_count += 1 mp = _MergedPage(pg, pdfr, id) srcpages.append(mp) self._associate_dests_to_pages(srcpages) self._associate_bookmarks_to_pages(srcpages) # Slice to insert the pages at the specified position self.pages[position:position] = srcpages # Keep track of our input files so we can close them later self.inputs.append((fileobj, pdfr, my_file)) def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): """ >>> append(file, bookmark=None, pages=None, import_bookmarks=True): Identical to the "merge" function, but assumes you want to concatenate all pages onto the end of the file instead of specifying a position. """ self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) def write(self, fileobj): """ >>> write(file) Writes all data that has been merged to "file" (which can be a filename or any kind of file-like object) """ my_file = False if type(fileobj) in (str, unicode): fileobj = file(fileobj, 'wb') my_file = True # Add pages to the PdfFileWriter # The commented out line below was replaced with the three lines # below it to allow PdfFileMerger to work with PyPdf 1.13 for page in self.pages: self.output.addPage(page.pagedata) page.out_pagedata = self.output.getReference( self.output._pages.getObject()["/Kids"][-1].getObject()) #idnum = self.output._objects.index( # self.output._pages.getObject()["/Kids"][-1].getObject()) + 1 #page.out_pagedata = IndirectObject(idnum, 0, self.output) # Once all pages are added, create bookmarks to point at those pages self._write_dests() self._write_bookmarks() # Write the output to the file self.output.write(fileobj) if my_file: fileobj.close() def close(self): """ >>> close() Shuts all file descriptors (input and output) and clears all memory usage """ self.pages = [] for fo, pdfr, mine in self.inputs: if mine: fo.close() self.inputs = [] self.output = None def _trim_dests(self, pdf, dests, pages): """ Removes any named destinations that are not a part of the specified page set """ new_dests = [] for k, o in dests.items(): for j in range(*pages): if pdf.getPage(j).getObject() == o['/Page'].getObject(): o[NameObject('/Page')] = o['/Page'].getObject() assert str(k) == str(o['/Title']) new_dests.append(o) break return new_dests def _trim_outline(self, pdf, outline, pages): """ Removes any outline/bookmark entries that are not a part of the specified page set """ new_outline = [] prev_header_added = True for i, o in enumerate(outline): if type(o) == list: sub = self._trim_outline(pdf, o, pages) if sub: if not prev_header_added: new_outline.append(outline[i-1]) new_outline.append(sub) else: prev_header_added = False for j in range(*pages): if pdf.getPage(j).getObject() == o['/Page'].getObject(): o[NameObject('/Page')] = o['/Page'].getObject() new_outline.append(o) prev_header_added = True break return new_outline def _write_dests(self): dests = self.named_dests for v in dests: pageno = None if '/Page' in v: for i, p in enumerate(self.pages): if p.id == v['/Page']: v[NameObject('/Page')] = p.out_pagedata pageno = i if pageno is not None: self.output.addNamedDestinationObject(v) def _write_bookmarks(self, bookmarks=None, parent=None): if bookmarks is None: bookmarks = self.bookmarks last_added = None for b in bookmarks: if type(b) == list: self._write_bookmarks(b, last_added) continue pageno = None if '/Page' in b: for i, p in enumerate(self.pages): if p.id == b['/Page']: b[NameObject('/Page')] = p.out_pagedata pageno = i if pageno is not None: last_added = self.output.addBookmarkDestination(b, parent) def _associate_dests_to_pages(self, pages): for nd in self.named_dests: pageno = None np = nd['/Page'] if type(np) == NumberObject: continue for p in pages: if np.getObject() == p.pagedata.getObject(): pageno = p.id if pageno is not None: nd[NameObject('/Page')] = NumberObject(pageno) else: raise ValueError("Unresolved named destination '%s'" % nd['/Title']) def _associate_bookmarks_to_pages(self, pages, bookmarks=None): if bookmarks is None: bookmarks = self.bookmarks for b in bookmarks: if type(b) == list: self._associate_bookmarks_to_pages(pages, b) continue pageno = None bp = b['/Page'] if type(bp) == NumberObject: continue for p in pages: if bp.getObject() == p.pagedata.getObject(): pageno = p.id if pageno is not None: b[NameObject('/Page')] = NumberObject(pageno) else: raise ValueError("Unresolved bookmark '%s'" % b['/Title']) def findBookmark(self, bookmark, root=None): if root is None: root = self.bookmarks for i, b in enumerate(root): if type(b) == list: res = self.findBookmark(bookmark, b) if res: return [i] + res if b == bookmark or b['/Title'] == bookmark: return [i] return None def addBookmark(self, title, pagenum, parent=None): """ Add a bookmark to the pdf, using the specified title and pointing at the specified page number. A parent can be specified to make this a nested bookmark below the parent. """ if parent is None: iloc = [len(self.bookmarks)-1] elif type(parent) == list: iloc = parent else: iloc = self.findBookmark(parent) dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) if parent is None: self.bookmarks.append(dest) else: bmparent = self.bookmarks for i in iloc[:-1]: bmparent = bmparent[i] npos = iloc[-1]+1 if npos < len(bmparent) and type(bmparent[npos]) == list: bmparent[npos].append(dest) else: bmparent.insert(npos, [dest]) def addNamedDestination(self, title, pagenum): """ Add a destination to the pdf, using the specified title and pointing at the specified page number. """ dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) self.named_dests.append(dest)