def getobj(self, objid): assert objid != 0 if not self.xrefs: raise PDFException('PDFDocument is not initialized') if 2 <= self.debug: print >> sys.stderr, 'getobj: objid=%r' % (objid) if objid in self._cached_objs: (obj, genno) = self._cached_objs[objid] else: for xref in self.xrefs: try: (strmid, index, genno) = xref.get_pos(objid) except KeyError: continue try: if strmid is not None: stream = stream_value(self.getobj(strmid)) obj = self._getobj_objstm(stream, index, objid) else: obj = self._getobj_parse(index, objid) if isinstance(obj, PDFStream): obj.set_objid(objid, genno) break except (PSEOF, PDFSyntaxError): continue else: raise PDFObjectNotFound(objid) if 2 <= self.debug: print >> sys.stderr, 'register: objid=%r: %r' % (objid, obj) if self.caching: self._cached_objs[objid] = (obj, genno) if self.decipher: obj = decipher_all(self.decipher, objid, genno, obj) return obj
def get_pages(self): if not self.xrefs: raise PDFException('PDFDocument is not initialized') def search(obj, parent): if isinstance(obj, int): objid = obj tree = dict_value(self.getobj(objid)).copy() else: objid = obj.objid tree = dict_value(obj).copy() for (k,v) in parent.iteritems(): if k in self.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if 1 <= self.debug: print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids'] for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: if 1 <= self.debug: print >>sys.stderr, 'Page: %r' % tree yield (objid, tree) if 'Pages' not in self.catalog: return for (pageid,tree) in search(self.catalog['Pages'], self.catalog): yield PDFPage(self, pageid, tree) return
def get_outlines(self): if 'Outlines' not in self.catalog: raise PDFException('No /Outlines defined!') def search(entry, level): entry = dict_value(entry) if 'Title' in entry: if 'A' in entry or 'Dest' in entry: title = decode_text(str_value(entry['Title'])) dest = entry.get('Dest') action = entry.get('A') se = entry.get('SE') yield (level, title, dest, action, se) if 'First' in entry and 'Last' in entry: for x in search(entry['First'], level + 1): yield x if 'Next' in entry: for x in search(entry['Next'], level): yield x return return search(self.catalog['Outlines'], 0)
def getobj(self, objid): if not self.xrefs: raise PDFException('PDFDocument is not initialized') if 2 <= self.debug: print >>sys.stderr, 'getobj: objid=%r' % (objid) if objid in self._cached_objs: genno = 0 obj = self._cached_objs[objid] else: for xref in self.xrefs: try: (strmid, index) = xref.get_pos(objid) break except KeyError: pass else: if STRICT: raise PDFSyntaxError('Cannot locate objid=%r' % objid) # return null for a nonexistent reference. return None if strmid: stream = stream_value(self.getobj(strmid)) if stream.get('Type') is not LITERAL_OBJSTM: if STRICT: raise PDFSyntaxError('Not a stream object: %r' % stream) try: n = stream['N'] except KeyError: if STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 if strmid in self._parsed_objs: objs = self._parsed_objs[strmid] else: parser = PDFStreamParser(stream.get_data()) parser.set_document(self) objs = [] try: while 1: (_,obj) = parser.nextobject() objs.append(obj) except PSEOF: pass if self.caching: self._parsed_objs[strmid] = objs genno = 0 i = n*2+index try: obj = objs[i] except IndexError: if STRICT: raise PDFSyntaxError('Invalid object number: objid=%r' % (objid)) # return None for an invalid object number return None if isinstance(obj, PDFStream): obj.set_objid(objid, 0) else: self._parser.seek(index) (_,objid1) = self._parser.nexttoken() # objid (_,genno) = self._parser.nexttoken() # genno (_,kwd) = self._parser.nexttoken() # #### hack around malformed pdf files #assert objid1 == objid, (objid, objid1) if objid1 != objid: x = [] while kwd is not self.KEYWORD_OBJ: (_,kwd) = self._parser.nexttoken() x.append(kwd) if x: objid1 = x[-2] genno = x[-1] # #### end hack around malformed pdf files if kwd is not self.KEYWORD_OBJ: raise PDFSyntaxError('Invalid object spec: offset=%r' % index) try: (_,obj) = self._parser.nextobject() if isinstance(obj, PDFStream): obj.set_objid(objid, genno) except PSEOF: return None if 2 <= self.debug: print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj) if self.caching: self._cached_objs[objid] = obj if self.decipher: obj = decipher_all(self.decipher, objid, genno, obj) return obj