def _getobj_parse(self, pos, objid): self._parser.seek(pos) (_, objid1) = self._parser.nexttoken() # objid if objid1 != objid: raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid)) (_, genno) = self._parser.nexttoken() # genno (_, kwd) = self._parser.nexttoken() if kwd is not self.KEYWORD_OBJ: raise PDFSyntaxError('Invalid object spec: offset=%r' % pos) (_, obj) = self._parser.nextobject() return obj
def __init__(self, parser, password='', caching=True, fallback=True): "Set the document to use a given PDFParser object." self.caching = caching self.xrefs = [] self.info = [] self.catalog = None self.encryption = None self.decipher = None self._parser = None self._cached_objs = {} self._parsed_objs = {} self._parser = parser self._parser.set_document(self) self.is_printable = self.is_modifiable = self.is_extractable = True # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. try: pos = self.find_xref(parser) self.read_xref_from(parser, pos, self.xrefs) except PDFNoValidXRef: fallback = True if fallback: parser.fallback = True xref = PDFXRefFallback() xref.load(parser) self.xrefs.append(xref) for xref in self.xrefs: trailer = xref.get_trailer() if not trailer: continue # If there's an encryption info, remember it. if 'Encrypt' in trailer: #assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) self._initialize_password(password) if 'Info' in trailer: self.info.append(dict_value(trailer['Info'])) if 'Root' in trailer: # Every PDF file must have exactly one /Root dictionary. self.catalog = dict_value(trailer['Root']) break else: raise PDFSyntaxError('No /Root object! - Is this really a PDF?') if self.catalog.get('Type') is not LITERAL_CATALOG: if STRICT: raise PDFSyntaxError('Catalog not found!') return
def _get_objects(self, stream): if stream.get('Type') is not LITERAL_OBJSTM: if STRICT: raise PDFSyntaxError('Not a stream object: %r' % stream) try: n = stream['N'] except KeyError: if STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 parser = PDFStreamParser(stream.get_data()) parser.set_document(self) objs = [] try: while 1: (_, obj) = parser.nextobject() objs.append(obj) except PSEOF: pass return (objs, n)
def _getobj_objstm(self, stream, index, objid): if stream.objid in self._parsed_objs: (objs, n) = self._parsed_objs[stream.objid] else: (objs, n) = self._get_objects(stream) if self.caching: self._parsed_objs[stream.objid] = (objs, n) i = n * 2 + index try: obj = objs[i] except IndexError: raise PDFSyntaxError('index too big: %r' % index) return obj
def load(self, parser, debug=0): parser.seek(0) while 1: try: (pos, line) = parser.nextline() except PSEOF: break if line.startswith('trailer'): parser.seek(pos) self.load_trailer(parser) if 1 <= debug: print >> sys.stderr, 'trailer: %r' % self.get_trailer() break m = self.PDFOBJ_CUE.match(line) if not m: continue (objid, genno) = m.groups() objid = int(objid) genno = int(genno) self.offsets[objid] = (None, pos, genno) # expand ObjStm. parser.seek(pos) (_, obj) = parser.nextobject() if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM: stream = stream_value(obj) try: n = stream['N'] except KeyError: if STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 parser1 = PDFStreamParser(stream.get_data()) objs = [] try: while 1: (_, obj) = parser1.nextobject() objs.append(obj) except PSEOF: pass n = min(n, len(objs) // 2) for index in xrange(n): objid1 = objs[index * 2] self.offsets[objid1] = (objid, index, 0) return
def load(self, parser, debug=0): (_, objid) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() (_, stream) = parser.nextobject() if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF: raise PDFNoValidXRef('Invalid PDF stream spec.') size = stream['Size'] index_array = stream.get('Index', (0, size)) if len(index_array) % 2 != 0: raise PDFSyntaxError('Invalid index number') self.ranges.extend(choplist(2, index_array)) (self.fl1, self.fl2, self.fl3) = stream['W'] self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs if 1 <= debug: print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % (', '.join(map(repr, self.ranges)), self.fl1, self.fl2, self.fl3)) return