def readindirect(self, objnum, gennum): ''' Read an indirect object. If it has already been read, return it from the cache. ''' def setobj(obj): # Store the new object in the dictionary # once we have its value record[1] = obj def ordinary(source, setobj, obj): # Deal with an ordinary (non-array, non-dict) object setobj(obj) return obj fdata, objnum, gennum = self.fdata, int(objnum), int(gennum) record = self.indirect_objects[fdata, objnum, gennum] if record[1] is not self.unresolved: return record[1] # Read the object header and validate it source = PdfTokens(fdata, record[0]) objid = source.multiple(3) assert int(objid[0]) == objnum, objid assert int(objid[1]) == gennum, objid assert objid[2] == 'obj', objid # Read the object, and call special code if it starts # an array or dictionary obj = source.next() obj = self.special.get(obj, ordinary)(source, setobj, obj) self.readstream(obj, source) obj.indirect = True return obj
def readxref(fdata): startloc = fdata.rindex('startxref') xrefinfo = list(PdfTokens(fdata, startloc, False)) assert len(xrefinfo) == 3, xrefinfo assert xrefinfo[0] == 'startxref', xrefinfo[0] assert xrefinfo[1].isdigit(), xrefinfo[1] assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2]) return startloc, PdfTokens(fdata, int(xrefinfo[1]))
def readstream(obj, source): ''' Read optional stream following a dictionary object. ''' tok = source.next() if tok == 'endobj': return # No stream assert isinstance(obj, PdfDict) assert tok == 'stream', tok fdata = source.fdata floc = fdata.rindex(tok, 0, source.floc) + len(tok) ch = fdata[floc] if ch == '\r': floc += 1 ch = fdata[floc] assert ch == '\n' startstream = floc + 1 endstream = startstream + int(obj.Length) obj._stream = fdata[startstream:endstream] source = PdfTokens(fdata, endstream) endit = source.multiple(2) if endit != 'endstream endobj'.split(): # /Length attribute is broken, try to read stream # anyway disregarding the specified value # TODO: issue warning here once we have some kind of # logging endstream = fdata.index('endstream', startstream) if fdata[endstream-2:endstream] == '\r\n': endstream -= 2 elif fdata[endstream-1] in ['\n', '\r']: endstream -= 1 source = PdfTokens(fdata, endstream) endit = source.multiple(2) assert endit == 'endstream endobj'.split() obj.Length = str(endstream-startstream) obj._stream = fdata[startstream:endstream]
def readstream(obj, source): ''' Read optional stream following a dictionary object. ''' tok = source.next() if tok == 'endobj': return # No stream assert isinstance(obj, PdfDict) assert tok == 'stream', tok fdata = source.fdata floc = fdata.rindex(tok, 0, source.floc) + len(tok) ch = fdata[floc] if ch == '\r': floc += 1 ch = fdata[floc] assert ch == '\n' startstream = floc + 1 endstream = startstream + int(obj.Length) obj._stream = fdata[startstream:endstream] source = PdfTokens(fdata, endstream) endit = source.multiple(2) if endit != 'endstream endobj'.split(): # /Length attribute is broken, try to read stream # anyway disregarding the specified value # TODO: issue warning here once we have some kind of # logging endstream = fdata.index('endstream', startstream) if fdata[endstream - 2:endstream] == '\r\n': endstream -= 2 elif fdata[endstream - 1] in ['\n', '\r']: endstream -= 1 source = PdfTokens(fdata, endstream) endit = source.multiple(2) assert endit == 'endstream endobj'.split() obj.Length = str(endstream - startstream) obj._stream = fdata[startstream:endstream]