def _readInlineImage(self, stream): settings = DictionaryObject() while True: tok = readNonWhitespace(stream) stream.seek(-1, 1) if tok == 'I': break key = readObject(stream, self.pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, self.pdf) settings[key] = value tmp = stream.read(3) assert tmp[:2] == 'ID' data = '' while True: tok = stream.read(1) if tok == 'E': next = stream.read(1) if next == 'I': break else: stream.seek(-1, 1) data += tok else: data += tok x = readNonWhitespace(stream) stream.seek(-1, 1) return {'settings': settings, 'data': data}
def readObjectHeader(self, stream): # Should never be necessary to read out whitespace, since the # cross-reference table should put us in the right spot to read the # object header. In reality... some files have stupid cross reference # tables that are off by whitespace bytes. extra = False utils.skipOverComment(stream) extra |= utils.skipOverWhitespace(stream) stream.seek(-1, 1) idnum = readUntilWhitespace(stream) extra |= utils.skipOverWhitespace(stream) stream.seek(-1, 1) generation = readUntilWhitespace(stream) stream.read(3) readNonWhitespace(stream) stream.seek(-1, 1) if (extra and self.strict): #not a fatal error warnings.warn( "Superfluous whitespace found in " "object header %s %s" % (idnum, generation), utils.PdfReadWarning) return int(idnum), int(generation)
def readObjectHeader(self, stream): idnum = readUntilWhitespace(stream) generation = readUntilWhitespace(stream) obj = stream.read(3) readNonWhitespace(stream) stream.seek(-1, 1) return int(idnum), int(generation)
def _readInlineImage(self, stream): # begin reading just after the "BI" - begin image # first read the dictionary of settings. settings = DictionaryObject() while True: tok = readNonWhitespace(stream) stream.seek(-1, 1) if tok == "I": # "ID" - begin of image data break key = readObject(stream, self.pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, self.pdf) settings[key] = value # left at beginning of ID tmp = stream.read(3) assert tmp[:2] == "ID" data = "" while True: tok = stream.read(1) if tok == "E": next = stream.read(1) if next == "I": break else: stream.seek(-1, 1) data += tok else: data += tok readNonWhitespace(stream) stream.seek(-1, 1) return {"settings": settings, "data": data}
def _readInlineImage(self, stream): # begin reading just after the "BI" - begin image # first read the dictionary of settings. settings = DictionaryObject() while True: tok = readNonWhitespace(stream) stream.seek(-1, 1) if tok == "I": # "ID" - begin of image data break key = readObject(stream, self.pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, self.pdf) settings[key] = value # left at beginning of ID tmp = stream.read(3) assert tmp[:2] == "ID" data = "" while True: tok = stream.read(1) if tok == "E": next = stream.read(1) if next == "I": break else: stream.seek(-1, 1) data += tok else: data += tok readNonWhitespace(stream) stream.seek(-1, 1) return {"settings": settings, "data": data}
def readFromStream(stream, pdf): tmp = stream.read(2) if tmp != '<<': raise utils.PdfReadError('dictionary read error') data = {} while True: tok = readNonWhitespace(stream) if tok == '>': stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if data.has_key(key): raise utils.PdfReadError('multiple definitions in dictionary') data[key] = value pos = stream.tell() s = readNonWhitespace(stream) if s == 's' and stream.read(5) == 'tream': eol = stream.read(1) while eol == ' ': eol = stream.read(1) assert eol in ('\n', '\r') if eol == '\r': stream.read(1) assert data.has_key('/Length') length = data['/Length'] if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data['__streamdata__'] = stream.read(length) e = readNonWhitespace(stream) ndstream = stream.read(8) if e + ndstream != 'endstream': pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == 'endstream': data['__streamdata__'] = data['__streamdata__'][:-1] else: stream.seek(pos, 0) raise utils.PdfReadError("Unable to find 'endstream' marker after stream.") else: stream.seek(pos, 0) if data.has_key('__streamdata__'): return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def readObjectHeader(self, stream): # Should never be necessary to read out whitespace, since the # cross-reference table should put us in the right spot to read the # object header. In reality... some files have stupid cross reference # tables that are off by whitespace bytes. readNonWhitespace(stream); stream.seek(-1, 1) idnum = readUntilWhitespace(stream) generation = readUntilWhitespace(stream) obj = stream.read(3) readNonWhitespace(stream) stream.seek(-1, 1) return int(idnum), int(generation)
def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval != None: return retval if indirectReference.generation == 0 and \ self.xref_objStm.has_key(indirectReference.idnum): # indirect reference to object in object stream # read the entire object stream into memory stmnum,idx = self.xref_objStm[indirectReference.idnum] objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) for i in range(objStm['/N']): objnum = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) offset = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) t = streamData.tell() streamData.seek(objStm['/First']+offset, 0) obj = readObject(streamData, self) self.resolvedObjects[0][objnum] = obj streamData.seek(t, 0) return self.resolvedObjects[0][indirectReference.idnum] start = self.xref[indirectReference.generation][indirectReference.idnum] self.stream.seek(start, 0) idnum, generation = self.readObjectHeader(self.stream) assert idnum == indirectReference.idnum assert generation == indirectReference.generation retval = readObject(self.stream, self) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self.isEncrypted: # if we don't have the encryption key: if not hasattr(self, '_decryption_key'): raise Exception, "file has not been decrypted" # otherwise, decrypt here... import struct pack1 = struct.pack("<i", indirectReference.idnum)[:3] pack2 = struct.pack("<i", indirectReference.generation)[:2] key = self._decryption_key + pack1 + pack2 assert len(key) == (len(self._decryption_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._decryption_key) + 5)] retval = self._decryptObject(retval, key) self.cacheIndirectObject(generation, idnum, retval) return retval
def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): if not generation: continue break generation += tok r = readNonWhitespace(stream) if r != b_("R"): raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): if not generation: continue break generation += tok r = readNonWhitespace(stream) if r != b_("R"): raise utils.PdfReadError( "Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) if tok == 't' or tok == 'f': return BooleanObject.readFromStream(stream) if tok == '(': return readStringFromStream(stream) if tok == '/': return NameObject.readFromStream(stream) if tok == '[': return ArrayObject.readFromStream(stream, pdf) if tok == 'n': return NullObject.readFromStream(stream) if tok == '<': peek = stream.read(2) stream.seek(-2, 1) if peek == '<<': return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) else: if tok == '%': while tok not in ('\r', '\n'): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) if tok == '+' or tok == '-': return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) if re.match('(\\d+)\\s(\\d+)\\sR[^a-zA-Z]', peek) != None: return IndirectObject.readFromStream(stream, pdf) return NumberObject.readFromStream(stream)
def __parseContentStream(self, stream): stream.seek(0, 0) operands = [] while True: peek = readNonWhitespace(stream) if peek == '': break stream.seek(-1, 1) if peek.isalpha() or peek == "'" or peek == '"': operator = '' while True: tok = stream.read(1) if tok.isspace() or tok in NameObject.delimiterCharacters: stream.seek(-1, 1) break elif tok == '': break operator += tok if operator == 'BI': assert operands == [] ii = self._readInlineImage(stream) self.operations.append((ii, 'INLINE IMAGE')) else: self.operations.append((operands, operator)) operands = [] elif peek == '%': while peek not in ('\r', '\n'): peek = stream.read(1) else: operands.append(readObject(stream, None))
def readHexStringFromStream(stream): stream.read(1) txt = "" x = "" while True: tok = readNonWhitespace(stream) if tok == ">": break x += tok if len(x) == 2: txt += chr(int(x, base=16)) x = "" if len(x) == 1: x += "0" if len(x) == 2: txt += chr(int(x, base=16)) return createStringObject(txt)
def readHexStringFromStream(stream): stream.read(1) txt = "" x = "" while True: tok = readNonWhitespace(stream) if tok == ">": break x += tok if len(x) == 2: txt += chr(int(x, base=16)) x = "" if len(x) == 1: x += "0" if len(x) == 2: txt += chr(int(x, base=16)) return createStringObject(txt)
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start if tok == '': return None elif tok == 't' or tok == 'f': # boolean object return BooleanObject.readFromStream(stream) elif tok == '(': # string object return readStringFromStream(stream) elif tok == '/': # name object return NameObject.readFromStream(stream) elif tok == '[': # array object return ArrayObject.readFromStream(stream, pdf) elif tok == 'n': # null object return NullObject.readFromStream(stream) elif tok == '<': # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == '<<': return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif tok == '%': # comment while tok not in ('\r', '\n'): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference if tok == '+' or tok == '-': # number return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream)
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start idx = ObjectPrefix.find(tok) if idx == 0: # name object return NameObject.readFromStream(stream, pdf) elif idx == 1: # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == b_('<<'): return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif idx == 2: # array object return ArrayObject.readFromStream(stream, pdf) elif idx == 3 or idx == 4: # boolean object return BooleanObject.readFromStream(stream) elif idx == 5: # string object return readStringFromStream(stream) elif idx == 6: # null object return NullObject.readFromStream(stream) elif idx == 7: # comment while tok not in (b_('\r'), b_('\n')): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference if tok in NumberSigns: # number return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if IndirectPattern.match(peek) != None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream)
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start idx = ObjectPrefix.find(tok) if idx == 0: # name object return NameObject.readFromStream(stream, pdf) elif idx == 1: # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == b_('<<'): return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif idx == 2: # array object return ArrayObject.readFromStream(stream, pdf) elif idx == 3 or idx == 4: # boolean object return BooleanObject.readFromStream(stream) elif idx == 5: # string object return readStringFromStream(stream) elif idx == 6: # null object return NullObject.readFromStream(stream) elif idx == 7: # comment while tok not in (b_('\r'), b_('\n')): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference if tok in NumberSigns: # number return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if IndirectPattern.match(peek) != None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream)
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start if tok == 't' or tok == 'f': # boolean object return BooleanObject.readFromStream(stream) elif tok == '(': # string object return readStringFromStream(stream) elif tok == '/': # name object return NameObject.readFromStream(stream) elif tok == '[': # array object return ArrayObject.readFromStream(stream, pdf) elif tok == 'n': # null object return NullObject.readFromStream(stream) elif tok == '<': # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == '<<': return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif tok == '%': # comment while tok not in ('\r', '\n'): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference if tok == '+' or tok == '-': # number return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream)
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start if tok == b_('t') or tok == b_('f'): # boolean object return BooleanObject.readFromStream(stream) elif tok == b_('('): # string object return readStringFromStream(stream) elif tok == b_('/'): # name object return NameObject.readFromStream(stream) elif tok == b_('['): # array object return ArrayObject.readFromStream(stream, pdf) elif tok == b_('n'): # null object return NullObject.readFromStream(stream) elif tok == b_('<'): # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == b_('<<'): return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif tok == b_('%'): # comment while tok not in (b_('\r'), b_('\n')): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference if tok == b_('+') or tok == b_('-'): # number return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if re.match(b_(r"(\d+)\s(\d+)\sR[^a-zA-Z]"), peek) is not None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream)
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start if tok == "t" or tok == "f": # boolean object return BooleanObject.readFromStream(stream) elif tok == "(": # string object return readStringFromStream(stream) elif tok == "/": # name object return NameObject.readFromStream(stream) elif tok == "[": # array object return ArrayObject.readFromStream(stream, pdf) elif tok == "n": # null object return NullObject.readFromStream(stream) elif tok == "<": # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == "<<": return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif tok == "%": # comment while tok not in ("\r", "\n"): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference if tok == "+" or tok == "-": # number return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream)
def readHexStringFromStream(stream): stream.read(1) txt = "" x = b_("") while True: tok = readNonWhitespace(stream) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok == b_(">"): break x += tok if len(x) == 2: txt += chr(int(x, base=16)) x = b_("") if len(x) == 1: x += b_("0") if len(x) == 2: txt += chr(int(x, base=16)) return createStringObject(b_(txt))
def readHexStringFromStream(stream): stream.read(1) txt = "" x = b_("") while True: tok = readNonWhitespace(stream) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok == b_(">"): break x += tok if len(x) == 2: txt += chr(int(x, base=16)) x = b_("") if len(x) == 1: x += b_("0") if len(x) == 2: txt += chr(int(x, base=16)) return createStringObject(b_(txt))
def __parseContentStream(self, stream): # file("f:\\tmp.txt", "w").write(stream.read()) stream.seek(0, 0) operands = [] while True: peek = readNonWhitespace(stream) if peek == "": break stream.seek(-1, 1) if peek.isalpha() or peek == "'" or peek == '"': operator = readUntilWhitespace(stream, maxchars=2) if operator == "BI": # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... assert operands == [] ii = self._readInlineImage(stream) self.operations.append((ii, "INLINE IMAGE")) else: self.operations.append((operands, operator)) operands = [] else: operands.append(readObject(stream, None))
def __parseContentStream(self, stream): stream.seek(0, 0) operands = [] while True: peek = readNonWhitespace(stream) if peek == '': break stream.seek(-1, 1) if peek.isalpha() or peek == "'" or peek == '"': operator = "" while True: tok = stream.read(1) if tok.isspace() or tok in NameObject.delimiterCharacters: stream.seek(-1, 1) break elif tok == '': break operator += tok if operator == "BI": # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... assert operands == [] ii = self._readInlineImage(stream) self.operations.append((ii, "INLINE IMAGE")) else: self.operations.append((operands, operator)) operands = [] elif peek == '%': # If we encounter a comment in the content stream, we have to # handle it here. Typically, readObject will handle # encountering a comment -- but readObject assumes that # following the comment must be the object we're trying to # read. In this case, it could be an operator instead. while peek not in ('\r', '\n'): peek = stream.read(1) else: operands.append(readObject(stream, None))
def __parseContentStream(self, stream): stream.seek(0, 0) operands = [] while True: peek = readNonWhitespace(stream) if peek == '': break stream.seek(-1, 1) if peek.isalpha() or peek == "'" or peek == '"': operator = "" while True: tok = stream.read(1) if tok.isspace() or tok in NameObject.delimiterCharacters: stream.seek(-1, 1) break elif tok == '': break operator += tok if operator == "BI": # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... assert operands == [] ii = self._readInlineImage(stream) self.operations.append((ii, "INLINE IMAGE")) else: self.operations.append((operands, operator)) operands = [] elif peek == '%': # If we encounter a comment in the content stream, we have to # handle it here. Typically, readObject will handle # encountering a comment -- but readObject assumes that # following the comment must be the object we're trying to # read. In this case, it could be an operator instead. while peek not in ('\r', '\n'): peek = stream.read(1) else: operands.append(readObject(stream, None))
def readFromStream(stream, pdf): tmp = stream.read(2) if tmp != "<<": raise utils.PdfReadError, "dictionary read error" data = {} while True: tok = readNonWhitespace(stream) if tok == ">": stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if data.has_key(key): # multiple definitions of key not permitted raise utils.PdfReadError, "multiple definitions in dictionary" data[key] = value pos = stream.tell() s = readNonWhitespace(stream) if s == 's' and stream.read(5) == 'tream': eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == ' ': eol = stream.read(1) assert eol in ("\n", "\r") if eol == "\r": # read \n after stream.read(1) # this is a stream object, not a dictionary assert data.has_key("/Length") length = data["/Length"] if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != "endstream": # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == "endstream": # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: stream.seek(pos, 0) raise utils.PdfReadError, "Unable to find 'endstream' marker after stream." else: stream.seek(pos, 0) if data.has_key("__streamdata__"): return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval is not None: return retval if indirectReference.generation == 0 \ and indirectReference.idnum in self.xref_objStm: # indirect reference to object in object stream # read the entire object stream into memory stmnum, idx = self.xref_objStm[indirectReference.idnum] objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) for i in range(objStm['/N']): objnum = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) offset = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) t = streamData.tell() streamData.seek(objStm['/First'] + offset, 0) obj = readObject(streamData, self) self.resolvedObjects[0][objnum] = obj streamData.seek(t, 0) return self.resolvedObjects[0][indirectReference.idnum] if indirectReference.idnum \ not in self.xref[indirectReference.generation]: warnings.warn( "Object %d %d not defined." % (indirectReference.idnum, indirectReference.generation), utils.PdfReadWarning) return None start = self.xref[indirectReference.generation][ indirectReference.idnum] self.stream.seek(start, 0) idnum, generation = self.readObjectHeader(self.stream) try: assert idnum == indirectReference.idnum except AssertionError: if self.xrefIndex: # Xref table probably had bad indexes due to not # being zero-indexed if self.strict: raise utils.PdfReadError( "Expected object ID (%d %d) does " "not match actual (%d %d); xref " "table not zero-indexed." % (indirectReference.idnum, indirectReference.generation, idnum, generation)) else: # should not happen since the xref table is corrected in # non-strict mode pass else: # some other problem raise utils.PdfReadError( "Expected object ID (%d %d) does not " " match actual (%d %d)." % (indirectReference.idnum, indirectReference.generation, idnum, generation)) assert generation == indirectReference.generation retval = readObject(self.stream, self) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self.isEncrypted: # if we don't have the encryption key: if not hasattr(self, '_decryption_key'): raise Exception("file has not been decrypted") # otherwise, decrypt here... pack1 = struct.pack("<i", indirectReference.idnum)[:3] pack2 = struct.pack("<i", indirectReference.generation)[:2] key = self._decryption_key + pack1 + pack2 assert len(key) == (len(self._decryption_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._decryption_key) + 5)] retval = self._decryptObject(retval, key) self.cacheIndirectObject(generation, idnum, retval) return retval
def read(self, stream): # start at the end: stream.seek(-1, 2) line = b_('') while not line: line = self.readNextEndLine(stream) if line[:5] != b_("%%EOF"): raise utils.PdfReadError, "EOF marker not found" # find startxref entry - the location of the xref table line = self.readNextEndLine(stream) startxref = int(line) line = self.readNextEndLine(stream) if line[:9] != b_("startxref"): raise utils.PdfReadError, "startxref not found" # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} self.trailer = DictionaryObject() while 1: # load the xref table stream.seek(startxref, 0) x = stream.read(1) if x == b_("x"): # standard cross-reference table ref = stream.read(4) if ref[:3] != b_("ref"): raise utils.PdfReadError, "xref table read error" readNonWhitespace(stream) stream.seek(-1, 1) # check if the first time looking at the xref table firsttime = True while True: num = readObject(stream, self) if firsttime and num != 0: self.xrefIndex = num warnings.warn( "Xref table not zero-indexed. ID " "numbers for objects will %sbe " "corrected." % ("" if not self.strict else "not "), utils.PdfReadWarning) # if table not zero indexed, could be due to # error from when PDF was created # which will lead to mismatched indices later on firsttime = False readNonWhitespace(stream) stream.seek(-1, 1) size = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) cnt = 0 while cnt < size: line = stream.read(20) # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes (as of PDF 1.7). However, some files have # 21-byte entries (or more) due to the use of \r\n # (CRLF) EOL's. Detect that case, and adjust the line # until it does not begin with a \r (CR) or \n (LF). while line[0] in b_("\x0D\x0A"): stream.seek(-20 + 1, 1) line = stream.read(20) # On the other hand, some malformed PDF files # use a single character EOL without a preceeding # space. Detect that case, and seek the stream # back one character. (0-9 means we've bled into # the next xref entry, t means we've bled into the # text "trailer"): if line[-1] in b_("0123456789t"): stream.seek(-1, 1) offset, generation = line[:16].split(b_(" ")) offset, generation = int(offset), int(generation) self.xref.setdefault(generation, {}) if num in self.xref[generation]: # It really seems like we should allow the last # xref table in the file to override previous # ones. Since we read the file backwards, assume # any existing key is already set correctly. pass else: self.xref[generation][num] = offset cnt += 1 num += 1 readNonWhitespace(stream) stream.seek(-1, 1) trailertag = stream.read(7) if trailertag != b_("trailer"): # more xrefs! stream.seek(-7, 1) else: break readNonWhitespace(stream) stream.seek(-1, 1) newTrailer = readObject(stream, self) for key, value in newTrailer.items(): self.trailer.setdefault(key, value) if "/Prev" in newTrailer: startxref = newTrailer["/Prev"] else: break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) idnum, generation = self.readObjectHeader(stream) xrefstream = readObject(stream, self) assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) entrySizes = xrefstream.get("/W") for num, size in self._pairs(idx_pairs): cnt = 0 while cnt < size: for i in range(len(entrySizes)): d = streamData.read(entrySizes[i]) di = convertToInt(d, entrySizes[i]) if i == 0: xref_type = di elif i == 1: if xref_type == 0: # next_free_object = di pass elif xref_type == 1: byte_offset = di elif xref_type == 2: objstr_num = di elif i == 2: if xref_type == 0: # next_generation = di pass elif xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 0: pass elif xref_type == 1: if generation not in self.xref: self.xref[generation] = {} if not num in self.xref[generation]: self.xref[generation][num] = byte_offset elif xref_type == 2: if not num in self.xref_objStm: self.xref_objStm[num] = [objstr_num, obstr_idx] cnt += 1 num += 1 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if key in xrefstream and key not in self.trailer: self.trailer[NameObject(key)] = xrefstream.raw_get(key) if "/Prev" in xrefstream: startxref = xrefstream["/Prev"] else: break else: # bad xref character at startxref. Let's see if we can find # the xref table nearby, as we've observed this error with an # off-by-one before. stream.seek(-11, 1) tmp = stream.read(20) xref_loc = tmp.find(b_("xref")) if xref_loc != -1: startxref -= (10 - xref_loc) continue else: # no xref table found at specified location assert False break # if not zero-indexed, verify that the table is correct # change it if necessary if self.xrefIndex and not self.strict: loc = stream.tell() for gen in self.xref: if gen == 65535: continue for id in self.xref[gen]: stream.seek(self.xref[gen][id], 0) pid, pgen = self.readObjectHeader(stream) if pid == id - self.xrefIndex: self._zeroXref(gen) break # if not, then either it's just plain wrong, # or the non-zero-index is actually correct stream.seek(loc, 0) # return to where it was
def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval is not None: return retval if indirectReference.generation == 0 \ and indirectReference.idnum in self.xref_objStm: # indirect reference to object in object stream # read the entire object stream into memory stmnum, idx = self.xref_objStm[indirectReference.idnum] objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) for i in range(objStm['/N']): objnum = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) offset = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) t = streamData.tell() streamData.seek(objStm['/First']+offset, 0) obj = readObject(streamData, self) self.resolvedObjects[0][objnum] = obj streamData.seek(t, 0) return self.resolvedObjects[0][indirectReference.idnum] if indirectReference.idnum \ not in self.xref[indirectReference.generation]: warnings.warn("Object %d %d not defined." % ( indirectReference.idnum, indirectReference.generation), utils.PdfReadWarning) return None start = self.xref[indirectReference.generation][ indirectReference.idnum] self.stream.seek(start, 0) idnum, generation = self.readObjectHeader(self.stream) try: assert idnum == indirectReference.idnum except AssertionError: if self.xrefIndex: # Xref table probably had bad indexes due to not # being zero-indexed if self.strict: raise utils.PdfReadError( "Expected object ID (%d %d) does " "not match actual (%d %d); xref " "table not zero-indexed." % ( indirectReference.idnum, indirectReference.generation, idnum, generation)) else: # should not happen since the xref table is corrected in # non-strict mode pass else: # some other problem raise utils.PdfReadError("Expected object ID (%d %d) does not " " match actual (%d %d)." % ( indirectReference.idnum, indirectReference.generation, idnum, generation)) assert generation == indirectReference.generation retval = readObject(self.stream, self) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self.isEncrypted: # if we don't have the encryption key: if not hasattr(self, '_decryption_key'): raise Exception("file has not been decrypted") # otherwise, decrypt here... pack1 = struct.pack("<i", indirectReference.idnum)[:3] pack2 = struct.pack("<i", indirectReference.generation)[:2] key = self._decryption_key + pack1 + pack2 assert len(key) == (len(self._decryption_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._decryption_key) + 5)] retval = self._decryptObject(retval, key) self.cacheIndirectObject(generation, idnum, retval) return retval
def readFromStream(stream, pdf): tmp = stream.read(2) if tmp != b_("<<"): raise utils.PdfReadError( ("Dictionary read error at byte %s: " "stream must begin with '<<'" % utils.hexStr(stream.tell()))) data = {} while True: tok = readNonWhitespace(stream) if not tok: # stream has truncated prematurely raise utils.PdfStreamError("Stream has ended unexpectedly") if tok == b_(">"): stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if key in data: # multiple definitions of key not permitted raise utils.PdfReadError, ("Multiple definitions in " "dictionary at byte %s for key %s" % (utils.hexStr(stream.tell()), key)) data[key] = value pos = stream.tell() s = readNonWhitespace(stream) if s == b_('s') and stream.read(5) == b_('tream'): eol = stream.read(1) # odd PDF file output has spaces after 'stream' # keyword but before EOL. # patch provided by Danial Sandler while eol == b_(' '): eol = stream.read(1) assert eol in (b_("\n"), b_("\r")) if eol == b_("\r"): # read \n after if stream.read(1) != '\n': stream.seek(-1, 1) # this is a stream object, not a dictionary assert "/Length" in data length = data["/Length"] if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b_("endstream"): # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b_("endstream"): # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: stream.seek(pos, 0) raise utils.PdfReadError, \ ("Unable to find 'endstream' marker after " "stream at byte %s." % utils.hexStr(stream.tell())) else: stream.seek(pos, 0) if "__streamdata__" in data: return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def read(self, stream): stream.seek(0, 2) if stream.tell() == 0: raise utils.PdfReadError('Empty file') stream.seek(-1, 2) line = '' while not line: line = self.readNextEndLine(stream) if line[:5] != '%%EOF': raise utils.PdfReadError('EOF marker not found') line = self.readNextEndLine(stream) startxref = int(line) line = self.readNextEndLine(stream) if line[:9] != 'startxref': raise utils.PdfReadError('startxref not found') self.xref = {} self.xref_objStm = {} self.trailer = DictionaryObject() while 1: stream.seek(startxref, 0) x = stream.read(1) if x == 'x': ref = stream.read(4) if ref[:3] != 'ref': raise utils.PdfReadError('xref table read error') readNonWhitespace(stream) stream.seek(-1, 1) while 1: num = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) size = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) cnt = 0 while cnt < size: line = stream.read(20) if line[-1] in '0123456789t': stream.seek(-1, 1) offset, generation = line[:16].split(' ') offset, generation = int(offset), int(generation) if not self.xref.has_key(generation): self.xref[generation] = {} if self.xref[generation].has_key(num): pass else: self.xref[generation][num] = offset cnt += 1 num += 1 readNonWhitespace(stream) stream.seek(-1, 1) trailertag = stream.read(7) if trailertag != 'trailer': stream.seek(-7, 1) else: break readNonWhitespace(stream) stream.seek(-1, 1) newTrailer = readObject(stream, self) for key, value in newTrailer.items(): if not self.trailer.has_key(key): self.trailer[key] = value if newTrailer.has_key('/Prev'): startxref = newTrailer['/Prev'] else: break elif x.isdigit(): stream.seek(-1, 1) idnum, generation = self.readObjectHeader(stream) xrefstream = readObject(stream, self) assert xrefstream['/Type'] == '/XRef' self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) idx_pairs = xrefstream.get('/Index', [0, xrefstream.get('/Size')]) entrySizes = xrefstream.get('/W') for num, size in self._pairs(idx_pairs): cnt = 0 while cnt < size: for i in range(len(entrySizes)): d = streamData.read(entrySizes[i]) di = convertToInt(d, entrySizes[i]) if i == 0: xref_type = di elif i == 1: if xref_type == 0: next_free_object = di elif xref_type == 1: byte_offset = di elif xref_type == 2: objstr_num = di elif i == 2: if xref_type == 0: next_generation = di elif xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 0: pass elif xref_type == 1: if not self.xref.has_key(generation): self.xref[generation] = {} if num not in self.xref[generation]: self.xref[generation][num] = byte_offset elif xref_type == 2: if num not in self.xref_objStm: self.xref_objStm[num] = [objstr_num, obstr_idx] cnt += 1 num += 1 trailerKeys = ('/Root', '/Encrypt', '/Info', '/ID') for key in trailerKeys: if xrefstream.has_key(key) and not self.trailer.has_key(key): self.trailer[NameObject(key)] = xrefstream.raw_get(key) if xrefstream.has_key('/Prev'): startxref = xrefstream['/Prev'] else: break else: stream.seek(-11, 1) tmp = stream.read(20) xref_loc = tmp.find('xref') if xref_loc != -1: startxref -= 10 - xref_loc continue else: assert False break
def readFromStream(stream, pdf): debug = False tmp = stream.read(2) if tmp != b_("<<"): raise utils.PdfReadError, \ ("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) data = {} while True: tok = readNonWhitespace(stream) if debug: print "Tok:", tok if tok == b_(">"): stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if data.has_key(key): # multiple definitions of key not permitted raise utils.PdfReadError, ("Multiple definitions in dictionary at byte %s for key %s" \ % (utils.hexStr(stream.tell()), key)) data[key] = value pos = stream.tell() s = readNonWhitespace(stream) if s == b_('s') and stream.read(5) == b_('tream'): eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b_(' '): eol = stream.read(1) assert eol in (b_("\n"), b_("\r")) if eol == b_("\r"): # read \n after stream.read(1) # this is a stream object, not a dictionary assert data.has_key("/Length") length = data["/Length"] if debug: print data if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) if debug: print "here" #if debug: print debugging.printAsHex(data["__streamdata__"]) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b_("endstream"): # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b_("endstream"): # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: # if debug: print "E", e, ndstream, debugging.toHex(end) stream.seek(pos, 0) raise utils.PdfReadError, \ ("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) else: stream.seek(pos, 0) if data.has_key("__streamdata__"): return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def read(self, stream): # start at the end: stream.seek(-1, 2) line = b_('') while not line: line = self.readNextEndLine(stream) if line[:5] != b_("%%EOF"): raise utils.PdfReadError, "EOF marker not found" # find startxref entry - the location of the xref table line = self.readNextEndLine(stream) startxref = int(line) line = self.readNextEndLine(stream) if line[:9] != b_("startxref"): raise utils.PdfReadError, "startxref not found" # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} self.trailer = DictionaryObject() while 1: # load the xref table stream.seek(startxref, 0) x = stream.read(1) if x == b_("x"): # standard cross-reference table ref = stream.read(4) if ref[:3] != b_("ref"): raise utils.PdfReadError, "xref table read error" readNonWhitespace(stream) stream.seek(-1, 1) # check if the first time looking at the xref table firsttime = True while True: num = readObject(stream, self) if firsttime and num != 0: self.xrefIndex = num warnings.warn("Xref table not zero-indexed. ID " "numbers for objects will %sbe " "corrected." % ("" if not self.strict else "not "), utils.PdfReadWarning) # if table not zero indexed, could be due to # error from when PDF was created # which will lead to mismatched indices later on firsttime = False readNonWhitespace(stream) stream.seek(-1, 1) size = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) cnt = 0 while cnt < size: line = stream.read(20) # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes (as of PDF 1.7). However, some files have # 21-byte entries (or more) due to the use of \r\n # (CRLF) EOL's. Detect that case, and adjust the line # until it does not begin with a \r (CR) or \n (LF). while line[0] in b_("\x0D\x0A"): stream.seek(-20 + 1, 1) line = stream.read(20) # On the other hand, some malformed PDF files # use a single character EOL without a preceeding # space. Detect that case, and seek the stream # back one character. (0-9 means we've bled into # the next xref entry, t means we've bled into the # text "trailer"): if line[-1] in b_("0123456789t"): stream.seek(-1, 1) offset, generation = line[:16].split(b_(" ")) offset, generation = int(offset), int(generation) self.xref.setdefault(generation, {}) if num in self.xref[generation]: # It really seems like we should allow the last # xref table in the file to override previous # ones. Since we read the file backwards, assume # any existing key is already set correctly. pass else: self.xref[generation][num] = offset cnt += 1 num += 1 readNonWhitespace(stream) stream.seek(-1, 1) trailertag = stream.read(7) if trailertag != b_("trailer"): # more xrefs! stream.seek(-7, 1) else: break readNonWhitespace(stream) stream.seek(-1, 1) newTrailer = readObject(stream, self) for key, value in newTrailer.items(): self.trailer.setdefault(key, value) if "/Prev" in newTrailer: startxref = newTrailer["/Prev"] else: break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) idnum, generation = self.readObjectHeader(stream) xrefstream = readObject(stream, self) assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) entrySizes = xrefstream.get("/W") for num, size in self._pairs(idx_pairs): cnt = 0 while cnt < size: for i in range(len(entrySizes)): d = streamData.read(entrySizes[i]) di = convertToInt(d, entrySizes[i]) if i == 0: xref_type = di elif i == 1: if xref_type == 0: # next_free_object = di pass elif xref_type == 1: byte_offset = di elif xref_type == 2: objstr_num = di elif i == 2: if xref_type == 0: # next_generation = di pass elif xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 0: pass elif xref_type == 1: if generation not in self.xref: self.xref[generation] = {} if not num in self.xref[generation]: self.xref[generation][num] = byte_offset elif xref_type == 2: if not num in self.xref_objStm: self.xref_objStm[num] = [objstr_num, obstr_idx] cnt += 1 num += 1 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if key in xrefstream and key not in self.trailer: self.trailer[NameObject(key)] = xrefstream.raw_get(key) if "/Prev" in xrefstream: startxref = xrefstream["/Prev"] else: break else: # bad xref character at startxref. Let's see if we can find # the xref table nearby, as we've observed this error with an # off-by-one before. stream.seek(-11, 1) tmp = stream.read(20) xref_loc = tmp.find(b_("xref")) if xref_loc != -1: startxref -= (10 - xref_loc) continue else: # no xref table found at specified location assert False break # if not zero-indexed, verify that the table is correct # change it if necessary if self.xrefIndex and not self.strict: loc = stream.tell() for gen in self.xref: if gen == 65535: continue for id in self.xref[gen]: stream.seek(self.xref[gen][id], 0) pid, pgen = self.readObjectHeader(stream) if pid == id - self.xrefIndex: self._zeroXref(gen) break # if not, then either it's just plain wrong, # or the non-zero-index is actually correct stream.seek(loc, 0) # return to where it was
def readFromStream(stream, pdf): debug = False tmp = stream.read(2) if tmp != b_("<<"): raise utils.PdfReadError, \ ("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) data = {} while True: tok = readNonWhitespace(stream) if tok == b_('\x00'): continue if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if debug: print "Tok:",tok if tok == b_(">"): stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if not data.has_key(key): data[key] = value pos = stream.tell() s = readNonWhitespace(stream) if s == b_('s') and stream.read(5) == b_('tream'): eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b_(' '): eol = stream.read(1) assert eol in (b_("\n"), b_("\r")) if eol == b_("\r"): # read \n after if stream.read(1) != '\n': stream.seek(-1, 1) # this is a stream object, not a dictionary assert data.has_key("/Length") length = data["/Length"] if debug: print data if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) if debug: print "here" #if debug: print debugging.printAsHex(data["__streamdata__"]) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b_("endstream"): # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b_("endstream"): # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: if pdf.strict == False: warnings.warn("Ignoring missing endstream. This could affect PDF output.") pass else: if debug: print "E", e, ndstream, debugging.toHex(end) stream.seek(pos, 0) raise utils.PdfReadError, \ ("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) else: stream.seek(pos, 0) if data.has_key("__streamdata__"): return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def readFromStream(stream, pdf): tmp = stream.read(2) if tmp != "<<": raise utils.PdfReadError, "dictionary read error" data = {} while True: tok = readNonWhitespace(stream) if tok == ">": stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if data.has_key(key): # multiple definitions of key not permitted raise utils.PdfReadError, "multiple definitions in dictionary" data[key] = value pos = stream.tell() s = readNonWhitespace(stream) if s == 's' and stream.read(5) == 'tream': eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == ' ': eol = stream.read(1) assert eol in ("\n", "\r") if eol == "\r": # read \n after stream.read(1) # this is a stream object, not a dictionary assert data.has_key("/Length") length = data["/Length"] if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != "endstream": # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == "endstream": # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: stream.seek(pos, 0) raise utils.PdfReadError, "Unable to find 'endstream' marker after stream." else: stream.seek(pos, 0) if data.has_key("__streamdata__"): return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def read(self, stream): # start at the end: stream.seek(-1, 2) line = '' while not line: line = self.readNextEndLine(stream) if line[:5] != "%%EOF": raise utils.PdfReadError, "EOF marker not found" # find startxref entry - the location of the xref table line = self.readNextEndLine(stream) startxref = int(line) line = self.readNextEndLine(stream) if line[:9] != "startxref": raise utils.PdfReadError, "startxref not found" # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} self.trailer = DictionaryObject() while 1: # load the xref table stream.seek(startxref, 0) x = stream.read(1) if x == "x": # standard cross-reference table ref = stream.read(4) if ref[:3] != "ref": raise utils.PdfReadError, "xref table read error" readNonWhitespace(stream) stream.seek(-1, 1) while 1: num = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) size = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) cnt = 0 while cnt < size: line = stream.read(20) # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes. However... some malformed PDF files # use a single character EOL without a preceeding # space. Detect that case, and seek the stream # back one character. (0-9 means we've bled into # the next xref entry, t means we've bled into the # text "trailer"): if line[-1] in "0123456789t": stream.seek(-1, 1) offset, generation = line[:16].split(" ") offset, generation = int(offset), int(generation) if not self.xref.has_key(generation): self.xref[generation] = {} if self.xref[generation].has_key(num): # It really seems like we should allow the last # xref table in the file to override previous # ones. Since we read the file backwards, assume # any existing key is already set correctly. pass else: self.xref[generation][num] = offset cnt += 1 num += 1 readNonWhitespace(stream) stream.seek(-1, 1) trailertag = stream.read(7) if trailertag != "trailer": # more xrefs! stream.seek(-7, 1) else: break readNonWhitespace(stream) stream.seek(-1, 1) newTrailer = readObject(stream, self) for key, value in newTrailer.items(): if not self.trailer.has_key(key): self.trailer[key] = value if newTrailer.has_key("/Prev"): startxref = newTrailer["/Prev"] else: break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) idnum, generation = self.readObjectHeader(stream) xrefstream = readObject(stream, self) assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) entrySizes = xrefstream.get("/W") for num, size in self._pairs(idx_pairs): cnt = 0 while cnt < size: for i in range(len(entrySizes)): d = streamData.read(entrySizes[i]) di = convertToInt(d, entrySizes[i]) if i == 0: xref_type = di elif i == 1: if xref_type == 0: next_free_object = di elif xref_type == 1: byte_offset = di elif xref_type == 2: objstr_num = di elif i == 2: if xref_type == 0: next_generation = di elif xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 0: pass elif xref_type == 1: if not self.xref.has_key(generation): self.xref[generation] = {} if not num in self.xref[generation]: self.xref[generation][num] = byte_offset elif xref_type == 2: if not num in self.xref_objStm: self.xref_objStm[num] = [objstr_num, obstr_idx] cnt += 1 num += 1 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if xrefstream.has_key(key) and not self.trailer.has_key(key): self.trailer[NameObject(key)] = xrefstream.raw_get(key) if xrefstream.has_key("/Prev"): startxref = xrefstream["/Prev"] else: break else: # bad xref character at startxref. Let's see if we can find # the xref table nearby, as we've observed this error with an # off-by-one before. stream.seek(-11, 1) tmp = stream.read(20) xref_loc = tmp.find("xref") if xref_loc != -1: startxref -= (10 - xref_loc) continue else: # no xref table found at specified location assert False break
def read(self, stream): # start at the end: stream.seek(-1, 2) line = "" while not line: line = self.readNextEndLine(stream) assert line[:5] == "%%EOF" # find startxref entry - the location of the xref table line = self.readNextEndLine(stream) startxref = int(line) line = self.readNextEndLine(stream) assert line[:9] == "startxref" # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} self.trailer = {} while 1: # load the xref table stream.seek(startxref, 0) x = stream.read(1) if x == "x": # standard cross-reference table ref = stream.read(4) assert ref[:3] == "ref" readNonWhitespace(stream) stream.seek(-1, 1) while 1: num = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) size = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) cnt = 0 while cnt < size: line = stream.read(20) offset, generation = line[:16].split(" ") offset, generation = int(offset), int(generation) if not self.xref.has_key(generation): self.xref[generation] = {} if self.xref[generation].has_key(num): # It really seems like we should allow the last # xref table in the file to override previous # ones. Since we read the file backwards, assume # any existing key is already set correctly. pass else: self.xref[generation][num] = offset cnt += 1 num += 1 readNonWhitespace(stream) stream.seek(-1, 1) trailertag = stream.read(7) if trailertag != "trailer": # more xrefs! stream.seek(-7, 1) else: break readNonWhitespace(stream) stream.seek(-1, 1) newTrailer = readObject(stream, self) for key, value in newTrailer.items(): if not self.trailer.has_key(key): self.trailer[key] = value if newTrailer.has_key(NameObject("/Prev")): startxref = newTrailer[NameObject("/Prev")] else: break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) idnum, generation = self.readObjectHeader(stream) xrefstream = readObject(stream, self) assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) num, size = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) entrySizes = xrefstream.get("/W") cnt = 0 while cnt < size: for i in range(len(entrySizes)): d = streamData.read(entrySizes[i]) di = convertToInt(d, entrySizes[i]) if i == 0: xref_type = di elif i == 1: if xref_type == 0: next_free_object = di elif xref_type == 1: byte_offset = di elif xref_type == 2: objstr_num = di elif i == 2: if xref_type == 0: next_generation = di elif xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 0: pass elif xref_type == 1: if not self.xref.has_key(generation): self.xref[generation] = {} self.xref[generation][num] = byte_offset elif xref_type == 2: self.xref_objStm[num] = [objstr_num, obstr_idx] cnt += 1 num += 1 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if xrefstream.has_key(key) and not self.trailer.has_key(key): self.trailer[NameObject(key)] = xrefstream[key] if xrefstream.has_key("/Prev"): startxref = xrefstream["/Prev"] else: break else: # bad xref character at startxref. Let's see if we can find # the xref table nearby, as we've observed this error with an # off-by-one before. stream.seek(-11, 1) tmp = stream.read(20) xref_loc = tmp.find("xref") if xref_loc != -1: startxref -= 10 - xref_loc continue else: # no xref table found at specified location assert False break
def readFromStream(stream, pdf): debug = False tmp = stream.read(2) if tmp != b_("<<"): raise utils.PdfReadError, \ ("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) data = {} while True: tok = readNonWhitespace(stream) if debug: print "Tok:",tok if tok == b_(">"): stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if data.has_key(key) and pdf.strict: # multiple definitions of key not permitted raise utils.PdfReadError, ("Multiple definitions in dictionary at byte %s for key %s" \ % (utils.hexStr(stream.tell()), key)) data[key] = value pos = stream.tell() s = readNonWhitespace(stream) if s == b_('s') and stream.read(5) == b_('tream'): eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b_(' '): eol = stream.read(1) assert eol in (b_("\n"), b_("\r")) if eol == b_("\r"): # read \n after if stream.read(1) != '\n': stream.seek(-1, 1) # this is a stream object, not a dictionary assert data.has_key("/Length") length = data["/Length"] if debug: print data if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) if debug: print "here" #if debug: print debugging.printAsHex(data["__streamdata__"]) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b_("endstream"): # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b_("endstream"): # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: # still not found, try looking forward one character stream.seek(pos + 1, 0) end = stream.read(9) if end == b_("endstream"): # we found it by looking forward one character, add # skipped character to the end of the stream data stream.seek(-10, 1) data["__streamdata__"] = data["__streamdata__"] + stream.read(1) stream.seek(9, 1) else: # give up looking for misplaced "endstream" token # if debug: print "E", e, ndstream, debugging.toHex(end) stream.seek(pos, 0) raise utils.PdfReadError, \ ("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) else: stream.seek(pos, 0) if data.has_key("__streamdata__"): return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def readObjectHeader(self, stream): # Should never be necessary to read out whitespace, since the # cross-reference table should put us in the right spot to read the # object header. In reality... some files have stupid cross reference # tables that are off by whitespace bytes. extra = False utils.skipOverComment(stream) extra |= utils.skipOverWhitespace(stream) stream.seek(-1, 1) idnum = readUntilWhitespace(stream) extra |= utils.skipOverWhitespace(stream) stream.seek(-1, 1) generation = readUntilWhitespace(stream) stream.read(3) readNonWhitespace(stream) stream.seek(-1, 1) if (extra and self.strict): #not a fatal error warnings.warn("Superfluous whitespace found in " "object header %s %s" % (idnum, generation), utils.PdfReadWarning) return int(idnum), int(generation)