def writeToStream(self, stream, encryption_key): bytearr = self if encryption_key: bytearr = RC4_encrypt(encryption_key, bytearr) stream.write(b_("<")) stream.write(utils.hexencode(bytearr)) stream.write(b_(">"))
def _decrypt(self, password): encrypt = self.trailer['/Encrypt'].getObject() if encrypt['/Filter'] != '/Standard': raise NotImplementedError( "only Standard PDF encryption handler is available") if not (encrypt['/V'] in (1, 2)): raise NotImplementedError( "only algorithm code 1 and 2 are supported") user_password, key = self._authenticateUserPassword(password) if user_password: self._decryption_key = key return 1 else: rev = encrypt['/R'].getObject() if rev == 2: keylen = 5 else: keylen = encrypt['/Length'].getObject() // 8 key = _alg33_1(password, rev, keylen) real_O = encrypt["/O"].getObject() if rev == 2: userpass = utils.RC4_encrypt(key, real_O) else: val = real_O for i in range(19, -1, -1): new_key = b_('') for l in range(len(key)): new_key += b_(chr(utils.ord_(key[l]) ^ i)) val = utils.RC4_encrypt(new_key, val) userpass = val owner_password, key = self._authenticateUserPassword(userpass) if owner_password: self._decryption_key = key return 2 return 0
def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): if not generation: continue break generation += tok r = readNonWhitespace(stream) if r != b_("R"): raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
def readFromStream(stream): word = stream.read(4) if word == b_("true"): return BooleanObject(True) elif word == b_("fals"): stream.read(1) return BooleanObject(False) assert False
def writeToStream(self, stream, encryption_key): stream.write(b_("<<\n")) for key, value in self.items(): key.writeToStream(stream, encryption_key) stream.write(b_(" ")) value.writeToStream(stream, encryption_key) stream.write(b_("\n")) stream.write(b_(">>"))
def encode_pdfdocencoding(unicode_string): retval = b_('') for c in unicode_string: try: retval += b_(chr(_pdfDocEncoding_rev[c])) except KeyError: raise UnicodeEncodeError("pdfdocencoding", c, -1, -1, "does not exist in translation table") return retval
def readFromStream(stream): word = stream.read(4) if word == b_("true"): return BooleanObject(True) elif word == b_("fals"): stream.read(1) return BooleanObject(False) else: raise utils.PdfReadError('Could not read Boolean object')
def write_to_file(self, f: io.BufferedReader): # TODO: EOL is not needed in some cases f.write(b_(self.obj_no)) f.write(b' ') f.write(b_(self.gen_no)) f.write(b' obj') f.write(b'\n') self.value.write_to_file(f) f.write(b'\n') f.write(b'endobj')
def writeToStream(self, stream, encryption_key): self[NameObject("/Length")] = NumberObject(len(self._data)) DictionaryObject.writeToStream(self, stream, encryption_key) del self["/Length"] stream.write(b_("\nstream\n")) data = self._data if encryption_key: data = RC4_encrypt(encryption_key, data) stream.write(data) stream.write(b_("\nendstream"))
def readFromStream(stream): name = b_("") while True: tok = stream.read(1) if tok != b_('+') and tok != b_('-') and tok != b_('.') and not tok.isdigit(): stream.seek(-1, 1) break name += tok if name.find(b_(".")) != -1: return FloatObject(name) else: return NumberObject(name)
def readFromStream(stream): num = b_("") while True: tok = stream.read(1) if tok != b_('+') and tok != b_('-') and tok != b_('.') and not tok.isdigit(): stream.seek(-1, 1) break num += tok if num.find(b_(".")) != -1: return FloatObject(num) else: return NumberObject(num)
def writeToStream(self, stream, encryption_key): stream.write(b_("<<\n")) key = NameObject('/D') key.writeToStream(stream, encryption_key) stream.write(b_(" ")) value = self.getDestArray() value.writeToStream(stream, encryption_key) key = NameObject("/S") key.writeToStream(stream, encryption_key) stream.write(b_(" ")) value = NameObject("/GoTo") value.writeToStream(stream, encryption_key) stream.write(b_("\n")) stream.write(b_(">>"))
def __init__(self): self._header = b_("%PDF-1.3") self._objects = [] # array of indirect objects # The root of our page tree node. pages = DictionaryObject() pages.update({ NameObject("/Type"): NameObject("/Pages"), NameObject("/Count"): NumberObject(0), NameObject("/Kids"): ArrayObject() }) self._pages = self._addObject(pages) # info object info = DictionaryObject() info.update({ NameObject("/Producer"): createStringObject( u"Python PDF Library - http://pybrary.net/pyPdf/") }) self._info = self._addObject(info) # root object root = DictionaryObject() root.update({ NameObject("/Type"): NameObject("/Catalog"), NameObject("/Pages"): self._pages }) self._root = self._addObject(root)
def readHexStringFromStream(stream): stream.read(1) txt = "" x = b_("") while True: tok = readNonWhitespace(stream) if tok == b_(">"): break x += tok if len(x) == 2: txt += chr(int(x, base=16)) x = b_("") if len(x) == 1: x += b_("0") if len(x) == 2: txt += chr(int(x, base=16)) return createStringObject(b_(txt))
def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if tok.isspace(): break generation += tok r = stream.read(1) if r != b_("R"): raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start idx = ObjectPrefix.find(tok) if idx == 0: # name object return NameObject.readFromStream(stream, pdf) elif idx == 1: # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == b_('<<'): return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif idx == 2: # array object return ArrayObject.readFromStream(stream, pdf) elif idx == 3 or idx == 4: # boolean object return BooleanObject.readFromStream(stream) elif idx == 5: # string object return readStringFromStream(stream) elif idx == 6: # null object return NullObject.readFromStream(stream) elif idx == 7: # comment while tok not in (b_('\r'), b_('\n')): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference if tok in NumberSigns: # number return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if IndirectPattern.match(peek) != None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream)
class NameObject(str, PdfObject): delimiterCharacters = b_("("), b_(")"), b_("<"), b_(">"), b_("["), b_("]"), b_("{"), b_("}"), b_("/"), b_("%") def __init__(self, data): str.__init__(data) def writeToStream(self, stream, encryption_key): stream.write(b_(self)) def readFromStream(stream): debug = False if debug: print stream.tell() name = stream.read(1) if name != b_("/"): raise utils.PdfReadError, "name read error" while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace() or tok in NameObject.delimiterCharacters: stream.seek(-1, 1) break name += tok if debug: print name return NameObject(name.decode('utf-8')) readFromStream = staticmethod(readFromStream)
def readFromStream(stream, pdf): arr = ArrayObject() tmp = stream.read(1) if tmp != b_("["): raise utils.PdfReadError("Could not read array") while True: # skip leading whitespace tok = stream.read(1) while tok.isspace(): tok = stream.read(1) stream.seek(-1, 1) # check for array ending peekahead = stream.read(1) if peekahead == b_("]"): break stream.seek(-1, 1) # read and append obj arr.append(readObject(stream, pdf)) return arr
def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if tok.isspace(): break generation += tok r = stream.read(1) if r != b_("R"): raise utils.PdfReadError( "Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
def readFromStream(stream, pdf): arr = ArrayObject() tmp = stream.read(1) if tmp != b_("["): raise utils.PdfReadError, "error reading array" while True: # skip leading whitespace tok = stream.read(1) while tok.isspace(): tok = stream.read(1) stream.seek(-1, 1) # check for array ending peekahead = stream.read(1) if peekahead == b_("]"): break stream.seek(-1, 1) # read and append obj arr.append(readObject(stream, pdf)) return arr
def readFromStream(stream): name = stream.read(1) if name != b_("/"): raise utils.PdfReadError, "name read error" while True: tok = stream.read(1) if tok.isspace() or tok in NameObject.delimiterCharacters: stream.seek(-1, 1) break name += tok return NameObject(name.decode('utf-8'))
def writeToStream(self, stream, encryption_key): # Try to write the string out as a PDFDocEncoding encoded string. It's # nicer to look at in the PDF file. Sadly, we take a performance hit # here for trying... try: bytearr = encode_pdfdocencoding(self) except UnicodeEncodeError: bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") if encryption_key: bytearr = RC4_encrypt(encryption_key, bytearr) obj = ByteStringObject(bytearr) obj.writeToStream(stream, None) else: stream.write(b_("(")) for c in bytearr: if not chr_(c).isalnum() and c != b_(' '): stream.write(b_("\\%03o" % ord_(c))) else: stream.write(b_(chr_(c))) stream.write(b_(")"))
def readHexStringFromStream(stream): stream.read(1) txt = "" x = b_("") while True: tok = readNonWhitespace(stream) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok == b_(">"): break x += tok if len(x) == 2: txt += chr(int(x, base=16)) x = b_("") if len(x) == 1: x += b_("0") if len(x) == 2: txt += chr(int(x, base=16)) return createStringObject(b_(txt))
def __new__(cls, s): b = bytes() if isinstance(s, bytes) or isinstance(s, bytearray): b = s elif isinstance(s, str): b = bytearray(s, 'utf_8') from itertools import chain for c in chain(range(33), range(127, 256)): if c in b: b = b.replace(bytes([c]), b_(('#' + hex(c)[2:]).upper())) else: raise ValueError() return PdfNameObjectBase.__new__(cls, b)
def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): break generation += tok r = stream.read(1) if r != b_("R"): raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
class NameObject(str, PdfObject): delimiterCharacters = b_("("), b_(")"), b_("<"), b_(">"), b_("["), \ b_("]"), b_("{"), b_("}"), b_("/"), b_("%") def __init__(self, data): str.__init__(data) def writeToStream(self, stream, encryption_key): stream.write(b_(self)) def readFromStream(stream): name = stream.read(1) if name != b_("/"): raise utils.PdfReadError, "name read error" while True: tok = stream.read(1) if tok.isspace() or tok in NameObject.delimiterCharacters: stream.seek(-1, 1) break name += tok return NameObject(name.decode('utf-8')) readFromStream = staticmethod(readFromStream)
def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): # 1. Create an encryption key based on the user password string, as # described in Algorithm 3.2. key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) # 2. Initialize the MD5 hash function and pass the 32-byte padding string # shown in step 1 of Algorithm 3.2 as input to this function. m = md5() m.update(_encryption_padding) # 3. Pass the first element of the file's file identifier array (the value # of the ID entry in the document's trailer dictionary; see Table 3.13 on # page 73) to the hash function and finish the hash. (See implementation # note 25 in Appendix H.) m.update(id1_entry.original_bytes) md5_hash = m.digest() # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption # function with the encryption key from step 1. val = utils.RC4_encrypt(key, md5_hash) # 5. Do the following 19 times: Take the output from the previous # invocation of the RC4 function and pass it as input to a new invocation # of the function; use an encryption key generated by taking each byte of # the original encryption key (obtained in step 2) and performing an XOR # operation between that byte and the single-byte value of the iteration # counter (from 1 to 19). for i in range(1, 20): new_key = b_('') for l in range(len(key)): new_key += b_(chr(utils.ord_(key[l]) ^ i)) val = utils.RC4_encrypt(new_key, val) # 6. Append 16 bytes of arbitrary padding to the output from the final # invocation of the RC4 function and store the 32-byte result as the value # of the U entry in the encryption dictionary. # (implementator note: I don't know what "arbitrary padding" is supposed to # mean, so I have used null bytes. This seems to match a few other # people's implementations) return val + (b_('\x00') * 16), key
def readFromStream(stream): debug = False if debug: print stream.tell() name = stream.read(1) if name != b_("/"): raise utils.PdfReadError, "name read error" while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace() or tok in NameObject.delimiterCharacters: stream.seek(-1, 1) break name += tok if debug: print name return NameObject(name.decode('utf-8'))
def __init__(self): self._header = b_("%PDF-1.3") self._objects = [] # array of indirect objects # The root of our page tree node. pages = DictionaryObject() pages.update({NameObject("/Type"): NameObject("/Pages"), NameObject("/Count"): NumberObject(0), NameObject("/Kids"): ArrayObject()}) self._pages = self._addObject(pages) # info object info = DictionaryObject() info.update({NameObject("/Producer"): createStringObject( u"Python PDF Library - http://pybrary.net/pyPdf/")}) self._info = self._addObject(info) # root object root = DictionaryObject() root.update({NameObject("/Type"): NameObject("/Catalog"), NameObject("/Pages"): self._pages}) self._root = self._addObject(root)
def writeToStream(self, stream, encryption_key): stream.write(b_("<<\n")) for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if self.has_key(x)]: key.writeToStream(stream, encryption_key) stream.write(b_(" ")) value = self.raw_get(key) value.writeToStream(stream, encryption_key) stream.write(b_("\n")) key = NameObject('/Dest') key.writeToStream(stream, encryption_key) stream.write(b_(" ")) value = self.getDestArray() value.writeToStream(stream, encryption_key) stream.write(b_("\n")) stream.write(b_(">>"))
def readNextEndLine(self, stream): line = b_("") while True: x = stream.read(1) stream.seek(-2, 1) if x == b_('\n') or x == b_('\r'): # \n = LF; \r = CR crlf = False while x == b_('\n') or x == b_('\r'): x = stream.read(1) if x == b_('\n') or x == b_('\r'): # account for CR+LF stream.seek(-1, 1) crlf = True stream.seek(-2, 1) # if using CR+LF, go back 2 bytes, else 1 stream.seek(2 if crlf else 1, 1) break else: line = x + line return line
def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): # 1. Pad or truncate the password string to exactly 32 bytes. If the # password string is more than 32 bytes long, use only its first 32 bytes; # if it is less than 32 bytes long, pad it by appending the required number # of additional bytes from the beginning of the padding string # (_encryption_padding). password = (password + _encryption_padding)[:32] # 2. Initialize the MD5 hash function and pass the result of step 1 as # input to this function. m = md5(password) # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash # function. m.update(owner_entry.original_bytes) # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass # these bytes to the MD5 hash function, low-order byte first. p_entry = struct.pack('<i', p_entry) m.update(p_entry) # 5. Pass the first element of the file's file identifier array to the MD5 # hash function. m.update(id1_entry.original_bytes) # 6. (Revision 3 or greater) If document metadata is not being encrypted, # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function. if rev >= 3 and not metadata_encrypt: m.update(b_("\xff\xff\xff\xff")) # 7. Finish the hash. md5_hash = m.digest() # 8. (Revision 3 or greater) Do the following 50 times: Take the output # from the previous MD5 hash and pass the first n bytes of the output as # input into a new MD5 hash, where n is the number of bytes of the # encryption key as defined by the value of the encryption dictionary's # /Length entry. if rev >= 3: for i in range(50): md5_hash = md5(md5_hash[:keylen]).digest() # 9. Set the encryption key to the first n bytes of the output from the # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or # greater, depends on the value of the encryption dictionary's /Length # entry. return md5_hash[:keylen]
def as_numeric(self): return int(b_(repr(self)))
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start if tok == b_('t') or tok == b_('f'): # boolean object return BooleanObject.readFromStream(stream) elif tok == b_('('): # string object return readStringFromStream(stream) elif tok == b_('/'): # name object return NameObject.readFromStream(stream) elif tok == b_('['): # array object return ArrayObject.readFromStream(stream, pdf) elif tok == b_('n'): # null object return NullObject.readFromStream(stream) elif tok == b_('<'): # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == b_('<<'): return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif tok == b_('%'): # comment while tok not in (b_('\r'), b_('\n')): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference if tok == b_('+') or tok == b_('-'): # number return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if re.match(b_(r"(\d+)\s(\d+)\sR[^a-zA-Z]"), peek) != None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream)
def writeToStream(self, stream, encryption_key): if self.value: stream.write(b_("true")) else: stream.write(b_("false"))
def readStringFromStream(stream): tok = stream.read(1) parens = 1 txt = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok == b_("("): parens += 1 elif tok == b_(")"): parens -= 1 if parens == 0: break elif tok == b_("\\"): tok = stream.read(1) if tok == b_("n"): tok = b_("\n") elif tok == b_("r"): tok = b_("\r") elif tok == b_("t"): tok = b_("\t") elif tok == b_("b"): tok = b_("\b") elif tok == b_("f"): tok = b_("\f") elif tok == b_("("): tok = b_("(") elif tok == b_(")"): tok = b_(")") elif tok == b_("\\"): tok = b_("\\") elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["), b_("]")): # odd/unnessecary escape sequences we have encountered tok = b_(tok) elif tok.isdigit(): # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) for i in range(2): ntok = stream.read(1) if ntok.isdigit(): tok += ntok else: break tok = b_(chr(int(tok, base=8))) elif tok in b_("\n\r"): # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if not tok in b_("\n\r"): stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: tok = b_('') else: raise utils.PdfReadError("Unexpected escaped string") txt += tok return createStringObject(txt)
def readFromStream(stream, pdf): debug = False tmp = stream.read(2) if tmp != b_("<<"): raise utils.PdfReadError, \ ("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) data = {} while True: tok = readNonWhitespace(stream) if debug: print "Tok:", tok if tok == b_(">"): stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if data.has_key(key): # multiple definitions of key not permitted raise utils.PdfReadError, ("Multiple definitions in dictionary at byte %s for key %s" \ % (utils.hexStr(stream.tell()), key)) data[key] = value pos = stream.tell() s = readNonWhitespace(stream) if s == b_('s') and stream.read(5) == b_('tream'): eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b_(' '): eol = stream.read(1) assert eol in (b_("\n"), b_("\r")) if eol == b_("\r"): # read \n after stream.read(1) # this is a stream object, not a dictionary assert data.has_key("/Length") length = data["/Length"] if debug: print data if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) if debug: print "here" #if debug: print debugging.printAsHex(data["__streamdata__"]) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b_("endstream"): # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b_("endstream"): # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: # if debug: print "E", e, ndstream, debugging.toHex(end) stream.seek(pos, 0) raise utils.PdfReadError, \ ("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) else: stream.seek(pos, 0) if data.has_key("__streamdata__"): return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def writeToStream(self, stream, encryption_key): stream.write(b_("%s %s R" % (self.idnum, self.generation)))
def writeToStream(self, stream, encryption_key): stream.write(b_("[")) for data in self: stream.write(b_(" ")) data.writeToStream(stream, encryption_key) stream.write(b_(" ]"))
def writeToStream(self, stream, encryption_key): stream.write(b_("null"))
def readFromStream(stream, pdf): debug = False tmp = stream.read(2) if tmp != b_("<<"): raise utils.PdfReadError, \ ("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) data = {} while True: tok = readNonWhitespace(stream) if tok == b_('\x00'): continue if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if debug: print "Tok:",tok if tok == b_(">"): stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if not data.has_key(key): data[key] = value pos = stream.tell() s = readNonWhitespace(stream) if s == b_('s') and stream.read(5) == b_('tream'): eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b_(' '): eol = stream.read(1) assert eol in (b_("\n"), b_("\r")) if eol == b_("\r"): # read \n after if stream.read(1) != '\n': stream.seek(-1, 1) # this is a stream object, not a dictionary assert data.has_key("/Length") length = data["/Length"] if debug: print data if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) if debug: print "here" #if debug: print debugging.printAsHex(data["__streamdata__"]) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b_("endstream"): # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b_("endstream"): # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: if pdf.strict == False: warnings.warn("Ignoring missing endstream. This could affect PDF output.") pass else: if debug: print "E", e, ndstream, debugging.toHex(end) stream.seek(pos, 0) raise utils.PdfReadError, \ ("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) else: stream.seek(pos, 0) if data.has_key("__streamdata__"): return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def as_numeric(self): return float(b_(repr(self)))
def writeToStream(self, stream, encryption_key): stream.write(b_(repr(self)))
def write(self, stream): externalReferenceMap = {} # PDF objects sometimes have circular references to their /Page objects # inside their object tree (for example, annotations). Those will be # indirect references to objects that we've recreated in this PDF. To # address this problem, PageObject's store their original object # reference number, and we add it to the external reference map before # we sweep for indirect references. This forces self-page-referencing # trees to reference the correct new object location, rather than # copying in a new copy of the page object. for objIndex in xrange(len(self._objects)): obj = self._objects[objIndex] if isinstance(obj, PageObject) and obj.indirectRef is not None: data = obj.indirectRef externalReferenceMap.setdefault(data.pdf, {}) externalReferenceMap[data.pdf].setdefault(data.generation, {}) externalReferenceMap[data.pdf][data.generation][data.idnum] = \ IndirectObject(objIndex + 1, 0, self) self.stack = [] self._sweepIndirectReferences(externalReferenceMap, self._root) del self.stack # Begin writing: object_positions = [] stream.write(self._header + b_("\n")) for i in range(len(self._objects)): idnum = (i + 1) obj = self._objects[i] object_positions.append(stream.tell()) stream.write(b_(str(idnum) + " 0 obj\n")) key = None if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: pack1 = struct.pack("<i", i + 1)[:3] pack2 = struct.pack("<i", 0)[:2] key = self._encrypt_key + pack1 + pack2 assert len(key) == (len(self._encrypt_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._encrypt_key) + 5)] if obj is not None: obj.writeToStream(stream, key) stream.write(b_("\nendobj\n")) # xref table xref_location = stream.tell() stream.write(b_("xref\n")) stream.write(b_("0 %s\n" % (len(self._objects) + 1))) stream.write(b_("%010d %05d f \n" % (0, 65535))) for offset in object_positions: stream.write(b_("%010d %05d n \n" % (offset, 0))) # trailer stream.write(b_("trailer\n")) trailer = DictionaryObject() trailer.update({ NameObject("/Size"): NumberObject(len(self._objects) + 1), NameObject("/Root"): self._root, NameObject("/Info"): self._info }) if hasattr(self, "_ID"): trailer[NameObject("/ID")] = self._ID if hasattr(self, "_encrypt"): trailer[NameObject("/Encrypt")] = self._encrypt trailer.writeToStream(stream, None) # eof stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
def readStringFromStream(stream): tok = stream.read(1) parens = 1 txt = b_("") while True: tok = stream.read(1) if tok == b_("("): parens += 1 elif tok == b_(")"): parens -= 1 if parens == 0: break elif tok == b_("\\"): tok = stream.read(1) if tok == b_("n"): tok = b_("\n") elif tok == b_("r"): tok = b_("\r") elif tok == b_("t"): tok = b_("\t") elif tok == b_("b"): tok = b_("\b") elif tok == b_("f"): tok = b_("\f") elif tok == b_("("): tok = b_("(") elif tok == b_(")"): tok = b_(")") elif tok == b_("\\"): tok = b_("\\") elif tok.isdigit(): # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) for i in range(2): ntok = stream.read(1) if ntok.isdigit(): tok += ntok else: break tok = b_(chr(int(tok, base=8))) elif tok in b_("\n\r"): # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if not tok in b_("\n\r"): stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: tok = b_('') else: raise utils.PdfReadError("Unexpected escaped string") txt += tok return createStringObject(txt)