def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): break generation += tok r = stream.read(1) if r != b_("R"): raise utils.PdfReadError( "Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
def readStringFromStream(stream): tok = stream.read(1) parens = 1 txt = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok == b_("("): parens += 1 elif tok == b_(")"): parens -= 1 if parens == 0: break elif tok == b_("\\"): tok = stream.read(1) if tok == b_("n"): tok = b_("\n") elif tok == b_("r"): tok = b_("\r") elif tok == b_("t"): tok = b_("\t") elif tok == b_("b"): tok = b_("\b") elif tok == b_("f"): tok = b_("\f") elif tok == b_("("): tok = b_("(") elif tok == b_(")"): tok = b_(")") elif tok == b_("\\"): tok = b_("\\") elif tok.isdigit(): # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) for i in range(2): ntok = stream.read(1) if ntok.isdigit(): tok += ntok else: break tok = b_(chr(int(tok, base=8))) elif tok in b_("\n\r"): # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if not tok in b_("\n\r"): stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: tok = b_('') else: raise utils.PdfReadError("Unexpected escaped string") txt += tok return createStringObject(txt)
def readFromStream(stream): word = stream.read(4) if word == b_("true"): return BooleanObject(True) elif word == b_("fals"): stream.read(1) return BooleanObject(False) else: raise utils.PdfReadError('Could not read Boolean object')
def readFromStream(stream, pdf): debug = False if debug: print((stream.tell())) name = stream.read(1) if name != NameObject.surfix: raise utils.PdfReadError("name read error") name += utils.readUntilRegex(stream, NameObject.delimiterPattern, ignore_eof=True) if debug: print(name) try: return NameObject(name.decode('utf-8')) except (UnicodeEncodeError, UnicodeDecodeError) as e: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number if not pdf.strict: warnings.warn("Illegal character in Name Object", utils.PdfReadWarning) return NameObject(name) else: raise utils.PdfReadError("Illegal character in Name Object")
def readStringFromStream(stream): tok = stream.read(1) parens = 1 txt = "" while True: tok = stream.read(1) if tok == "(": parens += 1 elif tok == ")": parens -= 1 if parens == 0: break elif tok == "\\": tok = stream.read(1) if tok == "n": tok = "\n" elif tok == "r": tok = "\r" elif tok == "t": tok = "\t" elif tok == "b": tok == "\b" elif tok == "f": tok = "\f" elif tok == "(": tok = "(" elif tok == ")": tok = ")" elif tok == "\\": tok = "\\" elif tok.isdigit(): tok += stream.read(2) tok = chr(int(tok, base=8)) elif tok in "\n\r": # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if not tok in "\n\r": stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: tok = '' else: raise utils.PdfReadError("Unexpected escaped string") txt += tok return createStringObject(txt)
def readFromStream(stream, pdf): idnum = "" while True: tok = stream.read(1) if tok.isspace(): break idnum += tok generation = "" while True: tok = stream.read(1) if tok.isspace(): break generation += tok r = stream.read(1) if r != "R": raise utils.PdfReadError("error reading indirect object reference") return IndirectObject(int(idnum), int(generation), pdf)
def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if tok.isspace(): break generation += tok r = stream.read(1) if r != b_("R"): raise utils.PdfReadError( "Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
def readFromStream(stream, pdf): arr = ArrayObject() tmp = stream.read(1) if tmp != b_("["): raise utils.PdfReadError("Could not read array") while True: # skip leading whitespace tok = stream.read(1) while tok.isspace(): tok = stream.read(1) stream.seek(-1, 1) # check for array ending peekahead = stream.read(1) if peekahead == b_("]"): break stream.seek(-1, 1) # read and append obj arr.append(readObject(stream, pdf)) return arr
def __init__(self, title, page, typ, *args): DictionaryObject.__init__(self) self[NameObject("/Title")] = title self[NameObject("/Page")] = page self[NameObject("/Type")] = typ # from table 8.2 of the PDF 1.6 reference. if typ == "/XYZ": (self[NameObject("/Left")], self[NameObject("/Top")], self[NameObject("/Zoom")]) = args elif typ == "/FitR": (self[NameObject("/Left")], self[NameObject("/Bottom")], self[NameObject("/Right")], self[NameObject("/Top")]) = args elif typ in ["/FitH", "FitBH"]: self[NameObject("/Top")], = args elif typ in ["/FitV", "FitBV"]: self[NameObject("/Left")], = args elif typ in ["/Fit", "FitB"]: pass else: raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
def _buildOutline(self, node): dest, title, outline = None, None, None if "/A" in node and "/Title" in node: # Action, section 8.5 (only type GoTo supported) title = node["/Title"] action = node["/A"] if action["/S"] == "/GoTo": dest = action["/D"] elif "/Dest" in node and "/Title" in node: # Destination, section 8.2.1 title = node["/Title"] dest = node["/Dest"] # if destination found, then create outline if dest: if isinstance(dest, ArrayObject): outline = self._buildDestination(title, dest) elif isinstance(dest, unicode) and dest in self._namedDests: outline = self._namedDests[dest] outline[NameObject("/Title")] = title else: raise utils.PdfReadError("Unexpected destination %r" % dest) return outline
def setData(self, data): raise utils.PdfReadError( "Creating EncodedStreamObject is not currently supported")
def readFromStream(stream, pdf): debug = False tmp = stream.read(2) if tmp != b_("<<"): raise utils.PdfReadError( "Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) data = {} while True: tok = readNonWhitespace(stream) if tok == b_('\x00'): continue elif tok == b_('%'): stream.seek(-1, 1) skipOverComment(stream) continue if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if debug: print(("Tok:", tok)) if tok == b_(">"): stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if not data.get(key): data[key] = value elif pdf.strict: # multiple definitions of key not permitted raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \ % (utils.hexStr(stream.tell()), key)) else: warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \ % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning) pos = stream.tell() s = readNonWhitespace(stream) if s == b_('s') and stream.read(5) == b_('tream'): eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b_(' '): eol = stream.read(1) assert eol in (b_("\n"), b_("\r")) if eol == b_("\r"): # read \n after if stream.read(1) != b_('\n'): stream.seek(-1, 1) # this is a stream object, not a dictionary assert "/Length" in data length = data["/Length"] if debug: print(data) if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) if debug: print("here") #if debug: print(binascii.hexlify(data["__streamdata__"])) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b_("endstream"): # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b_("endstream"): # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: if debug: print(("E", e, ndstream, debugging.toHex(end))) stream.seek(pos, 0) raise utils.PdfReadError( "Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) else: stream.seek(pos, 0) if "__streamdata__" in data: return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def readFromStream(stream): nulltxt = stream.read(4) if nulltxt != b_("null"): raise utils.PdfReadError("Could not read Null object") return NullObject()
def convertToInt(d, size): if size > 8: raise utils.PdfReadError("invalid size in convertToInt") d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d d = d[-8:] return struct.unpack(">q", d)[0]
def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval is not None: return retval if indirectReference.generation == 0 \ and indirectReference.idnum in self.xref_objStm: # indirect reference to object in object stream # read the entire object stream into memory stmnum, idx = self.xref_objStm[indirectReference.idnum] objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) for i in range(objStm['/N']): objnum = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) offset = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) t = streamData.tell() streamData.seek(objStm['/First'] + offset, 0) obj = readObject(streamData, self) self.resolvedObjects[0][objnum] = obj streamData.seek(t, 0) return self.resolvedObjects[0][indirectReference.idnum] if indirectReference.idnum \ not in self.xref[indirectReference.generation]: warnings.warn( "Object %d %d not defined." % (indirectReference.idnum, indirectReference.generation), utils.PdfReadWarning) return None start = self.xref[indirectReference.generation][ indirectReference.idnum] self.stream.seek(start, 0) idnum, generation = self.readObjectHeader(self.stream) try: assert idnum == indirectReference.idnum except AssertionError: if self.xrefIndex: # Xref table probably had bad indexes due to not # being zero-indexed if self.strict: raise utils.PdfReadError( "Expected object ID (%d %d) does " "not match actual (%d %d); xref " "table not zero-indexed." % (indirectReference.idnum, indirectReference.generation, idnum, generation)) else: # should not happen since the xref table is corrected in # non-strict mode pass else: # some other problem raise utils.PdfReadError( "Expected object ID (%d %d) does not " " match actual (%d %d)." % (indirectReference.idnum, indirectReference.generation, idnum, generation)) assert generation == indirectReference.generation retval = readObject(self.stream, self) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self.isEncrypted: # if we don't have the encryption key: if not hasattr(self, '_decryption_key'): raise Exception("file has not been decrypted") # otherwise, decrypt here... pack1 = struct.pack("<i", indirectReference.idnum)[:3] pack2 = struct.pack("<i", indirectReference.generation)[:2] key = self._decryption_key + pack1 + pack2 assert len(key) == (len(self._decryption_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._decryption_key) + 5)] retval = self._decryptObject(retval, key) self.cacheIndirectObject(generation, idnum, retval) return retval