def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok.isspace(): if not generation: continue break generation += tok r = readNonWhitespace(stream) if r != b_("R"): raise PdfReadError( "Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
def decode(data, decodeParms=None): """ :param data: a str sequence of hexadecimal-encoded values to be converted into a base-7 ASCII string :param decodeParms: :return: a string conversion in base-7 ASCII, where each of its values v is such that 0 <= ord(v) <= 127. """ retval = "" hex_pair = "" x = 0 while True: if x >= len(data): raise PdfStreamError("Unexpected EOD in ASCIIHexDecode") c = data[x] if c == ">": break elif c.isspace(): x += 1 continue hex_pair += c if len(hex_pair) == 2: retval += chr(int(hex_pair, base=16)) hex_pair = "" x += 1 assert hex_pair == "" return retval
def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start idx = ObjectPrefix.find(tok) if idx == 0: # name object return NameObject.readFromStream(stream, pdf) elif idx == 1: # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == b_("<<"): return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif idx == 2: # array object return ArrayObject.readFromStream(stream, pdf) elif idx == 3 or idx == 4: # boolean object return BooleanObject.readFromStream(stream) elif idx == 5: # string object return readStringFromStream(stream) elif idx == 6: # null object return NullObject.readFromStream(stream) elif idx == 7: # comment while tok not in (b_('\r'), b_('\n')): tok = stream.read(1) # Prevents an infinite loop by raising an error if the stream is at # the EOF if len(tok) <= 0: raise PdfStreamError("File ended unexpectedly.") tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if IndirectPattern.match(peek) is not None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream)
def readHexStringFromStream(stream): stream.read(1) txt = "" x = b_("") while True: tok = readNonWhitespace(stream) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok == b_(">"): break x += tok if len(x) == 2: txt += chr(int(x, base=16)) x = b_("") if len(x) == 1: x += b_("0") if len(x) == 2: txt += chr(int(x, base=16)) return createStringObject(b_(txt))
def readUntilRegex(stream, regex, ignore_eof=False): """ Reads until the regular expression pattern matched (ignore the match) :raises PdfStreamError: on premature end-of-file :param bool ignore_eof: If true, ignore end-of-line and return immediately """ name = b_('') while True: tok = stream.read(16) if not tok: # stream has truncated prematurely if ignore_eof: return name else: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) m = regex.search(tok) if m is not None: name += tok[:m.start()] stream.seek(m.start()-len(tok), 1) break name += tok return name
def readFromStream(stream, pdf): debug = False tmp = stream.read(2) if tmp != b_("<<"): raise PdfReadError( "Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) data = {} while True: tok = readNonWhitespace(stream) if tok == b_('\x00'): continue elif tok == b_('%'): stream.seek(-1, 1) skipOverComment(stream) continue if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if debug: print(("Tok:", tok)) if tok == b_(">"): stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if not data.get(key): data[key] = value elif pdf.strict: # multiple definitions of key not permitted raise PdfReadError( "Multiple definitions in dictionary at byte %s for key %s" \ % (utils.hexStr(stream.tell()), key)) else: warnings.warn( "Multiple definitions in dictionary at byte %s for key %s" \ % (utils.hexStr(stream.tell()), key), PdfReadWarning) pos = stream.tell() s = readNonWhitespace(stream) if s == b_('s') and stream.read(5) == b_('tream'): eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b_(' '): eol = stream.read(1) assert eol in (b_("\n"), b_("\r")) if eol == b_("\r"): # read \n after if stream.read(1) != b_('\n'): stream.seek(-1, 1) # this is a stream object, not a dictionary assert SA.LENGTH in data length = data[SA.LENGTH] if debug: print(data) if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) if debug: print("here") # if debug: print(binascii.hexlify(data["__streamdata__"])) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b_("endstream"): # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b_("endstream"): # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: stream.seek(pos, 0) raise PdfReadError( "Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) else: stream.seek(pos, 0) if "__streamdata__" in data: return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def readStringFromStream(stream): tok = stream.read(1) parens = 1 txt = b_("") while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok == b_("("): parens += 1 elif tok == b_(")"): parens -= 1 if parens == 0: break elif tok == b_("\\"): tok = stream.read(1) ESCAPE_DICT = { b_("n"): b_("\n"), b_("r"): b_("\r"), b_("t"): b_("\t"), b_("b"): b_("\b"), b_("f"): b_("\f"), b_("c"): b_(r"\c"), b_("("): b_("("), b_(")"): b_(")"), b_("/"): b_("/"), b_("\\"): b_("\\"), b_(" "): b_(" "), b_("/"): b_("/"), b_("%"): b_("%"), b_("<"): b_("<"), b_(">"): b_(">"), b_("["): b_("["), b_("]"): b_("]"), b_("#"): b_("#"), b_("_"): b_("_"), b_("&"): b_("&"), b_('$'): b_('$'), } try: tok = ESCAPE_DICT[tok] except KeyError: if tok.isdigit(): # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) for _ in range(2): ntok = stream.read(1) if ntok.isdigit(): tok += ntok else: break tok = b_(chr(int(tok, base=8))) elif tok in b_("\n\r"): # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if tok not in b_("\n\r"): stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: tok = b_('') else: raise PdfReadError(r"Unexpected escaped string: %s" % tok) txt += tok return createStringObject(txt)