def read_from_stream(stream, pdf_object): tmp = stream.read(2) if not tmp == b'<<': raise _u.PdfReadError("dictionary read error") data = {} while True: tok = _u.read_non_whitespace(stream) if tok in b'>': stream.read(1) break stream.seek(-1, io.SEEK_CUR) key = read_object(stream, pdf_object) _u.seek_token(stream) value = read_object(stream, pdf_object) if key in data: # multiple definitions of key not permitted raise _u.PdfReadError("multiple definitions in dictionary") data[key] = value read_stream_object_with(data, pdf_object, stream) if _STREAM_KEY in data: return initialize_from_dictionary(data) else: retval = DictObject() retval.update(data) return retval
def _read_object_header(stream): """Should never be necessary to read out whitespace, since the cross-reference table should put us in the right spot to read the object header. In reality... some files have stupid cross reference tables that are off by whitespace bytes.""" _u.seek_token(stream) idnum = _u.read_until_whitespace(stream) generation = _u.read_until_whitespace(stream) tok_obj = stream.read(3) if not b'obj' == tok_obj: raise ValueError(f'Expecting obj here. Got {tok_obj}') _u.seek_token(stream) return int(idnum), int(generation)
def __parse_xref_table(self): # standard cross-reference table ref = self._stream.read(4) if not ref[:3] == b'ref': raise _u.PdfReadError("xref table read error") _u.seek_token(self._stream) while True: num = read_object(self._stream, self) _u.seek_token(self._stream) size = read_object(self._stream, self) _u.seek_token(self._stream) cnt = 0 while cnt < size: line = self._stream.read(20) # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes. However... some malformed PDF files # use a single character EOL without a preceeding # space. Detect that case, and seek the stream # back one character. (0-9 means we've bled into # the next xref entry, t means we've bled into the # text "trailer"): if line[-1] in b'0123456789t': self._stream.seek(-1, io.SEEK_CUR) offset, generation = line[:16].split(b' ') offset, generation = int(offset), int(generation) if generation not in self._xref: self._xref[generation] = {} if num not in self._xref[generation]: self._xref[generation][num] = offset cnt += 1 num += 1 _u.seek_token(self._stream) trailertag = self._stream.read(7) if trailertag == b'trailer': break else: # more xrefs! self._stream.seek(-7, io.SEEK_CUR) _u.seek_token(self._stream) new_trailer = read_object(self._stream, self) for key, value in new_trailer.items(): if key not in self._trailer: self._trailer[key] = value if b'/Prev' in new_trailer: return new_trailer[b'/Prev'] else: return None
def read_object(stream, pdf_reader): tok = stream.read(1) stream.seek(-1, io.SEEK_CUR) # reset to start if tok in b'tf': # boolean object return BooleanObject.read_from_stream(stream) elif tok in b'(': # string object return read_string_from_stream(stream) elif tok in b'/': # name object return NameObject.read_from_stream(stream) elif tok in b'[': # array object return ArrayObject.read_from_stream(stream, pdf_reader) elif tok in b'n': # null object return NullObject.read_from_stream(stream) elif tok in b'<': # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, io.SEEK_CUR) # reset to start if peek == b'<<': return DictObject.read_from_stream(stream, pdf_reader) else: return read_hex_string_from_stream(stream) elif tok in b'%': # comment while tok not in b'\r\n': tok = stream.read(1) _u.seek_token(stream) return read_object(stream, pdf_reader) else: # number object OR indirect reference if tok in b'+-': # number return NumberObject.read_from_stream(stream) peek = stream.read(20) # _u.debug(peek) # raise ValueError() stream.seek(-len(peek), io.SEEK_CUR) # reset to start if re.match(br'(\d+)\s(\d+)\sR[^a-zA-Z]', peek) is not None: return RefObject.read_from_stream(stream, pdf_reader) else: return NumberObject.read_from_stream(stream)
def get_obj_of(self, reference: RefObject): retval = self._resolved_objects.get(reference.generation, {}).get(reference.idnum, None) if retval is not None: return retval if reference.generation == 0 and reference.idnum in self._xref_obj_stream: # indirect reference to object in object stream # read the entire object stream into memory stmnum, idx = self._xref_obj_stream[reference.idnum] obj_stm = RefObject(stmnum, 0, self).get_object() assert obj_stm[_k.TYPE] == b'/ObjStm' assert idx < obj_stm[b'/N'] stream_data = BytesIO(obj_stm.get_data()) for i in range(obj_stm[b'/N']): objnum = NumberObject.read_from_stream(stream_data) _u.seek_token(stream_data) offset = NumberObject.read_from_stream(stream_data) _u.seek_token(stream_data) t = stream_data.tell() stream_data.seek(obj_stm[b'/First'] + offset, io.SEEK_SET) obj = read_object(stream_data, self) self._resolved_objects[0][objnum] = obj stream_data.seek(t, io.SEEK_SET) return self._resolved_objects[0][reference.idnum] start = self._xref[reference.generation][reference.idnum] self._stream.seek(start, io.SEEK_SET) idnum, generation = _read_object_header(self._stream) assert idnum == reference.idnum assert generation == reference.generation retval = read_object(self._stream, self) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self._is_encrypted: # if we don't have the encryption key: if self._decryption_key is None: raise Exception("file has not been decrypted") # otherwise, decrypt here... pack1 = struct.pack("<i", reference.idnum)[:3] pack2 = struct.pack("<i", reference.generation)[:2] key = _u.encrypt(self._decryption_key, pack1, pack2) retval = self._decrypt_object(retval, key) self._cache_indirect_object(generation, idnum, retval) return retval
def __read_inline_image(self, stream): # begin reading just after the "BI" - begin image # first read the dictionary of settings. settings = DictObject() while True: tok = utils.seek_token(stream) if tok == b'I': # "ID" - begin of image data break key = read_object(stream, self.pdf) utils.seek_token(stream) value = read_object(stream, self.pdf) settings[key] = value # left at beginning of ID tmp = stream.read(3) assert tmp[:2] == b'ID' data = _read_image_data(stream) utils.debug(len(data)) utils.seek_token(stream) return {b'settings': settings, b'data': data}