def read_from_stream(stream, pdf_object): tmp = stream.read(2) if not tmp == b'<<': raise _u.PdfReadError("dictionary read error") data = {} while True: tok = _u.read_non_whitespace(stream) if tok in b'>': stream.read(1) break stream.seek(-1, io.SEEK_CUR) key = read_object(stream, pdf_object) _u.seek_token(stream) value = read_object(stream, pdf_object) if key in data: # multiple definitions of key not permitted raise _u.PdfReadError("multiple definitions in dictionary") data[key] = value read_stream_object_with(data, pdf_object, stream) if _STREAM_KEY in data: return initialize_from_dictionary(data) else: retval = DictObject() retval.update(data) return retval
def _read_cross_reference(self): # start at the end: self._stream.seek(-1, io.SEEK_END) line = b'' while not line: line = _read_backward_for_line(self._stream) if not line[-5:] == b'%%EOF': raise _u.PdfReadError(f'EOF marker not found: {line}') # find startxref entry - the location of the xref table line = _read_backward_for_line(self._stream) startxref = int(line) line = _read_backward_for_line(self._stream) if not line[:9] == b'startxref': raise _u.PdfReadError("Token 'startxref' not found") # read all cross reference tables and their trailers while True: # load the xref table self._stream.seek(startxref, io.SEEK_SET) x = self._stream.read(1) if x in b'x': startxref = self.__parse_xref_table() if startxref is None: break elif x.isdigit(): startxref = self.__parse_xref_stream() if startxref is None: break else: _u.debug(f''' Bad xref character at startxref. Let\'s see if we can find the xref table nearby, as we\'ve observed this error with an off-by-one before. ''') self._stream.seek(-11, io.SEEK_CUR) tmp = self._stream.read(20) xref_loc = tmp.find(b'xref') if xref_loc != -1: startxref -= (10 - xref_loc) continue else: raise ValueError('No xref table found at specified location')
def read_from_stream(stream): name = stream.read(1) if name not in b'/': raise _u.PdfReadError("name read error") while True: tok = stream.read(1) if tok.isspace() or tok in _u.DELIMITERS: stream.seek(-1, io.SEEK_CUR) break name += tok return NameObject(name)
def __parse_xref_table(self): # standard cross-reference table ref = self._stream.read(4) if not ref[:3] == b'ref': raise _u.PdfReadError("xref table read error") _u.seek_token(self._stream) while True: num = read_object(self._stream, self) _u.seek_token(self._stream) size = read_object(self._stream, self) _u.seek_token(self._stream) cnt = 0 while cnt < size: line = self._stream.read(20) # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes. However... some malformed PDF files # use a single character EOL without a preceeding # space. Detect that case, and seek the stream # back one character. (0-9 means we've bled into # the next xref entry, t means we've bled into the # text "trailer"): if line[-1] in b'0123456789t': self._stream.seek(-1, io.SEEK_CUR) offset, generation = line[:16].split(b' ') offset, generation = int(offset), int(generation) if generation not in self._xref: self._xref[generation] = {} if num not in self._xref[generation]: self._xref[generation][num] = offset cnt += 1 num += 1 _u.seek_token(self._stream) trailertag = self._stream.read(7) if trailertag == b'trailer': break else: # more xrefs! self._stream.seek(-7, io.SEEK_CUR) _u.seek_token(self._stream) new_trailer = read_object(self._stream, self) for key, value in new_trailer.items(): if key not in self._trailer: self._trailer[key] = value if b'/Prev' in new_trailer: return new_trailer[b'/Prev'] else: return None
def read_from_stream(stream, pdf): arr = ArrayObject() tmp = stream.read(1) if tmp not in b'[': raise _u.PdfReadError("error reading array") while True: tok = stream.read(1) # skip leading whitespace while tok.isspace(): tok = stream.read(1) stream.seek(-1, io.SEEK_CUR) peak_ahead = stream.read(1) if peak_ahead in b']': break stream.seek(-1, io.SEEK_CUR) arr.append(read_object(stream, pdf)) return arr
def read_from_stream(stream, pdf): idnum = b'' while True: tok = stream.read(1) if tok.isspace(): break idnum += tok generation = b'' while True: tok = stream.read(1) if tok.isspace(): break generation += tok r = stream.read(1) if r not in b'R': raise _u.PdfReadError("error reading indirect object reference") return RefObject(int(idnum), int(generation), pdf)
def read_stream_object_with(data, pdf_object, stream): pos = stream.tell() s = _u.read_non_whitespace(stream) if (s + stream.read(5)) == b'stream': eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b' ': eol = stream.read(1) assert eol in b'\n\r' if eol in b'\r': # read \n after stream.read(1) # this is a stream object, not a dictionary assert b'/Length' in data length = data[b'/Length'] if isinstance(length, RefObject): t = stream.tell() length = pdf_object.get_obj_of(length) stream.seek(t, io.SEEK_SET) data[_STREAM_KEY] = stream.read(length) e = _u.read_non_whitespace(stream) end_stream = stream.read(8) if (e + end_stream) != b'endstream': # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, io.SEEK_CUR) end = stream.read(9) if end == b'endstream': # we found it by looking back one character further. data[_STREAM_KEY] = data[_STREAM_KEY][:-1] else: stream.seek(pos, io.SEEK_SET) raise _u.PdfReadError( "Unable to find 'endstream' marker after stream.") else: stream.seek(pos, io.SEEK_SET)
def __init__(self, title, page, position_type, *args): DictObject.__init__(self) self[NameObject(_k.TITLE)] = title self[NameObject(_k.PAGE)] = page self[NameObject(_k.TYPE)] = position_type # from table 8.2 of the PDF 1.6 reference. if position_type == b'/XYZ': (self[NameObject(b'/Left')], self[NameObject(b'/Top')], self[NameObject(b'/Zoom')]) = args elif position_type == b'/FitR': (self[NameObject(b'/Left')], self[NameObject(b'/Bottom')], self[NameObject(b'/Right')], self[NameObject(b'/Top')]) = args elif position_type in [b'/FitH', b'FitBH']: self[NameObject(b'/Top')], = args elif position_type in [b'/FitV', b'FitBV']: self[NameObject(b'/Left')], = args elif position_type in [b'/Fit', b'FitB']: pass else: raise _u.PdfReadError("Unknown DestObject Type: %r" % position_type)
def _build_outline(self, node): dest, title, outline = None, None, None if b'/A' in node and _k.TITLE in node: # Action, section 8.5 (only type GoTo supported) title = node[_k.TITLE] action = node[b'/A'] if action[b'/S'] == b'/GoTo': dest = action[b'/D'] elif _k.DEST in node and _k.TITLE in node: # DestObject, section 8.2.1 title = node[_k.TITLE] dest = node[_k.DEST] # if destination found, then create outline if dest: if isinstance(dest, ArrayObject): outline = _build_destination(title, dest) elif dest in isinstance(dest, str) and self._named_dests: outline = self._named_dests[dest] outline[NameObject(_k.TITLE)] = title else: raise _u.PdfReadError("Unexpected destination %r" % dest) return outline
def read_from_stream(stream): null_txt = stream.read(4) if not null_txt == b'null': raise _u.PdfReadError("error reading null object") return NullObject()
def set_data(self, data): raise _u.PdfReadError( "Creating EncodedStreamObject is not currently supported")
def _convert_to_int(d, size): if size > 8: raise _u.PdfReadError("invalid size in convertToInt") d = b'\x00\x00\x00\x00\x00\x00\x00\x00' + d d = d[-8:] return struct.unpack('>q', d)[0]