Ejemplo n.º 1
0
 def read_from_stream(stream, pdf_object):
     tmp = stream.read(2)
     if not tmp == b'<<':
         raise _u.PdfReadError("dictionary read error")
     data = {}
     while True:
         tok = _u.read_non_whitespace(stream)
         if tok in b'>':
             stream.read(1)
             break
         stream.seek(-1, io.SEEK_CUR)
         key = read_object(stream, pdf_object)
         _u.seek_token(stream)
         value = read_object(stream, pdf_object)
         if key in data:
             # multiple definitions of key not permitted
             raise _u.PdfReadError("multiple definitions in dictionary")
         data[key] = value
     read_stream_object_with(data, pdf_object, stream)
     if _STREAM_KEY in data:
         return initialize_from_dictionary(data)
     else:
         retval = DictObject()
         retval.update(data)
         return retval
Ejemplo n.º 2
0
def _read_object_header(stream):
    """Should never be necessary to read out whitespace, since the
    cross-reference table should put us in the right spot to read the
    object header.  In reality... some files have stupid cross reference
    tables that are off by whitespace bytes."""
    _u.seek_token(stream)
    idnum = _u.read_until_whitespace(stream)
    generation = _u.read_until_whitespace(stream)
    tok_obj = stream.read(3)
    if not b'obj' == tok_obj:
        raise ValueError(f'Expecting obj here. Got {tok_obj}')
    _u.seek_token(stream)
    return int(idnum), int(generation)
Ejemplo n.º 3
0
 def __parse_xref_table(self):
     # standard cross-reference table
     ref = self._stream.read(4)
     if not ref[:3] == b'ref':
         raise _u.PdfReadError("xref table read error")
     _u.seek_token(self._stream)
     while True:
         num = read_object(self._stream, self)
         _u.seek_token(self._stream)
         size = read_object(self._stream, self)
         _u.seek_token(self._stream)
         cnt = 0
         while cnt < size:
             line = self._stream.read(20)
             # It's very clear in section 3.4.3 of the PDF spec
             # that all cross-reference table lines are a fixed
             # 20 bytes.  However... some malformed PDF files
             # use a single character EOL without a preceeding
             # space.  Detect that case, and seek the stream
             # back one character.  (0-9 means we've bled into
             # the next xref entry, t means we've bled into the
             # text "trailer"):
             if line[-1] in b'0123456789t':
                 self._stream.seek(-1, io.SEEK_CUR)
             offset, generation = line[:16].split(b' ')
             offset, generation = int(offset), int(generation)
             if generation not in self._xref:
                 self._xref[generation] = {}
             if num not in self._xref[generation]:
                 self._xref[generation][num] = offset
             cnt += 1
             num += 1
         _u.seek_token(self._stream)
         trailertag = self._stream.read(7)
         if trailertag == b'trailer':
             break
         else:
             # more xrefs!
             self._stream.seek(-7, io.SEEK_CUR)
     _u.seek_token(self._stream)
     new_trailer = read_object(self._stream, self)
     for key, value in new_trailer.items():
         if key not in self._trailer:
             self._trailer[key] = value
     if b'/Prev' in new_trailer:
         return new_trailer[b'/Prev']
     else:
         return None
Ejemplo n.º 4
0
def read_object(stream, pdf_reader):
    tok = stream.read(1)
    stream.seek(-1, io.SEEK_CUR)  # reset to start
    if tok in b'tf':
        # boolean object
        return BooleanObject.read_from_stream(stream)
    elif tok in b'(':
        # string object
        return read_string_from_stream(stream)
    elif tok in b'/':
        # name object
        return NameObject.read_from_stream(stream)
    elif tok in b'[':
        # array object
        return ArrayObject.read_from_stream(stream, pdf_reader)
    elif tok in b'n':
        # null object
        return NullObject.read_from_stream(stream)
    elif tok in b'<':
        # hexadecimal string OR dictionary
        peek = stream.read(2)
        stream.seek(-2, io.SEEK_CUR)  # reset to start
        if peek == b'<<':
            return DictObject.read_from_stream(stream, pdf_reader)
        else:
            return read_hex_string_from_stream(stream)
    elif tok in b'%':
        # comment
        while tok not in b'\r\n':
            tok = stream.read(1)
        _u.seek_token(stream)
        return read_object(stream, pdf_reader)
    else:
        # number object OR indirect reference
        if tok in b'+-':
            # number
            return NumberObject.read_from_stream(stream)
        peek = stream.read(20)
        # _u.debug(peek)
        # raise ValueError()
        stream.seek(-len(peek), io.SEEK_CUR)  # reset to start
        if re.match(br'(\d+)\s(\d+)\sR[^a-zA-Z]', peek) is not None:
            return RefObject.read_from_stream(stream, pdf_reader)
        else:
            return NumberObject.read_from_stream(stream)
Ejemplo n.º 5
0
    def get_obj_of(self, reference: RefObject):
        retval = self._resolved_objects.get(reference.generation, {}).get(reference.idnum, None)
        if retval is not None:
            return retval
        if reference.generation == 0 and reference.idnum in self._xref_obj_stream:
            # indirect reference to object in object stream
            # read the entire object stream into memory
            stmnum, idx = self._xref_obj_stream[reference.idnum]
            obj_stm = RefObject(stmnum, 0, self).get_object()
            assert obj_stm[_k.TYPE] == b'/ObjStm'
            assert idx < obj_stm[b'/N']
            stream_data = BytesIO(obj_stm.get_data())
            for i in range(obj_stm[b'/N']):
                objnum = NumberObject.read_from_stream(stream_data)
                _u.seek_token(stream_data)
                offset = NumberObject.read_from_stream(stream_data)
                _u.seek_token(stream_data)
                t = stream_data.tell()
                stream_data.seek(obj_stm[b'/First'] + offset, io.SEEK_SET)
                obj = read_object(stream_data, self)
                self._resolved_objects[0][objnum] = obj
                stream_data.seek(t, io.SEEK_SET)
            return self._resolved_objects[0][reference.idnum]
        start = self._xref[reference.generation][reference.idnum]
        self._stream.seek(start, io.SEEK_SET)
        idnum, generation = _read_object_header(self._stream)
        assert idnum == reference.idnum
        assert generation == reference.generation
        retval = read_object(self._stream, self)

        # override encryption is used for the /Encrypt dictionary
        if not self._override_encryption and self._is_encrypted:
            # if we don't have the encryption key:
            if self._decryption_key is None:
                raise Exception("file has not been decrypted")
            # otherwise, decrypt here...
            pack1 = struct.pack("<i", reference.idnum)[:3]
            pack2 = struct.pack("<i", reference.generation)[:2]
            key = _u.encrypt(self._decryption_key, pack1, pack2)
            retval = self._decrypt_object(retval, key)

        self._cache_indirect_object(generation, idnum, retval)
        return retval
Ejemplo n.º 6
0
 def __read_inline_image(self, stream):
     # begin reading just after the "BI" - begin image
     # first read the dictionary of settings.
     settings = DictObject()
     while True:
         tok = utils.seek_token(stream)
         if tok == b'I':
             # "ID" - begin of image data
             break
         key = read_object(stream, self.pdf)
         utils.seek_token(stream)
         value = read_object(stream, self.pdf)
         settings[key] = value
     # left at beginning of ID
     tmp = stream.read(3)
     assert tmp[:2] == b'ID'
     data = _read_image_data(stream)
     utils.debug(len(data))
     utils.seek_token(stream)
     return {b'settings': settings, b'data': data}