Exemple #1
0
 def read_from_stream(stream, pdf_object):
     tmp = stream.read(2)
     if not tmp == b'<<':
         raise _u.PdfReadError("dictionary read error")
     data = {}
     while True:
         tok = _u.read_non_whitespace(stream)
         if tok in b'>':
             stream.read(1)
             break
         stream.seek(-1, io.SEEK_CUR)
         key = read_object(stream, pdf_object)
         _u.seek_token(stream)
         value = read_object(stream, pdf_object)
         if key in data:
             # multiple definitions of key not permitted
             raise _u.PdfReadError("multiple definitions in dictionary")
         data[key] = value
     read_stream_object_with(data, pdf_object, stream)
     if _STREAM_KEY in data:
         return initialize_from_dictionary(data)
     else:
         retval = DictObject()
         retval.update(data)
         return retval
Exemple #2
0
    def _read_cross_reference(self):
        # start at the end:
        self._stream.seek(-1, io.SEEK_END)
        line = b''
        while not line:
            line = _read_backward_for_line(self._stream)
        if not line[-5:] == b'%%EOF':
            raise _u.PdfReadError(f'EOF marker not found: {line}')

        # find startxref entry - the location of the xref table
        line = _read_backward_for_line(self._stream)
        startxref = int(line)
        line = _read_backward_for_line(self._stream)
        if not line[:9] == b'startxref':
            raise _u.PdfReadError("Token 'startxref' not found")

        # read all cross reference tables and their trailers
        while True:
            # load the xref table
            self._stream.seek(startxref, io.SEEK_SET)
            x = self._stream.read(1)
            if x in b'x':
                startxref = self.__parse_xref_table()
                if startxref is None:
                    break
            elif x.isdigit():
                startxref = self.__parse_xref_stream()
                if startxref is None:
                    break
            else:
                _u.debug(f'''
                Bad xref character at startxref.
                Let\'s see if we can find the xref table nearby,
                as we\'ve observed this error with an off-by-one before.
                ''')
                self._stream.seek(-11, io.SEEK_CUR)
                tmp = self._stream.read(20)
                xref_loc = tmp.find(b'xref')
                if xref_loc != -1:
                    startxref -= (10 - xref_loc)
                    continue
                else:
                    raise ValueError('No xref table found at specified location')
Exemple #3
0
 def read_from_stream(stream):
     name = stream.read(1)
     if name not in b'/':
         raise _u.PdfReadError("name read error")
     while True:
         tok = stream.read(1)
         if tok.isspace() or tok in _u.DELIMITERS:
             stream.seek(-1, io.SEEK_CUR)
             break
         name += tok
     return NameObject(name)
Exemple #4
0
 def __parse_xref_table(self):
     # standard cross-reference table
     ref = self._stream.read(4)
     if not ref[:3] == b'ref':
         raise _u.PdfReadError("xref table read error")
     _u.seek_token(self._stream)
     while True:
         num = read_object(self._stream, self)
         _u.seek_token(self._stream)
         size = read_object(self._stream, self)
         _u.seek_token(self._stream)
         cnt = 0
         while cnt < size:
             line = self._stream.read(20)
             # It's very clear in section 3.4.3 of the PDF spec
             # that all cross-reference table lines are a fixed
             # 20 bytes.  However... some malformed PDF files
             # use a single character EOL without a preceeding
             # space.  Detect that case, and seek the stream
             # back one character.  (0-9 means we've bled into
             # the next xref entry, t means we've bled into the
             # text "trailer"):
             if line[-1] in b'0123456789t':
                 self._stream.seek(-1, io.SEEK_CUR)
             offset, generation = line[:16].split(b' ')
             offset, generation = int(offset), int(generation)
             if generation not in self._xref:
                 self._xref[generation] = {}
             if num not in self._xref[generation]:
                 self._xref[generation][num] = offset
             cnt += 1
             num += 1
         _u.seek_token(self._stream)
         trailertag = self._stream.read(7)
         if trailertag == b'trailer':
             break
         else:
             # more xrefs!
             self._stream.seek(-7, io.SEEK_CUR)
     _u.seek_token(self._stream)
     new_trailer = read_object(self._stream, self)
     for key, value in new_trailer.items():
         if key not in self._trailer:
             self._trailer[key] = value
     if b'/Prev' in new_trailer:
         return new_trailer[b'/Prev']
     else:
         return None
Exemple #5
0
 def read_from_stream(stream, pdf):
     arr = ArrayObject()
     tmp = stream.read(1)
     if tmp not in b'[':
         raise _u.PdfReadError("error reading array")
     while True:
         tok = stream.read(1)  # skip leading whitespace
         while tok.isspace():
             tok = stream.read(1)
         stream.seek(-1, io.SEEK_CUR)
         peak_ahead = stream.read(1)
         if peak_ahead in b']':
             break
         stream.seek(-1, io.SEEK_CUR)
         arr.append(read_object(stream, pdf))
     return arr
Exemple #6
0
 def read_from_stream(stream, pdf):
     idnum = b''
     while True:
         tok = stream.read(1)
         if tok.isspace():
             break
         idnum += tok
     generation = b''
     while True:
         tok = stream.read(1)
         if tok.isspace():
             break
         generation += tok
     r = stream.read(1)
     if r not in b'R':
         raise _u.PdfReadError("error reading indirect object reference")
     return RefObject(int(idnum), int(generation), pdf)
Exemple #7
0
def read_stream_object_with(data, pdf_object, stream):
    pos = stream.tell()
    s = _u.read_non_whitespace(stream)
    if (s + stream.read(5)) == b'stream':
        eol = stream.read(1)
        # odd PDF file output has spaces after 'stream' keyword but before EOL.
        # patch provided by Danial Sandler
        while eol == b' ':
            eol = stream.read(1)
        assert eol in b'\n\r'
        if eol in b'\r':
            # read \n after
            stream.read(1)
        # this is a stream object, not a dictionary
        assert b'/Length' in data
        length = data[b'/Length']
        if isinstance(length, RefObject):
            t = stream.tell()
            length = pdf_object.get_obj_of(length)
            stream.seek(t, io.SEEK_SET)
        data[_STREAM_KEY] = stream.read(length)
        e = _u.read_non_whitespace(stream)
        end_stream = stream.read(8)
        if (e + end_stream) != b'endstream':
            # (sigh) - the odd PDF file has a length that is too long, so
            # we need to read backwards to find the "endstream" ending.
            # ReportLab (unknown version) generates files with this bug,
            # and Python users into PDF files tend to be our audience.
            # we need to do this to correct the streamdata and chop off
            # an extra character.
            pos = stream.tell()
            stream.seek(-10, io.SEEK_CUR)
            end = stream.read(9)
            if end == b'endstream':
                # we found it by looking back one character further.
                data[_STREAM_KEY] = data[_STREAM_KEY][:-1]
            else:
                stream.seek(pos, io.SEEK_SET)
                raise _u.PdfReadError(
                    "Unable to find 'endstream' marker after stream.")
    else:
        stream.seek(pos, io.SEEK_SET)
Exemple #8
0
    def __init__(self, title, page, position_type, *args):
        DictObject.__init__(self)
        self[NameObject(_k.TITLE)] = title
        self[NameObject(_k.PAGE)] = page
        self[NameObject(_k.TYPE)] = position_type

        # from table 8.2 of the PDF 1.6 reference.
        if position_type == b'/XYZ':
            (self[NameObject(b'/Left')], self[NameObject(b'/Top')],
             self[NameObject(b'/Zoom')]) = args
        elif position_type == b'/FitR':
            (self[NameObject(b'/Left')], self[NameObject(b'/Bottom')],
             self[NameObject(b'/Right')], self[NameObject(b'/Top')]) = args
        elif position_type in [b'/FitH', b'FitBH']:
            self[NameObject(b'/Top')], = args
        elif position_type in [b'/FitV', b'FitBV']:
            self[NameObject(b'/Left')], = args
        elif position_type in [b'/Fit', b'FitB']:
            pass
        else:
            raise _u.PdfReadError("Unknown DestObject Type: %r" %
                                  position_type)
Exemple #9
0
    def _build_outline(self, node):
        dest, title, outline = None, None, None

        if b'/A' in node and _k.TITLE in node:
            # Action, section 8.5 (only type GoTo supported)
            title = node[_k.TITLE]
            action = node[b'/A']
            if action[b'/S'] == b'/GoTo':
                dest = action[b'/D']
        elif _k.DEST in node and _k.TITLE in node:
            # DestObject, section 8.2.1
            title = node[_k.TITLE]
            dest = node[_k.DEST]

        # if destination found, then create outline
        if dest:
            if isinstance(dest, ArrayObject):
                outline = _build_destination(title, dest)
            elif dest in isinstance(dest, str) and self._named_dests:
                outline = self._named_dests[dest]
                outline[NameObject(_k.TITLE)] = title
            else:
                raise _u.PdfReadError("Unexpected destination %r" % dest)
        return outline
Exemple #10
0
 def read_from_stream(stream):
     null_txt = stream.read(4)
     if not null_txt == b'null':
         raise _u.PdfReadError("error reading null object")
     return NullObject()
Exemple #11
0
 def set_data(self, data):
     raise _u.PdfReadError(
         "Creating EncodedStreamObject is not currently supported")
Exemple #12
0
def _convert_to_int(d, size):
    if size > 8:
        raise _u.PdfReadError("invalid size in convertToInt")
    d = b'\x00\x00\x00\x00\x00\x00\x00\x00' + d
    d = d[-8:]
    return struct.unpack('>q', d)[0]