Ejemplo n.º 1
0
    def _readInlineImage(self, stream):
        settings = DictionaryObject()
        while True:
            tok = readNonWhitespace(stream)
            stream.seek(-1, 1)
            if tok == 'I':
                break
            key = readObject(stream, self.pdf)
            tok = readNonWhitespace(stream)
            stream.seek(-1, 1)
            value = readObject(stream, self.pdf)
            settings[key] = value

        tmp = stream.read(3)
        assert tmp[:2] == 'ID'
        data = ''
        while True:
            tok = stream.read(1)
            if tok == 'E':
                next = stream.read(1)
                if next == 'I':
                    break
                else:
                    stream.seek(-1, 1)
                    data += tok
            else:
                data += tok

        x = readNonWhitespace(stream)
        stream.seek(-1, 1)
        return {'settings': settings,
         'data': data}
Ejemplo n.º 2
0
    def readObjectHeader(self, stream):
        # Should never be necessary to read out whitespace, since the
        # cross-reference table should put us in the right spot to read the
        # object header.  In reality... some files have stupid cross reference
        # tables that are off by whitespace bytes.
        extra = False
        utils.skipOverComment(stream)

        extra |= utils.skipOverWhitespace(stream)
        stream.seek(-1, 1)

        idnum = readUntilWhitespace(stream)

        extra |= utils.skipOverWhitespace(stream)
        stream.seek(-1, 1)

        generation = readUntilWhitespace(stream)
        stream.read(3)
        readNonWhitespace(stream)
        stream.seek(-1, 1)
        if (extra and self.strict):
            #not a fatal error
            warnings.warn(
                "Superfluous whitespace found in "
                "object header %s %s" % (idnum, generation),
                utils.PdfReadWarning)
        return int(idnum), int(generation)
Ejemplo n.º 3
0
 def readObjectHeader(self, stream):
     idnum = readUntilWhitespace(stream)
     generation = readUntilWhitespace(stream)
     obj = stream.read(3)
     readNonWhitespace(stream)
     stream.seek(-1, 1)
     return int(idnum), int(generation)
Ejemplo n.º 4
0
 def _readInlineImage(self, stream):
     # begin reading just after the "BI" - begin image
     # first read the dictionary of settings.
     settings = DictionaryObject()
     while True:
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         if tok == "I":
             # "ID" - begin of image data
             break
         key = readObject(stream, self.pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, self.pdf)
         settings[key] = value
     # left at beginning of ID
     tmp = stream.read(3)
     assert tmp[:2] == "ID"
     data = ""
     while True:
         tok = stream.read(1)
         if tok == "E":
             next = stream.read(1)
             if next == "I":
                 break
             else:
                 stream.seek(-1, 1)
                 data += tok
         else:
             data += tok
     readNonWhitespace(stream)
     stream.seek(-1, 1)
     return {"settings": settings, "data": data}
Ejemplo n.º 5
0
 def _readInlineImage(self, stream):
     # begin reading just after the "BI" - begin image
     # first read the dictionary of settings.
     settings = DictionaryObject()
     while True:
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         if tok == "I":
             # "ID" - begin of image data
             break
         key = readObject(stream, self.pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, self.pdf)
         settings[key] = value
     # left at beginning of ID
     tmp = stream.read(3)
     assert tmp[:2] == "ID"
     data = ""
     while True:
         tok = stream.read(1)
         if tok == "E":
             next = stream.read(1)
             if next == "I":
                 break
             else:
                 stream.seek(-1, 1)
                 data += tok
         else:
             data += tok
     readNonWhitespace(stream)
     stream.seek(-1, 1)
     return {"settings": settings, "data": data}
Ejemplo n.º 6
0
    def readFromStream(stream, pdf):
        tmp = stream.read(2)
        if tmp != '<<':
            raise utils.PdfReadError('dictionary read error')
        data = {}
        while True:
            tok = readNonWhitespace(stream)
            if tok == '>':
                stream.read(1)
                break
            stream.seek(-1, 1)
            key = readObject(stream, pdf)
            tok = readNonWhitespace(stream)
            stream.seek(-1, 1)
            value = readObject(stream, pdf)
            if data.has_key(key):
                raise utils.PdfReadError('multiple definitions in dictionary')
            data[key] = value

        pos = stream.tell()
        s = readNonWhitespace(stream)
        if s == 's' and stream.read(5) == 'tream':
            eol = stream.read(1)
            while eol == ' ':
                eol = stream.read(1)

            assert eol in ('\n', '\r')
            if eol == '\r':
                stream.read(1)
            assert data.has_key('/Length')
            length = data['/Length']
            if isinstance(length, IndirectObject):
                t = stream.tell()
                length = pdf.getObject(length)
                stream.seek(t, 0)
            data['__streamdata__'] = stream.read(length)
            e = readNonWhitespace(stream)
            ndstream = stream.read(8)
            if e + ndstream != 'endstream':
                pos = stream.tell()
                stream.seek(-10, 1)
                end = stream.read(9)
                if end == 'endstream':
                    data['__streamdata__'] = data['__streamdata__'][:-1]
                else:
                    stream.seek(pos, 0)
                    raise utils.PdfReadError("Unable to find 'endstream' marker after stream.")
        else:
            stream.seek(pos, 0)
        if data.has_key('__streamdata__'):
            return StreamObject.initializeFromDictionary(data)
        else:
            retval = DictionaryObject()
            retval.update(data)
            return retval
Ejemplo n.º 7
0
 def readObjectHeader(self, stream):
     # Should never be necessary to read out whitespace, since the
     # cross-reference table should put us in the right spot to read the
     # object header.  In reality... some files have stupid cross reference
     # tables that are off by whitespace bytes.
     readNonWhitespace(stream); stream.seek(-1, 1)
     idnum = readUntilWhitespace(stream)
     generation = readUntilWhitespace(stream)
     obj = stream.read(3)
     readNonWhitespace(stream)
     stream.seek(-1, 1)
     return int(idnum), int(generation)
Ejemplo n.º 8
0
    def getObject(self, indirectReference):
        retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None)
        if retval != None:
            return retval
        if indirectReference.generation == 0 and \
           self.xref_objStm.has_key(indirectReference.idnum):
            # indirect reference to object in object stream
            # read the entire object stream into memory
            stmnum,idx = self.xref_objStm[indirectReference.idnum]
            objStm = IndirectObject(stmnum, 0, self).getObject()
            assert objStm['/Type'] == '/ObjStm'
            assert idx < objStm['/N']
            streamData = StringIO(objStm.getData())
            for i in range(objStm['/N']):
                objnum = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                offset = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                t = streamData.tell()
                streamData.seek(objStm['/First']+offset, 0)
                obj = readObject(streamData, self)
                self.resolvedObjects[0][objnum] = obj
                streamData.seek(t, 0)
            return self.resolvedObjects[0][indirectReference.idnum]
        start = self.xref[indirectReference.generation][indirectReference.idnum]
        self.stream.seek(start, 0)
        idnum, generation = self.readObjectHeader(self.stream)
        assert idnum == indirectReference.idnum
        assert generation == indirectReference.generation
        retval = readObject(self.stream, self)

        # override encryption is used for the /Encrypt dictionary
        if not self._override_encryption and self.isEncrypted:
            # if we don't have the encryption key:
            if not hasattr(self, '_decryption_key'):
                raise Exception, "file has not been decrypted"
            # otherwise, decrypt here...
            import struct
            pack1 = struct.pack("<i", indirectReference.idnum)[:3]
            pack2 = struct.pack("<i", indirectReference.generation)[:2]
            key = self._decryption_key + pack1 + pack2
            assert len(key) == (len(self._decryption_key) + 5)
            md5_hash = md5(key).digest()
            key = md5_hash[:min(16, len(self._decryption_key) + 5)]
            retval = self._decryptObject(retval, key)

        self.cacheIndirectObject(generation, idnum, retval)
        return retval
Ejemplo n.º 9
0
 def readFromStream(stream, pdf):
     idnum = b_("")
     while True:
         tok = stream.read(1)
         if not tok:
             # stream has truncated prematurely
             raise PdfStreamError("Stream has ended unexpectedly")
         if tok.isspace():
             break
         idnum += tok
     generation = b_("")
     while True:
         tok = stream.read(1)
         if not tok:
             # stream has truncated prematurely
             raise PdfStreamError("Stream has ended unexpectedly")
         if tok.isspace():
             if not generation:
                 continue
             break
         generation += tok
     r = readNonWhitespace(stream)
     if r != b_("R"):
         raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell()))
     return IndirectObject(int(idnum), int(generation), pdf)
Ejemplo n.º 10
0
 def readFromStream(stream, pdf):
     idnum = b_("")
     while True:
         tok = stream.read(1)
         if not tok:
             # stream has truncated prematurely
             raise PdfStreamError("Stream has ended unexpectedly")
         if tok.isspace():
             break
         idnum += tok
     generation = b_("")
     while True:
         tok = stream.read(1)
         if not tok:
             # stream has truncated prematurely
             raise PdfStreamError("Stream has ended unexpectedly")
         if tok.isspace():
             if not generation:
                 continue
             break
         generation += tok
     r = readNonWhitespace(stream)
     if r != b_("R"):
         raise utils.PdfReadError(
             "Error reading indirect object reference at byte %s" %
             utils.hexStr(stream.tell()))
     return IndirectObject(int(idnum), int(generation), pdf)
Ejemplo n.º 11
0
def readObject(stream, pdf):
    tok = stream.read(1)
    stream.seek(-1, 1)
    if tok == 't' or tok == 'f':
        return BooleanObject.readFromStream(stream)
    if tok == '(':
        return readStringFromStream(stream)
    if tok == '/':
        return NameObject.readFromStream(stream)
    if tok == '[':
        return ArrayObject.readFromStream(stream, pdf)
    if tok == 'n':
        return NullObject.readFromStream(stream)
    if tok == '<':
        peek = stream.read(2)
        stream.seek(-2, 1)
        if peek == '<<':
            return DictionaryObject.readFromStream(stream, pdf)
        else:
            return readHexStringFromStream(stream)
    else:
        if tok == '%':
            while tok not in ('\r', '\n'):
                tok = stream.read(1)

            tok = readNonWhitespace(stream)
            stream.seek(-1, 1)
            return readObject(stream, pdf)
        if tok == '+' or tok == '-':
            return NumberObject.readFromStream(stream)
        peek = stream.read(20)
        stream.seek(-len(peek), 1)
        if re.match('(\\d+)\\s(\\d+)\\sR[^a-zA-Z]', peek) != None:
            return IndirectObject.readFromStream(stream, pdf)
        return NumberObject.readFromStream(stream)
Ejemplo n.º 12
0
    def __parseContentStream(self, stream):
        stream.seek(0, 0)
        operands = []
        while True:
            peek = readNonWhitespace(stream)
            if peek == '':
                break
            stream.seek(-1, 1)
            if peek.isalpha() or peek == "'" or peek == '"':
                operator = ''
                while True:
                    tok = stream.read(1)
                    if tok.isspace() or tok in NameObject.delimiterCharacters:
                        stream.seek(-1, 1)
                        break
                    elif tok == '':
                        break
                    operator += tok

                if operator == 'BI':
                    assert operands == []
                    ii = self._readInlineImage(stream)
                    self.operations.append((ii, 'INLINE IMAGE'))
                else:
                    self.operations.append((operands, operator))
                    operands = []
            elif peek == '%':
                while peek not in ('\r', '\n'):
                    peek = stream.read(1)

            else:
                operands.append(readObject(stream, None))
Ejemplo n.º 13
0
def readHexStringFromStream(stream):
    stream.read(1)
    txt = ""
    x = ""
    while True:
        tok = readNonWhitespace(stream)
        if tok == ">":
            break
        x += tok
        if len(x) == 2:
            txt += chr(int(x, base=16))
            x = ""
    if len(x) == 1:
        x += "0"
    if len(x) == 2:
        txt += chr(int(x, base=16))
    return createStringObject(txt)
Ejemplo n.º 14
0
def readHexStringFromStream(stream):
    stream.read(1)
    txt = ""
    x = ""
    while True:
        tok = readNonWhitespace(stream)
        if tok == ">":
            break
        x += tok
        if len(x) == 2:
            txt += chr(int(x, base=16))
            x = ""
    if len(x) == 1:
        x += "0"
    if len(x) == 2:
        txt += chr(int(x, base=16))
    return createStringObject(txt)
Ejemplo n.º 15
0
def readObject(stream, pdf):
    tok = stream.read(1)
    stream.seek(-1, 1) # reset to start
    if tok == '':
      return None
    elif tok == 't' or tok == 'f':
        # boolean object
        return BooleanObject.readFromStream(stream)
    elif tok == '(':
        # string object
        return readStringFromStream(stream)
    elif tok == '/':
        # name object
        return NameObject.readFromStream(stream)
    elif tok == '[':
        # array object
        return ArrayObject.readFromStream(stream, pdf)
    elif tok == 'n':
        # null object
        return NullObject.readFromStream(stream)
    elif tok == '<':
        # hexadecimal string OR dictionary
        peek = stream.read(2)
        stream.seek(-2, 1) # reset to start
        if peek == '<<':
            return DictionaryObject.readFromStream(stream, pdf)
        else:
            return readHexStringFromStream(stream)
    elif tok == '%':
        # comment
        while tok not in ('\r', '\n'):
            tok = stream.read(1)
        tok = readNonWhitespace(stream)
        stream.seek(-1, 1)
        return readObject(stream, pdf)
    else:
        # number object OR indirect reference
        if tok == '+' or tok == '-':
            # number
            return NumberObject.readFromStream(stream)
        peek = stream.read(20)
        stream.seek(-len(peek), 1) # reset to start
        if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None:
            return IndirectObject.readFromStream(stream, pdf)
        else:
            return NumberObject.readFromStream(stream)
Ejemplo n.º 16
0
def readObject(stream, pdf):
    tok = stream.read(1)
    stream.seek(-1, 1) # reset to start
    idx = ObjectPrefix.find(tok)
    if idx == 0:
        # name object
        return NameObject.readFromStream(stream, pdf)
    elif idx == 1:
        # hexadecimal string OR dictionary
        peek = stream.read(2)
        stream.seek(-2, 1) # reset to start
        if peek == b_('<<'):
            return DictionaryObject.readFromStream(stream, pdf)
        else:
            return readHexStringFromStream(stream)
    elif idx == 2:
        # array object
        return ArrayObject.readFromStream(stream, pdf)
    elif idx == 3 or idx == 4:
        # boolean object
        return BooleanObject.readFromStream(stream)
    elif idx == 5:
        # string object
        return readStringFromStream(stream)
    elif idx == 6:
        # null object
        return NullObject.readFromStream(stream)
    elif idx == 7:
        # comment
        while tok not in (b_('\r'), b_('\n')):
            tok = stream.read(1)
        tok = readNonWhitespace(stream)
        stream.seek(-1, 1)
        return readObject(stream, pdf)
    else:
        # number object OR indirect reference
        if tok in NumberSigns:
            # number
            return NumberObject.readFromStream(stream)
        peek = stream.read(20)
        stream.seek(-len(peek), 1) # reset to start
        if IndirectPattern.match(peek) != None:
            return IndirectObject.readFromStream(stream, pdf)
        else:
            return NumberObject.readFromStream(stream)
Ejemplo n.º 17
0
def readObject(stream, pdf):
    tok = stream.read(1)
    stream.seek(-1, 1)  # reset to start
    idx = ObjectPrefix.find(tok)
    if idx == 0:
        # name object
        return NameObject.readFromStream(stream, pdf)
    elif idx == 1:
        # hexadecimal string OR dictionary
        peek = stream.read(2)
        stream.seek(-2, 1)  # reset to start
        if peek == b_('<<'):
            return DictionaryObject.readFromStream(stream, pdf)
        else:
            return readHexStringFromStream(stream)
    elif idx == 2:
        # array object
        return ArrayObject.readFromStream(stream, pdf)
    elif idx == 3 or idx == 4:
        # boolean object
        return BooleanObject.readFromStream(stream)
    elif idx == 5:
        # string object
        return readStringFromStream(stream)
    elif idx == 6:
        # null object
        return NullObject.readFromStream(stream)
    elif idx == 7:
        # comment
        while tok not in (b_('\r'), b_('\n')):
            tok = stream.read(1)
        tok = readNonWhitespace(stream)
        stream.seek(-1, 1)
        return readObject(stream, pdf)
    else:
        # number object OR indirect reference
        if tok in NumberSigns:
            # number
            return NumberObject.readFromStream(stream)
        peek = stream.read(20)
        stream.seek(-len(peek), 1)  # reset to start
        if IndirectPattern.match(peek) != None:
            return IndirectObject.readFromStream(stream, pdf)
        else:
            return NumberObject.readFromStream(stream)
Ejemplo n.º 18
0
def readObject(stream, pdf):
    tok = stream.read(1)
    stream.seek(-1, 1)  # reset to start
    if tok == 't' or tok == 'f':
        # boolean object
        return BooleanObject.readFromStream(stream)
    elif tok == '(':
        # string object
        return readStringFromStream(stream)
    elif tok == '/':
        # name object
        return NameObject.readFromStream(stream)
    elif tok == '[':
        # array object
        return ArrayObject.readFromStream(stream, pdf)
    elif tok == 'n':
        # null object
        return NullObject.readFromStream(stream)
    elif tok == '<':
        # hexadecimal string OR dictionary
        peek = stream.read(2)
        stream.seek(-2, 1)  # reset to start
        if peek == '<<':
            return DictionaryObject.readFromStream(stream, pdf)
        else:
            return readHexStringFromStream(stream)
    elif tok == '%':
        # comment
        while tok not in ('\r', '\n'):
            tok = stream.read(1)
        tok = readNonWhitespace(stream)
        stream.seek(-1, 1)
        return readObject(stream, pdf)
    else:
        # number object OR indirect reference
        if tok == '+' or tok == '-':
            # number
            return NumberObject.readFromStream(stream)
        peek = stream.read(20)
        stream.seek(-len(peek), 1)  # reset to start
        if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None:
            return IndirectObject.readFromStream(stream, pdf)
        else:
            return NumberObject.readFromStream(stream)
Ejemplo n.º 19
0
def readObject(stream, pdf):
    tok = stream.read(1)
    stream.seek(-1, 1)  # reset to start
    if tok == b_('t') or tok == b_('f'):
        # boolean object
        return BooleanObject.readFromStream(stream)
    elif tok == b_('('):
        # string object
        return readStringFromStream(stream)
    elif tok == b_('/'):
        # name object
        return NameObject.readFromStream(stream)
    elif tok == b_('['):
        # array object
        return ArrayObject.readFromStream(stream, pdf)
    elif tok == b_('n'):
        # null object
        return NullObject.readFromStream(stream)
    elif tok == b_('<'):
        # hexadecimal string OR dictionary
        peek = stream.read(2)
        stream.seek(-2, 1)  # reset to start
        if peek == b_('<<'):
            return DictionaryObject.readFromStream(stream, pdf)
        else:
            return readHexStringFromStream(stream)
    elif tok == b_('%'):
        # comment
        while tok not in (b_('\r'), b_('\n')):
            tok = stream.read(1)
        tok = readNonWhitespace(stream)
        stream.seek(-1, 1)
        return readObject(stream, pdf)
    else:
        # number object OR indirect reference
        if tok == b_('+') or tok == b_('-'):
            # number
            return NumberObject.readFromStream(stream)
        peek = stream.read(20)
        stream.seek(-len(peek), 1)  # reset to start
        if re.match(b_(r"(\d+)\s(\d+)\sR[^a-zA-Z]"), peek) is not None:
            return IndirectObject.readFromStream(stream, pdf)
        else:
            return NumberObject.readFromStream(stream)
Ejemplo n.º 20
0
def readObject(stream, pdf):
    tok = stream.read(1)
    stream.seek(-1, 1)  # reset to start
    if tok == "t" or tok == "f":
        # boolean object
        return BooleanObject.readFromStream(stream)
    elif tok == "(":
        # string object
        return readStringFromStream(stream)
    elif tok == "/":
        # name object
        return NameObject.readFromStream(stream)
    elif tok == "[":
        # array object
        return ArrayObject.readFromStream(stream, pdf)
    elif tok == "n":
        # null object
        return NullObject.readFromStream(stream)
    elif tok == "<":
        # hexadecimal string OR dictionary
        peek = stream.read(2)
        stream.seek(-2, 1)  # reset to start
        if peek == "<<":
            return DictionaryObject.readFromStream(stream, pdf)
        else:
            return readHexStringFromStream(stream)
    elif tok == "%":
        # comment
        while tok not in ("\r", "\n"):
            tok = stream.read(1)
        tok = readNonWhitespace(stream)
        stream.seek(-1, 1)
        return readObject(stream, pdf)
    else:
        # number object OR indirect reference
        if tok == "+" or tok == "-":
            # number
            return NumberObject.readFromStream(stream)
        peek = stream.read(20)
        stream.seek(-len(peek), 1)  # reset to start
        if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None:
            return IndirectObject.readFromStream(stream, pdf)
        else:
            return NumberObject.readFromStream(stream)
Ejemplo n.º 21
0
def readHexStringFromStream(stream):
    stream.read(1)
    txt = ""
    x = b_("")
    while True:
        tok = readNonWhitespace(stream)
        if not tok:
            # stream has truncated prematurely
            raise PdfStreamError("Stream has ended unexpectedly")
        if tok == b_(">"):
            break
        x += tok
        if len(x) == 2:
            txt += chr(int(x, base=16))
            x = b_("")
    if len(x) == 1:
        x += b_("0")
    if len(x) == 2:
        txt += chr(int(x, base=16))
    return createStringObject(b_(txt))
Ejemplo n.º 22
0
def readHexStringFromStream(stream):
    stream.read(1)
    txt = ""
    x = b_("")
    while True:
        tok = readNonWhitespace(stream)
        if not tok:
            # stream has truncated prematurely
            raise PdfStreamError("Stream has ended unexpectedly")
        if tok == b_(">"):
            break
        x += tok
        if len(x) == 2:
            txt += chr(int(x, base=16))
            x = b_("")
    if len(x) == 1:
        x += b_("0")
    if len(x) == 2:
        txt += chr(int(x, base=16))
    return createStringObject(b_(txt))
Ejemplo n.º 23
0
 def __parseContentStream(self, stream):
     # file("f:\\tmp.txt", "w").write(stream.read())
     stream.seek(0, 0)
     operands = []
     while True:
         peek = readNonWhitespace(stream)
         if peek == "":
             break
         stream.seek(-1, 1)
         if peek.isalpha() or peek == "'" or peek == '"':
             operator = readUntilWhitespace(stream, maxchars=2)
             if operator == "BI":
                 # begin inline image - a completely different parsing
                 # mechanism is required, of course... thanks buddy...
                 assert operands == []
                 ii = self._readInlineImage(stream)
                 self.operations.append((ii, "INLINE IMAGE"))
             else:
                 self.operations.append((operands, operator))
                 operands = []
         else:
             operands.append(readObject(stream, None))
Ejemplo n.º 24
0
 def __parseContentStream(self, stream):
     stream.seek(0, 0)
     operands = []
     while True:
         peek = readNonWhitespace(stream)
         if peek == '':
             break
         stream.seek(-1, 1)
         if peek.isalpha() or peek == "'" or peek == '"':
             operator = ""
             while True:
                 tok = stream.read(1)
                 if tok.isspace() or tok in NameObject.delimiterCharacters:
                     stream.seek(-1, 1)
                     break
                 elif tok == '':
                     break
                 operator += tok
             if operator == "BI":
                 # begin inline image - a completely different parsing
                 # mechanism is required, of course... thanks buddy...
                 assert operands == []
                 ii = self._readInlineImage(stream)
                 self.operations.append((ii, "INLINE IMAGE"))
             else:
                 self.operations.append((operands, operator))
                 operands = []
         elif peek == '%':
             # If we encounter a comment in the content stream, we have to
             # handle it here.  Typically, readObject will handle
             # encountering a comment -- but readObject assumes that
             # following the comment must be the object we're trying to
             # read.  In this case, it could be an operator instead.
             while peek not in ('\r', '\n'):
                 peek = stream.read(1)
         else:
             operands.append(readObject(stream, None))
Ejemplo n.º 25
0
 def __parseContentStream(self, stream):
     stream.seek(0, 0)
     operands = []
     while True:
         peek = readNonWhitespace(stream)
         if peek == '':
             break
         stream.seek(-1, 1)
         if peek.isalpha() or peek == "'" or peek == '"':
             operator = ""
             while True:
                 tok = stream.read(1)
                 if tok.isspace() or tok in NameObject.delimiterCharacters:
                     stream.seek(-1, 1)
                     break
                 elif tok == '':
                     break
                 operator += tok
             if operator == "BI":
                 # begin inline image - a completely different parsing
                 # mechanism is required, of course... thanks buddy...
                 assert operands == []
                 ii = self._readInlineImage(stream)
                 self.operations.append((ii, "INLINE IMAGE"))
             else:
                 self.operations.append((operands, operator))
                 operands = []
         elif peek == '%':
             # If we encounter a comment in the content stream, we have to
             # handle it here.  Typically, readObject will handle
             # encountering a comment -- but readObject assumes that
             # following the comment must be the object we're trying to
             # read.  In this case, it could be an operator instead.
             while peek not in ('\r', '\n'):
                 peek = stream.read(1)
         else:
             operands.append(readObject(stream, None))
Ejemplo n.º 26
0
 def readFromStream(stream, pdf):
     tmp = stream.read(2)
     if tmp != "<<":
         raise utils.PdfReadError, "dictionary read error"
     data = {}
     while True:
         tok = readNonWhitespace(stream)
         if tok == ">":
             stream.read(1)
             break
         stream.seek(-1, 1)
         key = readObject(stream, pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, pdf)
         if data.has_key(key):
             # multiple definitions of key not permitted
             raise utils.PdfReadError, "multiple definitions in dictionary"
         data[key] = value
     pos = stream.tell()
     s = readNonWhitespace(stream)
     if s == 's' and stream.read(5) == 'tream':
         eol = stream.read(1)
         # odd PDF file output has spaces after 'stream' keyword but before EOL.
         # patch provided by Danial Sandler
         while eol == ' ':
             eol = stream.read(1)
         assert eol in ("\n", "\r")
         if eol == "\r":
             # read \n after
             stream.read(1)
         # this is a stream object, not a dictionary
         assert data.has_key("/Length")
         length = data["/Length"]
         if isinstance(length, IndirectObject):
             t = stream.tell()
             length = pdf.getObject(length)
             stream.seek(t, 0)
         data["__streamdata__"] = stream.read(length)
         e = readNonWhitespace(stream)
         ndstream = stream.read(8)
         if (e + ndstream) != "endstream":
             # (sigh) - the odd PDF file has a length that is too long, so
             # we need to read backwards to find the "endstream" ending.
             # ReportLab (unknown version) generates files with this bug,
             # and Python users into PDF files tend to be our audience.
             # we need to do this to correct the streamdata and chop off
             # an extra character.
             pos = stream.tell()
             stream.seek(-10, 1)
             end = stream.read(9)
             if end == "endstream":
                 # we found it by looking back one character further.
                 data["__streamdata__"] = data["__streamdata__"][:-1]
             else:
                 stream.seek(pos, 0)
                 raise utils.PdfReadError, "Unable to find 'endstream' marker after stream."
     else:
         stream.seek(pos, 0)
     if data.has_key("__streamdata__"):
         return StreamObject.initializeFromDictionary(data)
     else:
         retval = DictionaryObject()
         retval.update(data)
         return retval
Ejemplo n.º 27
0
    def getObject(self, indirectReference):
        retval = self.resolvedObjects.get(indirectReference.generation,
                                          {}).get(indirectReference.idnum,
                                                  None)
        if retval is not None:
            return retval
        if indirectReference.generation == 0 \
                and indirectReference.idnum in self.xref_objStm:
            # indirect reference to object in object stream
            # read the entire object stream into memory
            stmnum, idx = self.xref_objStm[indirectReference.idnum]
            objStm = IndirectObject(stmnum, 0, self).getObject()
            assert objStm['/Type'] == '/ObjStm'
            assert idx < objStm['/N']
            streamData = StringIO(objStm.getData())
            for i in range(objStm['/N']):
                objnum = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                offset = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                t = streamData.tell()
                streamData.seek(objStm['/First'] + offset, 0)
                obj = readObject(streamData, self)
                self.resolvedObjects[0][objnum] = obj
                streamData.seek(t, 0)
            return self.resolvedObjects[0][indirectReference.idnum]
        if indirectReference.idnum \
                not in self.xref[indirectReference.generation]:
            warnings.warn(
                "Object %d %d not defined." %
                (indirectReference.idnum, indirectReference.generation),
                utils.PdfReadWarning)
            return None
        start = self.xref[indirectReference.generation][
            indirectReference.idnum]
        self.stream.seek(start, 0)
        idnum, generation = self.readObjectHeader(self.stream)
        try:
            assert idnum == indirectReference.idnum
        except AssertionError:
            if self.xrefIndex:
                # Xref table probably had bad indexes due to not
                # being zero-indexed
                if self.strict:
                    raise utils.PdfReadError(
                        "Expected object ID (%d %d) does "
                        "not match actual (%d %d); xref "
                        "table not zero-indexed." %
                        (indirectReference.idnum, indirectReference.generation,
                         idnum, generation))
                else:
                    # should not happen since the xref table is corrected in
                    # non-strict mode
                    pass
            else:  # some other problem
                raise utils.PdfReadError(
                    "Expected object ID (%d %d) does not "
                    " match actual (%d %d)." %
                    (indirectReference.idnum, indirectReference.generation,
                     idnum, generation))
        assert generation == indirectReference.generation
        retval = readObject(self.stream, self)
        # override encryption is used for the /Encrypt dictionary
        if not self._override_encryption and self.isEncrypted:
            # if we don't have the encryption key:
            if not hasattr(self, '_decryption_key'):
                raise Exception("file has not been decrypted")
            # otherwise, decrypt here...
            pack1 = struct.pack("<i", indirectReference.idnum)[:3]
            pack2 = struct.pack("<i", indirectReference.generation)[:2]
            key = self._decryption_key + pack1 + pack2
            assert len(key) == (len(self._decryption_key) + 5)
            md5_hash = md5(key).digest()
            key = md5_hash[:min(16, len(self._decryption_key) + 5)]
            retval = self._decryptObject(retval, key)

        self.cacheIndirectObject(generation, idnum, retval)
        return retval
Ejemplo n.º 28
0
    def read(self, stream):
        # start at the end:
        stream.seek(-1, 2)
        line = b_('')
        while not line:
            line = self.readNextEndLine(stream)
        if line[:5] != b_("%%EOF"):
            raise utils.PdfReadError, "EOF marker not found"
        # find startxref entry - the location of the xref table
        line = self.readNextEndLine(stream)
        startxref = int(line)
        line = self.readNextEndLine(stream)
        if line[:9] != b_("startxref"):
            raise utils.PdfReadError, "startxref not found"

        # read all cross reference tables and their trailers
        self.xref = {}
        self.xref_objStm = {}
        self.trailer = DictionaryObject()
        while 1:
            # load the xref table
            stream.seek(startxref, 0)
            x = stream.read(1)
            if x == b_("x"):
                # standard cross-reference table
                ref = stream.read(4)
                if ref[:3] != b_("ref"):
                    raise utils.PdfReadError, "xref table read error"
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                # check if the first time looking at the xref table
                firsttime = True
                while True:
                    num = readObject(stream, self)
                    if firsttime and num != 0:
                        self.xrefIndex = num
                        warnings.warn(
                            "Xref table not zero-indexed. ID "
                            "numbers for objects will %sbe "
                            "corrected." % ("" if not self.strict else "not "),
                            utils.PdfReadWarning)
                        # if table not zero indexed, could be due to
                        # error from when PDF was created
                        # which will lead to mismatched indices later on
                    firsttime = False
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    size = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    cnt = 0
                    while cnt < size:
                        line = stream.read(20)
                        # It's very clear in section 3.4.3 of the PDF spec
                        # that all cross-reference table lines are a fixed
                        # 20 bytes (as of PDF 1.7). However, some files have
                        # 21-byte entries (or more) due to the use of \r\n
                        # (CRLF) EOL's. Detect that case, and adjust the line
                        # until it does not begin with a \r (CR) or \n (LF).
                        while line[0] in b_("\x0D\x0A"):
                            stream.seek(-20 + 1, 1)
                            line = stream.read(20)
                        # On the other hand, some malformed PDF files
                        # use a single character EOL without a preceeding
                        # space.  Detect that case, and seek the stream
                        # back one character.  (0-9 means we've bled into
                        # the next xref entry, t means we've bled into the
                        # text "trailer"):
                        if line[-1] in b_("0123456789t"):
                            stream.seek(-1, 1)
                        offset, generation = line[:16].split(b_(" "))
                        offset, generation = int(offset), int(generation)
                        self.xref.setdefault(generation, {})
                        if num in self.xref[generation]:
                            # It really seems like we should allow the last
                            # xref table in the file to override previous
                            # ones. Since we read the file backwards, assume
                            # any existing key is already set correctly.
                            pass
                        else:
                            self.xref[generation][num] = offset
                        cnt += 1
                        num += 1
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    trailertag = stream.read(7)
                    if trailertag != b_("trailer"):
                        # more xrefs!
                        stream.seek(-7, 1)
                    else:
                        break
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                newTrailer = readObject(stream, self)
                for key, value in newTrailer.items():
                    self.trailer.setdefault(key, value)
                if "/Prev" in newTrailer:
                    startxref = newTrailer["/Prev"]
                else:
                    break
            elif x.isdigit():
                # PDF 1.5+ Cross-Reference Stream
                stream.seek(-1, 1)
                idnum, generation = self.readObjectHeader(stream)
                xrefstream = readObject(stream, self)
                assert xrefstream["/Type"] == "/XRef"
                self.cacheIndirectObject(generation, idnum, xrefstream)
                streamData = StringIO(xrefstream.getData())
                idx_pairs = xrefstream.get("/Index",
                                           [0, xrefstream.get("/Size")])
                entrySizes = xrefstream.get("/W")
                for num, size in self._pairs(idx_pairs):
                    cnt = 0
                    while cnt < size:
                        for i in range(len(entrySizes)):
                            d = streamData.read(entrySizes[i])
                            di = convertToInt(d, entrySizes[i])
                            if i == 0:
                                xref_type = di
                            elif i == 1:
                                if xref_type == 0:
                                    # next_free_object = di
                                    pass
                                elif xref_type == 1:
                                    byte_offset = di
                                elif xref_type == 2:
                                    objstr_num = di
                            elif i == 2:
                                if xref_type == 0:
                                    # next_generation = di
                                    pass
                                elif xref_type == 1:
                                    generation = di
                                elif xref_type == 2:
                                    obstr_idx = di
                        if xref_type == 0:
                            pass
                        elif xref_type == 1:
                            if generation not in self.xref:
                                self.xref[generation] = {}
                            if not num in self.xref[generation]:
                                self.xref[generation][num] = byte_offset
                        elif xref_type == 2:
                            if not num in self.xref_objStm:
                                self.xref_objStm[num] = [objstr_num, obstr_idx]
                        cnt += 1
                        num += 1
                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
                for key in trailerKeys:
                    if key in xrefstream and key not in self.trailer:
                        self.trailer[NameObject(key)] = xrefstream.raw_get(key)
                if "/Prev" in xrefstream:
                    startxref = xrefstream["/Prev"]
                else:
                    break
            else:
                # bad xref character at startxref.  Let's see if we can find
                # the xref table nearby, as we've observed this error with an
                # off-by-one before.
                stream.seek(-11, 1)
                tmp = stream.read(20)
                xref_loc = tmp.find(b_("xref"))
                if xref_loc != -1:
                    startxref -= (10 - xref_loc)
                    continue
                else:
                    # no xref table found at specified location
                    assert False
                    break
        # if not zero-indexed, verify that the table is correct
        # change it if necessary
        if self.xrefIndex and not self.strict:
            loc = stream.tell()
            for gen in self.xref:
                if gen == 65535:
                    continue
                for id in self.xref[gen]:
                    stream.seek(self.xref[gen][id], 0)
                    pid, pgen = self.readObjectHeader(stream)
                    if pid == id - self.xrefIndex:
                        self._zeroXref(gen)
                        break
                    # if not, then either it's just plain wrong,
                    # or the non-zero-index is actually correct
            stream.seek(loc, 0)  # return to where it was
Ejemplo n.º 29
0
    def getObject(self, indirectReference):
        retval = self.resolvedObjects.get(indirectReference.generation,
                                          {}).get(indirectReference.idnum,
                                                  None)
        if retval is not None:
            return retval
        if indirectReference.generation == 0 \
                and indirectReference.idnum in self.xref_objStm:
            # indirect reference to object in object stream
            # read the entire object stream into memory
            stmnum, idx = self.xref_objStm[indirectReference.idnum]
            objStm = IndirectObject(stmnum, 0, self).getObject()
            assert objStm['/Type'] == '/ObjStm'
            assert idx < objStm['/N']
            streamData = StringIO(objStm.getData())
            for i in range(objStm['/N']):
                objnum = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                offset = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                t = streamData.tell()
                streamData.seek(objStm['/First']+offset, 0)
                obj = readObject(streamData, self)
                self.resolvedObjects[0][objnum] = obj
                streamData.seek(t, 0)
            return self.resolvedObjects[0][indirectReference.idnum]
        if indirectReference.idnum \
                not in self.xref[indirectReference.generation]:
            warnings.warn("Object %d %d not defined." % (
                indirectReference.idnum, indirectReference.generation),
                utils.PdfReadWarning)
            return None
        start = self.xref[indirectReference.generation][
            indirectReference.idnum]
        self.stream.seek(start, 0)
        idnum, generation = self.readObjectHeader(self.stream)
        try:
            assert idnum == indirectReference.idnum
        except AssertionError:
            if self.xrefIndex:
                # Xref table probably had bad indexes due to not
                # being zero-indexed
                if self.strict:
                    raise utils.PdfReadError(
                        "Expected object ID (%d %d) does "
                        "not match actual (%d %d); xref "
                        "table not zero-indexed." % (
                            indirectReference.idnum,
                            indirectReference.generation,
                            idnum,
                            generation))
                else:
                    # should not happen since the xref table is corrected in
                    # non-strict mode
                    pass
            else:  # some other problem
                raise utils.PdfReadError("Expected object ID (%d %d) does not "
                                         " match actual (%d %d)." % (
                                             indirectReference.idnum,
                                             indirectReference.generation,
                                             idnum, generation))
        assert generation == indirectReference.generation
        retval = readObject(self.stream, self)
        # override encryption is used for the /Encrypt dictionary
        if not self._override_encryption and self.isEncrypted:
            # if we don't have the encryption key:
            if not hasattr(self, '_decryption_key'):
                raise Exception("file has not been decrypted")
            # otherwise, decrypt here...
            pack1 = struct.pack("<i", indirectReference.idnum)[:3]
            pack2 = struct.pack("<i", indirectReference.generation)[:2]
            key = self._decryption_key + pack1 + pack2
            assert len(key) == (len(self._decryption_key) + 5)
            md5_hash = md5(key).digest()
            key = md5_hash[:min(16, len(self._decryption_key) + 5)]
            retval = self._decryptObject(retval, key)

        self.cacheIndirectObject(generation, idnum, retval)
        return retval
Ejemplo n.º 30
0
 def readFromStream(stream, pdf):
     tmp = stream.read(2)
     if tmp != b_("<<"):
         raise utils.PdfReadError(
             ("Dictionary read error at byte %s: "
              "stream must begin with '<<'" %
                 utils.hexStr(stream.tell())))
     data = {}
     while True:
         tok = readNonWhitespace(stream)
         if not tok:
             # stream has truncated prematurely
             raise utils.PdfStreamError("Stream has ended unexpectedly")
         if tok == b_(">"):
             stream.read(1)
             break
         stream.seek(-1, 1)
         key = readObject(stream, pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, pdf)
         if key in data:
             # multiple definitions of key not permitted
             raise utils.PdfReadError, ("Multiple definitions in "
                                        "dictionary at byte %s for key %s"
                                        % (utils.hexStr(stream.tell()),
                                           key))
         data[key] = value
     pos = stream.tell()
     s = readNonWhitespace(stream)
     if s == b_('s') and stream.read(5) == b_('tream'):
         eol = stream.read(1)
         # odd PDF file output has spaces after 'stream'
         # keyword but before EOL.
         # patch provided by Danial Sandler
         while eol == b_(' '):
             eol = stream.read(1)
         assert eol in (b_("\n"), b_("\r"))
         if eol == b_("\r"):
             # read \n after
             if stream.read(1) != '\n':
                 stream.seek(-1, 1)
         # this is a stream object, not a dictionary
         assert "/Length" in data
         length = data["/Length"]
         if isinstance(length, IndirectObject):
             t = stream.tell()
             length = pdf.getObject(length)
             stream.seek(t, 0)
         data["__streamdata__"] = stream.read(length)
         e = readNonWhitespace(stream)
         ndstream = stream.read(8)
         if (e + ndstream) != b_("endstream"):
             # (sigh) - the odd PDF file has a length that is too long, so
             # we need to read backwards to find the "endstream" ending.
             # ReportLab (unknown version) generates files with this bug,
             # and Python users into PDF files tend to be our audience.
             # we need to do this to correct the streamdata and chop off
             # an extra character.
             pos = stream.tell()
             stream.seek(-10, 1)
             end = stream.read(9)
             if end == b_("endstream"):
                 # we found it by looking back one character further.
                 data["__streamdata__"] = data["__streamdata__"][:-1]
             else:
                 stream.seek(pos, 0)
                 raise utils.PdfReadError, \
                     ("Unable to find 'endstream' marker after "
                      "stream at byte %s." % utils.hexStr(stream.tell()))
     else:
         stream.seek(pos, 0)
     if "__streamdata__" in data:
         return StreamObject.initializeFromDictionary(data)
     else:
         retval = DictionaryObject()
         retval.update(data)
         return retval
Ejemplo n.º 31
0
    def read(self, stream):
        stream.seek(0, 2)
        if stream.tell() == 0:
            raise utils.PdfReadError('Empty file')
        stream.seek(-1, 2)
        line = ''
        while not line:
            line = self.readNextEndLine(stream)

        if line[:5] != '%%EOF':
            raise utils.PdfReadError('EOF marker not found')
        line = self.readNextEndLine(stream)
        startxref = int(line)
        line = self.readNextEndLine(stream)
        if line[:9] != 'startxref':
            raise utils.PdfReadError('startxref not found')
        self.xref = {}
        self.xref_objStm = {}
        self.trailer = DictionaryObject()
        while 1:
            stream.seek(startxref, 0)
            x = stream.read(1)
            if x == 'x':
                ref = stream.read(4)
                if ref[:3] != 'ref':
                    raise utils.PdfReadError('xref table read error')
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                while 1:
                    num = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    size = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    cnt = 0
                    while cnt < size:
                        line = stream.read(20)
                        if line[-1] in '0123456789t':
                            stream.seek(-1, 1)
                        offset, generation = line[:16].split(' ')
                        offset, generation = int(offset), int(generation)
                        if not self.xref.has_key(generation):
                            self.xref[generation] = {}
                        if self.xref[generation].has_key(num):
                            pass
                        else:
                            self.xref[generation][num] = offset
                        cnt += 1
                        num += 1

                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    trailertag = stream.read(7)
                    if trailertag != 'trailer':
                        stream.seek(-7, 1)
                    else:
                        break

                readNonWhitespace(stream)
                stream.seek(-1, 1)
                newTrailer = readObject(stream, self)
                for key, value in newTrailer.items():
                    if not self.trailer.has_key(key):
                        self.trailer[key] = value

                if newTrailer.has_key('/Prev'):
                    startxref = newTrailer['/Prev']
                else:
                    break
            elif x.isdigit():
                stream.seek(-1, 1)
                idnum, generation = self.readObjectHeader(stream)
                xrefstream = readObject(stream, self)
                assert xrefstream['/Type'] == '/XRef'
                self.cacheIndirectObject(generation, idnum, xrefstream)
                streamData = StringIO(xrefstream.getData())
                idx_pairs = xrefstream.get('/Index', [0, xrefstream.get('/Size')])
                entrySizes = xrefstream.get('/W')
                for num, size in self._pairs(idx_pairs):
                    cnt = 0
                    while cnt < size:
                        for i in range(len(entrySizes)):
                            d = streamData.read(entrySizes[i])
                            di = convertToInt(d, entrySizes[i])
                            if i == 0:
                                xref_type = di
                            elif i == 1:
                                if xref_type == 0:
                                    next_free_object = di
                                elif xref_type == 1:
                                    byte_offset = di
                                elif xref_type == 2:
                                    objstr_num = di
                            elif i == 2:
                                if xref_type == 0:
                                    next_generation = di
                                elif xref_type == 1:
                                    generation = di
                                elif xref_type == 2:
                                    obstr_idx = di

                        if xref_type == 0:
                            pass
                        elif xref_type == 1:
                            if not self.xref.has_key(generation):
                                self.xref[generation] = {}
                            if num not in self.xref[generation]:
                                self.xref[generation][num] = byte_offset
                        elif xref_type == 2:
                            if num not in self.xref_objStm:
                                self.xref_objStm[num] = [objstr_num, obstr_idx]
                        cnt += 1
                        num += 1

                trailerKeys = ('/Root', '/Encrypt', '/Info', '/ID')
                for key in trailerKeys:
                    if xrefstream.has_key(key) and not self.trailer.has_key(key):
                        self.trailer[NameObject(key)] = xrefstream.raw_get(key)

                if xrefstream.has_key('/Prev'):
                    startxref = xrefstream['/Prev']
                else:
                    break
            else:
                stream.seek(-11, 1)
                tmp = stream.read(20)
                xref_loc = tmp.find('xref')
                if xref_loc != -1:
                    startxref -= 10 - xref_loc
                    continue
                else:
                    assert False
                    break
Ejemplo n.º 32
0
 def readFromStream(stream, pdf):
     debug = False
     tmp = stream.read(2)
     if tmp != b_("<<"):
         raise utils.PdfReadError, \
             ("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
     data = {}
     while True:
         tok = readNonWhitespace(stream)
         if debug: print "Tok:", tok
         if tok == b_(">"):
             stream.read(1)
             break
         stream.seek(-1, 1)
         key = readObject(stream, pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, pdf)
         if data.has_key(key):
             # multiple definitions of key not permitted
             raise utils.PdfReadError, ("Multiple definitions in dictionary at byte %s for key %s" \
                                        % (utils.hexStr(stream.tell()), key))
         data[key] = value
     pos = stream.tell()
     s = readNonWhitespace(stream)
     if s == b_('s') and stream.read(5) == b_('tream'):
         eol = stream.read(1)
         # odd PDF file output has spaces after 'stream' keyword but before EOL.
         # patch provided by Danial Sandler
         while eol == b_(' '):
             eol = stream.read(1)
         assert eol in (b_("\n"), b_("\r"))
         if eol == b_("\r"):
             # read \n after
             stream.read(1)
         # this is a stream object, not a dictionary
         assert data.has_key("/Length")
         length = data["/Length"]
         if debug: print data
         if isinstance(length, IndirectObject):
             t = stream.tell()
             length = pdf.getObject(length)
             stream.seek(t, 0)
         data["__streamdata__"] = stream.read(length)
         if debug: print "here"
         #if debug: print debugging.printAsHex(data["__streamdata__"])
         e = readNonWhitespace(stream)
         ndstream = stream.read(8)
         if (e + ndstream) != b_("endstream"):
             # (sigh) - the odd PDF file has a length that is too long, so
             # we need to read backwards to find the "endstream" ending.
             # ReportLab (unknown version) generates files with this bug,
             # and Python users into PDF files tend to be our audience.
             # we need to do this to correct the streamdata and chop off
             # an extra character.
             pos = stream.tell()
             stream.seek(-10, 1)
             end = stream.read(9)
             if end == b_("endstream"):
                 # we found it by looking back one character further.
                 data["__streamdata__"] = data["__streamdata__"][:-1]
             else:
                 # if debug: print "E", e, ndstream, debugging.toHex(end)
                 stream.seek(pos, 0)
                 raise utils.PdfReadError, \
                     ("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
     else:
         stream.seek(pos, 0)
     if data.has_key("__streamdata__"):
         return StreamObject.initializeFromDictionary(data)
     else:
         retval = DictionaryObject()
         retval.update(data)
         return retval
Ejemplo n.º 33
0
    def read(self, stream):
        # start at the end:
        stream.seek(-1, 2)
        line = b_('')
        while not line:
            line = self.readNextEndLine(stream)
        if line[:5] != b_("%%EOF"):
            raise utils.PdfReadError, "EOF marker not found"
        # find startxref entry - the location of the xref table
        line = self.readNextEndLine(stream)
        startxref = int(line)
        line = self.readNextEndLine(stream)
        if line[:9] != b_("startxref"):
            raise utils.PdfReadError, "startxref not found"

        # read all cross reference tables and their trailers
        self.xref = {}
        self.xref_objStm = {}
        self.trailer = DictionaryObject()
        while 1:
            # load the xref table
            stream.seek(startxref, 0)
            x = stream.read(1)
            if x == b_("x"):
                # standard cross-reference table
                ref = stream.read(4)
                if ref[:3] != b_("ref"):
                    raise utils.PdfReadError, "xref table read error"
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                # check if the first time looking at the xref table
                firsttime = True
                while True:
                    num = readObject(stream, self)
                    if firsttime and num != 0:
                        self.xrefIndex = num
                        warnings.warn("Xref table not zero-indexed. ID "
                                      "numbers for objects will %sbe "
                                      "corrected." %
                                      ("" if not self.strict else "not "),
                                      utils.PdfReadWarning)
                         # if table not zero indexed, could be due to
                         # error from when PDF was created
                         # which will lead to mismatched indices later on
                    firsttime = False
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    size = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    cnt = 0
                    while cnt < size:
                        line = stream.read(20)
                        # It's very clear in section 3.4.3 of the PDF spec
                        # that all cross-reference table lines are a fixed
                        # 20 bytes (as of PDF 1.7). However, some files have
                        # 21-byte entries (or more) due to the use of \r\n
                        # (CRLF) EOL's. Detect that case, and adjust the line
                        # until it does not begin with a \r (CR) or \n (LF).
                        while line[0] in b_("\x0D\x0A"):
                            stream.seek(-20 + 1, 1)
                            line = stream.read(20)
                        # On the other hand, some malformed PDF files
                        # use a single character EOL without a preceeding
                        # space.  Detect that case, and seek the stream
                        # back one character.  (0-9 means we've bled into
                        # the next xref entry, t means we've bled into the
                        # text "trailer"):
                        if line[-1] in b_("0123456789t"):
                            stream.seek(-1, 1)
                        offset, generation = line[:16].split(b_(" "))
                        offset, generation = int(offset), int(generation)
                        self.xref.setdefault(generation, {})
                        if num in self.xref[generation]:
                            # It really seems like we should allow the last
                            # xref table in the file to override previous
                            # ones. Since we read the file backwards, assume
                            # any existing key is already set correctly.
                            pass
                        else:
                            self.xref[generation][num] = offset
                        cnt += 1
                        num += 1
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    trailertag = stream.read(7)
                    if trailertag != b_("trailer"):
                        # more xrefs!
                        stream.seek(-7, 1)
                    else:
                        break
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                newTrailer = readObject(stream, self)
                for key, value in newTrailer.items():
                    self.trailer.setdefault(key, value)
                if "/Prev" in newTrailer:
                    startxref = newTrailer["/Prev"]
                else:
                    break
            elif x.isdigit():
                # PDF 1.5+ Cross-Reference Stream
                stream.seek(-1, 1)
                idnum, generation = self.readObjectHeader(stream)
                xrefstream = readObject(stream, self)
                assert xrefstream["/Type"] == "/XRef"
                self.cacheIndirectObject(generation, idnum, xrefstream)
                streamData = StringIO(xrefstream.getData())
                idx_pairs = xrefstream.get("/Index",
                                           [0, xrefstream.get("/Size")])
                entrySizes = xrefstream.get("/W")
                for num, size in self._pairs(idx_pairs):
                    cnt = 0
                    while cnt < size:
                        for i in range(len(entrySizes)):
                            d = streamData.read(entrySizes[i])
                            di = convertToInt(d, entrySizes[i])
                            if i == 0:
                                xref_type = di
                            elif i == 1:
                                if xref_type == 0:
                                    # next_free_object = di
                                    pass
                                elif xref_type == 1:
                                    byte_offset = di
                                elif xref_type == 2:
                                    objstr_num = di
                            elif i == 2:
                                if xref_type == 0:
                                    # next_generation = di
                                    pass
                                elif xref_type == 1:
                                    generation = di
                                elif xref_type == 2:
                                    obstr_idx = di
                        if xref_type == 0:
                            pass
                        elif xref_type == 1:
                            if generation not in self.xref:
                                self.xref[generation] = {}
                            if not num in self.xref[generation]:
                                self.xref[generation][num] = byte_offset
                        elif xref_type == 2:
                            if not num in self.xref_objStm:
                                self.xref_objStm[num] = [objstr_num, obstr_idx]
                        cnt += 1
                        num += 1
                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
                for key in trailerKeys:
                    if key in xrefstream and key not in self.trailer:
                        self.trailer[NameObject(key)] = xrefstream.raw_get(key)
                if "/Prev" in xrefstream:
                    startxref = xrefstream["/Prev"]
                else:
                    break
            else:
                # bad xref character at startxref.  Let's see if we can find
                # the xref table nearby, as we've observed this error with an
                # off-by-one before.
                stream.seek(-11, 1)
                tmp = stream.read(20)
                xref_loc = tmp.find(b_("xref"))
                if xref_loc != -1:
                    startxref -= (10 - xref_loc)
                    continue
                else:
                    # no xref table found at specified location
                    assert False
                    break
        # if not zero-indexed, verify that the table is correct
        # change it if necessary
        if self.xrefIndex and not self.strict:
            loc = stream.tell()
            for gen in self.xref:
                if gen == 65535:
                    continue
                for id in self.xref[gen]:
                    stream.seek(self.xref[gen][id], 0)
                    pid, pgen = self.readObjectHeader(stream)
                    if pid == id - self.xrefIndex:
                        self._zeroXref(gen)
                        break
                    # if not, then either it's just plain wrong,
                    # or the non-zero-index is actually correct
            stream.seek(loc, 0)  # return to where it was
Ejemplo n.º 34
0
    def readFromStream(stream, pdf):
        debug = False
        tmp = stream.read(2)
        if tmp != b_("<<"):
            raise utils.PdfReadError, \
                ("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
        data = {}
        while True:
            tok = readNonWhitespace(stream)
            if tok == b_('\x00'):
                continue
            if not tok:
                # stream has truncated prematurely
                raise PdfStreamError("Stream has ended unexpectedly")

            if debug: print "Tok:",tok
            if tok == b_(">"):
                stream.read(1)
                break
            stream.seek(-1, 1)
            key = readObject(stream, pdf)
            tok = readNonWhitespace(stream)
            stream.seek(-1, 1)
            value = readObject(stream, pdf)
            if not data.has_key(key):
                data[key] = value
        pos = stream.tell()
        s = readNonWhitespace(stream)
        if s == b_('s') and stream.read(5) == b_('tream'):
            eol = stream.read(1)
            # odd PDF file output has spaces after 'stream' keyword but before EOL.
            # patch provided by Danial Sandler
            while eol == b_(' '):
                eol = stream.read(1)
            assert eol in (b_("\n"), b_("\r"))
            if eol == b_("\r"):
                # read \n after
                if stream.read(1)  != '\n':
                    stream.seek(-1, 1)
            # this is a stream object, not a dictionary
            assert data.has_key("/Length")
            length = data["/Length"]
            if debug: print data
            if isinstance(length, IndirectObject):
                t = stream.tell()
                length = pdf.getObject(length)
                stream.seek(t, 0)
            data["__streamdata__"] = stream.read(length)
            if debug: print "here"
            #if debug: print debugging.printAsHex(data["__streamdata__"])
            e = readNonWhitespace(stream)
            ndstream = stream.read(8)
            if (e + ndstream) != b_("endstream"):
                # (sigh) - the odd PDF file has a length that is too long, so
                # we need to read backwards to find the "endstream" ending.
                # ReportLab (unknown version) generates files with this bug,
                # and Python users into PDF files tend to be our audience.
                # we need to do this to correct the streamdata and chop off
                # an extra character.
                pos = stream.tell()
                stream.seek(-10, 1)
                end = stream.read(9)
                if end == b_("endstream"):
                    # we found it by looking back one character further.
                    data["__streamdata__"] = data["__streamdata__"][:-1]
                else:
                    if pdf.strict == False:
                        warnings.warn("Ignoring missing endstream. This could affect PDF output.")
                        pass
                    else:
                        if debug: print "E", e, ndstream, debugging.toHex(end)
                        stream.seek(pos, 0)
                        raise utils.PdfReadError, \
                            ("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
        else:
            stream.seek(pos, 0)
        if data.has_key("__streamdata__"):
            return StreamObject.initializeFromDictionary(data)
        else:
            retval = DictionaryObject()
            retval.update(data)
            return retval
Ejemplo n.º 35
0
 def readFromStream(stream, pdf):
     tmp = stream.read(2)
     if tmp != "<<":
         raise utils.PdfReadError, "dictionary read error"
     data = {}
     while True:
         tok = readNonWhitespace(stream)
         if tok == ">":
             stream.read(1)
             break
         stream.seek(-1, 1)
         key = readObject(stream, pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, pdf)
         if data.has_key(key):
             # multiple definitions of key not permitted
             raise utils.PdfReadError, "multiple definitions in dictionary"
         data[key] = value
     pos = stream.tell()
     s = readNonWhitespace(stream)
     if s == 's' and stream.read(5) == 'tream':
         eol = stream.read(1)
         # odd PDF file output has spaces after 'stream' keyword but before EOL.
         # patch provided by Danial Sandler
         while eol == ' ':
             eol = stream.read(1)
         assert eol in ("\n", "\r")
         if eol == "\r":
             # read \n after
             stream.read(1)
         # this is a stream object, not a dictionary
         assert data.has_key("/Length")
         length = data["/Length"]
         if isinstance(length, IndirectObject):
             t = stream.tell()
             length = pdf.getObject(length)
             stream.seek(t, 0)
         data["__streamdata__"] = stream.read(length)
         e = readNonWhitespace(stream)
         ndstream = stream.read(8)
         if (e + ndstream) != "endstream":
             # (sigh) - the odd PDF file has a length that is too long, so
             # we need to read backwards to find the "endstream" ending.
             # ReportLab (unknown version) generates files with this bug,
             # and Python users into PDF files tend to be our audience.
             # we need to do this to correct the streamdata and chop off
             # an extra character.
             pos = stream.tell()
             stream.seek(-10, 1)
             end = stream.read(9)
             if end == "endstream":
                 # we found it by looking back one character further.
                 data["__streamdata__"] = data["__streamdata__"][:-1]
             else:
                 stream.seek(pos, 0)
                 raise utils.PdfReadError, "Unable to find 'endstream' marker after stream."
     else:
         stream.seek(pos, 0)
     if data.has_key("__streamdata__"):
         return StreamObject.initializeFromDictionary(data)
     else:
         retval = DictionaryObject()
         retval.update(data)
         return retval
Ejemplo n.º 36
0
    def read(self, stream):
        # start at the end:
        stream.seek(-1, 2)
        line = ''
        while not line:
            line = self.readNextEndLine(stream)
        if line[:5] != "%%EOF":
            raise utils.PdfReadError, "EOF marker not found"

        # find startxref entry - the location of the xref table
        line = self.readNextEndLine(stream)
        startxref = int(line)
        line = self.readNextEndLine(stream)
        if line[:9] != "startxref":
            raise utils.PdfReadError, "startxref not found"

        # read all cross reference tables and their trailers
        self.xref = {}
        self.xref_objStm = {}
        self.trailer = DictionaryObject()
        while 1:
            # load the xref table
            stream.seek(startxref, 0)
            x = stream.read(1)
            if x == "x":
                # standard cross-reference table
                ref = stream.read(4)
                if ref[:3] != "ref":
                    raise utils.PdfReadError, "xref table read error"
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                while 1:
                    num = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    size = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    cnt = 0
                    while cnt < size:
                        line = stream.read(20)
                        # It's very clear in section 3.4.3 of the PDF spec
                        # that all cross-reference table lines are a fixed
                        # 20 bytes.  However... some malformed PDF files
                        # use a single character EOL without a preceeding
                        # space.  Detect that case, and seek the stream
                        # back one character.  (0-9 means we've bled into
                        # the next xref entry, t means we've bled into the
                        # text "trailer"):
                        if line[-1] in "0123456789t":
                            stream.seek(-1, 1)
                        offset, generation = line[:16].split(" ")
                        offset, generation = int(offset), int(generation)
                        if not self.xref.has_key(generation):
                            self.xref[generation] = {}
                        if self.xref[generation].has_key(num):
                            # It really seems like we should allow the last
                            # xref table in the file to override previous
                            # ones. Since we read the file backwards, assume
                            # any existing key is already set correctly.
                            pass
                        else:
                            self.xref[generation][num] = offset
                        cnt += 1
                        num += 1
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    trailertag = stream.read(7)
                    if trailertag != "trailer":
                        # more xrefs!
                        stream.seek(-7, 1)
                    else:
                        break
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                newTrailer = readObject(stream, self)
                for key, value in newTrailer.items():
                    if not self.trailer.has_key(key):
                        self.trailer[key] = value
                if newTrailer.has_key("/Prev"):
                    startxref = newTrailer["/Prev"]
                else:
                    break
            elif x.isdigit():
                # PDF 1.5+ Cross-Reference Stream
                stream.seek(-1, 1)
                idnum, generation = self.readObjectHeader(stream)
                xrefstream = readObject(stream, self)
                assert xrefstream["/Type"] == "/XRef"
                self.cacheIndirectObject(generation, idnum, xrefstream)
                streamData = StringIO(xrefstream.getData())
                idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
                entrySizes = xrefstream.get("/W")
                for num, size in self._pairs(idx_pairs):
                    cnt = 0
                    while cnt < size:
                        for i in range(len(entrySizes)):
                            d = streamData.read(entrySizes[i])
                            di = convertToInt(d, entrySizes[i])
                            if i == 0:
                                xref_type = di
                            elif i == 1:
                                if xref_type == 0:
                                    next_free_object = di
                                elif xref_type == 1:
                                    byte_offset = di
                                elif xref_type == 2:
                                    objstr_num = di
                            elif i == 2:
                                if xref_type == 0:
                                    next_generation = di
                                elif xref_type == 1:
                                    generation = di
                                elif xref_type == 2:
                                    obstr_idx = di
                        if xref_type == 0:
                            pass
                        elif xref_type == 1:
                            if not self.xref.has_key(generation):
                                self.xref[generation] = {}
                            if not num in self.xref[generation]:
                                self.xref[generation][num] = byte_offset
                        elif xref_type == 2:
                            if not num in self.xref_objStm:
                                self.xref_objStm[num] = [objstr_num, obstr_idx]
                        cnt += 1
                        num += 1
                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
                for key in trailerKeys:
                    if xrefstream.has_key(key) and not self.trailer.has_key(key):
                        self.trailer[NameObject(key)] = xrefstream.raw_get(key)
                if xrefstream.has_key("/Prev"):
                    startxref = xrefstream["/Prev"]
                else:
                    break
            else:
                # bad xref character at startxref.  Let's see if we can find
                # the xref table nearby, as we've observed this error with an
                # off-by-one before.
                stream.seek(-11, 1)
                tmp = stream.read(20)
                xref_loc = tmp.find("xref")
                if xref_loc != -1:
                    startxref -= (10 - xref_loc)
                    continue
                else:
                    # no xref table found at specified location
                    assert False
                    break
Ejemplo n.º 37
0
    def read(self, stream):
        # start at the end:
        stream.seek(-1, 2)
        line = ""
        while not line:
            line = self.readNextEndLine(stream)
        assert line[:5] == "%%EOF"

        # find startxref entry - the location of the xref table
        line = self.readNextEndLine(stream)
        startxref = int(line)
        line = self.readNextEndLine(stream)
        assert line[:9] == "startxref"

        # read all cross reference tables and their trailers
        self.xref = {}
        self.xref_objStm = {}
        self.trailer = {}
        while 1:
            # load the xref table
            stream.seek(startxref, 0)
            x = stream.read(1)
            if x == "x":
                # standard cross-reference table
                ref = stream.read(4)
                assert ref[:3] == "ref"
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                while 1:
                    num = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    size = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    cnt = 0
                    while cnt < size:
                        line = stream.read(20)
                        offset, generation = line[:16].split(" ")
                        offset, generation = int(offset), int(generation)
                        if not self.xref.has_key(generation):
                            self.xref[generation] = {}
                        if self.xref[generation].has_key(num):
                            # It really seems like we should allow the last
                            # xref table in the file to override previous
                            # ones. Since we read the file backwards, assume
                            # any existing key is already set correctly.
                            pass
                        else:
                            self.xref[generation][num] = offset
                        cnt += 1
                        num += 1
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    trailertag = stream.read(7)
                    if trailertag != "trailer":
                        # more xrefs!
                        stream.seek(-7, 1)
                    else:
                        break
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                newTrailer = readObject(stream, self)
                for key, value in newTrailer.items():
                    if not self.trailer.has_key(key):
                        self.trailer[key] = value
                if newTrailer.has_key(NameObject("/Prev")):
                    startxref = newTrailer[NameObject("/Prev")]
                else:
                    break
            elif x.isdigit():
                # PDF 1.5+ Cross-Reference Stream
                stream.seek(-1, 1)
                idnum, generation = self.readObjectHeader(stream)
                xrefstream = readObject(stream, self)
                assert xrefstream["/Type"] == "/XRef"
                self.cacheIndirectObject(generation, idnum, xrefstream)
                streamData = StringIO(xrefstream.getData())
                num, size = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
                entrySizes = xrefstream.get("/W")
                cnt = 0
                while cnt < size:
                    for i in range(len(entrySizes)):
                        d = streamData.read(entrySizes[i])
                        di = convertToInt(d, entrySizes[i])
                        if i == 0:
                            xref_type = di
                        elif i == 1:
                            if xref_type == 0:
                                next_free_object = di
                            elif xref_type == 1:
                                byte_offset = di
                            elif xref_type == 2:
                                objstr_num = di
                        elif i == 2:
                            if xref_type == 0:
                                next_generation = di
                            elif xref_type == 1:
                                generation = di
                            elif xref_type == 2:
                                obstr_idx = di
                    if xref_type == 0:
                        pass
                    elif xref_type == 1:
                        if not self.xref.has_key(generation):
                            self.xref[generation] = {}
                        self.xref[generation][num] = byte_offset
                    elif xref_type == 2:
                        self.xref_objStm[num] = [objstr_num, obstr_idx]
                    cnt += 1
                    num += 1
                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
                for key in trailerKeys:
                    if xrefstream.has_key(key) and not self.trailer.has_key(key):
                        self.trailer[NameObject(key)] = xrefstream[key]
                if xrefstream.has_key("/Prev"):
                    startxref = xrefstream["/Prev"]
                else:
                    break
            else:
                # bad xref character at startxref.  Let's see if we can find
                # the xref table nearby, as we've observed this error with an
                # off-by-one before.
                stream.seek(-11, 1)
                tmp = stream.read(20)
                xref_loc = tmp.find("xref")
                if xref_loc != -1:
                    startxref -= 10 - xref_loc
                    continue
                else:
                    # no xref table found at specified location
                    assert False
                    break
Ejemplo n.º 38
0
 def readFromStream(stream, pdf):
     debug = False
     tmp = stream.read(2)
     if tmp != b_("<<"):
         raise utils.PdfReadError, \
             ("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
     data = {}
     while True:
         tok = readNonWhitespace(stream)
         if debug: print "Tok:",tok
         if tok == b_(">"):
             stream.read(1)
             break
         stream.seek(-1, 1)
         key = readObject(stream, pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, pdf)
         if data.has_key(key) and pdf.strict:
             # multiple definitions of key not permitted
             raise utils.PdfReadError, ("Multiple definitions in dictionary at byte %s for key %s" \
                                        % (utils.hexStr(stream.tell()), key))
         data[key] = value
     pos = stream.tell()
     s = readNonWhitespace(stream)
     if s == b_('s') and stream.read(5) == b_('tream'):
         eol = stream.read(1)
         # odd PDF file output has spaces after 'stream' keyword but before EOL.
         # patch provided by Danial Sandler
         while eol == b_(' '):
             eol = stream.read(1)
         assert eol in (b_("\n"), b_("\r"))
         if eol == b_("\r"):
             # read \n after
             if stream.read(1)  != '\n':
                 stream.seek(-1, 1)
         # this is a stream object, not a dictionary
         assert data.has_key("/Length")
         length = data["/Length"]
         if debug: print data
         if isinstance(length, IndirectObject):
             t = stream.tell()
             length = pdf.getObject(length)
             stream.seek(t, 0)
         data["__streamdata__"] = stream.read(length)
         if debug: print "here"
         #if debug: print debugging.printAsHex(data["__streamdata__"])
         e = readNonWhitespace(stream)
         ndstream = stream.read(8)
         if (e + ndstream) != b_("endstream"):
             # (sigh) - the odd PDF file has a length that is too long, so
             # we need to read backwards to find the "endstream" ending.
             # ReportLab (unknown version) generates files with this bug,
             # and Python users into PDF files tend to be our audience.
             # we need to do this to correct the streamdata and chop off
             # an extra character.
             pos = stream.tell()
             stream.seek(-10, 1)
             end = stream.read(9)
             if end == b_("endstream"):
                 # we found it by looking back one character further.
                 data["__streamdata__"] = data["__streamdata__"][:-1]
             else:
                 # still not found, try looking forward one character
                 stream.seek(pos + 1, 0)
                 end = stream.read(9)
                 if end == b_("endstream"):
                     # we found it by looking forward one character, add
                     # skipped character to the end of the stream data
                     stream.seek(-10, 1)
                     data["__streamdata__"] = data["__streamdata__"] + stream.read(1)
                     stream.seek(9, 1)
                 else:
                     # give up looking for misplaced "endstream" token
                     # if debug: print "E", e, ndstream, debugging.toHex(end)
                     stream.seek(pos, 0)
                     raise utils.PdfReadError, \
                         ("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
     else:
         stream.seek(pos, 0)
     if data.has_key("__streamdata__"):
         return StreamObject.initializeFromDictionary(data)
     else:
         retval = DictionaryObject()
         retval.update(data)
         return retval
Ejemplo n.º 39
-4
    def readObjectHeader(self, stream):
        # Should never be necessary to read out whitespace, since the
        # cross-reference table should put us in the right spot to read the
        # object header.  In reality... some files have stupid cross reference
        # tables that are off by whitespace bytes.
        extra = False
        utils.skipOverComment(stream)

        extra |= utils.skipOverWhitespace(stream)
        stream.seek(-1, 1)

        idnum = readUntilWhitespace(stream)

        extra |= utils.skipOverWhitespace(stream)
        stream.seek(-1, 1)

        generation = readUntilWhitespace(stream)
        stream.read(3)
        readNonWhitespace(stream)
        stream.seek(-1, 1)
        if (extra and self.strict):
            #not a fatal error
            warnings.warn("Superfluous whitespace found in "
                          "object header %s %s" % (idnum, generation),
                          utils.PdfReadWarning)
        return int(idnum), int(generation)