Beispiel #1
0
 def readFromStream(stream, pdf):
     idnum = b_("")
     while True:
         tok = stream.read(1)
         if not tok:
             # stream has truncated prematurely
             raise PdfStreamError("Stream has ended unexpectedly")
         if tok.isspace():
             break
         idnum += tok
     generation = b_("")
     while True:
         tok = stream.read(1)
         if not tok:
             # stream has truncated prematurely
             raise PdfStreamError("Stream has ended unexpectedly")
         if tok.isspace():
             break
         generation += tok
     r = stream.read(1)
     if r != b_("R"):
         raise utils.PdfReadError(
             "Error reading indirect object reference at byte %s" %
             utils.hexStr(stream.tell()))
     return IndirectObject(int(idnum), int(generation), pdf)
Beispiel #2
0
def readStringFromStream(stream):
    tok = stream.read(1)
    parens = 1
    txt = b_("")
    while True:
        tok = stream.read(1)
        if not tok:
            # stream has truncated prematurely
            raise PdfStreamError("Stream has ended unexpectedly")
        if tok == b_("("):
            parens += 1
        elif tok == b_(")"):
            parens -= 1
            if parens == 0:
                break
        elif tok == b_("\\"):
            tok = stream.read(1)
            if tok == b_("n"):
                tok = b_("\n")
            elif tok == b_("r"):
                tok = b_("\r")
            elif tok == b_("t"):
                tok = b_("\t")
            elif tok == b_("b"):
                tok = b_("\b")
            elif tok == b_("f"):
                tok = b_("\f")
            elif tok == b_("("):
                tok = b_("(")
            elif tok == b_(")"):
                tok = b_(")")
            elif tok == b_("\\"):
                tok = b_("\\")
            elif tok.isdigit():
                # "The number ddd may consist of one, two, or three
                # octal digits; high-order overflow shall be ignored.
                # Three octal digits shall be used, with leading zeros
                # as needed, if the next character of the string is also
                # a digit." (PDF reference 7.3.4.2, p 16)
                for i in range(2):
                    ntok = stream.read(1)
                    if ntok.isdigit():
                        tok += ntok
                    else:
                        break
                tok = b_(chr(int(tok, base=8)))
            elif tok in b_("\n\r"):
                # This case is  hit when a backslash followed by a line
                # break occurs.  If it's a multi-char EOL, consume the
                # second character:
                tok = stream.read(1)
                if not tok in b_("\n\r"):
                    stream.seek(-1, 1)
                # Then don't add anything to the actual string, since this
                # line break was escaped:
                tok = b_('')
            else:
                raise utils.PdfReadError("Unexpected escaped string")
        txt += tok
    return createStringObject(txt)
Beispiel #3
0
 def readFromStream(stream):
     word = stream.read(4)
     if word == b_("true"):
         return BooleanObject(True)
     elif word == b_("fals"):
         stream.read(1)
         return BooleanObject(False)
     else:
         raise utils.PdfReadError('Could not read Boolean object')
Beispiel #4
0
 def readFromStream(stream, pdf):
     debug = False
     if debug: print((stream.tell()))
     name = stream.read(1)
     if name != NameObject.surfix:
         raise utils.PdfReadError("name read error")
     name += utils.readUntilRegex(stream,
                                  NameObject.delimiterPattern,
                                  ignore_eof=True)
     if debug: print(name)
     try:
         return NameObject(name.decode('utf-8'))
     except (UnicodeEncodeError, UnicodeDecodeError) as e:
         # Name objects should represent irregular characters
         # with a '#' followed by the symbol's hex number
         if not pdf.strict:
             warnings.warn("Illegal character in Name Object",
                           utils.PdfReadWarning)
             return NameObject(name)
         else:
             raise utils.PdfReadError("Illegal character in Name Object")
Beispiel #5
0
def readStringFromStream(stream):
    tok = stream.read(1)
    parens = 1
    txt = ""
    while True:
        tok = stream.read(1)
        if tok == "(":
            parens += 1
        elif tok == ")":
            parens -= 1
            if parens == 0:
                break
        elif tok == "\\":
            tok = stream.read(1)
            if tok == "n":
                tok = "\n"
            elif tok == "r":
                tok = "\r"
            elif tok == "t":
                tok = "\t"
            elif tok == "b":
                tok == "\b"
            elif tok == "f":
                tok = "\f"
            elif tok == "(":
                tok = "("
            elif tok == ")":
                tok = ")"
            elif tok == "\\":
                tok = "\\"
            elif tok.isdigit():
                tok += stream.read(2)
                tok = chr(int(tok, base=8))
            elif tok in "\n\r":
                # This case is  hit when a backslash followed by a line
                # break occurs.  If it's a multi-char EOL, consume the
                # second character:
                tok = stream.read(1)
                if not tok in "\n\r":
                    stream.seek(-1, 1)
                # Then don't add anything to the actual string, since this
                # line break was escaped:
                tok = ''
            else:
                raise utils.PdfReadError("Unexpected escaped string")
        txt += tok
    return createStringObject(txt)
Beispiel #6
0
 def readFromStream(stream, pdf):
     idnum = ""
     while True:
         tok = stream.read(1)
         if tok.isspace():
             break
         idnum += tok
     generation = ""
     while True:
         tok = stream.read(1)
         if tok.isspace():
             break
         generation += tok
     r = stream.read(1)
     if r != "R":
         raise utils.PdfReadError("error reading indirect object reference")
     return IndirectObject(int(idnum), int(generation), pdf)
Beispiel #7
0
 def readFromStream(stream, pdf):
     idnum = b_("")
     while True:
         tok = stream.read(1)
         if tok.isspace():
             break
         idnum += tok
     generation = b_("")
     while True:
         tok = stream.read(1)
         if tok.isspace():
             break
         generation += tok
     r = stream.read(1)
     if r != b_("R"):
         raise utils.PdfReadError(
             "Error reading indirect object reference at byte %s" %
             utils.hexStr(stream.tell()))
     return IndirectObject(int(idnum), int(generation), pdf)
Beispiel #8
0
 def readFromStream(stream, pdf):
     arr = ArrayObject()
     tmp = stream.read(1)
     if tmp != b_("["):
         raise utils.PdfReadError("Could not read array")
     while True:
         # skip leading whitespace
         tok = stream.read(1)
         while tok.isspace():
             tok = stream.read(1)
         stream.seek(-1, 1)
         # check for array ending
         peekahead = stream.read(1)
         if peekahead == b_("]"):
             break
         stream.seek(-1, 1)
         # read and append obj
         arr.append(readObject(stream, pdf))
     return arr
Beispiel #9
0
    def __init__(self, title, page, typ, *args):
        DictionaryObject.__init__(self)
        self[NameObject("/Title")] = title
        self[NameObject("/Page")] = page
        self[NameObject("/Type")] = typ

        # from table 8.2 of the PDF 1.6 reference.
        if typ == "/XYZ":
            (self[NameObject("/Left")], self[NameObject("/Top")],
             self[NameObject("/Zoom")]) = args
        elif typ == "/FitR":
            (self[NameObject("/Left")], self[NameObject("/Bottom")],
             self[NameObject("/Right")], self[NameObject("/Top")]) = args
        elif typ in ["/FitH", "FitBH"]:
            self[NameObject("/Top")], = args
        elif typ in ["/FitV", "FitBV"]:
            self[NameObject("/Left")], = args
        elif typ in ["/Fit", "FitB"]:
            pass
        else:
            raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
Beispiel #10
0
    def _buildOutline(self, node):
        dest, title, outline = None, None, None
        if "/A" in node and "/Title" in node:
            # Action, section 8.5 (only type GoTo supported)
            title = node["/Title"]
            action = node["/A"]
            if action["/S"] == "/GoTo":
                dest = action["/D"]
        elif "/Dest" in node and "/Title" in node:
            # Destination, section 8.2.1
            title = node["/Title"]
            dest = node["/Dest"]

        # if destination found, then create outline
        if dest:
            if isinstance(dest, ArrayObject):
                outline = self._buildDestination(title, dest)
            elif isinstance(dest, unicode) and dest in self._namedDests:
                outline = self._namedDests[dest]
                outline[NameObject("/Title")] = title
            else:
                raise utils.PdfReadError("Unexpected destination %r" % dest)
        return outline
Beispiel #11
0
 def setData(self, data):
     raise utils.PdfReadError(
         "Creating EncodedStreamObject is not currently supported")
Beispiel #12
0
    def readFromStream(stream, pdf):
        debug = False
        tmp = stream.read(2)
        if tmp != b_("<<"):
            raise utils.PdfReadError(
                "Dictionary read error at byte %s: stream must begin with '<<'"
                % utils.hexStr(stream.tell()))
        data = {}
        while True:
            tok = readNonWhitespace(stream)
            if tok == b_('\x00'):
                continue
            elif tok == b_('%'):
                stream.seek(-1, 1)
                skipOverComment(stream)
                continue
            if not tok:
                # stream has truncated prematurely
                raise PdfStreamError("Stream has ended unexpectedly")

            if debug: print(("Tok:", tok))
            if tok == b_(">"):
                stream.read(1)
                break
            stream.seek(-1, 1)
            key = readObject(stream, pdf)
            tok = readNonWhitespace(stream)
            stream.seek(-1, 1)
            value = readObject(stream, pdf)
            if not data.get(key):
                data[key] = value
            elif pdf.strict:
                # multiple definitions of key not permitted
                raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
                                           % (utils.hexStr(stream.tell()), key))
            else:
                warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
                                           % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning)

        pos = stream.tell()
        s = readNonWhitespace(stream)
        if s == b_('s') and stream.read(5) == b_('tream'):
            eol = stream.read(1)
            # odd PDF file output has spaces after 'stream' keyword but before EOL.
            # patch provided by Danial Sandler
            while eol == b_(' '):
                eol = stream.read(1)
            assert eol in (b_("\n"), b_("\r"))
            if eol == b_("\r"):
                # read \n after
                if stream.read(1) != b_('\n'):
                    stream.seek(-1, 1)
            # this is a stream object, not a dictionary
            assert "/Length" in data
            length = data["/Length"]
            if debug: print(data)
            if isinstance(length, IndirectObject):
                t = stream.tell()
                length = pdf.getObject(length)
                stream.seek(t, 0)
            data["__streamdata__"] = stream.read(length)
            if debug: print("here")
            #if debug: print(binascii.hexlify(data["__streamdata__"]))
            e = readNonWhitespace(stream)
            ndstream = stream.read(8)
            if (e + ndstream) != b_("endstream"):
                # (sigh) - the odd PDF file has a length that is too long, so
                # we need to read backwards to find the "endstream" ending.
                # ReportLab (unknown version) generates files with this bug,
                # and Python users into PDF files tend to be our audience.
                # we need to do this to correct the streamdata and chop off
                # an extra character.
                pos = stream.tell()
                stream.seek(-10, 1)
                end = stream.read(9)
                if end == b_("endstream"):
                    # we found it by looking back one character further.
                    data["__streamdata__"] = data["__streamdata__"][:-1]
                else:
                    if debug: print(("E", e, ndstream, debugging.toHex(end)))
                    stream.seek(pos, 0)
                    raise utils.PdfReadError(
                        "Unable to find 'endstream' marker after stream at byte %s."
                        % utils.hexStr(stream.tell()))
        else:
            stream.seek(pos, 0)
        if "__streamdata__" in data:
            return StreamObject.initializeFromDictionary(data)
        else:
            retval = DictionaryObject()
            retval.update(data)
            return retval
Beispiel #13
0
 def readFromStream(stream):
     nulltxt = stream.read(4)
     if nulltxt != b_("null"):
         raise utils.PdfReadError("Could not read Null object")
     return NullObject()
Beispiel #14
0
def convertToInt(d, size):
    if size > 8:
        raise utils.PdfReadError("invalid size in convertToInt")
    d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d
    d = d[-8:]
    return struct.unpack(">q", d)[0]
Beispiel #15
0
    def getObject(self, indirectReference):
        retval = self.resolvedObjects.get(indirectReference.generation,
                                          {}).get(indirectReference.idnum,
                                                  None)
        if retval is not None:
            return retval
        if indirectReference.generation == 0 \
                and indirectReference.idnum in self.xref_objStm:
            # indirect reference to object in object stream
            # read the entire object stream into memory
            stmnum, idx = self.xref_objStm[indirectReference.idnum]
            objStm = IndirectObject(stmnum, 0, self).getObject()
            assert objStm['/Type'] == '/ObjStm'
            assert idx < objStm['/N']
            streamData = StringIO(objStm.getData())
            for i in range(objStm['/N']):
                objnum = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                offset = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                t = streamData.tell()
                streamData.seek(objStm['/First'] + offset, 0)
                obj = readObject(streamData, self)
                self.resolvedObjects[0][objnum] = obj
                streamData.seek(t, 0)
            return self.resolvedObjects[0][indirectReference.idnum]
        if indirectReference.idnum \
                not in self.xref[indirectReference.generation]:
            warnings.warn(
                "Object %d %d not defined." %
                (indirectReference.idnum, indirectReference.generation),
                utils.PdfReadWarning)
            return None
        start = self.xref[indirectReference.generation][
            indirectReference.idnum]
        self.stream.seek(start, 0)
        idnum, generation = self.readObjectHeader(self.stream)
        try:
            assert idnum == indirectReference.idnum
        except AssertionError:
            if self.xrefIndex:
                # Xref table probably had bad indexes due to not
                # being zero-indexed
                if self.strict:
                    raise utils.PdfReadError(
                        "Expected object ID (%d %d) does "
                        "not match actual (%d %d); xref "
                        "table not zero-indexed." %
                        (indirectReference.idnum, indirectReference.generation,
                         idnum, generation))
                else:
                    # should not happen since the xref table is corrected in
                    # non-strict mode
                    pass
            else:  # some other problem
                raise utils.PdfReadError(
                    "Expected object ID (%d %d) does not "
                    " match actual (%d %d)." %
                    (indirectReference.idnum, indirectReference.generation,
                     idnum, generation))
        assert generation == indirectReference.generation
        retval = readObject(self.stream, self)
        # override encryption is used for the /Encrypt dictionary
        if not self._override_encryption and self.isEncrypted:
            # if we don't have the encryption key:
            if not hasattr(self, '_decryption_key'):
                raise Exception("file has not been decrypted")
            # otherwise, decrypt here...
            pack1 = struct.pack("<i", indirectReference.idnum)[:3]
            pack2 = struct.pack("<i", indirectReference.generation)[:2]
            key = self._decryption_key + pack1 + pack2
            assert len(key) == (len(self._decryption_key) + 5)
            md5_hash = md5(key).digest()
            key = md5_hash[:min(16, len(self._decryption_key) + 5)]
            retval = self._decryptObject(retval, key)

        self.cacheIndirectObject(generation, idnum, retval)
        return retval