Exemple #1
0
    def decompress_stream(self, key):
        """
         Decompress  compressed object Streams. 
         """
        self.key = key
        #print "++++++++++++ decompressing stream +++++++++++++++++++++++++" ,key
        try:
            data = self.objects[self.key]
            start = data.find("stream")
            end = data.find("endstream")
            self.buff = data[start + 6:].strip()
            if len(self.buff) < 2: return

            self.methods = self.get_compMethod(key, data)
            self.data = self.buff.strip()
            for self.method in self.methods:
                #print self.method
                if 'fl' == self.method.lower():
                    self.data = decompress(self.data)
                if 'ascii85decode' == self.method.lower():
                    self.data = ascii85decode(self.data)
                if 'asciihexdecode' == self.method.lower():
                    self.data = asciihexdecode(self.data)
                if 'lzwdecode' == self.method.lower():
                    self.data = lzwdecode(self.data)

            if len(self.methods) == 0:
                self.handle_evasion(key, data[:start])

        except Exception, err:
            pass
Exemple #2
0
 def decode(self):
   assert self.data == None and self.rawdata != None
   data = self.rawdata
   if self.decipher:
     # Handle encryption
     data = self.decipher(self.objid, self.genno, data)
   if 'Filter' not in self.dic:
     self.data = data
     self.rawdata = None
     return
   filters = self.dic['Filter']
   if not isinstance(filters, list):
     filters = [ filters ]
   for f in filters:
     if f in LITERALS_FLATE_DECODE:
       # will get errors if the document is encrypted.
       data = self.decomp(data)
     elif f in LITERALS_LZW_DECODE:
       try:
         from cStringIO import StringIO
       except ImportError:
         from StringIO import StringIO
       data = ''.join(LZWDecoder(StringIO(data)).run())
     elif f in LITERALS_ASCII85_DECODE:
       import ascii85
       data = ascii85.ascii85decode(data)
     elif f in LITERALS_ASCIIHEX_DECODE:
       import ascii85
       data = ascii85.asciihexdecode(data)
     elif f == LITERAL_CRYPT:
       raise PDFNotImplementedError('/Crypt filter is unsupported')
     else:
       raise PDFNotImplementedError('Unsupported filter: %r' % f)
     # apply predictors
     if 'DP' in self.dic:
       params = self.dic['DP']
     else:
       params = self.dic.get('DecodeParms', {})
     if 'Predictor' in params:
       pred = int_value(params['Predictor'])
       if pred:
         if pred != 12:
           raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
         if 'Columns' not in params:
           raise PDFValueError('Columns undefined for predictor=12')
         columns = int_value(params['Columns'])
         buf = ''
         ent0 = '\x00' * columns
         for i in xrange(0, len(data), columns+1):
           pred = data[i]
           ent1 = data[i+1:i+1+columns]
           if pred == '\x02':
             ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
           buf += ent1
           ent0 = ent1
         data = buf
   self.data = data
   self.rawdata = None
   return
 def decode(self):
     assert self.data is None and self.rawdata != None
     data = self.rawdata
     if self.decipher:
         # Handle encryption
         data = self.decipher(self.objid, self.genno, data)
     filters = self.get_filters()
     if not filters:
         self.data = data
         self.rawdata = None
         return
     for f in filters:
         if f in LITERALS_FLATE_DECODE:
             # will get errors if the document is encrypted.
             try:
                 data = zlib.decompress(data)
             except zlib.error:
                 data = ''
         elif f in LITERALS_LZW_DECODE:
             data = lzwdecode(data)
         elif f in LITERALS_ASCII85_DECODE:
             data = ascii85decode(data)
         elif f in LITERALS_ASCIIHEX_DECODE:
             data = asciihexdecode(data)
         elif f in LITERALS_RUNLENGTH_DECODE:
             data = rldecode(data)
         elif f in LITERALS_CCITTFAX_DECODE:
             #data = ccittfaxdecode(data)
             raise PDFNotImplementedError('Unsupported filter: %r' % f)
         elif f == LITERAL_CRYPT:
             # not yet..
             raise PDFNotImplementedError('/Crypt filter is unsupported')
         else:
             raise PDFNotImplementedError('Unsupported filter: %r' % f)
         # apply predictors
         params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
         if 'Predictor' in params and 'Columns' in params:
             pred = int_value(params['Predictor'])
             columns = int_value(params['Columns'])
             if pred:
                 if pred != 12:
                     raise PDFNotImplementedError(
                         'Unsupported predictor: %r' % pred)
                 buf = ''
                 ent0 = '\x00' * columns
                 for i in xrange(0, len(data), columns + 1):
                     pred = data[i]
                     ent1 = data[i + 1:i + 1 + columns]
                     if pred == '\x02':
                         ent1 = ''.join(
                             chr((ord(a) + ord(b)) & 255)
                             for (a, b) in zip(ent0, ent1))
                     buf += ent1
                     ent0 = ent1
                 data = buf
     self.data = data
     self.rawdata = None
     return
Exemple #4
0
 def decode(self):
     assert self.data is None and self.rawdata is not None
     data = self.rawdata
     if self.decipher:
         # Handle encryption
         data = self.decipher(self.objid, self.genno, data, self.attrs)
     filters = self.get_filters()
     if not filters:
         self.data = data
         self.rawdata = None
         return
     for f in filters:
         params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
         if f in LITERALS_FLATE_DECODE:
             # will get errors if the document is encrypted.
             try:
                 data = zlib.decompress(data)
             except zlib.error as e:
                 if STRICT:
                     raise PDFException('Invalid zlib bytes: %r, %r' % (e, data))
                 data = ''
         elif f in LITERALS_LZW_DECODE:
             data = lzwdecode(data)
         elif f in LITERALS_ASCII85_DECODE:
             data = ascii85decode(data)
         elif f in LITERALS_ASCIIHEX_DECODE:
             data = asciihexdecode(data)
         elif f in LITERALS_RUNLENGTH_DECODE:
             data = rldecode(data)
         elif f in LITERALS_CCITTFAX_DECODE:
             data = ccittfaxdecode(data, params)
         elif f in LITERALS_DCT_DECODE:
             # This is probably a JPG stream - it does not need to be decoded twice.
             # Just return the stream to the user.
             pass
         elif f == LITERAL_CRYPT:
             # not yet..
             raise PDFNotImplementedError('/Crypt filter is unsupported')
         else:
             raise PDFNotImplementedError('Unsupported filter: %r' % f)
         # apply predictors
         if 'Predictor' in params:
             pred = int_value(params['Predictor'])
             if pred == 1:
                 # no predictor
                 pass
             elif 10 <= pred:
                 # PNG predictor
                 colors = int_value(params.get('Colors', 1))
                 columns = int_value(params.get('Columns', 1))
                 bitspercomponent = int_value(params.get('BitsPerComponent', 8))
                 data = apply_png_predictor(pred, colors, columns, bitspercomponent, data)
             else:
                 raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
     self.data = data
     self.rawdata = None
     return
Exemple #5
0
 def decode(self):
     assert self.data is None and self.rawdata != None
     data = self.rawdata
     if self.decipher:
         # Handle encryption
         data = self.decipher(self.objid, self.genno, data)
     filters = self.get_filters()
     if not filters:
         self.data = data
         self.rawdata = None
         return
     for f in filters:
         if f in LITERALS_FLATE_DECODE:
             # will get errors if the document is encrypted.
             try:
                 data = zlib.decompress(data)
             except zlib.error as e:
                 if STRICT:
                     raise PDFException("Invalid zlib bytes: %r, %r" % (e, data))
                 data = ""
         elif f in LITERALS_LZW_DECODE:
             data = lzwdecode(data)
         elif f in LITERALS_ASCII85_DECODE:
             data = ascii85decode(data)
         elif f in LITERALS_ASCIIHEX_DECODE:
             data = asciihexdecode(data)
         elif f in LITERALS_RUNLENGTH_DECODE:
             data = rldecode(data)
         elif f in LITERALS_CCITTFAX_DECODE:
             # data = ccittfaxdecode(data)
             raise PDFNotImplementedError("Unsupported filter: %r" % f)
         elif f == LITERAL_CRYPT:
             # not yet..
             raise PDFNotImplementedError("/Crypt filter is unsupported")
         else:
             raise PDFNotImplementedError("Unsupported filter: %r" % f)
         # apply predictors
         params = self.get_any(("DP", "DecodeParms", "FDecodeParms"), {})
         if "Predictor" in params and "Columns" in params:
             pred = int_value(params["Predictor"])
             columns = int_value(params["Columns"])
             if pred:
                 if pred != 12:
                     raise PDFNotImplementedError("Unsupported predictor: %r" % pred)
                 buf = ""
                 ent0 = "\x00" * columns
                 for i in xrange(0, len(data), columns + 1):
                     pred = data[i]
                     ent1 = data[i + 1 : i + 1 + columns]
                     if pred == "\x02":
                         ent1 = "".join(chr((ord(a) + ord(b)) & 255) for (a, b) in zip(ent0, ent1))
                     buf += ent1
                     ent0 = ent1
                 data = buf
     self.data = data
     self.rawdata = None
     return
Exemple #6
0
 def decode(self):
   assert self.data == None and self.rawdata != None
   data = self.rawdata
   if self.decipher:
     # Handle encryption
     data = self.decipher(self.objid, self.genno, data)
   if 'Filter' not in self.dic:
     self.data = data
     self.rawdata = None
     return
   filters = self.dic['Filter']
   if not isinstance(filters, list):
     filters = [ filters ]
   for f in filters:
     if f in LITERALS_FLATE_DECODE:
       # will get errors if the document is encrypted.
       data = zlib.decompress(data)
     elif f in LITERALS_LZW_DECODE:
       try:
         from cStringIO import StringIO
       except ImportError:
         from StringIO import StringIO
       data = ''.join(LZWDecoder(StringIO(data)).run())
     elif f in LITERALS_ASCII85_DECODE:
       import ascii85
       data = ascii85.ascii85decode(data)
     elif f == LITERAL_CRYPT:
       raise PDFNotImplementedError('/Crypt filter is unsupported')
     else:
       raise PDFNotImplementedError('Unsupported filter: %r' % f)
     # apply predictors
     if 'DP' in self.dic:
       params = self.dic['DP']
     else:
       params = self.dic.get('DecodeParms', {})
     if 'Predictor' in params:
       pred = int_value(params['Predictor'])
       if pred:
         if pred != 12:
           raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
         if 'Columns' not in params:
           raise PDFValueError('Columns undefined for predictor=12')
         columns = int_value(params['Columns'])
         buf = ''
         ent0 = '\x00' * columns
         for i in xrange(0, len(data), columns+1):
           pred = data[i]
           ent1 = data[i+1:i+1+columns]
           if pred == '\x02':
             ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
           buf += ent1
           ent0 = ent1
         data = buf
   self.data = data
   self.rawdata = None
   return
Exemple #7
0
 def decode(self):
     assert self.data is None and self.rawdata != None
     data = self.rawdata
     if self.decipher:
         # Handle encryption
         data = self.decipher(self.objid, self.genno, data)
     filters = self.get_filters()
     if not filters:
         self.data = data
         self.rawdata = None
         return
     for f in filters:
         if f in LITERALS_FLATE_DECODE:
             # will get errors if the document is encrypted.
             try:
                 data = zlib.decompress(data)
             except zlib.error:
                 data = ''
         elif f in LITERALS_LZW_DECODE:
             data = lzwdecode(data)
         elif f in LITERALS_ASCII85_DECODE:
             data = ascii85decode(data)
         elif f in LITERALS_ASCIIHEX_DECODE:
             data = asciihexdecode(data)
         elif f in LITERALS_RUNLENGTH_DECODE:
             data = rldecode(data)
         elif f in LITERALS_CCITTFAX_DECODE:
             #data = ccittfaxdecode(data)
             raise PDFNotImplementedError('Unsupported filter: %r' % f)
         elif f == LITERAL_CRYPT:
             # not yet..
             raise PDFNotImplementedError('/Crypt filter is unsupported')
         else:
             raise PDFNotImplementedError('Unsupported filter: %r' % f)
         # apply predictors
         params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
         if 'Predictor' in params and 'Columns' in params:
             pred = int_value(params['Predictor'])
             columns = int_value(params['Columns'])
             if pred:
                 if pred != 12:
                     raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
                 buf = ''
                 ent0 = '\x00' * columns
                 for i in xrange(0, len(data), columns+1):
                     pred = data[i]
                     ent1 = data[i+1:i+1+columns]
                     if pred == '\x02':
                         ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
                     buf += ent1
                     ent0 = ent1
                 data = buf
     self.data = data
     self.rawdata = None
     return
class PDFStream(PDFObject):

    def __init__(self, attrs, rawdata, decipher=None):
        assert isinstance(attrs, dict)
        self.attrs = attrs
        self.rawdata = rawdata
        self.decipher = decipher
        self.data = None
        self.objid = None
        self.genno = None
        return

    def set_objid(self, objid, genno):
        self.objid = objid
        self.genno = genno
        return

    def __repr__(self):
        if self.data is None:
            assert self.rawdata is not None
            return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.attrs)
        else:
            assert self.data is not None
            return '<PDFStream(%r): len=%d, %r>' % (self.objid, len(self.data), self.attrs)

    def __contains__(self, name):
        return name in self.attrs

    def __getitem__(self, name):
        return self.attrs[name]

    def get(self, name, default=None):
        return self.attrs.get(name, default)

    def get_any(self, names, default=None):
        for name in names:
            if name in self.attrs:
                return self.attrs[name]
        return default

    def get_filters(self):
        filters = self.get_any(('F', 'Filter'))
        if not filters:
            return []
        if isinstance(filters, list):
            return filters
        return [filters]

    def decode(self):
        assert self.data is None and self.rawdata is not None
        data = self.rawdata
        if self.decipher:
            # Handle encryption
            data = self.decipher(self.objid, self.genno, data)
        filters = self.get_filters()
        if not filters:
            self.data = data
            self.rawdata = None
            return
        for f in filters:
            params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
            if f in LITERALS_FLATE_DECODE:
                # will get errors if the document is encrypted.
                try:
                    data = zlib.decompress(data)
                except zlib.error, e:
                    if STRICT:
                        raise PDFException('Invalid zlib bytes: %r, %r' % (e, data))
                    data = ''
            elif f in LITERALS_LZW_DECODE:
                data = lzwdecode(data)
            elif f in LITERALS_ASCII85_DECODE:
                data = ascii85decode(data)
Exemple #9
0
def getdictionary(file, followlinks=False):
    # sequence of key - object pairs, of which at least the key is a /name (without slash)
    # it may be followed by a stream, which is encapsulated by the words "stream" and "endstream"
    # TODO: handling of specific keys and return values (Length, Size): from global to variables to returns from this function (as this function may be called recursively)
    # TODO: the stream itself needs to be checked "in the context to which it is referred to"
    global globaldictvaluesize  # TODO: from global variable to return'd value from this function
    vprint("[DICT]", 2, '')
    getkey = ""
    getobject = ""
    dictionary = {}
    while getkey != '>' and getobject != '>':
        getkey = readobject(file)
        getobject = readobject(file, getkey in followlinkslist)
        dictionary[getkey] = getobject
    nword, nnword = getnexttwowords(file)
    if nword == 'stream':
        #TODO: followsymlinks handling in case stream can have symlinks
        length = num(dictionary.get("Length"))
        getword(
            file
        )  # actually read the word 'stream' (including trailing delimiter)
        file.seek(-1, 1)  #stream follows after 'stream\r\n' or 'stream\n'
        if ord(nextchar(file)) == 13:  # \r
            file.read(2)  # read 'stream\r\n'
        else:
            file.read(1)  # read 'stream\n'
        stream = file.read(length)
        vprint(" ", 2)
        vprint("[STREAM] " + str(length) + " bytes", 2)
        if "Filter" in list(dictionary.keys()):
            filterlist = dictionary.get("Filter")
            if isinstance(filterlist, str):
                filterlist = [filterlist]
            for streamfilter in list(filterlist):
                vprint("[DECODE]: " + streamfilter + " ", 2, '')
                if streamfilter == "FlateDecode":
                    #TODO: Predictor filters not implemented yet, see getxrefstream(). It is possible we encounter an xrefstream as we might not explicitly have searched and parsed one.
                    try:
                        stream = zlib.decompress(stream)
                        vprint(makeprintable(stream), 3)
                    except:
                        try:
                            vprint(
                                "zlib error; streamlength: " +
                                str(len(stream)) + ", firstbyte: " +
                                str(stream[0]) + ", lastbyte: " +
                                str(stream[len(stream) - 1]) +
                                ", ZLIB runtime version: " +
                                zlib.ZLIB_RUNTIME_VERSION, 2)
                        except:
                            vprint("No return stream given by zlib", 2)
                elif streamfilter == 'ASCII85Decode':
                    stream = ascii85decode(stream)
                elif streamfilter == 'LZWDecode':
                    stream = lzwdecode(stream)


#				elif streamfilter == 'CCITTFaxDecode':
#					vprint("[FILTER]: "+streamfilter,3)
#					ccittfaxdecode(stream) #TODO: needs additional arguments
                elif streamfilter == '/':
                    pass
                else:
                    vprint(
                        "Filter not implemented: " + streamfilter +
                        ", found in object: " + str(currentobject[0]) + " " +
                        str(currentobject[1]), 1)
                    # TODO: use counttable instead to give list of unimplemented filters with objects at the end of the scan.
                    # TODO: need to break here if multiple compressions are used of which one fails to prevent error out.
        getword(file)  # the word endstream
        vprint("[STREAM: end]", 2)
        if dictionary.get("Type") == "XRef":
            vprint("[XRef]", 2)
            dictionary["Stream"] = stream
        elif dictionary.get("Type") == "ObjStm":
            vprint("[STREAM]: open ObjStm", 2)
            f = open(".pdfaudit", "w+b")
            f.write(stream)
            f.write(bytearray([13, 13, 13]))
            f.seek(0)
            iterateobjstm(f, num(dictionary.get("N")))
            f.close()
            #TODO: delete file
            vprint("[STREAM]: close ObjStm", 2)
        else:
            dictionary["Stream"] = stream.decode('utf-8', 'ignore')
        #TODO: followsymlinks handling in case stream can have symlinks
    vprint("[DICT: end]", 2, '')
    checkdictionary(dictionary)
    return dictionary  # TODO: check if we can return more here