def decompress_stream(self, key): """ Decompress compressed object Streams. """ self.key = key #print "++++++++++++ decompressing stream +++++++++++++++++++++++++" ,key try: data = self.objects[self.key] start = data.find("stream") end = data.find("endstream") self.buff = data[start + 6:].strip() if len(self.buff) < 2: return self.methods = self.get_compMethod(key, data) self.data = self.buff.strip() for self.method in self.methods: #print self.method if 'fl' == self.method.lower(): self.data = decompress(self.data) if 'ascii85decode' == self.method.lower(): self.data = ascii85decode(self.data) if 'asciihexdecode' == self.method.lower(): self.data = asciihexdecode(self.data) if 'lzwdecode' == self.method.lower(): self.data = lzwdecode(self.data) if len(self.methods) == 0: self.handle_evasion(key, data[:start]) except Exception, err: pass
def decode(self): assert self.data is None and self.rawdata is not None data = self.rawdata if self.decipher: # Handle encryption data = self.decipher(self.objid, self.genno, data) filters = self.get_filters() if not filters: self.data = data self.rawdata = None return for f in filters: if isinstance(f, PDFObjRef): filters += f.resolve() continue params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. try: data = zlib.decompress(data) except zlib.error, e: if STRICT: raise PDFException('Invalid zlib bytes: %r, %r' % (e, data)) data = '' elif f in LITERALS_LZW_DECODE: data = lzwdecode(data)
def decode(self): assert self.data is None and self.rawdata != None data = self.rawdata if self.decipher: # Handle encryption data = self.decipher(self.objid, self.genno, data) filters = self.get_filters() if not filters: self.data = data self.rawdata = None return for f in filters: if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. try: data = zlib.decompress(data) except zlib.error: data = '' elif f in LITERALS_LZW_DECODE: data = lzwdecode(data) elif f in LITERALS_ASCII85_DECODE: data = ascii85decode(data) elif f in LITERALS_ASCIIHEX_DECODE: data = asciihexdecode(data) elif f in LITERALS_RUNLENGTH_DECODE: data = rldecode(data) elif f in LITERALS_CCITTFAX_DECODE: #data = ccittfaxdecode(data) raise PDFNotImplementedError('Unsupported filter: %r' % f) elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError('/Crypt filter is unsupported') else: raise PDFNotImplementedError('Unsupported filter: %r' % f) # apply predictors params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) if 'Predictor' in params and 'Columns' in params: pred = int_value(params['Predictor']) columns = int_value(params['Columns']) if pred: if pred != 12: raise PDFNotImplementedError( 'Unsupported predictor: %r' % pred) buf = '' ent0 = '\x00' * columns for i in xrange(0, len(data), columns + 1): pred = data[i] ent1 = data[i + 1:i + 1 + columns] if pred == '\x02': ent1 = ''.join( chr((ord(a) + ord(b)) & 255) for (a, b) in zip(ent0, ent1)) buf += ent1 ent0 = ent1 data = buf self.data = data self.rawdata = None return
def decode(self): assert self.data is None and self.rawdata is not None data = self.rawdata if self.decipher: # Handle encryption data = self.decipher(self.objid, self.genno, data, self.attrs) filters = self.get_filters() if not filters: self.data = data self.rawdata = None return for f in filters: params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. try: data = zlib.decompress(data) except zlib.error as e: if STRICT: raise PDFException('Invalid zlib bytes: %r, %r' % (e, data)) data = '' elif f in LITERALS_LZW_DECODE: data = lzwdecode(data) elif f in LITERALS_ASCII85_DECODE: data = ascii85decode(data) elif f in LITERALS_ASCIIHEX_DECODE: data = asciihexdecode(data) elif f in LITERALS_RUNLENGTH_DECODE: data = rldecode(data) elif f in LITERALS_CCITTFAX_DECODE: data = ccittfaxdecode(data, params) elif f in LITERALS_DCT_DECODE: # This is probably a JPG stream - it does not need to be decoded twice. # Just return the stream to the user. pass elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError('/Crypt filter is unsupported') else: raise PDFNotImplementedError('Unsupported filter: %r' % f) # apply predictors if 'Predictor' in params: pred = int_value(params['Predictor']) if pred == 1: # no predictor pass elif 10 <= pred: # PNG predictor colors = int_value(params.get('Colors', 1)) columns = int_value(params.get('Columns', 1)) bitspercomponent = int_value(params.get('BitsPerComponent', 8)) data = apply_png_predictor(pred, colors, columns, bitspercomponent, data) else: raise PDFNotImplementedError('Unsupported predictor: %r' % pred) self.data = data self.rawdata = None return
def decode(self): assert self.data is None and self.rawdata != None data = self.rawdata if self.decipher: # Handle encryption data = self.decipher(self.objid, self.genno, data) filters = self.get_filters() if not filters: self.data = data self.rawdata = None return for f in filters: if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. try: data = zlib.decompress(data) except zlib.error as e: if STRICT: raise PDFException("Invalid zlib bytes: %r, %r" % (e, data)) data = "" elif f in LITERALS_LZW_DECODE: data = lzwdecode(data) elif f in LITERALS_ASCII85_DECODE: data = ascii85decode(data) elif f in LITERALS_ASCIIHEX_DECODE: data = asciihexdecode(data) elif f in LITERALS_RUNLENGTH_DECODE: data = rldecode(data) elif f in LITERALS_CCITTFAX_DECODE: # data = ccittfaxdecode(data) raise PDFNotImplementedError("Unsupported filter: %r" % f) elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError("/Crypt filter is unsupported") else: raise PDFNotImplementedError("Unsupported filter: %r" % f) # apply predictors params = self.get_any(("DP", "DecodeParms", "FDecodeParms"), {}) if "Predictor" in params and "Columns" in params: pred = int_value(params["Predictor"]) columns = int_value(params["Columns"]) if pred: if pred != 12: raise PDFNotImplementedError("Unsupported predictor: %r" % pred) buf = "" ent0 = "\x00" * columns for i in xrange(0, len(data), columns + 1): pred = data[i] ent1 = data[i + 1 : i + 1 + columns] if pred == "\x02": ent1 = "".join(chr((ord(a) + ord(b)) & 255) for (a, b) in zip(ent0, ent1)) buf += ent1 ent0 = ent1 data = buf self.data = data self.rawdata = None return
def decode(self): assert self.data is None and self.rawdata != None data = self.rawdata if self.decipher: # Handle encryption data = self.decipher(self.objid, self.genno, data) filters = self.get_filters() if not filters: self.data = data self.rawdata = None return for f in filters: if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. try: data = zlib.decompress(data) except zlib.error: data = '' elif f in LITERALS_LZW_DECODE: data = lzwdecode(data) elif f in LITERALS_ASCII85_DECODE: data = ascii85decode(data) elif f in LITERALS_ASCIIHEX_DECODE: data = asciihexdecode(data) elif f in LITERALS_RUNLENGTH_DECODE: data = rldecode(data) elif f in LITERALS_CCITTFAX_DECODE: #data = ccittfaxdecode(data) raise PDFNotImplementedError('Unsupported filter: %r' % f) elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError('/Crypt filter is unsupported') else: raise PDFNotImplementedError('Unsupported filter: %r' % f) # apply predictors params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) if 'Predictor' in params and 'Columns' in params: pred = int_value(params['Predictor']) columns = int_value(params['Columns']) if pred: if pred != 12: raise PDFNotImplementedError('Unsupported predictor: %r' % pred) buf = '' ent0 = '\x00' * columns for i in xrange(0, len(data), columns+1): pred = data[i] ent1 = data[i+1:i+1+columns] if pred == '\x02': ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) ) buf += ent1 ent0 = ent1 data = buf self.data = data self.rawdata = None return
def lzwDecode(stream, parameters): ''' Method to decode streams using the LZW algorithm @param stream: A PDF stream @return: A tuple (status,statusContent), where statusContent is the decoded PDF stream in case status = 0 or an error in case status = -1 ''' decodedStream = '' try: decodedStream = lzw.lzwdecode(stream) except: return (-1, 'Error decompressing string') if parameters == None or parameters == {}: return (0, decodedStream) else: if parameters.has_key('/Predictor'): predictor = parameters['/Predictor'].getRawValue() else: predictor = 1 # Columns = number of samples per row if parameters.has_key('/Columns'): columns = parameters['/Columns'].getRawValue() else: columns = 1 # Colors = number of components per sample if parameters.has_key('/Colors'): colors = parameters['/Colors'].getRawValue() if colors < 1: colors = 1 else: colors = 1 # BitsPerComponent: number of bits per color component if parameters.has_key('/BitsPerComponent'): bits = parameters['/BitsPerComponent'].getRawValue() if bits not in [1, 2, 4, 8, 16]: bits = 8 else: bits = 8 if parameters.has_key('/EarlyChange'): earlyChange = parameters['/EarlyChange'].getRawValue() else: earlyChange = 1 if predictor != None and predictor != 1: ret = post_prediction(decodedStream, predictor, columns, colors, bits) return ret else: return (0, decodedStream)
def lzwDecode(stream, parameters): ''' Method to decode streams using the LZW algorithm @param stream: A PDF stream @return: A tuple (status,statusContent), where statusContent is the decoded PDF stream in case status = 0 or an error in case status = -1 ''' decodedStream = '' try: decodedStream = lzw.lzwdecode(stream) except: return (-1,'Error decompressing string') if parameters == None or parameters == {}: return (0,decodedStream) else: if parameters.has_key('/Predictor'): predictor = parameters['/Predictor'].getRawValue() else: predictor = 1 # Columns = number of samples per row if parameters.has_key('/Columns'): columns = parameters['/Columns'].getRawValue() else: columns = 1 # Colors = number of components per sample if parameters.has_key('/Colors'): colors = parameters['/Colors'].getRawValue() if colors < 1: colors = 1 else: colors = 1 # BitsPerComponent: number of bits per color component if parameters.has_key('/BitsPerComponent'): bits = parameters['/BitsPerComponent'].getRawValue() if bits not in [1,2,4,8,16]: bits = 8 else: bits = 8 if parameters.has_key('/EarlyChange'): earlyChange = parameters['/EarlyChange'].getRawValue() else: earlyChange = 1 if predictor != None and predictor != 1: ret = post_prediction(decodedStream, predictor, columns, colors, bits) return ret else: return (0,decodedStream)
def lzwDecode(stream, parameters): """ Method to decode streams using the LZW algorithm @param stream: A PDF stream @return: A tuple (status,statusContent), where statusContent is the decoded PDF stream in case status = 0 or an error in case status = -1 """ decodedStream = "" try: decodedStream = lzw.lzwdecode(stream) except: return (-1, "Error decompressing string") if parameters == None or parameters == {}: return (0, decodedStream) else: if "/Predictor" in parameters: predictor = parameters["/Predictor"].getRawValue() else: predictor = 1 # Columns = number of samples per row if "/Columns" in parameters: columns = parameters["/Columns"].getRawValue() else: columns = 1 # Colors = number of components per sample if "/Colors" in parameters: colors = parameters["/Colors"].getRawValue() if colors < 1: colors = 1 else: colors = 1 # BitsPerComponent: number of bits per color component if "/BitsPerComponent" in parameters: bits = parameters["/BitsPerComponent"].getRawValue() if bits not in [1, 2, 4, 8, 16]: bits = 8 else: bits = 8 if "/EarlyChange" in parameters: earlyChange = parameters["/EarlyChange"].getRawValue() else: earlyChange = 1 if predictor != None and predictor != 1: ret = post_prediction(decodedStream, predictor, columns, colors, bits) return ret else: return (0, decodedStream)
def decode(self): assert self.data is None and self.rawdata != None data = self.rawdata if self.decipher: # Handle encryption data = self.decipher(self.objid, self.genno, data) filters = self.get_filters() if not filters: self.data = data self.rawdata = None return for f in filters: if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. try: data = zlib.decompress(data) except zlib.error, e: if STRICT: raise PDFException('Invalid zlib bytes: %r, %r' % (e, data)) data = '' elif f in LITERALS_LZW_DECODE: data = lzwdecode(data)
def getdictionary(file, followlinks=False): # sequence of key - object pairs, of which at least the key is a /name (without slash) # it may be followed by a stream, which is encapsulated by the words "stream" and "endstream" # TODO: handling of specific keys and return values (Length, Size): from global to variables to returns from this function (as this function may be called recursively) # TODO: the stream itself needs to be checked "in the context to which it is referred to" global globaldictvaluesize # TODO: from global variable to return'd value from this function vprint("[DICT]", 2, '') getkey = "" getobject = "" dictionary = {} while getkey != '>' and getobject != '>': getkey = readobject(file) getobject = readobject(file, getkey in followlinkslist) dictionary[getkey] = getobject nword, nnword = getnexttwowords(file) if nword == 'stream': #TODO: followsymlinks handling in case stream can have symlinks length = num(dictionary.get("Length")) getword( file ) # actually read the word 'stream' (including trailing delimiter) file.seek(-1, 1) #stream follows after 'stream\r\n' or 'stream\n' if ord(nextchar(file)) == 13: # \r file.read(2) # read 'stream\r\n' else: file.read(1) # read 'stream\n' stream = file.read(length) vprint(" ", 2) vprint("[STREAM] " + str(length) + " bytes", 2) if "Filter" in list(dictionary.keys()): filterlist = dictionary.get("Filter") if isinstance(filterlist, str): filterlist = [filterlist] for streamfilter in list(filterlist): vprint("[DECODE]: " + streamfilter + " ", 2, '') if streamfilter == "FlateDecode": #TODO: Predictor filters not implemented yet, see getxrefstream(). It is possible we encounter an xrefstream as we might not explicitly have searched and parsed one. try: stream = zlib.decompress(stream) vprint(makeprintable(stream), 3) except: try: vprint( "zlib error; streamlength: " + str(len(stream)) + ", firstbyte: " + str(stream[0]) + ", lastbyte: " + str(stream[len(stream) - 1]) + ", ZLIB runtime version: " + zlib.ZLIB_RUNTIME_VERSION, 2) except: vprint("No return stream given by zlib", 2) elif streamfilter == 'ASCII85Decode': stream = ascii85decode(stream) elif streamfilter == 'LZWDecode': stream = lzwdecode(stream) # elif streamfilter == 'CCITTFaxDecode': # vprint("[FILTER]: "+streamfilter,3) # ccittfaxdecode(stream) #TODO: needs additional arguments elif streamfilter == '/': pass else: vprint( "Filter not implemented: " + streamfilter + ", found in object: " + str(currentobject[0]) + " " + str(currentobject[1]), 1) # TODO: use counttable instead to give list of unimplemented filters with objects at the end of the scan. # TODO: need to break here if multiple compressions are used of which one fails to prevent error out. getword(file) # the word endstream vprint("[STREAM: end]", 2) if dictionary.get("Type") == "XRef": vprint("[XRef]", 2) dictionary["Stream"] = stream elif dictionary.get("Type") == "ObjStm": vprint("[STREAM]: open ObjStm", 2) f = open(".pdfaudit", "w+b") f.write(stream) f.write(bytearray([13, 13, 13])) f.seek(0) iterateobjstm(f, num(dictionary.get("N"))) f.close() #TODO: delete file vprint("[STREAM]: close ObjStm", 2) else: dictionary["Stream"] = stream.decode('utf-8', 'ignore') #TODO: followsymlinks handling in case stream can have symlinks vprint("[DICT: end]", 2, '') checkdictionary(dictionary) return dictionary # TODO: check if we can return more here