def handle_pdf(self, url, content): sample = log.ThugLogging.build_sample(content, url) if sample is None or sample['type'] not in ('PDF', ): return fd, rfile = tempfile.mkstemp() with open(rfile, 'wb') as fd: fd.write(content) pdfparser = PDFParser() try: ret, pdf = pdfparser.parse(rfile, forceMode=True, looseMode=True) # pylint:disable=unused-variable except: # pylint:disable=bare-except os.remove(rfile) return False statsDict = pdf.getStats() analysis = self.getPeepXML(statsDict, url) log_dir = os.path.join(log.ThugLogging.baseDir, "analysis", "pdf") log.ThugLogging.log_peepdf(log_dir, sample, analysis) self.swf_mastah(pdf, statsDict, url) os.remove(rfile) return True
def DumpStream(path, objid): if not (os.path.isfile(path)): print '{0} not a file!'.format(path) return 1 try: pdfParser = PDFParser() _, pdf = pdfParser.parse(path, True) if not pdf: return 2 else: # get object obj = pdf.getObject(objid, None) if not obj: print '{0} stream not found!'.format(objid) return 1 if obj.getType() != 'stream': print '{0} is not a stream!'.format(objid) return 1 value = obj.getStream() if value != -1: print value except Exception as ex: print str(ex) return 1 return 0
def DumpStream(path, objid): if not(os.path.isfile(path)): print '{0} not a file!'.format(path) return 1 try: pdfParser = PDFParser() _,pdf = pdfParser.parse(path, True) if not pdf: return 2 else: # get object obj = pdf.getObject(objid, None) if not obj: print '{0} stream not found!'.format(objid) return 1 if obj.getType() != 'stream': print '{0} is not a stream!'.format(objid) return 1 value = obj.getStream() if value != -1: print value except Exception as ex: print str(ex) return 1 return 0
def handle_pdf(self, url, content): m = hashlib.md5() m.update(content) md5sum = m.hexdigest() rfile = os.path.join(log.ThugLogging.baseDir, md5sum) with open(rfile, 'wb') as fd: fd.write(content) pdfparser = PDFParser() try: ret, pdf = pdfparser.parse(rfile, forceMode=True, looseMode=True) except: os.remove(rfile) return False statsDict = pdf.getStats() analysis = self.getPeepXML(statsDict, url) pdflogdir = os.path.join(log.ThugLogging.baseDir, "analysis", "pdf") try: os.makedirs(pdflogdir) except: pass report = os.path.join(pdflogdir, "%s.xml" % (statsDict["MD5"], )) with open(report, 'wb') as fd: fd.write(analysis) os.remove(rfile) return True
def handle_pdf(self, url, content): m = hashlib.md5() m.update(content) md5sum = m.hexdigest() rfile = os.path.join(log.ThugLogging.baseDir, md5sum) with open(rfile, 'wb') as fd: fd.write(content) pdfparser = PDFParser() try: ret, pdf = pdfparser.parse(rfile, forceMode = True, looseMode = True) except: os.remove(rfile) return False statsDict = pdf.getStats() analysis = self.getPeepXML(statsDict, url) pdflogdir = os.path.join(log.ThugLogging.baseDir, "analysis", "pdf") try: os.makedirs(pdflogdir) except: pass report = os.path.join(pdflogdir, "%s.xml" % (statsDict["MD5"], )) with open(report, 'wb') as fd: fd.write(analysis) os.remove(rfile) return True
def handle_pdf(self, url, content): m = hashlib.md5() m.update(content) md5sum = m.hexdigest() rfile = log.ThugLogging.store_content(log.ThugLogging.baseDir, md5sum, content) pdfparser = PDFParser() try: ret, pdf = pdfparser.parse(rfile, forceMode = True, looseMode = True) except: os.remove(rfile) return False statsDict = pdf.getStats() analysis = self.getPeepXML(statsDict, url) pdflogdir = os.path.join(log.ThugLogging.baseDir, "analysis", "pdf") try: os.makedirs(pdflogdir) except: pass log.ThugLogging.store_content(pdflogdir, "%s.xml" % (statsDict["MD5"], ), analysis) self.swf_mastah(pdf, statsDict) os.remove(rfile) return True
def handle_pdf(self, url, content): sample = log.ThugLogging.build_sample(content, url) if sample is None or sample['type'] not in ('PDF', ): return fd, rfile = tempfile.mkstemp() with open(rfile, 'wb') as fd: fd.write(content) pdfparser = PDFParser() try: ret, pdf = pdfparser.parse(rfile, forceMode = True, looseMode = True) #pylint:disable=unused-variable except: #pylint:disable=bare-except os.remove(rfile) return False statsDict = pdf.getStats() analysis = self.getPeepXML(statsDict, url) log_dir = os.path.join(log.ThugLogging.baseDir, "analysis", "pdf") log.ThugLogging.log_peepdf(log_dir, sample, analysis) self.swf_mastah(pdf, statsDict, url) os.remove(rfile) return True
def handle_pdf(self, url, content): m = hashlib.md5() m.update(content) md5sum = m.hexdigest() rfile = log.ThugLogging.store_content(log.ThugLogging.baseDir, md5sum, content) pdfparser = PDFParser() try: ret, pdf = pdfparser.parse(rfile, forceMode=True, looseMode=True) except: os.remove(rfile) return False statsDict = pdf.getStats() analysis = self.getPeepXML(statsDict, url) pdflogdir = os.path.join(log.ThugLogging.baseDir, "analysis", "pdf") try: os.makedirs(pdflogdir) except: pass log.ThugLogging.store_content(pdflogdir, "%s.xml" % (statsDict["MD5"], ), analysis) self.swf_mastah(pdf, statsDict) os.remove(rfile) return True
def fakeFile_check(filePath): try: from peepdf.PDFCore import PDFParser pdfParser = PDFParser() _, pdf = pdfParser.parse(filePath) return pdf except Exception: return None
def get_streams(): # This function is brutally ripped from Brandon Dixon's swf_mastah.py. # Initialize peepdf parser. parser = PDFParser() # Parse currently opened PDF document. ret, pdf = parser.parse(__sessions__.current.file.path, True, False) # Generate statistics. stats = pdf.getStats() results = [] objects = [] count = 0 for version in range(len(stats["Version"])): body = pdf.body[count] objects = body.objects for index in objects: oid = objects[index].id offset = objects[index].offset size = objects[index].size details = objects[index].object if details.type == "stream": encoded_stream = details.encodedStream decoded_stream = details.decodedStream result = [oid, offset, size, get_type(decoded_stream)[:100]] # If the stream needs to be dumped or opened, we do it # and expand the results with the path to the stream dump. if arg_open or arg_dump: # If was instructed to dump, we already have a base folder. if arg_dump: folder = arg_dump # Otherwise we juts generate a temporary one. else: folder = tempfile.mkdtemp() # Dump stream to this path. # TODO: sometimes there appear to be multiple streams # with the same object ID. Is that even possible? # It will cause conflicts. dump_path = "{0}/{1}_{2}_stream.bin".format(folder, __sessions__.current.file.md5, oid) with open(dump_path, "wb") as handle: handle.write(decoded_stream.strip()) # Add dump path to the stream attributes. result.append(dump_path) # Update list of streams. results.append(result) count += 1 return results
def get_data(f_list): for i in f_list: T_file = file_root + i pdfParser = PDFParser() try: _, pdf = pdfParser.parse(T_file) newfile = os.getcwd() + '/' + file_classify shutil.copytree(file_root+i, newfile+i) #复制文件到新文件夹 ,移动用move except Exception: continue return newfile
def get_data(f_list): for i in f_list: T_file = file_root + i pdfParser = PDFParser() try: _, pdf = pdfParser.parse(T_file) except Exception: os.remove(file_root + i) # shutil.move(file_root+i, file_classify+i) #复制文件到新文件夹 ,移动用move/copyfile print i continue return i
def ProcessFile(path): if not (os.path.isfile(path)): print '{0} not a file!'.format(path) return 2 try: data = {} data['valid'] = True pdfdata = {} pdfParser = PDFParser() _, pdf = pdfParser.parse(path, True) if not pdf: data['valid'] = False else: errors = [] streams = [] # general info statsDict = pdf.getStats() try: data['info'] = json.dumps(statsDict, indent=4, sort_keys=False) except Exception as e: data['info'] = e # enumerate errors if hasattr(pdf, 'errors'): errors.extend(pdf.errors) # enumerate streams statsDict = pdf.getStats() for versionId, statsVersion in enumerate(statsDict['Versions']): for objid in statsVersion['Objects'][1]: obj = pdf.getObject(objid, versionId) if not obj: continue stream = {} stream['id'] = objid stream['type'] = obj.getType() stream['attributes'] = {} stream['has_js'] = obj.containsJScode if hasattr(obj, 'elements'): for key in obj.elements: element = obj.elements[key] stream['attributes'][key] = convert_to_printable( element.value) if obj.getType() == 'stream': stream['data_len'] = obj.size value = obj.getStream() if value != -1: stream['data'] = convert_to_printable(value) streams.append(stream) pdfdata['streams'] = streams pdfdata['errors'] = errors data['data'] = pdfdata encoded = json.dumps(data) print encoded except Exception as ex: data = {} data['valid'] = False data['error'] = str(ex) print json.dumps(data) return 1 return 0
def main(): global COLORIZED_OUTPUT argsParser = optparse.OptionParser(usage='Usage: peepdf.py [options] PDF_file', description=versionHeader) argsParser.add_option('-i', '--interactive', action='store_true', dest='isInteractive', default=False, help='Sets console mode.') argsParser.add_option('-s', '--load-script', action='store', type='string', dest='scriptFile', help='Loads the commands stored in the specified file and execute them.') argsParser.add_option('-c', '--check-vt', action='store_true', dest='checkOnVT', default=False, help='Checks the hash of the PDF file on VirusTotal.') argsParser.add_option('-f', '--force-mode', action='store_true', dest='isForceMode', default=False, help='Sets force parsing mode to ignore errors.') argsParser.add_option('-l', '--loose-mode', action='store_true', dest='isLooseMode', default=False, help='Sets loose parsing mode to catch malformed objects.') argsParser.add_option('-m', '--manual-analysis', action='store_true', dest='isManualAnalysis', default=False, help='Avoids automatic Javascript analysis. Useful with eternal loops like heap spraying.') argsParser.add_option('-g', '--grinch-mode', action='store_true', dest='avoidColors', default=False, help='Avoids colorized output in the interactive console.') argsParser.add_option('-v', '--version', action='store_true', dest='version', default=False, help='Shows program\'s version number.') argsParser.add_option('-x', '--xml', action='store_true', dest='xmlOutput', default=False, help='Shows the document information in XML format.') argsParser.add_option('-j', '--json', action='store_true', dest='jsonOutput', default=False, help='Shows the document information in JSON format.') argsParser.add_option('-C', '--command', action='append', type='string', dest='commands', help='Specifies a command from the interactive console to be executed.') (options, args) = argsParser.parse_args() stats = "" pdf = None fileName = None statsDict = None vtJsonDict = None try: # Avoid colors in the output if not COLORIZED_OUTPUT or options.avoidColors: warningColor = '' errorColor = '' alertColor = '' staticColor = '' resetColor = '' else: warningColor = Fore.YELLOW errorColor = Fore.RED alertColor = Fore.RED staticColor = Fore.BLUE resetColor = Style.RESET_ALL if options.version: print(peepdfHeader) else: if len(args) == 1: fileName = args[0] if not os.path.exists(fileName): sys.exit('Error: The file "' + fileName + '" does not exist!!') elif len(args) > 1 or (len(args) == 0 and not options.isInteractive): sys.exit(argsParser.print_help()) if options.scriptFile is not None: if not os.path.exists(options.scriptFile): sys.exit('Error: The script file "' + options.scriptFile + '" does not exist!!') if fileName is not None: pdfParser = PDFParser() ret, pdf = pdfParser.parse(fileName, options.isForceMode, options.isLooseMode, options.isManualAnalysis) if options.checkOnVT: # Checks the MD5 on VirusTotal md5Hash = pdf.getMD5() ret = vtcheck(md5Hash, VT_KEY) if ret[0] == -1: pdf.addError(ret[1]) else: vtJsonDict = ret[1] if "response_code" in vtJsonDict: if vtJsonDict['response_code'] == 1: if "positives" in vtJsonDict and "total" in vtJsonDict: pdf.setDetectionRate([vtJsonDict['positives'], vtJsonDict['total']]) else: pdf.addError('Missing elements in the response from VirusTotal!!') if "permalink" in vtJsonDict: pdf.setDetectionReport(vtJsonDict['permalink']) else: pdf.setDetectionRate(None) else: pdf.addError('Bad response from VirusTotal!!') statsDict = pdf.getStats() if options.xmlOutput: try: xml = getPeepXML(statsDict, _version, revision) sys.stdout.write(xml) except: errorMessage = '*** Error: Exception while generating the XML file!!' traceback.print_exc(file=open(errorsFile, 'a')) raise Exception('PeepException', 'Send me an email ;)') elif options.jsonOutput and not options.commands: try: jsonReport = getPeepJSON(statsDict, _version, revision) sys.stdout.write(jsonReport) except: errorMessage = '*** Error: Exception while generating the JSON report!!' traceback.print_exc(file=open(errorsFile, 'a')) raise Exception('PeepException', 'Send me an email ;)') else: if COLORIZED_OUTPUT and not options.avoidColors: try: init() except: COLORIZED_OUTPUT = False if options.scriptFile is not None: from peepdf.PDFConsole import PDFConsole scriptFileObject = open(options.scriptFile, 'rb') console = PDFConsole(pdf, VT_KEY, options.avoidColors, stdin=scriptFileObject) try: console.cmdloop() except: errorMessage = '*** Error: Exception not handled using the batch mode!!' scriptFileObject.close() traceback.print_exc(file=open(errorsFile, 'a')) raise Exception('PeepException', 'Send me an email ;)') elif options.commands is not None: from .PDFConsole import PDFConsole console = PDFConsole(pdf, VT_KEY, options.avoidColors) try: for command in options.commands: console.onecmd(command) except: errorMessage = '*** Error: Exception not handled using the batch commands!!' traceback.print_exc(file=open(errorsFile, 'a')) raise Exception('PeepException', 'Send me an email ;)') else: if statsDict is not None: if COLORIZED_OUTPUT and not options.avoidColors: beforeStaticLabel = staticColor else: beforeStaticLabel = '' if not JS_MODULE: warningMessage = 'Warning: PyV8 is not installed!!' stats += warningColor + warningMessage + resetColor + newLine if not EMU_MODULE: warningMessage = 'Warning: pylibemu is not installed!!' stats += warningColor + warningMessage + resetColor + newLine if not PIL_MODULE: warningMessage = 'Warning: Python Imaging Library (PIL) is not installed!!' stats += warningColor + warningMessage + resetColor + newLine errors = statsDict['Errors'] for error in errors: if error.find('Decryption error') != -1: stats += errorColor + error + resetColor + newLine if stats != '': stats += newLine statsDict = pdf.getStats() stats += beforeStaticLabel + 'File: ' + resetColor + statsDict['File'] + newLine stats += beforeStaticLabel + 'MD5: ' + resetColor + statsDict['MD5'] + newLine stats += beforeStaticLabel + 'SHA1: ' + resetColor + statsDict['SHA1'] + newLine stats += beforeStaticLabel + 'SHA256: ' + resetColor + statsDict['SHA256'] + newLine stats += beforeStaticLabel + 'Size: ' + resetColor + statsDict['Size'] + ' bytes' + newLine if options.checkOnVT: if statsDict['Detection'] != []: detectionReportInfo = '' if statsDict['Detection'] is not None: detectionColor = '' if COLORIZED_OUTPUT and not options.avoidColors: detectionLevel = statsDict['Detection'][0] / (statsDict['Detection'][1] / 3) if detectionLevel == 0: detectionColor = alertColor elif detectionLevel == 1: detectionColor = warningColor detectionRate = '%s%d%s/%d' % ( detectionColor, statsDict['Detection'][0], resetColor, statsDict['Detection'][1]) if statsDict['Detection report'] != '': detectionReportInfo = ( beforeStaticLabel + 'Detection report: ' + resetColor + statsDict['Detection report'] + newLine ) else: detectionRate = 'File not found on VirusTotal' stats += beforeStaticLabel + 'Detection: ' + resetColor + detectionRate + newLine stats += detectionReportInfo stats += beforeStaticLabel + 'Version: ' + resetColor + statsDict['Version'] + newLine stats += beforeStaticLabel + 'Binary: ' + resetColor + statsDict['Binary'] + newLine stats += beforeStaticLabel + 'Linearized: ' + resetColor + statsDict['Linearized'] + newLine stats += beforeStaticLabel + 'Encrypted: ' + resetColor + statsDict['Encrypted'] if statsDict['Encryption Algorithms'] != []: stats += ' (' for algorithmInfo in statsDict['Encryption Algorithms']: stats += algorithmInfo[0] + ' ' + str(algorithmInfo[1]) + ' bits, ' stats = stats[:-2] + ')' stats += newLine stats += beforeStaticLabel + 'Updates: ' + resetColor + statsDict['Updates'] + newLine stats += beforeStaticLabel + 'Objects: ' + resetColor + statsDict['Objects'] + newLine stats += beforeStaticLabel + 'Streams: ' + resetColor + statsDict['Streams'] + newLine stats += beforeStaticLabel + 'URIs: ' + resetColor + statsDict['URIs'] + newLine stats += beforeStaticLabel + 'Comments: ' + resetColor + statsDict['Comments'] + newLine stats += beforeStaticLabel + 'Errors: ' + resetColor + str(len(statsDict['Errors'])) + newLine * 2 for version in range(len(statsDict['Versions'])): statsVersion = statsDict['Versions'][version] stats += beforeStaticLabel + 'Version ' + resetColor + str(version) + ':' + newLine if statsVersion['Catalog'] is not None: stats += beforeStaticLabel + '\tCatalog: ' + resetColor + statsVersion['Catalog'] + newLine else: stats += beforeStaticLabel + '\tCatalog: ' + resetColor + 'No' + newLine if statsVersion['Info'] is not None: stats += beforeStaticLabel + '\tInfo: ' + resetColor + statsVersion['Info'] + newLine else: stats += beforeStaticLabel + '\tInfo: ' + resetColor + 'No' + newLine stats += beforeStaticLabel + '\tObjects (' + statsVersion['Objects'][ 0] + '): ' + resetColor + str(statsVersion['Objects'][1]) + newLine if statsVersion['Compressed Objects'] is not None: stats += beforeStaticLabel + '\tCompressed objects (' + statsVersion['Compressed Objects'][ 0] + '): ' + resetColor + str(statsVersion['Compressed Objects'][1]) + newLine if statsVersion['Errors'] is not None: stats += beforeStaticLabel + '\t\tErrors (' + statsVersion['Errors'][ 0] + '): ' + resetColor + str(statsVersion['Errors'][1]) + newLine stats += beforeStaticLabel + '\tStreams (' + statsVersion['Streams'][ 0] + '): ' + resetColor + str(statsVersion['Streams'][1]) if statsVersion['Xref Streams'] is not None: stats += newLine + beforeStaticLabel + '\t\tXref streams (' + statsVersion['Xref Streams'][ 0] + '): ' + resetColor + str(statsVersion['Xref Streams'][1]) if statsVersion['Object Streams'] is not None: stats += ( newLine + beforeStaticLabel + '\t\tObject streams (' + statsVersion['Object Streams'][0] + '): ' + resetColor + str(statsVersion['Object Streams'][1]) ) if int(statsVersion['Streams'][0]) > 0: stats += ( newLine + beforeStaticLabel + '\t\tEncoded (' + statsVersion['Encoded'][0] + '): ' + resetColor + str(statsVersion['Encoded'][1]) ) if statsVersion['Decoding Errors'] is not None: stats += ( newLine + beforeStaticLabel + '\t\tDecoding errors (' + statsVersion['Decoding Errors'][0] + '): ' + resetColor + str(statsVersion['Decoding Errors'][1]) ) if statsVersion['URIs'] is not None: stats += ( newLine + beforeStaticLabel + '\tObjects with URIs (' + statsVersion['URIs'][0] + '): ' + resetColor + str(statsVersion['URIs'][1]) ) if COLORIZED_OUTPUT and not options.avoidColors: beforeStaticLabel = warningColor if statsVersion['Objects with JS code'] is not None: stats += ( newLine + beforeStaticLabel + '\tObjects with JS code (' + statsVersion['Objects with JS code'][0] + '): ' + resetColor + str(statsVersion['Objects with JS code'][1]) ) actions = statsVersion['Actions'] events = statsVersion['Events'] vulns = statsVersion['Vulns'] elements = statsVersion['Elements'] if events is not None or actions is not None or vulns is not None or elements is not None: stats += newLine + beforeStaticLabel + '\tSuspicious elements:' + resetColor + newLine if events is not None: for event in events: stats += ( '\t\t' + beforeStaticLabel + event + ' (%d): ' % len(events[event]) + resetColor + str(events[event]) + newLine ) if actions is not None: for action in actions: stats += ( '\t\t' + beforeStaticLabel + action + ' (%d): ' % len(actions[action]) + resetColor + str(actions[action]) + newLine ) if vulns is not None: for vuln in vulns: if vuln in vulnsDict: vulnName = vulnsDict[vuln][0] vulnCVEList = vulnsDict[vuln][1] stats += '\t\t' + beforeStaticLabel + vulnName + ' (' for vulnCVE in vulnCVEList: stats += vulnCVE + ',' stats = stats[:-1] + ') (%d): ' % len(vulns[vuln]) + resetColor + str(vulns[vuln]) + newLine else: stats += ( '\t\t' + beforeStaticLabel + vuln + ' (%d): ' % len(vulns[vuln]) + resetColor + str(vulns[vuln]) + newLine ) if elements is not None: for element in elements: if element in vulnsDict: vulnName = vulnsDict[element][0] vulnCVEList = vulnsDict[element][1] stats += '\t\t' + beforeStaticLabel + vulnName + ' (' for vulnCVE in vulnCVEList: stats += vulnCVE + ',' stats = stats[:-1] + '): ' + resetColor + str(elements[element]) + newLine else: stats += '\t\t' + beforeStaticLabel + element + ': ' + resetColor + str( elements[element]) + newLine if COLORIZED_OUTPUT and not options.avoidColors: beforeStaticLabel = staticColor urls = statsVersion['URLs'] if urls is not None: stats += newLine + beforeStaticLabel + '\tFound URLs:' + resetColor + newLine for url in urls: stats += '\t\t' + url + newLine stats += newLine * 2 if fileName is not None: print(stats) if options.isInteractive: from peepdf.PDFConsole import PDFConsole console = PDFConsole(pdf, VT_KEY, options.avoidColors) while not console.leaving: try: console.cmdloop() except KeyboardInterrupt as e: sys.exit() except: errorMessage = '*** Error: Exception not handled using the interactive console!! Please, report it to the author!!' print(errorColor + errorMessage + resetColor + newLine) traceback.print_exc(file=open(errorsFile, 'a')) except Exception as e: if len(e.args) == 2: excName, excReason = e.args else: excName = None if excName is None or excName != 'PeepException': errorMessage = '*** Error: Exception not handled!!' traceback.print_exc(file=open(errorsFile, 'a')) print(errorColor + errorMessage + resetColor + newLine) finally: if os.path.exists(errorsFile): message = newLine + 'Please, don\'t forget to report errors if found:' + newLine * 2 message += '\t- Sending the file "%s" to the author (mailto:[email protected])%s' % ( errorsFile, newLine) message += '\t- And/Or creating an issue on the project webpage (https://github.com/jesparza/peepdf/issues)' + newLine message = errorColor + message + resetColor sys.exit(message)
def peepdf_parse(filepath: str, pdfresult: Dict[str, Any]) -> Dict[str, Any]: """Uses V8Py from peepdf to extract JavaScript from PDF objects.""" if not HAVE_PEEPDF: return pdfresult log.debug("About to parse with PDFParser") parser = PDFParser() _, pdf = parser.parse(filepath, forceMode=True, looseMode=True, manualAnalysis=False) urlset = set() annoturiset = set() objects = [] retobjects = [] metadata = {} base_uri = _set_base_uri(pdf) for i, body in enumerate(pdf.body): metatmp = pdf.getBasicMetadata(i) if metatmp: metadata = metatmp objects = body.objects for index in objects: oid = objects[index].id offset = objects[index].offset size = objects[index].size details = objects[index].object obj_data = { "Object ID": oid, "Offset": offset, "Size": size, } if details.type == "stream": # encoded_stream = details.encodedStream decoded_stream = details.decodedStream if not HAVE_V8PY: continue jsdata = None try: jslist, unescapedbytes, urlsfound, errors, ctxdummy = analyseJS( decoded_stream.strip()) jsdata = jslist[0] except Exception as e: log.error(e, exc_info=True) continue if len(errors) or jsdata is None: continue for url in urlsfound: urlset.add(url) # The following loop is required to "JSONify" the strings returned from PyV8. # As PyV8 returns byte strings, we must parse out bytecode and # replace it with an escape '\'. We can't use encode("string_escape") # as this would mess up the new line representation which is used for # beautifying the javascript code for Django's web interface. ret_data = "" for char in jsdata: tmp = f"\\x{char.encode().hex()}" if ord( char) > 127 else char ret_data += tmp obj_data["Data"] = ret_data retobjects.append(obj_data) elif details.type == "dictionary" and details.hasElement("/A"): # verify it to be a link type annotation subtype_elem = details.getElementByName("/Subtype") type_elem = details.getElementByName("/Type") if not subtype_elem or not type_elem: continue subtype_elem = _get_obj_val(pdf, i, subtype_elem) type_elem = _get_obj_val(pdf, i, type_elem) if subtype_elem.getValue() != "/Link" or type_elem.getValue( ) != "/Annot": continue a_elem = details.getElementByName("/A") a_elem = _get_obj_val(pdf, i, a_elem) if a_elem.type == "dictionary" and a_elem.hasElement("/URI"): uri_elem = a_elem.getElementByName("/URI") uri_elem = _get_obj_val(pdf, i, uri_elem) annoturiset.add(base_uri + uri_elem.getValue()) pdfresult["JSStreams"] = retobjects if "creator" in metadata: pdfresult["Info"]["Creator"] = convert_to_printable( _clean_string(metadata["creator"])) if "producer" in metadata: pdfresult["Info"]["Producer"] = convert_to_printable( _clean_string(metadata["producer"])) if "author" in metadata: pdfresult["Info"]["Author"] = convert_to_printable( _clean_string(metadata["author"])) if len(urlset): pdfresult["JS_URLs"] = list(urlset) if len(annoturiset): pdfresult["Annot_URLs"] = list(annoturiset) return pdfresult
def ProcessFile(path): if not(os.path.isfile(path)): print '{0} not a file!'.format(path) return 2 try: data = {} data['valid'] = True pdfdata = {} pdfParser = PDFParser() _,pdf = pdfParser.parse(path, True) if not pdf: data['valid'] = False else: errors = [] streams = [] # general info statsDict = pdf.getStats() try: data['info'] = json.dumps(statsDict, indent=4, sort_keys=False) except Exception as e: data['info'] = e # enumerate errors if hasattr(pdf, 'errors'): errors.extend(pdf.errors) # enumerate streams statsDict = pdf.getStats() for versionId, statsVersion in enumerate(statsDict['Versions']): for objid in statsVersion['Objects'][1]: obj = pdf.getObject(objid, versionId) if not obj: continue stream = {} stream['id'] = objid stream['type'] = obj.getType() stream['attributes'] = {} stream['has_js'] = obj.containsJScode if hasattr(obj, 'elements'): for key in obj.elements: element = obj.elements[key] stream['attributes'][key] = convert_to_printable(element.value) if obj.getType() == 'stream': stream['data_len'] = obj.size value = obj.getStream() if value != -1: stream['data'] = convert_to_printable(value) streams.append(stream) pdfdata['streams'] = streams pdfdata['errors'] = errors data['data'] = pdfdata encoded = json.dumps(data) print encoded except Exception as ex: data = {} data['valid'] = False data['error'] = str(ex) print json.dumps(data) return 1 return 0
def main(): global COLORIZED_OUTPUT argsParser = optparse.OptionParser( usage='Usage: peepdf.py [options] PDF_file', description=versionHeader) argsParser.add_option('-i', '--interactive', action='store_true', dest='isInteractive', default=False, help='Sets console mode.') argsParser.add_option( '-s', '--load-script', action='store', type='string', dest='scriptFile', help='Loads the commands stored in the specified file and execute them.' ) argsParser.add_option( '-c', '--check-vt', action='store_true', dest='checkOnVT', default=False, help='Checks the hash of the PDF file on VirusTotal.') argsParser.add_option('-f', '--force-mode', action='store_true', dest='isForceMode', default=False, help='Sets force parsing mode to ignore errors.') argsParser.add_option( '-l', '--loose-mode', action='store_true', dest='isLooseMode', default=False, help='Sets loose parsing mode to catch malformed objects.') argsParser.add_option( '-m', '--manual-analysis', action='store_true', dest='isManualAnalysis', default=False, help= 'Avoids automatic Javascript analysis. Useful with eternal loops like heap spraying.' ) argsParser.add_option( '-g', '--grinch-mode', action='store_true', dest='avoidColors', default=False, help='Avoids colorized output in the interactive console.') argsParser.add_option('-v', '--version', action='store_true', dest='version', default=False, help='Shows program\'s version number.') argsParser.add_option('-x', '--xml', action='store_true', dest='xmlOutput', default=False, help='Shows the document information in XML format.') argsParser.add_option( '-j', '--json', action='store_true', dest='jsonOutput', default=False, help='Shows the document information in JSON format.') argsParser.add_option( '-C', '--command', action='append', type='string', dest='commands', help='Specifies a command from the interactive console to be executed.' ) (options, args) = argsParser.parse_args() stats = "" pdf = None fileName = None statsDict = None vtJsonDict = None try: # Avoid colors in the output if not COLORIZED_OUTPUT or options.avoidColors: warningColor = '' errorColor = '' alertColor = '' staticColor = '' resetColor = '' else: warningColor = Fore.YELLOW errorColor = Fore.RED alertColor = Fore.RED staticColor = Fore.BLUE resetColor = Style.RESET_ALL if options.version: print peepdfHeader else: if len(args) == 1: fileName = args[0] if not os.path.exists(fileName): sys.exit('Error: The file "' + fileName + '" does not exist!!') elif len(args) > 1 or (len(args) == 0 and not options.isInteractive): sys.exit(argsParser.print_help()) if options.scriptFile is not None: if not os.path.exists(options.scriptFile): sys.exit('Error: The script file "' + options.scriptFile + '" does not exist!!') if fileName is not None: pdfParser = PDFParser() ret, pdf = pdfParser.parse(fileName, options.isForceMode, options.isLooseMode, options.isManualAnalysis) if options.checkOnVT: # Checks the MD5 on VirusTotal md5Hash = pdf.getMD5() ret = vtcheck(md5Hash, VT_KEY) if ret[0] == -1: pdf.addError(ret[1]) else: vtJsonDict = ret[1] if "response_code" in vtJsonDict: if vtJsonDict['response_code'] == 1: if "positives" in vtJsonDict and "total" in vtJsonDict: pdf.setDetectionRate([ vtJsonDict['positives'], vtJsonDict['total'] ]) else: pdf.addError( 'Missing elements in the response from VirusTotal!!' ) if "permalink" in vtJsonDict: pdf.setDetectionReport( vtJsonDict['permalink']) else: pdf.setDetectionRate(None) else: pdf.addError('Bad response from VirusTotal!!') statsDict = pdf.getStats() if options.xmlOutput: try: xml = getPeepXML(statsDict, _version, revision) sys.stdout.write(xml) except: errorMessage = '*** Error: Exception while generating the XML file!!' traceback.print_exc(file=open(errorsFile, 'a')) raise Exception('PeepException', 'Send me an email ;)') elif options.jsonOutput and not options.commands: try: jsonReport = getPeepJSON(statsDict, _version, revision) sys.stdout.write(jsonReport) except: errorMessage = '*** Error: Exception while generating the JSON report!!' traceback.print_exc(file=open(errorsFile, 'a')) raise Exception('PeepException', 'Send me an email ;)') else: if COLORIZED_OUTPUT and not options.avoidColors: try: init() except: COLORIZED_OUTPUT = False if options.scriptFile is not None: from peepdf.PDFConsole import PDFConsole scriptFileObject = open(options.scriptFile, 'rb') console = PDFConsole(pdf, VT_KEY, options.avoidColors, stdin=scriptFileObject) try: console.cmdloop() except: errorMessage = '*** Error: Exception not handled using the batch mode!!' scriptFileObject.close() traceback.print_exc(file=open(errorsFile, 'a')) raise Exception('PeepException', 'Send me an email ;)') elif options.commands is not None: from PDFConsole import PDFConsole console = PDFConsole(pdf, VT_KEY, options.avoidColors) try: for command in options.commands: console.onecmd(command) except: errorMessage = '*** Error: Exception not handled using the batch commands!!' traceback.print_exc(file=open(errorsFile, 'a')) raise Exception('PeepException', 'Send me an email ;)') else: if statsDict is not None: if COLORIZED_OUTPUT and not options.avoidColors: beforeStaticLabel = staticColor else: beforeStaticLabel = '' if not JS_MODULE: warningMessage = 'Warning: PyV8 is not installed!!' stats += warningColor + warningMessage + resetColor + newLine if not EMU_MODULE: warningMessage = 'Warning: pylibemu is not installed!!' stats += warningColor + warningMessage + resetColor + newLine if not PIL_MODULE: warningMessage = 'Warning: Python Imaging Library (PIL) is not installed!!' stats += warningColor + warningMessage + resetColor + newLine errors = statsDict['Errors'] for error in errors: if error.find('Decryption error') != -1: stats += errorColor + error + resetColor + newLine if stats != '': stats += newLine statsDict = pdf.getStats() stats += beforeStaticLabel + 'File: ' + resetColor + statsDict[ 'File'] + newLine stats += beforeStaticLabel + 'MD5: ' + resetColor + statsDict[ 'MD5'] + newLine stats += beforeStaticLabel + 'SHA1: ' + resetColor + statsDict[ 'SHA1'] + newLine stats += beforeStaticLabel + 'SHA256: ' + resetColor + statsDict[ 'SHA256'] + newLine stats += beforeStaticLabel + 'Size: ' + resetColor + statsDict[ 'Size'] + ' bytes' + newLine if options.checkOnVT: if statsDict['Detection'] != []: detectionReportInfo = '' if statsDict['Detection'] is not None: detectionColor = '' if COLORIZED_OUTPUT and not options.avoidColors: detectionLevel = statsDict[ 'Detection'][0] / ( statsDict['Detection'][1] / 3) if detectionLevel == 0: detectionColor = alertColor elif detectionLevel == 1: detectionColor = warningColor detectionRate = '%s%d%s/%d' % ( detectionColor, statsDict['Detection'][0], resetColor, statsDict['Detection'][1]) if statsDict['Detection report'] != '': detectionReportInfo = ( beforeStaticLabel + 'Detection report: ' + resetColor + statsDict['Detection report'] + newLine) else: detectionRate = 'File not found on VirusTotal' stats += beforeStaticLabel + 'Detection: ' + resetColor + detectionRate + newLine stats += detectionReportInfo stats += beforeStaticLabel + 'Version: ' + resetColor + statsDict[ 'Version'] + newLine stats += beforeStaticLabel + 'Binary: ' + resetColor + statsDict[ 'Binary'] + newLine stats += beforeStaticLabel + 'Linearized: ' + resetColor + statsDict[ 'Linearized'] + newLine stats += beforeStaticLabel + 'Encrypted: ' + resetColor + statsDict[ 'Encrypted'] if statsDict['Encryption Algorithms'] != []: stats += ' (' for algorithmInfo in statsDict[ 'Encryption Algorithms']: stats += algorithmInfo[0] + ' ' + str( algorithmInfo[1]) + ' bits, ' stats = stats[:-2] + ')' stats += newLine stats += beforeStaticLabel + 'Updates: ' + resetColor + statsDict[ 'Updates'] + newLine stats += beforeStaticLabel + 'Objects: ' + resetColor + statsDict[ 'Objects'] + newLine stats += beforeStaticLabel + 'Streams: ' + resetColor + statsDict[ 'Streams'] + newLine stats += beforeStaticLabel + 'URIs: ' + resetColor + statsDict[ 'URIs'] + newLine stats += beforeStaticLabel + 'Comments: ' + resetColor + statsDict[ 'Comments'] + newLine stats += beforeStaticLabel + 'Errors: ' + resetColor + str( len(statsDict['Errors'])) + newLine * 2 for version in range(len(statsDict['Versions'])): statsVersion = statsDict['Versions'][version] stats += beforeStaticLabel + 'Version ' + resetColor + str( version) + ':' + newLine if statsVersion['Catalog'] is not None: stats += beforeStaticLabel + '\tCatalog: ' + resetColor + statsVersion[ 'Catalog'] + newLine else: stats += beforeStaticLabel + '\tCatalog: ' + resetColor + 'No' + newLine if statsVersion['Info'] is not None: stats += beforeStaticLabel + '\tInfo: ' + resetColor + statsVersion[ 'Info'] + newLine else: stats += beforeStaticLabel + '\tInfo: ' + resetColor + 'No' + newLine stats += beforeStaticLabel + '\tObjects (' + statsVersion[ 'Objects'][0] + '): ' + resetColor + str( statsVersion['Objects'][1]) + newLine if statsVersion['Compressed Objects'] is not None: stats += beforeStaticLabel + '\tCompressed objects (' + statsVersion[ 'Compressed Objects'][ 0] + '): ' + resetColor + str( statsVersion['Compressed Objects'] [1]) + newLine if statsVersion['Errors'] is not None: stats += beforeStaticLabel + '\t\tErrors (' + statsVersion[ 'Errors'][0] + '): ' + resetColor + str( statsVersion['Errors'][1]) + newLine stats += beforeStaticLabel + '\tStreams (' + statsVersion[ 'Streams'][0] + '): ' + resetColor + str( statsVersion['Streams'][1]) if statsVersion['Xref Streams'] is not None: stats += newLine + beforeStaticLabel + '\t\tXref streams (' + statsVersion[ 'Xref Streams'][ 0] + '): ' + resetColor + str( statsVersion['Xref Streams'][1]) if statsVersion['Object Streams'] is not None: stats += ( newLine + beforeStaticLabel + '\t\tObject streams (' + statsVersion['Object Streams'][0] + '): ' + resetColor + str(statsVersion['Object Streams'][1])) if int(statsVersion['Streams'][0]) > 0: stats += (newLine + beforeStaticLabel + '\t\tEncoded (' + statsVersion['Encoded'][0] + '): ' + resetColor + str(statsVersion['Encoded'][1])) if statsVersion['Decoding Errors'] is not None: stats += ( newLine + beforeStaticLabel + '\t\tDecoding errors (' + statsVersion['Decoding Errors'][0] + '): ' + resetColor + str(statsVersion['Decoding Errors'][1]) ) if statsVersion['URIs'] is not None: stats += (newLine + beforeStaticLabel + '\tObjects with URIs (' + statsVersion['URIs'][0] + '): ' + resetColor + str(statsVersion['URIs'][1])) if COLORIZED_OUTPUT and not options.avoidColors: beforeStaticLabel = warningColor if statsVersion[ 'Objects with JS code'] is not None: stats += ( newLine + beforeStaticLabel + '\tObjects with JS code (' + statsVersion['Objects with JS code'][0] + '): ' + resetColor + str(statsVersion['Objects with JS code'] [1])) actions = statsVersion['Actions'] events = statsVersion['Events'] vulns = statsVersion['Vulns'] elements = statsVersion['Elements'] if events is not None or actions is not None or vulns is not None or elements is not None: stats += newLine + beforeStaticLabel + '\tSuspicious elements:' + resetColor + newLine if events is not None: for event in events: stats += ( '\t\t' + beforeStaticLabel + event + ' (%d): ' % len(events[event]) + resetColor + str(events[event]) + newLine) if actions is not None: for action in actions: stats += ( '\t\t' + beforeStaticLabel + action + ' (%d): ' % len(actions[action]) + resetColor + str(actions[action]) + newLine) if vulns is not None: for vuln in vulns: if vuln in vulnsDict: vulnName = vulnsDict[vuln][0] vulnCVEList = vulnsDict[vuln][1] stats += '\t\t' + beforeStaticLabel + vulnName + ' (' for vulnCVE in vulnCVEList: stats += vulnCVE + ',' stats = stats[:-1] + ') (%d): ' % len( vulns[vuln]) + resetColor + str( vulns[vuln]) + newLine else: stats += ( '\t\t' + beforeStaticLabel + vuln + ' (%d): ' % len(vulns[vuln]) + resetColor + str(vulns[vuln]) + newLine) if elements is not None: for element in elements: if element in vulnsDict: vulnName = vulnsDict[element][0] vulnCVEList = vulnsDict[element][1] stats += '\t\t' + beforeStaticLabel + vulnName + ' (' for vulnCVE in vulnCVEList: stats += vulnCVE + ',' stats = stats[: -1] + '): ' + resetColor + str( elements[element] ) + newLine else: stats += '\t\t' + beforeStaticLabel + element + ': ' + resetColor + str( elements[element]) + newLine if COLORIZED_OUTPUT and not options.avoidColors: beforeStaticLabel = staticColor urls = statsVersion['URLs'] if urls is not None: stats += newLine + beforeStaticLabel + '\tFound URLs:' + resetColor + newLine for url in urls: stats += '\t\t' + url + newLine stats += newLine * 2 if fileName is not None: print stats if options.isInteractive: from peepdf.PDFConsole import PDFConsole console = PDFConsole(pdf, VT_KEY, options.avoidColors) while not console.leaving: try: console.cmdloop() except KeyboardInterrupt as e: sys.exit() except: errorMessage = '*** Error: Exception not handled using the interactive console!! Please, report it to the author!!' print errorColor + errorMessage + resetColor + newLine traceback.print_exc(file=open(errorsFile, 'a')) except Exception as e: if len(e.args) == 2: excName, excReason = e.args else: excName = None if excName is None or excName != 'PeepException': errorMessage = '*** Error: Exception not handled!!' traceback.print_exc(file=open(errorsFile, 'a')) print errorColor + errorMessage + resetColor + newLine finally: if os.path.exists(errorsFile): message = newLine + 'Please, don\'t forget to report errors if found:' + newLine * 2 message += '\t- Sending the file "%s" to the author (mailto:[email protected])%s' % ( errorsFile, newLine) message += '\t- And/Or creating an issue on the project webpage (https://github.com/jesparza/peepdf/issues)' + newLine message = errorColor + message + resetColor sys.exit(message)
def get_streams(): # This function is brutally ripped from Brandon Dixon's swf_mastah.py. # Initialize peepdf parser. parser = PDFParser() # Parse currently opened PDF document. ret, pdf = parser.parse(__sessions__.current.file.path, True, False) # Generate statistics. results = [] objects = [] count = 0 object_counter = 1 for i in range(len(pdf.body)): body = pdf.body[count] objects = body.objects for index in objects: oid = objects[index].id offset = objects[index].offset size = objects[index].size details = objects[index].object if details.type == 'stream': decoded_stream = details.decodedStream result = [ object_counter, oid, offset, size, get_type(decoded_stream)[:100] ] # If the stream needs to be dumped or opened, we do it # and expand the results with the path to the stream dump. if arg_open or arg_dump: # If was instructed to dump, we already have a base folder. if arg_dump: folder = arg_dump # Otherwise we juts generate a temporary one. else: folder = tempfile.gettempdir() # Confirm the dump path if not os.path.exists(folder): try: os.makedirs(folder) except Exception as e: self.log('error', "Unable to create directory at {0}: {1}".format(folder, e)) return results else: if not os.path.isdir(folder): self.log('error', "You need to specify a folder not a file") return results # Dump stream to this path. # TODO: sometimes there appear to be multiple streams # with the same object ID. Is that even possible? # It will cause conflicts. dump_path = '{0}/{1}_{2}_pdf_stream.bin'.format(folder, __sessions__.current.file.md5, object_counter) with open(dump_path, 'wb') as handle: handle.write(decoded_stream.strip()) # Add dump path to the stream attributes. result.append(dump_path) # Update list of streams. results.append(result) object_counter += 1 count += 1 return results
def get_streams(): # This function is brutally ripped from Brandon Dixon's swf_mastah.py. # Initialize peepdf parser. parser = PDFParser() # Parse currently opened PDF document. ret, pdf = parser.parse(__sessions__.current.file.path, True, False) # Generate statistics. results = [] objects = [] count = 0 object_counter = 1 for i in range(len(pdf.body)): body = pdf.body[count] objects = body.objects for index in objects: oid = objects[index].id offset = objects[index].offset size = objects[index].size details = objects[index].object if details.type == 'stream': decoded_stream = details.decodedStream result = [ object_counter, oid, offset, size, get_type(decoded_stream)[:100] ] # If the stream needs to be dumped or opened, we do it # and expand the results with the path to the stream dump. if arg_open or arg_dump: # If was instructed to dump, we already have a base folder. if arg_dump: folder = arg_dump # Otherwise we juts generate a temporary one. else: folder = tempfile.gettempdir() # Confirm the dump path if not os.path.exists(folder): try: os.makedirs(folder) except Exception as e: self.log( 'error', "Unable to create directory at {0}: {1}" .format(folder, e)) return results else: if not os.path.isdir(folder): self.log( 'error', "You need to specify a folder not a file" ) return results # Dump stream to this path. # TODO: sometimes there appear to be multiple streams # with the same object ID. Is that even possible? # It will cause conflicts. dump_path = '{0}/{1}_{2}_pdf_stream.bin'.format( folder, __sessions__.current.file.md5, object_counter) with open(dump_path, 'wb') as handle: handle.write(decoded_stream.strip()) # Add dump path to the stream attributes. result.append(dump_path) # Update list of streams. results.append(result) object_counter += 1 count += 1 return results