Python unescapeHTMLEntities 예제들, PDFUtils.unescapeHTMLEntities Python 예제들

예제 #1

0

파일 보기

파일: build_objs.py 프로젝트: toejamhoney/peepdf-js_analyse

def get_annots(app, root):
  for annot in root.iterfind(".//key"):
    if annot.text == "Annots":
        objs = []
        parent = annot.getparent()
        ref_list = parent[parent.index(annot)+1][0]
        for ref in ref_list:
          id = ref.get("id")
          for obj in root.iterfind(".//object"):
              if obj.get("id") == id:
                  size = obj[0].get("size")
                  size = re.sub("%", "", size)
                  new = {}
                  childs =  obj[0].getchildren()
                  for i in range(int(size)):
                      if childs[2*i+1][0].tag == "literal":
                          new[childs[2*i].text] = unescapeHTMLEntities(childs[2*i+1][0].text)
                      elif childs[2*i+1][0].tag == "ref":
                          for ob in root.iterfind(".//object"):
                              if ob.get("id") == childs[2*i+1][0].get("id"):
                                  for child in ob.iterdescendants(tag="data"):
                                     new[childs[2*i].text] = unescapeHTMLEntities(child.text)
                      else:
                           new[childs[2*i].text] = "Unknown tag: " + childs[2*i+1][0].tag
                  new["subject"] = new.pop("Subj")
                  app['doc']['annots'].append(new)

예제 #2

0

파일 보기

파일: build_objs.py 프로젝트: toejamhoney/peepdf-js_analyse

def get_fields(root):
    ret = {}
    for key in root.iterfind("field"):
        print key.get("name")
        for elem in root.iterfind(key.get("name")):
            if elem.text != None:
                ret[key.get("name")] = unescapeHTMLEntities(elem.text)
    return ret

예제 #3

0

파일 보기

파일: build_objs.py 프로젝트: toejamhoney/peepdf-js_analyse

def create_info_obj(tree):
    info_attrs = ["author", "creator", "creationDate", "Date", "keywords", "modDate", "producer", "subject", "title", "trapped"]
    this = {}
    this["info"] ={}
    for item in info_attrs:
        elem = search_tree(tree, ".//key", item[0].upper() + item[1:])
        if elem != None:
            parent = elem.getparent()
            sibling = parent[parent.index(elem)+1][0]
            if sibling.tag == "string" and sibling.text != None:
                this["info"][item] = unescapeHTMLEntities(sibling.text)
            elif sibling.tag == "ref":
                for ob in tree.iterfind(".//object"):
                    if ob.get("id") == sibling.get("id"):
                        for child in ob.iterdescendants(tag="data"):
                            this["info"][item] = unescapeHTMLEntities(child.text)
            else:
                this["info"][item] = "Unknown tag: " + sibling.tag
    #print this
    return this

예제 #4

0

파일 보기

파일: build_objs.py 프로젝트: toejamhoney/peepdf-js_analyse

def create_event_obj(tree):
    #
    #print xml
    #tree = create_tree(xml)
    event_attrs = ["author", "calculate", "creator", "creationDate", "delay", "dirty", "external", "filesize", "keywords", "modDate", "numFields", "numPages", "numTemplates", "path", "pageNum", "producer", "subject", "title", "zoom", "zoomType"]
    event = {}
    event["target"] ={}
    for item in event_attrs:
        elem = search_tree(tree, ".//key", item[0].upper() + item[1:])
        if elem != None:
            parent = elem.getparent()
            sibling = parent[parent.index(elem)+1][0]
            if sibling.tag == "string" and sibling.text != None:
                event["target"][item] = unescapeHTMLEntities(sibling.text)
            elif sibling.tag == "ref":
                for ob in tree.iterfind(".//object"):
                    if ob.get("id") == sibling.get("id"):
                        for child in ob.iterdescendants(tag="data"):
                            if child.text != None:
                                event["target"][item] = unescapeHTMLEntities(child.text)
            else:
                event["target"][item] = "Unknown tag: " + sibling.tag
    #print event
    return event

예제 #5

0

파일 보기

파일: build_objs.py 프로젝트: toejamhoney/peepdf-js_analyse

def create_app_obj(tree):
    app= {}
    app_attrs = ["calculate", "formsVersion", "fullscreen", "language", "numPlugins", "openInPlace", "platform", "toolbar", "toolbarHorizontal", "toolbarVertical"]
    doc = {}
    for item in app_attrs:
        elem = search_tree(tree, ".//key", item[0].upper() + item[1:])
        if elem != None:
            parent = elem.getparent()
            doc[item] = unescapeHTMLEntities(parent[parent.index(elem)+1][0].text)
    app['doc'] = doc;
    app['doc']['annots'] = []
    app['doc']['viewerType'] = 'Reader'
    app['viewerType'] = 'Reader'
    app['viewerVersion'] = 5.0
    app['plugIns'] = [{ 'version': 6.0}, {'version': 7.5}, {'version': 8.7},{'version': 9.1}]
    if not 'language' in app.keys():
        app['language'] = "ENU"
    if not 'platform' in app.keys():
        app['platform'] = "WIN"
    get_annots(app, tree)
    return app

예제 #6

0

파일 보기

파일: JSAnalysis.py 프로젝트: KillerInstinct/elastic-cuckoo-modified

def extractJS(code):
    return code
    JSCode = []
    unescapedBytes = []

    try:
        code = unescapeHTMLEntities(code)
        scriptElements = re.findall(reJSscript, code, re.DOTALL | re.IGNORECASE)
        if scriptElements != []:
            code = ''
            for scriptElement in scriptElements:
                code += scriptElement + '\n\n'
        opts = jsbeautifier.default_options()
        opts.escape_strings = True
        code = jsbeautifier.beautify(code, opts)
        JSCode.append(code)
        ret = ""
        for code in JSCode:
            ret += code + "\n"
    except:
        ret = "Error extracting code."

    return ret

예제 #7

0

파일 보기

def analyseJS(code, context=None, manualAnalysis=False):
    '''
        Hooks the eval function and search for obfuscated elements in the Javascript code
        
        @param code: The Javascript code (string)
        @return: List with analysis information of the Javascript code: [JSCode,unescapedBytes,urlsFound,errors,context], where 
                JSCode is a list with the several stages Javascript code,
                unescapedBytes is a list with the parameters of unescape functions, 
                urlsFound is a list with the URLs found in the unescaped bytes,
                errors is a list of errors,
                context is the context of execution of the Javascript code.
    '''
    errors = []
    JSCode = []
    unescapedBytes = []
    urlsFound = []

    try:
        code = unescapeHTMLEntities(code)
        scriptElements = re.findall(reJSscript, code,
                                    re.DOTALL | re.IGNORECASE)
        if scriptElements != []:
            code = ''
            for scriptElement in scriptElements:
                code += scriptElement + '\n\n'
        code = jsbeautifier.beautify(code)
        JSCode.append(code)

        if code != None and JS_MODULE and not manualAnalysis:
            if context == None:
                with PyV8.JSLocker():
                    context = PyV8.JSContext(Global())
            with PyV8.JSLocker():
                context.enter()
                # Hooking the eval function
                context.eval('eval=evalOverride')
                #context.eval(preDefinedCode)
                while True:
                    originalCode = code
                    try:
                        context.eval(code)
                        evalCode = context.eval('evalCode')
                        evalCode = jsbeautifier.beautify(evalCode)
                        if evalCode != '' and evalCode != code:
                            code = evalCode
                            JSCode.append(code)
                        else:
                            break
                    except:
                        error = str(sys.exc_info()[1])
                        open('jserror.log', 'ab').write(error + newLine)
                        errors.append(error)
                        break
                context.leave()
            if code != '':
                escapedVars = re.findall('(\w*?)\s*?=\s*?(unescape\((.*?)\))',
                                         code, re.DOTALL)
                for var in escapedVars:
                    bytes = var[2]
                    if bytes.find('+') != -1 or bytes.find('%') == -1:
                        varContent = getVarContent(code, bytes)
                        if len(varContent) > 150:
                            ret = unescape(varContent)
                            if ret[0] != -1:
                                bytes = ret[1]
                                urls = re.findall('https?://.*$', bytes,
                                                  re.DOTALL)
                                if bytes not in unescapedBytes:
                                    unescapedBytes.append(bytes)
                                for url in urls:
                                    if url not in urlsFound:
                                        urlsFound.append(url)
                    else:
                        bytes = bytes[1:-1]
                        if len(bytes) > 150:
                            ret = unescape(bytes)
                            if ret[0] != -1:
                                bytes = ret[1]
                                urls = re.findall('https?://.*$', bytes,
                                                  re.DOTALL)
                                if bytes not in unescapedBytes:
                                    unescapedBytes.append(bytes)
                                for url in urls:
                                    if url not in urlsFound:
                                        urlsFound.append(url)
    except:
        traceback.print_exc(file=open(errorsFile, 'a'))
        errors.append('Unexpected error in the JSAnalysis module!!')
    finally:
        for js in JSCode:
            if js == None or js == '':
                JSCode.remove(js)
    return [JSCode, unescapedBytes, urlsFound, errors, context]

예제 #8

0

파일 보기

파일: JSAnalysis.py 프로젝트: cmu-sei/pdfrankenstein

def analyseJS(code, context=None, manualAnalysis=False):
    '''
        Hooks the eval function and search for obfuscated elements in the Javascript code
        
        @param code: The Javascript code (string)
        @return: List with analysis information of the Javascript code: [JSCode,unescapedBytes,urlsFound,errors,context], where 
                JSCode is a list with the several stages Javascript code,
                unescapedBytes is a list with the parameters of unescape functions, 
                urlsFound is a list with the URLs found in the unescaped bytes,
                errors is a list of errors,
                context is the context of execution of the Javascript code.
    '''
    errors = []
    JSCode = []
    unescapedBytes = []
    urlsFound = []

    try:
        code = unescapeHTMLEntities(code)
        scriptElements = re.findall(reJSscript, code, re.DOTALL | re.IGNORECASE)
        if scriptElements != []:
            code = ''
            for scriptElement in scriptElements:
                code += scriptElement + '\n\n'
        code = jsbeautifier.beautify(code)
        JSCode.append(code)

        if code != None and JS_MODULE and not manualAnalysis:
            if context == None:
                context = PyV8.JSContext(Global())
            context.enter()
            # Hooking the eval function
            context.eval('eval=evalOverride')
            # context.eval(preDefinedCode)
            while True:
                originalCode = code
                try:
                    context.eval(code)
                    evalCode = context.eval('evalCode')
                    evalCode = jsbeautifier.beautify(evalCode)
                    if evalCode != '' and evalCode != code:
                        code = evalCode
                        JSCode.append(code)
                    else:
                        break
                except:
                    error = str(sys.exc_info()[1])
                    open('jserror.log', 'ab').write(error + newLine)
                    errors.append(error)
                    break

            if False:
                escapedVars = re.findall('(\w*?)\s*?=\s*?(unescape\((.*?)\))', code, re.DOTALL)
                for var in escapedVars:
                    bytes = var[2]
                    if bytes.find('+') != -1 or bytes.find('%') == -1:
                        varContent = getVarContent(code, bytes)
                        if len(varContent) > 150:
                            ret = unescape(varContent)
                            if ret[0] != -1:
                                bytes = ret[1]
                                urls = re.findall('https?://.*$', bytes, re.DOTALL)
                                if bytes not in unescapedBytes:
                                    unescapedBytes.append(bytes)
                                for url in urls:
                                    if url not in urlsFound:
                                        urlsFound.append(url)
                    else:
                        bytes = bytes[1:-1]
                        if len(bytes) > 150:
                            ret = unescape(bytes)
                            if ret[0] != -1:
                                bytes = ret[1]
                                urls = re.findall('https?://.*$', bytes, re.DOTALL)
                                if bytes not in unescapedBytes:
                                    unescapedBytes.append(bytes)
                                for url in urls:
                                    if url not in urlsFound:
                                        urlsFound.append(url)
    except:
        traceback.print_exc(file=open(errorsFile, 'a'))
        errors.append('Unexpected error in the JSAnalysis module!!')
    finally:
        for js in JSCode:
            if js == None or js == '':
                JSCode.remove(js)
    return [JSCode, unescapedBytes, urlsFound, errors, context]

예제 #9

0

파일 보기

def analyseJS(code, context=None, manualAnalysis=False):
    errors = []
    jsCode = []
    unESbs = []
    urlsFound = []

    try:
        code = unescapeHTMLEntities(code)
        scriptElements = re.findall(reJSscript, code,
                                    re.DOTALL | re.IGNORECASE)
        if scriptElements:
            code = ''
            for scriptElement in scriptElements:
                code = code + scriptElement + '\n\n'
        code = jsbeautifier.beautify(code)
        jsCode.append(code)

        # Check if the code argument is supplies, and the PYV8 module is present on our computer
        if code is not None and JS_MODULE and not manualAnalysis:
            if context is None:
                context = PyV8.JSContext(Global())
            context.enter()
            # Hooking the eval function
            context.eval('eval=evalOverride')
            # context.eval(preDefinedCode)
            while True:
                originalCode = code
                try:
                    context.eval(code)
                    reCode = context.eval('reCode')
                    reCode = jsbeautifier.beautify(reCode)
                    if reCode != '' and reCode != code:
                        code = reCode
                        jsCode.append(code)
                    else:
                        break
                except:
                    error = str(sys.exc_info()[1])
                    open('jserror.log', 'ab').write(error + newLine)
                    errors.append(error)
                    break

            if code != '':
                # This searches for variables that are been escaped, so that we can trate them
                # and turn them into unescape
                escapedVars = re.findall(
                    '([-a-zA-Z0-9]*?)\s*?=\s*?(unescape\((.*?)\))', code,
                    re.DOTALL)
                for var in escapedVars:
                    bytes = var[2]
                    if bytes.find('+') != -1 or bytes.find('%') == -1:
                        varContent = getVarContent(code, bytes)
                        if len(varContent) > 150:
                            ret = unescape(varContent)
                            if ret[0] != -1:
                                bytes = ret[1]
                                urls = re.findall('https?://.*$', bytes,
                                                  re.DOTALL)
                                if bytes not in unESbs:
                                    unESbs.append(bytes)
                                for url in urls:
                                    if url not in urlsFound:
                                        urlsFound.append(url)
                    else:
                        bytes = bytes[1:-1]
                        if len(bytes) > 150:
                            ret = unescape(bytes)
                            if ret[0] != -1:
                                bytes = ret[1]
                                urls = re.findall('https?://.*$', bytes,
                                                  re.DOTALL)
                                if bytes not in unESbs:
                                    unESbs.append(bytes)
                                for url in urls:
                                    if url not in urlsFound:
                                        urlsFound.append(url)
    except:
        traceback.print_exc(file=open(errorsFile, 'a'))
        errors.append('Unexpected error in the JSAnalysis module!!')
    finally:
        for js in jsCode:
            if js is None or js == '':
                jsCode.remove(js)
    return [jsCode, unESbs, urlsFound, errors, context]

예제 #10

0

파일 보기

def analyseJS(code):
    '''
        Search for obfuscated functions in the Javascript code
        
        @param code: The Javascript code (string)
        @return: List with analysis information of the Javascript code: [JSCode,unescapedBytes,urlsFound], where JSCode is a list with the several stages Javascript code, unescapedBytes is a list with the parameters of unescape functions, and urlsFound is a list with the URLs found in the unescaped bytes. 
    '''
    errors = []
    JSCode = []
    unescapedBytes = []
    urlsFound = []
    oldStdErr = sys.stderr
    errorFile = StringIO()
    sys.stderr = errorFile

    try:
        scriptCode = re.findall(reJSscript, code, re.DOTALL | re.IGNORECASE)
        if scriptCode != []:
            for c in scriptCode:
                code = unescapeHTMLEntities(c)
                code = jsbeautifier.beautify(c)
                JSCode.append(c)

        else:
            code_items = filter(
                lambda x: re.match('^\s*\d+\s+\d+', x) == None, [
                    re.sub('^\s*\(', '',
                           re.sub('\)[^\)]+$', '',
                                  a.split('JavaScript')[0]))
                    for a in re.split('/\s*JS', code)[1:]
                ])
            if code_items != []:
                for ci in code_items:
                    ci = ci.replace("\\\\", "\\").replace("\(", "(").replace(
                        "\)",
                        ")").replace("\ ",
                                     " ").replace("\\r",
                                                  "\r").replace("\\n", "\n")
                    ci = unescapeHTMLEntities(ci)
                    ci = jsbeautifier.beautify(ci)

                    JSCode.append(ci)
            else:
                code = unescapeHTMLEntities(code)
                code = jsbeautifier.beautify(code)
                JSCode.append(code)

        for code in JSCode:
            if code != None and JS_MODULE:
                r = Runtime()
                context = r.new_context()
                while True:
                    evalFunctionsData = searchObfuscatedFunctions(code, 'eval')
                    originalElement = code
                    for evalFunctionData in evalFunctionsData:
                        if not evalFunctionData[2]:
                            modifiedCode = evalFunctionData[1][0].replace(
                                evalFunctionData[0], 'return')
                            code = originalElement.replace(
                                evalFunctionData[1][0], modifiedCode)
                        else:
                            code = originalElement.replace(
                                evalFunctionData[1][0],
                                evalFunctionData[1][1] + ';')
                        try:
                            executedJS = context.eval_script(code)
                            if executedJS == None:
                                raise Exception
                            break
                        except:
                            if evalFunctionData[2]:
                                modifiedCode = evalFunctionData[1][0].replace(
                                    evalFunctionData[0], 'return')
                                code = originalElement.replace(
                                    evalFunctionData[1][0], modifiedCode)
                            else:
                                code = originalElement.replace(
                                    evalFunctionData[1][0],
                                    evalFunctionData[1][1] + ';')
                            try:
                                executedJS = context.eval_script(code)
                                if executedJS == None:
                                    raise Exception
                            except:
                                code = originalElement
                                continue
                    else:
                        break
                    if executedJS != originalElement and executedJS != None and executedJS != '':
                        code = executedJS
                        JSCode.append(code)
                    else:
                        break

                if code != None:
                    escapedVars = re.findall(
                        '(\w*?)\s*?=\s*?(unescape\((.*?)\))', code, re.DOTALL)
                    for var in escapedVars:
                        bytes = var[2]
                        if bytes.find('+') != -1:
                            varContent = getVarContent(code, bytes)
                            if len(varContent) > 150:
                                ret = unescape(varContent)
                                if ret[0] != -1:
                                    bytes = ret[1]
                                    urls = re.findall('https?://.*$', bytes,
                                                      re.DOTALL)
                                    if bytes not in unescapedBytes:
                                        unescapedBytes.append(bytes)
                                    for url in urls:
                                        if url not in urlsFound:
                                            urlsFound.append(url)
                        else:
                            bytes = bytes[1:-1]
                            if len(bytes) > 150:
                                ret = unescape(bytes)
                                if ret[0] != -1:
                                    bytes = ret[1]
                                    urls = re.findall('https?://.*$', bytes,
                                                      re.DOTALL)
                                    if bytes not in unescapedBytes:
                                        unescapedBytes.append(bytes)
                                    for url in urls:
                                        if url not in urlsFound:
                                            urlsFound.append(url)
    except Exception, e:
        errors.append('Unknown error!! [%s]' % e)

예제 #11

0

파일 보기

파일: JSAnalysis.py 프로젝트: Tholep/mpeepdf

def JSUnpack(code,
             rawCode=None,
             infoObjects=[],
             annotsInPagesMaster='[]',
             annotsNameInPagesMaster='[]',
             manualAnalysis=False):
    '''
    Hooks the eval function with multiple app versions and search for obfuscated elements in the Javascript code.
    Also take data from XFA, object info and getAnnot(s) in a PDF to an original code. The idea is mainly taken from JSUnpack
    
    @param code: the Javascript code (string)
    @param rawCode: The raw Javascript code, may contains HTML, XML elements (string)
    @param infoObjects: is list of infoObjects of a PDF
    @param annotsInPagesMaster: is a list of annotation per page
    @param annotsNameInPagesMaster: is a dictionary of annotation by name
    @param manualAnalysis: analyse manually or automatic (boolean)
    @return: List with analysis information of the Javascript code: [JSCode,unescapedBytes,urlsFound,errors], where 
            JSCode is a list with the several stages Javascript code,
            unescapedBytes is a list with the parameters of unescape functions, 
            urlsFound is a list with the URLs found in the unescaped bytes,
            errors is a list of errors,
    '''

    # a dictionary for each app.viewerversion. Each element contains 4 lists: jsCode, unescapedBytes, urlsFound
    valuesFoundByViewerVersion = {}
    #pre-code with data from inforamtion object
    preInfo = ''
    #Take variable name(s) of xml elements (.e.g in XFA, Acroform)
    XMLVar = ''
    #Take annotation data
    annotsInPagesMaster = "var annotsInPagesMaster = %s;\n" % (
        str(annotsInPagesMaster))
    annotsNameInPagesMaster = "var annotsNameInPagesMaster = %s;\n" % (
        str(annotsNameInPagesMaster))

    #version strings
    pdfVersions = ['9.0', '10.0', '11.0']

    #get preInfo from InfoObject

    for obj in infoObjects:
        elements = obj.getElements()
        if elements.has_key("/Creator"):
            creatorValue = elements["/Creator"].getValue()
            preInfo += 'info.creator = String("%s");\n' % (str(creatorValue))
            preInfo += "this.creator = info.creator;\n"
            preInfo += "info.Creator = info.creator;\n"
            preInfo += "app.doc.creator = info.creator;\n"
            preInfo += "app.doc.Creator = info.creator;\n"
        if elements.has_key("/Title"):
            titleValue = elements["/Title"].getValue()
            preInfo += 'info.title = string("%s");\n' % (str(titleValue))
            preInfo += "this.title = info.title;\n"
            preInfo += "info.Title = info.title;\n"
            preInfo += "app.doc.title = info.title;\n"
            preInfo += "app.doc.Title = info.title;\n"
        if elements.has_key("/Subject"):
            subjectValue = elements["/Subject"].getValue()
            preInfo += 'info.subject = String("%s");\n' % (str(subjectValue))
            preInfo += "this.subject = info.subject;\n"
            preInfo += "info.Subject = info.subject;\n"
            preInfo += "app.doc.subject = info.subject;\n"
            preInfo += "app.doc.Subject = info.subject;\n"
        if elements.has_key("/Author"):
            authorValue = elements["/Author"].getValue()
            preInfo += 'info.author = String("%s");\n' % (str(authorValue))
            preInfo += "this.author = info.author;\n"
            preInfo += "info.Author = info.author;\n"
            preInfo += "app.doc.author = info.author;\n"
            preInfo += "app.doc.Author = info.author;\n"
        if elements.has_key("/CreationDate"):
            dateValue = elements["/CreationDate"].getValue()
            preInfo += 'info.creationdate = String("%s");\n' % (str(dateValue))
            preInfo += "this.creationdate = info.creationdate;\n"
            preInfo += "info.CreationDate = info.creationdate;\n"
            preInfo += "app.doc.creationdate = info.creationdate;\n"
            preInfo += "app.doc.CreationDate = info.creationdate;\n"
            preInfo += "app.doc.creationDate = info.creationdate;\n"
            preInfo += "info.creationDate = info.creationdate;\n"

    #Get xml variable name
    if rawCode is not None:
        try:
            doc = xml.dom.minidom.parseString(rawCode)
            scriptElements = doc.getElementsByTagNameNS("*", "script")
            if scriptElements:
                for script in scriptElements:
                    nameVar = script.parentNode.parentNode.getAttribute('name')
                    if nameVar:
                        XMLVar += nameVar + " = this;\n"

        except Exception as e:
            pass

    #Pre-process input code, same as in analyseJS
    try:
        code = unescapeHTMLEntities(code)
        scriptElements = re.findall(reJSscript, code,
                                    re.DOTALL | re.IGNORECASE)
        if scriptElements:
            code = ''
            for scriptElement in scriptElements:
                code += scriptElement + '\n\n'
        code = jsbeautifier.beautify(code)

        if code is not None and not manualAnalysis:
            originalCode = code
            for version in pdfVersions:
                # initialize 4 lists for each PDF version
                errors = []
                jsCode = []
                unescapedBytes = []
                urlsFound = []

                code = originalCode
                jsCode.append(code)
                viewerVersion = 'app.viewerVersion = Number(%s);\n' % (version)
                while True:
                    #Detect shellcode in code
                    if code != '':
                        #Detect shellcode and embedded URL(s) in case of using unescape function. e.g. x = unescape(%u0A0A%0B0B)
                        escapedVars = re.findall(
                            '(\w*?)\s*?=\s*?(unescape\((.*?)\))', code,
                            re.DOTALL)
                        for var in escapedVars:
                            bytes = var[2]
                            if bytes.find('+') != -1 or bytes.find('%') == -1:
                                varContent = getVarContent(code, bytes)
                                if len(varContent) > 150:
                                    ret = unescape(varContent)
                                    if ret[0] != -1:
                                        bytes = ret[1]
                                        urls = re.findall(
                                            r'https?://[a-zA-Z0-9\./]+', bytes,
                                            re.DOTALL)
                                        if bytes not in unescapedBytes:
                                            unescapedBytes.append(bytes)
                                        for url in urls:
                                            if url not in urlsFound:
                                                urlsFound.append(str(url))
                            else:
                                bytes = bytes[1:-1]
                                if len(bytes) > 150:
                                    ret = unescape(bytes)
                                    if ret[0] != -1:
                                        bytes = ret[1]
                                        urls = re.findall(
                                            r'https?://[a-zA-Z0-9\./]+', bytes,
                                            re.DOTALL)
                                        if bytes not in unescapedBytes:
                                            unescapedBytes.append(bytes)
                                        for url in urls:
                                            if url not in urlsFound:
                                                urlsFound.append(str(url))
                        # Detect shellcode in case of finding variable assigned to an escaped string
                        # post.js produce a signature. e.g. #//shellcode len 767 (including any NOPs) payload = %u0A0A%u0A0A%u0A0A%uE1D9%u34D9%u5824%u5858
                        escapedVars = re.findall(
                            '//shellcode (pdf|len) (\d+) .*? = (.*)$', code,
                            re.DOTALL)
                        for var in escapedVars:
                            bytes = str(var[2])
                            if len(bytes) > 150:
                                ret = unescape(bytes)
                                if ret[0] != -1:
                                    bytes = ret[1]
                                    urls = re.findall(
                                        r'https?://[a-zA-Z0-9\./]+', bytes,
                                        re.DOTALL)
                                    if bytes not in unescapedBytes:
                                        unescapedBytes.append(bytes)
                                    for url in urls:
                                        if url not in urlsFound:
                                            urlsFound.append(url)

                    #Prepare JS code
                    isJS = isJavascript(code)
                    if isJS:
                        code = viewerVersion + preInfo + annotsInPagesMaster + annotsNameInPagesMaster + XMLVar + code
                    #Hook eval and run Javascript
                    if isJS:
                        status, evalCode, error = evalJS(code)
                        evalCode = jsbeautifier.beautify(evalCode)
                        if error != "":
                            errors.append(error)

                        #if next stage of the JS exists, re-eval the next stage
                        if (evalCode is not None
                                or evalCode != '') and evalCode != code:
                            # Assign code to the next stage
                            code = evalCode
                            if isJavascript(code):
                                jsCode.append(code)
                        else:
                            break
                    else:
                        break
                valuesFoundByViewerVersion[version] = [
                    jsCode, unescapedBytes, urlsFound, errors
                ]
    except:
        traceback.print_exc(file=open(errorsFile, 'a'))
        errors.append('Unexpected error in the JSUnpack module!!')

    return valuesFoundByViewerVersion

예제 #12

0

파일 보기

파일: JSAnalysis.py 프로젝트: Marhaus524/peepdf

def analyseJS(code, context=None, manualAnalysis=False):
    """
    Hooks the eval function and search for obfuscated elements in the Javascript code

    @param code: The Javascript code (string)
    @return: List with analysis information of the Javascript code: [JSCode,unescapedBytes,urlsFound,errors,context], where
            JSCode is a list with the several stages Javascript code,
            unescapedBytes is a list with the parameters of unescape functions,
            urlsFound is a list with the URLs found in the unescaped bytes,
            errors is a list of errors,
            context is the context of execution of the Javascript code.
    """
    errors = []
    jsCode = []
    unescapedBytes = []
    urlsFound = []

    try:
        code = unescapeHTMLEntities(code)
        scriptElements = re.findall(reJSscript, code,
                                    re.DOTALL | re.IGNORECASE)
        if scriptElements:
            code = ""
            for scriptElement in scriptElements:
                code += scriptElement + "\n\n"
        code = jsbeautifier.beautify(code)
        jsCode.append(code)

        if code is not None and JS_MODULE and not manualAnalysis:
            if context is None:
                context = PyV8.JSContext(Global())
            context.enter()
            # Hooking the eval function
            context.eval("eval=evalOverride")
            # context.eval(preDefinedCode)
            while True:
                originalCode = code
                try:
                    context.eval(code)
                    evalCode = context.eval("evalCode")
                    evalCode = jsbeautifier.beautify(evalCode)
                    if evalCode != "" and evalCode != code:
                        code = evalCode
                        jsCode.append(code)
                    else:
                        break
                except:
                    error = str(sys.exc_info()[1])
                    open("jserror.log", "ab").write(error + newLine)
                    errors.append(error)
                    break

            if code != "":
                escapedVars = re.findall("(\w*?)\s*?=\s*?(unescape\((.*?)\))",
                                         code, re.DOTALL)
                for var in escapedVars:
                    bytes = var[2]
                    if bytes.find("+") != -1 or bytes.find("%") == -1:
                        varContent = getVarContent(code, bytes)
                        if len(varContent) > 150:
                            ret = unescape(varContent)
                            if ret[0] != -1:
                                bytes = ret[1]
                                urls = re.findall("https?://.*$", bytes,
                                                  re.DOTALL)
                                if bytes not in unescapedBytes:
                                    unescapedBytes.append(bytes)
                                for url in urls:
                                    if url not in urlsFound:
                                        urlsFound.append(url)
                    else:
                        bytes = bytes[1:-1]
                        if len(bytes) > 150:
                            ret = unescape(bytes)
                            if ret[0] != -1:
                                bytes = ret[1]
                                urls = re.findall("https?://.*$", bytes,
                                                  re.DOTALL)
                                if bytes not in unescapedBytes:
                                    unescapedBytes.append(bytes)
                                for url in urls:
                                    if url not in urlsFound:
                                        urlsFound.append(url)
    except:
        traceback.print_exc(file=open(errorsFile, "a"))
        errors.append("Unexpected error in the JSAnalysis module!!")
    finally:
        for js in jsCode:
            if js is None or js == "":
                jsCode.remove(js)
    return [jsCode, unescapedBytes, urlsFound, errors, context]