JSAnalysis.py

try:
    import PyV8
except ImportError as e:
    print str(e)
    PyV8 = None

import re
import lxml.etree as ET

import build_pdf_objects
from util.str_utils import unescapeHTMLEntities

reJSscript = '<script[^>]*?contentType\s*?=\s*?[\'"]application/x-javascript[\'"][^>]*?>(.*?)</script>'

#Mimic native Adobe objects and add them to the context
def create_objs(context, tree):
    try: 
        app = build_pdf_objects.create_app_obj(tree)
        context.eval("app = " + str(app) + ";")
        context.eval("app.doc.syncAnnotScan = function () {}")
        context.eval("app.doc.getAnnots = function () { return app.doc.annots;}")
        context.eval("app.eval = function (string) { eval(string);}")
        context.eval("app.newDoc = function () { return '';}")
        context.eval("app.getString = function () { ret = \"\"; for(var prop in app){ ret += app[prop]; } return ret;}")
    except Exception as e:
        #print "App: " + e.message
        pass
    try:
        info = build_pdf_objects.create_info_obj(tree)  
        context.eval("this.info = " + str(info) + ";")
        for key in info:
            context.eval("this." + key + "= '" + re.escape(info[key]) + "';")
        context.eval("this.eval = eval")
        #print info
    except Exception as e:
        print "Info: " + e.message
        pass
    try:
        event = build_pdf_objects.create_event_obj(tree)
        context.eval("event = " + str(event) + ";")
        context.eval("event.target.info = this.info")
    except Exception as e:
        #print "Event: " + e.message
        pass

'''
    Eval the code and handle any exceptions it throws
'''
def eval_loop (code, context, old_msg = "", limit=10):
    try:
        context.eval(code) 
        return context.eval("evalCode")
    #catch exceptions and attempt to fix them
    except ReferenceError as e:
        #print e.message
        if e.message == old_msg:
            return context.eval("evalCode")
        elif e.message.find('$') > -1:
            context.eval("$ = this;")
        else:
            #try commenting out line
            line_num = re.findall("@\s(\d*?)\s", e.message)
            line_num = int(line_num[0])
            i = 0
            for item in code.split("\n"):
                i += 1
                if i == line_num:
                    code = re.sub(item, "//" + item, code)
                    break
        return eval_loop(code, context, e.message)
    except TypeError as te:
        #print te.message
        if te.message == old_msg:
            return context.eval("evalCode")
        elif te.message.find("called on null or undefined") > -1:
            #in Adobe undefined objects become app object
            line = re.findall("->\s(.*)", te.message)
            sub, count = re.subn("=\s?.\(.*?\)", "=app", line[0])
            if count < 1:
               sub = re.sub("=.*", "=app", line[0])
            line = re.escape(line[0])
            code = re.sub(line, sub, code)
        elif te.message.find("undefined is not a function") > -1:
            #sub in eval as a guess
            line = re.findall("->\s(.*)", te.message)
            match = re.findall("[\s=]?(.*?)\(", line[0])
            if len(match) > 0:
                sub = re.sub(match[0], "eval", line[0])
                line = re.escape(line[0])
                code = re.sub(line, sub, code)
            else:
                return context.eval("evalCode")
        elif te.message.find("Cannot read property") > -1:
            #undefined becomes app
            line = re.findall("->\s(.*)", te.message)
            match = re.findall("[=\s](.*?)\[", line[0])
            if len(match) > 0:
                sub = re.sub(match[0], "app", line[0])
                line = re.escape(line[0])
                code = re.sub(line, sub, code)
            else:
                return context.eval("evalCode")
        else:
            return context.eval("evalCode")
        return eval_loop(code, context, te.message)
    except SyntaxError as se:
        #print se.message
        if se.message == old_msg:
            return context.eval("evalCode")
        line_num = re.findall("@\s(\d*?)\s", se.message)
        if len(line_num) > 0:
            line_num = int(line_num[0])
            i = 0
            #try commenting out the line number with the error
            for item in code.split("\n"):
                i += 1
                if i == line_num:
                    esc_item = re.escape(item)
                    code, n = re.subn(esc_item, "//" + item, code)
                    break
        else:
            return context.eval('evalCode')
        return eval_loop(code, context, se.message)
    except Exception as e1:
        #print e1.message
        return context.eval("evalCode")

def analyse (js, tree):
    if not PyV8:
	return ''
    with PyV8.JSIsolate():
        context = PyV8.JSContext()
        context.enter()
        context.eval('evalCode = \'\';')
        context.eval('evalOverride = function (expression) { evalCode += expression; return;}')
        context.eval('eval=evalOverride')
        try:
            if tree is not None:
                create_objs(context, tree)
            ret = eval_loop(js, context)   
            context.leave()
            if ret == None:
                return ''
            else:
                return ret
        except Exception as e:
            context.leave()
            #return 'Error with analyzing JS: ' + e.message
            return ''

def isJavascript(content):
    '''
        Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascript code or not.
        
        @param content: A string
        @return: A boolean, True if it seems to contain Javascript code or False in the other case
    '''
    JSStrings = ['var ',';',')','(','function ','=','{','}','if ','else','return','while ','for ',',','eval', 'unescape', '.replace']
    keyStrings = [';','(',')']
    stringsFound = []
    limit = 15
    minDistinctStringsFound = 5
    results = 0
    content = unescapeHTMLEntities(content)
    if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []:
        return True
    for char in content:
        if (ord(char) < 32 and char not in ['\n','\r','\t','\f','\x00']) or ord(char) >= 127:
            return False

    for string in JSStrings:
        cont = content.count(string)
        results += cont
        if cont > 0 and string not in stringsFound:
            stringsFound.append(string)
        elif cont == 0 and string in keyStrings:
            return False

    if results > limit and len(stringsFound) >= minDistinctStringsFound:
        return True
    else:
        return False