def get_value(elem, root):
    if elem.tag == "literal" or elem.tag == "number" or elem.tag == "keyword":
        return unescapeHTMLEntities(elem.text)
    elif elem.tag == "string":
        return unescapeHTMLEntities(elem.text.decode('base64'))
    elif elem.tag == "ref":
        # find the referenced object and return its value
        obj = get_ref_object(elem.get('id'), root)
        return get_value(obj[0], root)
    elif elem.tag == "stream":
        return unescapeHTMLEntities(elem[1].text.decode('base64'))
    elif elem.tag == "dict":
        # build the dictionary
        ret = {}
        size = elem.get("size")
        size = re.sub("%", "", size)
        dict_elems = elem.getchildren()
        for i in range(int(size)):
            val = get_value(dict_elems[i][0], root)
            if val is not None:
                ret[dict_elems[i].tag] = val
    elif elem.tag == "list":
        # build the list
        ret = []
        size = elem.get("size")
        size = re.sub("%", "", size)
        list_elems = elem.getchildren()
        for i in range(int(size)):
            val = get_value(list_elems[i], root)
            if val is not None:
                ret.append(val)
    else:
        # some tags not accounted for: Rect, field, xfa, Media, etc
        ret = None
    return ret
def isJavascript(content):
    '''
        Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascript code or not.
        
        @param content: A string
        @return: A boolean, True if it seems to contain Javascript code or False in the other case
    '''
    JSStrings = ['var ',';',')','(','function ','=','{','}','if ','else','return','while ','for ',',','eval', 'unescape', '.replace']
    keyStrings = [';','(',')']
    stringsFound = []
    limit = 15
    minDistinctStringsFound = 5
    results = 0
    content = unescapeHTMLEntities(content)
    if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []:
        return True
    for char in content:
        if (ord(char) < 32 and char not in ['\n','\r','\t','\f','\x00']) or ord(char) >= 127:
            return False

    for string in JSStrings:
        cont = content.count(string)
        results += cont
        if cont > 0 and string not in stringsFound:
            stringsFound.append(string)
        elif cont == 0 and string in keyStrings:
            return False

    if results > limit and len(stringsFound) >= minDistinctStringsFound:
        return True
    else:
        return False
Exemple #3
0
def isJavascript(content):
    """
    Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascript code or not.

    :param content: A string
    :return: A boolean, True if it seems to contain Javascript code or False in the other case
    """
    JSStrings = [
        'var ', ';', ')', '(', 'function ', '=', '{', '}', 'if ', 'else',
        'return', 'while ', 'for ', ',', 'eval', 'unescape', '.replace'
    ]
    keyStrings = [';', '(', ')']
    stringsFound = []
    limit = 15
    minDistinctStringsFound = 5
    results = 0
    content = unescapeHTMLEntities(content)
    if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []:
        return True
    for char in content:
        if (ord(char) < 32 and char not in ['\n', '\r', '\t', '\f', '\x00'
                                            ]) or ord(char) >= 127:
            return False

    for string in JSStrings:
        cont = content.count(string)
        results += cont
        if cont > 0 and string not in stringsFound:
            stringsFound.append(string)
        elif cont == 0 and string in keyStrings:
            return False

    if results > limit and len(stringsFound) >= minDistinctStringsFound:
        return True
    else:
        return False