def get_value (elem, root): if elem.tag == "literal" or elem.tag == "number" or elem.tag == "keyword": return unescapeHTMLEntities(elem.text) elif elem.tag == "string": return unescapeHTMLEntities(elem.text.decode('base64')) elif elem.tag == "ref": #find the referenced object and return its value obj = get_ref_object(elem.get('id'), root) return get_value(obj[0], root) elif elem.tag == "stream": return unescapeHTMLEntities(elem[1].text.decode('base64')) elif elem.tag == "dict": #build the dictionary ret = {} size = elem.get("size") size = re.sub("%", "", size) dict_elems = elem.getchildren() for i in range(int(size)): val = get_value(dict_elems[i][0], root) if val is not None: ret[dict_elems[i].tag] = val elif elem.tag == "list": #build the list ret = [] size = elem.get("size") size = re.sub("%", "", size) list_elems = elem.getchildren() for i in range(int(size)): val = get_value(list_elems[i], root) if val is not None: ret.append(val) else: #some tags not accounted for: Rect, field, xfa, Media, etc ret = None return ret
def isJavascript(content): ''' Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascript code or not. @param content: A string @return: A boolean, True if it seems to contain Javascript code or False in the other case ''' JSStrings = ['var ',';',')','(','function ','=','{','}','if ','else','return','while ','for ',',','eval', 'unescape', '.replace'] keyStrings = [';','(',')'] stringsFound = [] limit = 15 minDistinctStringsFound = 5 results = 0 content = unescapeHTMLEntities(content) if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []: return True for char in content: if (ord(char) < 32 and char not in ['\n','\r','\t','\f','\x00']) or ord(char) >= 127: return False for string in JSStrings: cont = content.count(string) results += cont if cont > 0 and string not in stringsFound: stringsFound.append(string) elif cont == 0 and string in keyStrings: return False if results > limit and len(stringsFound) >= minDistinctStringsFound: return True else: return False