def scan_parser(code_content, sensitive_func, vul_lineno, file_path): """ 开始检测函数 :param code_content: 要检测的文件内容 :param sensitive_func: 要检测的敏感函数,传入的为函数列表 :param vul_lineno: 漏洞函数所在行号 :param file_path: 文件名 :param ast: 深度ast分析 :return: """ try: global scan_results scan_results = [] parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line) for func in sensitive_func: # 循环判断代码中是否存在敏感函数,若存在,递归判断参数是否可控;对文件内容循环判断多次 back_node = [] analysis(all_nodes, func, back_node, int(vul_lineno), file_path, function_params=None) except SyntaxError as e: logger.warning('[AST] [ERROR]:{e}'.format(e=e)) return scan_results
def get_listener(code, fake_filename='filename.php'): parser = make_parser() line_map = [(None, None), ('filename.php', 1)] nodes = parser.parse(code, lexer=lexer.clone(), tracking=True, debug=False) listener = MyPHPListener(line_map=line_map, name=fake_filename) php_traverser.traverse(nodes, listener) return listener
def deep_parameters_back(node, back_node, function_params, count, file_path): """ 深度递归遍历 :param node: :param back_node: :param function_params: :param file_path: :return: """ count += 1 params = get_node_name(node) is_co, cp, expr_lineno = parameters_back(params, back_node, function_params) if count > 20: logger.warning("[Deep AST] depth too big to auto exit...") return is_co, cp, expr_lineno if is_co == 3: logger.debug("[Deep AST] try to find include, start deep AST") for node in back_node[::-1]: if isinstance(node, php.Include): filename = node.expr file_path = re.split(r"[\/\\]", file_path) file_path.pop() file_path.append(filename) file_path = "/".join(file_path) try: logger.debug("[Deep AST] open new file {file_path}".format( file_path=file_path)) f = open(file_path, 'r') file_content = f.read() except: logger.warning( "[Deep AST] error to open new file...continue") continue parser = make_parser() all_nodes = parser.parse(file_content, debug=False, lexer=lexer.clone(), tracking=with_line) node = php.Variable(cp) is_co, cp, expr_lineno = deep_parameters_back( node, all_nodes, function_params, count, file_path) if is_co == -1: break return is_co, cp, expr_lineno
def scan(code_content, sensitive_func): """ 开始检测函数 :param code_content: 要检测的文件内容 :param sensitive_func: 要检测的敏感函数,传入的为函数列表 :return: """ parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line) pprint.pprint(all_nodes) for func in sensitive_func: # 循环判断代码中是否存在敏感函数,若存在,递归判断参数是否可控 analysis(all_nodes, func)
def analyze(path): with codecs.open(path, 'r', 'latin1') as f: src = f.read() try: items = parser.parse(src, tracking=True, lexer=lexer.clone()) except SyntaxError as e: print "Syntax Error", e.filename, e except ValueError as e: print "Syntax Error", e else: for ast in items: if hasattr(ast, 'generic'): item = ast.generic(with_lineno=True) else: item = ast print item
def pre_ast(self): for fileext in self.file_list: if ".php" == fileext[0]: # 下面是对于php文件的处理逻辑 for filepath in fileext[1]['list']: filepath = os.path.join(self.target_directory, filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'php' fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() self.pre_result[filepath]['content'] = code_content try: parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=True) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) # 搜索所有的常量 for node in all_nodes: if isinstance( node, php.FunctionCall) and node.name == "define": define_params = node.params logger.debug( "[AST][Pretreatment] new define {}={}".format( define_params[0].node, define_params[1].node)) self.define_dict[ define_params[0].node] = define_params[1].node
def parse_code(self, filename): basedir = self.state.document.settings.env.doctreedir cachename = os.path.join(basedir, basename(filename, 'parse')) if is_same_mtime(filename, cachename): tree = pickle.load(open(cachename, 'rb')) else: try: with codecs.open(filename, 'r', 'utf-8') as f: tree = parser.parse(f.read(), lexer=lexer.clone()) with open(cachename, 'wb') as f: pickle.dump(tree, f) mtime = os.stat(filename).st_mtime os.utime(cachename, (mtime, mtime)) except Exception: raise return tree
def scan_parser(code_content, sensitive_func, vul_lineno): """ 开始检测函数 :param code_content: 要检测的文件内容 :param sensitive_func: 要检测的敏感函数,传入的为函数列表 :param vul_lineno: 漏洞函数所在行号 :return: """ try: global scan_results scan_results = [] parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line) for func in sensitive_func: # 循环判断代码中是否存在敏感函数,若存在,递归判断参数是否可控;对文件内容循环判断多次 back_node = [] analysis(all_nodes, func, back_node, int(vul_lineno), function_params=None) except SyntaxError as e: logger.warning('[AST] [ERROR]:{e}'.format(e=e)) return scan_results
def anlysis_params(param, code_content, file_path, lineno): """ 在cast调用时做中转数据预处理 :param lineno: :param param: :param code_content: :param file_path: :return: """ count = 0 function_params = None param = php.Variable(param) parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line) vul_nodes = [] for node in all_nodes: if node.lineno < int(lineno): vul_nodes.append(node) is_co, cp, expr_lineno = deep_parameters_back(param, vul_nodes, function_params, count, file_path, lineno) return is_co, cp, expr_lineno
def create_graph(path, file): # Preprocess file so includes are considered pre = Preprocessor(path) # Line map contains a mapping between line number and original file + original line number line_map, file_str = pre.preprocess_file(file) # Reset definition register with every new graph DefinitionRegister.reset() # Make a parser parser = make_parser() # Make a lexer l = lexer.clone() nodes = parser.parse(file_str, lexer=l, tracking=True, debug=False) listener = MyPHPListener(line_map=line_map, name=file) php_traverser.traverse(nodes, listener) return listener.get_graph()
async def pre_ast(self): while not self.target_queue.empty(): fileext = self.target_queue.get() if not self.lan: break if fileext[0] in ext_dict['php'] and 'php' in self.lan: # 下面是对于php文件的处理逻辑 for filepath in fileext[1]['list']: all_nodes = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'php' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() # self.pre_result[filepath]['content'] = code_content try: parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=True) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) # 搜索所有的常量 for node in all_nodes: if isinstance( node, php.FunctionCall) and node.name == "define": define_params = node.params logger.debug( "[AST][Pretreatment] new define {}={}".format( define_params[0].node, define_params[1].node)) self.define_dict[ define_params[0].node] = define_params[1].node elif fileext[0] in ext_dict[ 'chromeext'] and 'chromeext' in self.lan: # 针对chrome 拓展的预处理 # 需要提取其中的js和html? for filepath in fileext[1]['list']: child_files = [] child_files_html = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'chromeext' # 首先想办法解压crx try: target_files_path = un_zip(filepath) self.pre_result[filepath][ 'target_files_path'] = target_files_path except zipfile.BadZipFile: logger.warning( "[Pretreatment][Chrome Ext] file {} not zip". format(filepath)) continue except OSError: logger.warning( "[Pretreatment][Chrome Ext] file {} unzip error". format(filepath)) continue # 分析manifest.json manifest_path = os.path.join(target_files_path, "manifest.json") # target可能是单个文件,这里需要专门处理 if not (self.target_directory.endswith("/") or self.target_directory.endswith("\\") ) and not os.path.isdir(self.target_directory): path_list = re.split(r'[\\|/]', self.target_directory) relative_path = os.path.join(path_list[-1] + "_files") else: relative_path = target_files_path.split( self.target_directory)[-1] if relative_path.startswith( '\\') or relative_path.startswith("/"): relative_path = relative_path[1:] if os.path.isfile(manifest_path): fi = codecs.open(manifest_path, "r", encoding='utf-8', errors='ignore') manifest_content = fi.read() fi.close() try: manifest = json.loads(manifest_content, encoding='utf-8') except json.decoder.JSONDecodeError: logger.warning( "[Pretreatment][Chrome Ext] File {} parse error..." .format(target_files_path)) continue self.pre_result[filepath]["manifest"] = manifest # 想办法优化,如果不想深入js和html的判断,那么就跳过 if len(self.lan) and self.lan == 'chromeext': logger.debug( "[Pretreatment][Chrome Ext] pass js & html scan..." ) continue # content scripts if "content_scripts" in manifest: for script in manifest["content_scripts"]: if "js" in script: child_files.extend([ os.path.join(relative_path, js) for js in script['js'] ]) # background js if "background" in manifest: if "scripts" in manifest["background"]: child_files.extend([ os.path.join(relative_path, js) for js in manifest["background"]["scripts"] ]) # background html if "page" in manifest["background"]: child_files_html.append( os.path.join( relative_path, manifest["background"]["page"])) # popup.html if "browser_action" in manifest: if "default_popup" in manifest["browser_action"]: child_files_html.append( os.path.join( relative_path, manifest["browser_action"] ["default_popup"])) # web_accessible_resources if "web_accessible_resources" in manifest: for resource in manifest[ "web_accessible_resources"]: if ".js" in resource: child_files.append( os.path.join(relative_path, resource)) if ".html" in resource: child_files_html.append( os.path.join(relative_path, resource)) # chrome_url_overrides if "chrome_url_overrides" in manifest: for key in manifest["chrome_url_overrides"]: child_files_html.append( os.path.join( relative_path, manifest["chrome_url_overrides"][key])) self.pre_result[filepath]["child_files"] = child_files if len(child_files): # 将content_scripts加入到文件列表中构造 self.target_queue.put(('.js', { 'count': len(child_files), 'list': child_files })) # 通过浅复制操作外部传入的files self.file_list.append(('.js', { 'count': len(child_files), 'list': child_files })) if len(child_files_html): self.target_queue.put(('.html', { 'count': len(child_files_html), 'list': child_files_html })) else: logger.warning( "[Pretreatment][Chrome Ext] File {} parse error..." .format(target_files_path)) continue elif fileext[0] in ext_dict['html'] and 'javascript' in self.lan: # html only found js for filepath in fileext[1]['list']: filepath = self.get_path(filepath) script_list = [] try: fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() except FileNotFoundError: continue except OSError: continue # tmp.js save all inline javascript code tmp_path = os.path.join(os.path.dirname(filepath), "tmp.js") fi2 = codecs.open(tmp_path, "a", encoding='utf-8', errors='ignore') try: soup = BeautifulSoup(code_content, "html.parser") script_tag_list = soup.find_all('script') for script_tag in script_tag_list: script_attrs = script_tag.attrs if 'src' in script_attrs: parents_path = os.path.normpath("\\".join( re.split(r'[\\|/]', filepath)[:-1])) script_path = os.path.join( parents_path, script_attrs['src']) script_list.append(script_path) else: # 如果没有src,那么代表是内联js script_content = script_tag.string fi2.write(" \n{}\n ".format(script_content)) fi2.close() if tmp_path not in script_list: script_list.append(tmp_path) # 将content_scripts加入到文件列表中构造 self.target_queue.put(('.js', { 'count': len(script_list), 'list': script_list })) # 通过浅复制操作外部传入的files self.file_list.append(('.js', { 'count': len(script_list), 'list': script_list })) except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) continue elif fileext[0] in ext_dict[ 'javascript'] and 'javascript' in self.lan: # 针对javascript的预处理 # 需要对js做语义分析 for filepath in fileext[1]['list']: filepath = self.get_path(filepath) if not filepath.endswith(".js"): continue self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'javascript' self.pre_result[filepath]['ast_nodes'] = [] try: fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() except FileNotFoundError: continue except OSError: continue # 添加代码美化并且写入新文件 new_filepath = filepath + ".pretty" try: if not os.path.isfile(new_filepath): fi2 = codecs.open(new_filepath, "w", encoding='utf-8', errors='ignore') code_content = jsbeautifier.beautify(code_content) fi2.write(code_content) fi2.close() # self.pre_result[filepath]['content'] = code_content all_nodes = esprima.parse(code_content, {"loc": True}) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except esprima.error_handler.Error: logger.warning( '[AST] [ERROR] Invalid regular expression in {}...' .format(filepath)) except KeyboardInterrupt: logger.log('[AST stop...') exit() except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) continue # 手动回收? gc.collect() return True
def scan(code_content, vul_function, vul_function_line): parser = make_parser() all_nodes = export(parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line)) return traversal(all_nodes, vul_function, vul_function_line)
async def pre_ast(self): while not self.target_queue.empty(): fileext = self.target_queue.get() if fileext[0] in ext_dict['php']: # 下面是对于php文件的处理逻辑 for filepath in fileext[1]['list']: all_nodes = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'php' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() self.pre_result[filepath]['content'] = code_content try: parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=True) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) # 搜索所有的常量 for node in all_nodes: if isinstance( node, php.FunctionCall) and node.name == "define": define_params = node.params logger.debug( "[AST][Pretreatment] new define {}={}".format( define_params[0].node, define_params[1].node)) self.define_dict[ define_params[0].node] = define_params[1].node elif fileext[0] in ext_dict['chromeext']: # 针对chrome 拓展的预处理 # 需要提取其中的js和html? for filepath in fileext[1]['list']: child_files = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'chromeext' # 首先想办法解压crx try: target_files_path = un_zip(filepath) self.pre_result[filepath][ 'target_files_path'] = target_files_path except zipfile.BadZipFile: logger.warning( "[Pretreatment][Chrome Ext] file {} not zip". format(filepath)) continue # 分析manifest.json manifest_path = os.path.join(target_files_path, "manifest.json") # target可能是单个文件,这里需要专门处理 if not (self.target_directory.endswith("/") or self.target_directory.endswith("\\") ) and not os.path.isdir(self.target_directory): relative_path = os.path.join( re.split(r'[\\|/]', self.target_directory)[-1] + "_files") else: relative_path = target_files_path.split( self.target_directory)[-1] if relative_path.startswith( '\\') or relative_path.startswith("/"): relative_path = relative_path[1:] if os.path.isfile(manifest_path): fi = codecs.open(manifest_path, "r", encoding='utf-8', errors='ignore') manifest_content = fi.read() fi.close() try: manifest = json.loads(manifest_content, encoding='utf-8') except json.decoder.JSONDecodeError: logger.warning( "[Pretreatment][Chrome Ext] File {} parse error..." .format(target_files_path)) continue self.pre_result[filepath]["manifest"] = manifest if "content_scripts" in manifest: for script in manifest["content_scripts"]: if 'js' in script: child_files.extend([ os.path.join(relative_path, js) for js in script['js'] ]) self.pre_result[filepath]["child_files"] = child_files # 将content_scripts加入到文件列表中构造 self.target_queue.put(('.js', { 'count': len(child_files), 'list': child_files })) # 通过浅复制操作外部传入的files self.file_list.append(('.js', { 'count': len(child_files), 'list': child_files })) else: logger.warning( "[Pretreatment][Chrome Ext] File {} parse error..." .format(target_files_path)) continue elif fileext[0] in ext_dict['javascript']: # 针对javascript的预处理 # 需要对js做语义分析 for filepath in fileext[1]['list']: filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'javascript' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() # 添加代码美化并且写入新文件 new_filepath = filepath + ".pretty" if not os.path.isfile(new_filepath): fi2 = codecs.open(new_filepath, "w", encoding='utf-8', errors='ignore') code_content = jsbeautifier.beautify(code_content) fi2.write(code_content) fi2.close() self.pre_result[filepath]['content'] = code_content try: all_nodes = esprima.parse(code_content, {"loc": True}) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except esprima.error_handler.Error: logger.warning( '[AST] [ERROR] Invalid regular expression in {}...' .format(filepath)) except KeyboardInterrupt: logger.log('[AST stop...') exit() except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) continue # 手动回收? gc.collect() return True
def pre_ast(self, lan=None): if lan is not None: # 检查是否在可ast pasre列表中 if not list(set(lan).intersection(set(could_ast_pase_lans))): logger.info("[AST][Pretreatment] Current scan target language does not require ast pretreatment...") return True for fileext in self.file_list: if fileext[0] in ext_dict['php']: # 下面是对于php文件的处理逻辑 for filepath in fileext[1]['list']: all_nodes = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'php' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() self.pre_result[filepath]['content'] = code_content try: parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=True) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format(filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format(filepath, traceback.format_exc())) except: logger.warning('[AST] something error, {}'.format(traceback.format_exc())) # 搜索所有的常量 for node in all_nodes: if isinstance(node, php.FunctionCall) and node.name == "define": define_params = node.params logger.debug("[AST][Pretreatment] new define {}={}".format(define_params[0].node, define_params[1].node)) self.define_dict[define_params[0].node] = define_params[1].node elif fileext[0] in ext_dict['chromeext']: child_files = [] # 针对chrome 拓展的预处理 # 需要提取其中的js和html? for filepath in fileext[1]['list']: filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'chromeext' # 首先想办法解压crx target_files_path = un_zip(filepath) self.pre_result[filepath]['target_files_path'] = target_files_path # 分析manifest.json manifest_path = os.path.join(target_files_path, "manifest.json") relative_path = target_files_path.split(self.target_directory)[-1] if relative_path.startswith('\\') or relative_path.startswith("/"): relative_path = relative_path[1:] if os.path.isfile(manifest_path): fi = codecs.open(manifest_path, "r", encoding='utf-8', errors='ignore') manifest_content = fi.read() manifest = json.loads(manifest_content) self.pre_result[filepath]["manifest"] = manifest if "content_scripts" in manifest: for script in manifest["content_scripts"]: child_files.extend([os.path.join(relative_path, js) for js in script['js']]) self.pre_result[filepath]["child_files"] = child_files else: logger.warning("[Pretreatment][Chrome Ext] File {} parse error...".format(target_files_path)) continue elif fileext[0] in ext_dict['javascript']: # 针对javascript的预处理 # 需要对js做语义分析 for filepath in fileext[1]['list']: filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'javascript' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() self.pre_result[filepath]['content'] = code_content try: all_nodes = esprima.parse(code_content, {"loc": True}) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format(filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format(filepath, traceback.format_exc())) except: logger.warning('[AST] something error, {}'.format(traceback.format_exc()))