def parseConfiguration(self): dns_re = re.compile(r"^mysql:host=([^;]+);dbname=(.+)$") def array_get(array, key): if isinstance(array, Array): for element in array.nodes: if element.key == key: return element.value return None parser = make_parser() with open(self.fsconfig) as f: ast = parser.parse(f.read(), lexer=phplex.lexer.clone()) if ast and isinstance(ast[0], Return): r = ast[0].node self.title = array_get(r, "name") if not self.title: raise HHConfigError(self, "no title given") logging.debug("HH.parseConfiguration: title={}".format(self.title)) components = array_get(r, "components") if components: db = array_get(components, "db") if db: dsn = array_get(db, "dsn") if dsn: logging.debug("HH.parseConfiguration: dsn={}".format(dsn)) m = dns_re.match(dsn) if m: self.dbhost = m.group(1) self.dbname = m.group(2) self.dbuser = array_get(db, "username") self.dbpass = array_get(db, "password") if not self.dbhost or not self.dbname: raise HHConfigError(self, "no database given")
def scan_parser(code_content, sensitive_func, vul_lineno, file_path): """ 开始检测函数 :param code_content: 要检测的文件内容 :param sensitive_func: 要检测的敏感函数,传入的为函数列表 :param vul_lineno: 漏洞函数所在行号 :param file_path: 文件名 :param ast: 深度ast分析 :return: """ try: global scan_results scan_results = [] parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line) for func in sensitive_func: # 循环判断代码中是否存在敏感函数,若存在,递归判断参数是否可控;对文件内容循环判断多次 back_node = [] analysis(all_nodes, func, back_node, int(vul_lineno), file_path, function_params=None) except SyntaxError as e: logger.warning('[AST] [ERROR]:{e}'.format(e=e)) return scan_results
def get_listener(code, fake_filename='filename.php'): parser = make_parser() line_map = [(None, None), ('filename.php', 1)] nodes = parser.parse(code, lexer=lexer.clone(), tracking=True, debug=False) listener = MyPHPListener(line_map=line_map, name=fake_filename) php_traverser.traverse(nodes, listener) return listener
def deep_parameters_back(node, back_node, function_params, count, file_path): """ 深度递归遍历 :param node: :param back_node: :param function_params: :param file_path: :return: """ count += 1 params = get_node_name(node) is_co, cp, expr_lineno = parameters_back(params, back_node, function_params) if count > 20: logger.warning("[Deep AST] depth too big to auto exit...") return is_co, cp, expr_lineno if is_co == 3: logger.debug("[Deep AST] try to find include, start deep AST") for node in back_node[::-1]: if isinstance(node, php.Include): filename = node.expr file_path = re.split(r"[\/\\]", file_path) file_path.pop() file_path.append(filename) file_path = "/".join(file_path) try: logger.debug("[Deep AST] open new file {file_path}".format( file_path=file_path)) f = open(file_path, 'r') file_content = f.read() except: logger.warning( "[Deep AST] error to open new file...continue") continue parser = make_parser() all_nodes = parser.parse(file_content, debug=False, lexer=lexer.clone(), tracking=with_line) node = php.Variable(cp) is_co, cp, expr_lineno = deep_parameters_back( node, all_nodes, function_params, count, file_path) if is_co == -1: break return is_co, cp, expr_lineno
def parser(filename): if not os.path.exists(filename): return {} with open(filename) as f: code = f.read() reload(phply.phplex) logger.debug('Parse file: %s' % filename) return export(make_parser().parse(code, lexer=phply.phplex.lexer, tracking=True))
def scan(code_content, sensitive_func): """ 开始检测函数 :param code_content: 要检测的文件内容 :param sensitive_func: 要检测的敏感函数,传入的为函数列表 :return: """ parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line) pprint.pprint(all_nodes) for func in sensitive_func: # 循环判断代码中是否存在敏感函数,若存在,递归判断参数是否可控 analysis(all_nodes, func)
def pre_ast(self): for fileext in self.file_list: if ".php" == fileext[0]: # 下面是对于php文件的处理逻辑 for filepath in fileext[1]['list']: filepath = os.path.join(self.target_directory, filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'php' fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() self.pre_result[filepath]['content'] = code_content try: parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=True) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) # 搜索所有的常量 for node in all_nodes: if isinstance( node, php.FunctionCall) and node.name == "define": define_params = node.params logger.debug( "[AST][Pretreatment] new define {}={}".format( define_params[0].node, define_params[1].node)) self.define_dict[ define_params[0].node] = define_params[1].node
def count_function_calls(self): # with open(self.input_file, "r") as fin: # _file = fin.read() # with open(self.input_file, "a") as fin: # if not _file.rstrip(" ").rstrip("\n").endswith("?>"): # fin.write("?>") with open(self.input_file, "r") as fin: _file = fin.read() parser = make_parser() lexer = phplex.lexer.clone() lexer.filename = self.input_file.replace("\\", "/") output = parser.parse(_file, lexer=lexer) try: resolve_magic_constants(output) function_calls = str(output).count("FunctionCall") except RuntimeError: function_calls = None return function_calls
def parseFile(filepath): allHTML = "" try: parser = make_parser() parsed = parser.parse(open(filepath).read(), debug=False, lexer=lexer, tracking=True) print(parsed) for node in parsed: # node. if isinstance(node, InlineHTML): allHTML += node.data parser = None parsed = None except Exception as ex: errored.append(file) print(ex) return allHTML
def scan_parser(code_content, sensitive_func, vul_lineno): """ 开始检测函数 :param code_content: 要检测的文件内容 :param sensitive_func: 要检测的敏感函数,传入的为函数列表 :param vul_lineno: 漏洞函数所在行号 :return: """ try: global scan_results scan_results = [] parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line) for func in sensitive_func: # 循环判断代码中是否存在敏感函数,若存在,递归判断参数是否可控;对文件内容循环判断多次 back_node = [] analysis(all_nodes, func, back_node, int(vul_lineno), function_params=None) except SyntaxError as e: logger.warning('[AST] [ERROR]:{e}'.format(e=e)) return scan_results
def create_graph(path, file): # Preprocess file so includes are considered pre = Preprocessor(path) # Line map contains a mapping between line number and original file + original line number line_map, file_str = pre.preprocess_file(file) # Reset definition register with every new graph DefinitionRegister.reset() # Make a parser parser = make_parser() # Make a lexer l = lexer.clone() nodes = parser.parse(file_str, lexer=l, tracking=True, debug=False) listener = MyPHPListener(line_map=line_map, name=file) php_traverser.traverse(nodes, listener) return listener.get_graph()
def anlysis_params(param, code_content, file_path, lineno): """ 在cast调用时做中转数据预处理 :param lineno: :param param: :param code_content: :param file_path: :return: """ count = 0 function_params = None param = php.Variable(param) parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line) vul_nodes = [] for node in all_nodes: if node.lineno < int(lineno): vul_nodes.append(node) is_co, cp, expr_lineno = deep_parameters_back(param, vul_nodes, function_params, count, file_path, lineno) return is_co, cp, expr_lineno
def phpparse(data): parser = make_parser(debug=False) s = data lexer = phplexer lexer.lineno = 1 try: result = parser.parse(s, lexer=lexer.clone(), debug=False) except SyntaxError as e: if e.lineno is not None: logger.error(e, 'near', repr(e.text)) else: logger.error(e) raise except: logger.error("Critical error") raise import pprint for item in result: if hasattr(item, 'generic'): item = item.generic() pprint.pprint(item) parser.restart()
def compute_avg_argument_length(self): # with open(self.input_file, "r") as fin: # _file = fin.read() # with open(self.input_file, "a") as fin: # if not _file.rstrip(" ").rstrip("\n").endswith("?>"): # fin.write("?>") with open(self.input_file, "r") as fin: _file = fin.read() parser = make_parser() lexer = phplex.lexer.clone() lexer.filename = self.input_file output = parser.parse(_file, lexer=lexer) try: resolve_magic_constants(output) except RuntimeError: avg_length_of_arguments_to_function = None return avg_length_of_arguments_to_function indexes = [m.end() for m in re.finditer('FunctionCall', str(output))] func_args = [] for index in indexes: pars = 1 count = 0 for char in str(output)[index + 1:]: count += 1 if char == '(': pars += 1 elif char == ')': pars -= 1 if pars == 0: func_args.append(str(output)[index:index + count + 1]) break functions = [] for func in func_args: for ind, char2 in enumerate(func): if char2 == "'": function_name = "" for char in func[ind + 1:]: if char == "'": break function_name += char functions.append(function_name) break functions = list(set(functions)) functions = [x for x in functions if not x.startswith("$")] func_dict = {} for func in functions: indexes = [(m.start(), m.end()) for m in re.finditer(func, _file)] func_dict[func] = indexes for key, value in func_dict.iteritems(): for tup in value: start_char = tup[0] if _file[start_char - 9:start_char - 1] == "function": func_dict[key].remove(tup) func_args_dict = {} for key, value in func_dict.iteritems(): func_args_dict[key] = [] for tup in value: end_line = tup[1] pars = 1 count = 0 arguments = "" for char in _file[end_line + 1:]: count += 1 if char == '(': pars += 1 elif char == ')': pars -= 1 if pars == 0: arguments = _file[end_line + 1:end_line + count] break func_args_dict[key].append(arguments) total_length = 0 for key, value in func_args_dict.iteritems(): for arg in value: total_length += len(arg) if len(func_args_dict) == 0: avg_length_of_arguments_to_function = 0 else: avg_length_of_arguments_to_function = float(total_length) / len( func_args_dict) return avg_length_of_arguments_to_function
def parse(self, phpsrc): """Read the source of a PHP file in and include them as units.""" def handle_array(prefix, nodes, lexer): prefix += lexer.extract_array() for item in nodes: assert isinstance(item, ArrayElement) # Skip empty keys if item.key == '': continue if isinstance(item.key, BinaryOp): name = '\'{0}\''.format(concatenate(item.key)) elif isinstance(item.key, (int, float)): name = '{0}'.format(item.key) else: name = '\'{0}\''.format(item.key) if prefix: name = '{0}->{1}'.format(prefix, name) if isinstance(item.value, Array): handle_array(name, item.value.nodes, lexer) elif isinstance(item.value, six.string_types): self.create_and_add_unit( name, item.value, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) def concatenate(item): if isinstance(item, six.string_types): return item elif isinstance(item, Variable): return item.name assert isinstance(item, BinaryOp) return concatenate(item.left) + concatenate(item.right) parser = make_parser() for item in parser.productions: item.callable = wrap_production(item.callable) lexer = PHPLexer() tree = parser.parse(phpsrc.decode(self.encoding), lexer=lexer, tracking=True) # Handle text without PHP start if len(tree) == 1 and isinstance(tree[0], InlineHTML): return self.parse(b'<?php\n' + phpsrc) for item in tree: if isinstance(item, FunctionCall): if item.name == 'define': self.create_and_add_unit( lexer.extract_name('COMMA', *item.lexpositions), item.params[1].node, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item, Assignment): if isinstance(item.node, ArrayOffset): name = lexer.extract_name('EQUALS', *item.lexpositions) if isinstance(item.expr, six.string_types): self.create_and_add_unit( name, item.expr, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item.expr, BinaryOp) and item.expr.op == '.': self.create_and_add_unit( name, concatenate(item.expr), lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item.node, Variable): name = lexer.extract_name('EQUALS', *item.lexpositions) if isinstance(item.expr, Array): handle_array(name, item.expr.nodes, lexer) elif isinstance(item.expr, six.string_types): self.create_and_add_unit( name, item.expr, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item.expr, BinaryOp) and item.expr.op == '.': self.create_and_add_unit( name, concatenate(item.expr), lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item, Return): if isinstance(item.node, Array): handle_array('return', item.node.nodes, lexer)
async def pre_ast(self): while not self.target_queue.empty(): fileext = self.target_queue.get() if not self.lan: break if fileext[0] in ext_dict['php'] and 'php' in self.lan: # 下面是对于php文件的处理逻辑 for filepath in fileext[1]['list']: all_nodes = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'php' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() # self.pre_result[filepath]['content'] = code_content try: parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=True) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) # 搜索所有的常量 for node in all_nodes: if isinstance( node, php.FunctionCall) and node.name == "define": define_params = node.params logger.debug( "[AST][Pretreatment] new define {}={}".format( define_params[0].node, define_params[1].node)) self.define_dict[ define_params[0].node] = define_params[1].node elif fileext[0] in ext_dict[ 'chromeext'] and 'chromeext' in self.lan: # 针对chrome 拓展的预处理 # 需要提取其中的js和html? for filepath in fileext[1]['list']: child_files = [] child_files_html = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'chromeext' # 首先想办法解压crx try: target_files_path = un_zip(filepath) self.pre_result[filepath][ 'target_files_path'] = target_files_path except zipfile.BadZipFile: logger.warning( "[Pretreatment][Chrome Ext] file {} not zip". format(filepath)) continue except OSError: logger.warning( "[Pretreatment][Chrome Ext] file {} unzip error". format(filepath)) continue # 分析manifest.json manifest_path = os.path.join(target_files_path, "manifest.json") # target可能是单个文件,这里需要专门处理 if not (self.target_directory.endswith("/") or self.target_directory.endswith("\\") ) and not os.path.isdir(self.target_directory): path_list = re.split(r'[\\|/]', self.target_directory) relative_path = os.path.join(path_list[-1] + "_files") else: relative_path = target_files_path.split( self.target_directory)[-1] if relative_path.startswith( '\\') or relative_path.startswith("/"): relative_path = relative_path[1:] if os.path.isfile(manifest_path): fi = codecs.open(manifest_path, "r", encoding='utf-8', errors='ignore') manifest_content = fi.read() fi.close() try: manifest = json.loads(manifest_content, encoding='utf-8') except json.decoder.JSONDecodeError: logger.warning( "[Pretreatment][Chrome Ext] File {} parse error..." .format(target_files_path)) continue self.pre_result[filepath]["manifest"] = manifest # 想办法优化,如果不想深入js和html的判断,那么就跳过 if len(self.lan) and self.lan == 'chromeext': logger.debug( "[Pretreatment][Chrome Ext] pass js & html scan..." ) continue # content scripts if "content_scripts" in manifest: for script in manifest["content_scripts"]: if "js" in script: child_files.extend([ os.path.join(relative_path, js) for js in script['js'] ]) # background js if "background" in manifest: if "scripts" in manifest["background"]: child_files.extend([ os.path.join(relative_path, js) for js in manifest["background"]["scripts"] ]) # background html if "page" in manifest["background"]: child_files_html.append( os.path.join( relative_path, manifest["background"]["page"])) # popup.html if "browser_action" in manifest: if "default_popup" in manifest["browser_action"]: child_files_html.append( os.path.join( relative_path, manifest["browser_action"] ["default_popup"])) # web_accessible_resources if "web_accessible_resources" in manifest: for resource in manifest[ "web_accessible_resources"]: if ".js" in resource: child_files.append( os.path.join(relative_path, resource)) if ".html" in resource: child_files_html.append( os.path.join(relative_path, resource)) # chrome_url_overrides if "chrome_url_overrides" in manifest: for key in manifest["chrome_url_overrides"]: child_files_html.append( os.path.join( relative_path, manifest["chrome_url_overrides"][key])) self.pre_result[filepath]["child_files"] = child_files if len(child_files): # 将content_scripts加入到文件列表中构造 self.target_queue.put(('.js', { 'count': len(child_files), 'list': child_files })) # 通过浅复制操作外部传入的files self.file_list.append(('.js', { 'count': len(child_files), 'list': child_files })) if len(child_files_html): self.target_queue.put(('.html', { 'count': len(child_files_html), 'list': child_files_html })) else: logger.warning( "[Pretreatment][Chrome Ext] File {} parse error..." .format(target_files_path)) continue elif fileext[0] in ext_dict['html'] and 'javascript' in self.lan: # html only found js for filepath in fileext[1]['list']: filepath = self.get_path(filepath) script_list = [] try: fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() except FileNotFoundError: continue except OSError: continue # tmp.js save all inline javascript code tmp_path = os.path.join(os.path.dirname(filepath), "tmp.js") fi2 = codecs.open(tmp_path, "a", encoding='utf-8', errors='ignore') try: soup = BeautifulSoup(code_content, "html.parser") script_tag_list = soup.find_all('script') for script_tag in script_tag_list: script_attrs = script_tag.attrs if 'src' in script_attrs: parents_path = os.path.normpath("\\".join( re.split(r'[\\|/]', filepath)[:-1])) script_path = os.path.join( parents_path, script_attrs['src']) script_list.append(script_path) else: # 如果没有src,那么代表是内联js script_content = script_tag.string fi2.write(" \n{}\n ".format(script_content)) fi2.close() if tmp_path not in script_list: script_list.append(tmp_path) # 将content_scripts加入到文件列表中构造 self.target_queue.put(('.js', { 'count': len(script_list), 'list': script_list })) # 通过浅复制操作外部传入的files self.file_list.append(('.js', { 'count': len(script_list), 'list': script_list })) except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) continue elif fileext[0] in ext_dict[ 'javascript'] and 'javascript' in self.lan: # 针对javascript的预处理 # 需要对js做语义分析 for filepath in fileext[1]['list']: filepath = self.get_path(filepath) if not filepath.endswith(".js"): continue self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'javascript' self.pre_result[filepath]['ast_nodes'] = [] try: fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() except FileNotFoundError: continue except OSError: continue # 添加代码美化并且写入新文件 new_filepath = filepath + ".pretty" try: if not os.path.isfile(new_filepath): fi2 = codecs.open(new_filepath, "w", encoding='utf-8', errors='ignore') code_content = jsbeautifier.beautify(code_content) fi2.write(code_content) fi2.close() # self.pre_result[filepath]['content'] = code_content all_nodes = esprima.parse(code_content, {"loc": True}) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except esprima.error_handler.Error: logger.warning( '[AST] [ERROR] Invalid regular expression in {}...' .format(filepath)) except KeyboardInterrupt: logger.log('[AST stop...') exit() except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) continue # 手动回收? gc.collect() return True
def scan(code_content, vul_function, vul_function_line): parser = make_parser() all_nodes = export(parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=with_line)) return traversal(all_nodes, vul_function, vul_function_line)
async def pre_ast(self): while not self.target_queue.empty(): fileext = self.target_queue.get() if fileext[0] in ext_dict['php']: # 下面是对于php文件的处理逻辑 for filepath in fileext[1]['list']: all_nodes = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'php' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() self.pre_result[filepath]['content'] = code_content try: parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=True) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) # 搜索所有的常量 for node in all_nodes: if isinstance( node, php.FunctionCall) and node.name == "define": define_params = node.params logger.debug( "[AST][Pretreatment] new define {}={}".format( define_params[0].node, define_params[1].node)) self.define_dict[ define_params[0].node] = define_params[1].node elif fileext[0] in ext_dict['chromeext']: # 针对chrome 拓展的预处理 # 需要提取其中的js和html? for filepath in fileext[1]['list']: child_files = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'chromeext' # 首先想办法解压crx try: target_files_path = un_zip(filepath) self.pre_result[filepath][ 'target_files_path'] = target_files_path except zipfile.BadZipFile: logger.warning( "[Pretreatment][Chrome Ext] file {} not zip". format(filepath)) continue # 分析manifest.json manifest_path = os.path.join(target_files_path, "manifest.json") # target可能是单个文件,这里需要专门处理 if not (self.target_directory.endswith("/") or self.target_directory.endswith("\\") ) and not os.path.isdir(self.target_directory): relative_path = os.path.join( re.split(r'[\\|/]', self.target_directory)[-1] + "_files") else: relative_path = target_files_path.split( self.target_directory)[-1] if relative_path.startswith( '\\') or relative_path.startswith("/"): relative_path = relative_path[1:] if os.path.isfile(manifest_path): fi = codecs.open(manifest_path, "r", encoding='utf-8', errors='ignore') manifest_content = fi.read() fi.close() try: manifest = json.loads(manifest_content, encoding='utf-8') except json.decoder.JSONDecodeError: logger.warning( "[Pretreatment][Chrome Ext] File {} parse error..." .format(target_files_path)) continue self.pre_result[filepath]["manifest"] = manifest if "content_scripts" in manifest: for script in manifest["content_scripts"]: if 'js' in script: child_files.extend([ os.path.join(relative_path, js) for js in script['js'] ]) self.pre_result[filepath]["child_files"] = child_files # 将content_scripts加入到文件列表中构造 self.target_queue.put(('.js', { 'count': len(child_files), 'list': child_files })) # 通过浅复制操作外部传入的files self.file_list.append(('.js', { 'count': len(child_files), 'list': child_files })) else: logger.warning( "[Pretreatment][Chrome Ext] File {} parse error..." .format(target_files_path)) continue elif fileext[0] in ext_dict['javascript']: # 针对javascript的预处理 # 需要对js做语义分析 for filepath in fileext[1]['list']: filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'javascript' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() fi.close() # 添加代码美化并且写入新文件 new_filepath = filepath + ".pretty" if not os.path.isfile(new_filepath): fi2 = codecs.open(new_filepath, "w", encoding='utf-8', errors='ignore') code_content = jsbeautifier.beautify(code_content) fi2.write(code_content) fi2.close() self.pre_result[filepath]['content'] = code_content try: all_nodes = esprima.parse(code_content, {"loc": True}) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format( filepath, traceback.format_exc())) except esprima.error_handler.Error: logger.warning( '[AST] [ERROR] Invalid regular expression in {}...' .format(filepath)) except KeyboardInterrupt: logger.log('[AST stop...') exit() except: logger.warning('[AST] something error, {}'.format( traceback.format_exc())) continue # 手动回收? gc.collect() return True
def pre_ast(self, lan=None): if lan is not None: # 检查是否在可ast pasre列表中 if not list(set(lan).intersection(set(could_ast_pase_lans))): logger.info("[AST][Pretreatment] Current scan target language does not require ast pretreatment...") return True for fileext in self.file_list: if fileext[0] in ext_dict['php']: # 下面是对于php文件的处理逻辑 for filepath in fileext[1]['list']: all_nodes = [] filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'php' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() self.pre_result[filepath]['content'] = code_content try: parser = make_parser() all_nodes = parser.parse(code_content, debug=False, lexer=lexer.clone(), tracking=True) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format(filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format(filepath, traceback.format_exc())) except: logger.warning('[AST] something error, {}'.format(traceback.format_exc())) # 搜索所有的常量 for node in all_nodes: if isinstance(node, php.FunctionCall) and node.name == "define": define_params = node.params logger.debug("[AST][Pretreatment] new define {}={}".format(define_params[0].node, define_params[1].node)) self.define_dict[define_params[0].node] = define_params[1].node elif fileext[0] in ext_dict['chromeext']: child_files = [] # 针对chrome 拓展的预处理 # 需要提取其中的js和html? for filepath in fileext[1]['list']: filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'chromeext' # 首先想办法解压crx target_files_path = un_zip(filepath) self.pre_result[filepath]['target_files_path'] = target_files_path # 分析manifest.json manifest_path = os.path.join(target_files_path, "manifest.json") relative_path = target_files_path.split(self.target_directory)[-1] if relative_path.startswith('\\') or relative_path.startswith("/"): relative_path = relative_path[1:] if os.path.isfile(manifest_path): fi = codecs.open(manifest_path, "r", encoding='utf-8', errors='ignore') manifest_content = fi.read() manifest = json.loads(manifest_content) self.pre_result[filepath]["manifest"] = manifest if "content_scripts" in manifest: for script in manifest["content_scripts"]: child_files.extend([os.path.join(relative_path, js) for js in script['js']]) self.pre_result[filepath]["child_files"] = child_files else: logger.warning("[Pretreatment][Chrome Ext] File {} parse error...".format(target_files_path)) continue elif fileext[0] in ext_dict['javascript']: # 针对javascript的预处理 # 需要对js做语义分析 for filepath in fileext[1]['list']: filepath = self.get_path(filepath) self.pre_result[filepath] = {} self.pre_result[filepath]['language'] = 'javascript' self.pre_result[filepath]['ast_nodes'] = [] fi = codecs.open(filepath, "r", encoding='utf-8', errors='ignore') code_content = fi.read() self.pre_result[filepath]['content'] = code_content try: all_nodes = esprima.parse(code_content, {"loc": True}) # 合并字典 self.pre_result[filepath]['ast_nodes'] = all_nodes except SyntaxError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format(filepath, traceback.format_exc())) except AssertionError as e: logger.warning('[AST] [ERROR] parser {}: {}'.format(filepath, traceback.format_exc())) except: logger.warning('[AST] something error, {}'.format(traceback.format_exc()))
def parse(self, phpsrc): """Read the source of a PHP file in and include them as units.""" def handle_array(prefix, nodes, lexer): prefix += lexer.extract_array() for item in nodes: assert isinstance(item, ArrayElement) if item.key is None: name = [] else: # To update lexer current position lexer.extract_name("DOUBLE_ARROW", *item.lexpositions) if isinstance(item.key, BinaryOp): name = "'{}'".format(concatenate(item.key)) elif isinstance(item.key, (int, float)): name = f"{item.key}" else: name = f"'{item.key}'" if prefix: name = f"{prefix}->{name}" if isinstance(item.value, Array): handle_array(name, item.value.nodes, lexer) elif isinstance(item.value, str): self.create_and_add_unit( name, item.value, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) def concatenate(item): if isinstance(item, str): return item elif isinstance(item, Variable): return item.name assert isinstance(item, BinaryOp) return concatenate(item.left) + concatenate(item.right) parser = make_parser() for item in parser.productions: item.callable = wrap_production(item.callable) lexer = PHPLexer() tree = parser.parse(phpsrc.decode(self.encoding), lexer=lexer, tracking=True) # Handle text without PHP start if len(tree) == 1 and isinstance(tree[0], InlineHTML): return self.parse(b"<?php\n" + phpsrc) for item in tree: if isinstance(item, FunctionCall): if item.name == "define": self.create_and_add_unit( lexer.extract_name("COMMA", *item.lexpositions), item.params[1].node, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item, Assignment): if isinstance(item.node, ArrayOffset): name = lexer.extract_name("EQUALS", *item.lexpositions) if isinstance(item.expr, Array): handle_array(name, item.expr.nodes, lexer) elif isinstance(item.expr, str): self.create_and_add_unit( name, item.expr, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item.expr, BinaryOp) and item.expr.op == ".": self.create_and_add_unit( name, concatenate(item.expr), lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item.node, Variable): name = lexer.extract_name("EQUALS", *item.lexpositions) if isinstance(item.expr, Array): handle_array(name, item.expr.nodes, lexer) elif isinstance(item.expr, str): self.create_and_add_unit( name, item.expr, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item.expr, BinaryOp) and item.expr.op == ".": self.create_and_add_unit( name, concatenate(item.expr), lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item, Return): if isinstance(item.node, Array): # Adjustextractor position lexer.extract_name("RETURN", *item.lexpositions) handle_array("return", item.node.nodes, lexer)
def parse(self, phpsrc): """Read the source of a PHP file in and include them as units.""" def handle_array(prefix, nodes, lexer): prefix += lexer.extract_array() for item in nodes: assert isinstance(item, ArrayElement) # Skip empty keys if item.key == '': continue if isinstance(item.key, BinaryOp): name = '\'{0}\''.format(concatenate(item.key)) elif isinstance(item.key, (int, float)): name = '{0}'.format(item.key) else: name = '\'{0}\''.format(item.key) if prefix: name = '{0}->{1}'.format(prefix, name) if isinstance(item.value, Array): handle_array(name, item.value.nodes, lexer) elif isinstance(item.value, str): self.create_and_add_unit( name, item.value, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) def concatenate(item): if isinstance(item, str): return item elif isinstance(item, Variable): return item.name assert isinstance(item, BinaryOp) return concatenate(item.left) + concatenate(item.right) parser = make_parser() for item in parser.productions: item.callable = wrap_production(item.callable) lexer = PHPLexer() tree = parser.parse(phpsrc.decode(self.encoding), lexer=lexer, tracking=True) # Handle text without PHP start if len(tree) == 1 and isinstance(tree[0], InlineHTML): return self.parse(b'<?php\n' + phpsrc) for item in tree: if isinstance(item, FunctionCall): if item.name == 'define': self.create_and_add_unit( lexer.extract_name('COMMA', *item.lexpositions), item.params[1].node, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item, Assignment): if isinstance(item.node, ArrayOffset): name = lexer.extract_name('EQUALS', *item.lexpositions) if isinstance(item.expr, str): self.create_and_add_unit( name, item.expr, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item.expr, BinaryOp) and item.expr.op == '.': self.create_and_add_unit( name, concatenate(item.expr), lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item.node, Variable): name = lexer.extract_name('EQUALS', *item.lexpositions) if isinstance(item.expr, Array): handle_array(name, item.expr.nodes, lexer) elif isinstance(item.expr, str): self.create_and_add_unit( name, item.expr, lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item.expr, BinaryOp) and item.expr.op == '.': self.create_and_add_unit( name, concatenate(item.expr), lexer.extract_quote(), lexer.extract_comments(item.lexpositions[1]), ) elif isinstance(item, Return): if isinstance(item.node, Array): handle_array('return', item.node.nodes, lexer)
#!/usr/bin/env python # php2python.py - Converts PHP to Python using unparse.py # Usage: php2python.py < input.php > output.py import sys sys.path.append('..') from phply.phplex import lexer from phply.phpparse import make_parser from phply import pythonast from ast import Module from unparse import Unparser input = sys.stdin output = sys.stdout parser = make_parser() body = [pythonast.from_phpast(ast) for ast in parser.parse(input.read(), lexer=lexer)] Unparser(body, output)
logging.info("No config file given, using default") config_file = DEFAULT_CONFIG return readConfig(config_file) if __name__ == "__main__": enableDebug() lexer = phplex.lexer phpFile = sys.argv[1] with open(phpFile, "r") as f: code = f.read() if code: # FIXME: assuming it's php. Handle php inside HTML parser = make_parser() if (not code.strip().startswith('<')): parser.parse('<?', lexer=lexer) lexer.lineno = 1 config = chooseConfigFile() try: rootNode = parser.parse(code, lexer=lexer) for rule in config: tryAnalyse(rule, rootNode, code) except SyntaxError as e: print(e, 'near', repr(e.text)) except: traceback.print_exc()
def run(self): from phply.phpparse import make_parser make_parser(debug=False)