def reformat(apis_file, all_sources, all_sinks, json_result_file, outfile): try: results = json.load(open(json_result_file, 'r')) except Exception as e: logging.error("failed to load progpilot results in json: %s", json_result_file) return None logging.warning("there are %d sources and %d sinks checked!", len(all_sources), len(all_sinks)) # load the astgen config from file config = AstLookupConfig() read_proto_from_file(config, apis_file, binary=False) logging.warning("loaded config with %d apis to check!", len(config.apis)) result = ModuleResult() set_result(result=result, apis=config.apis, all_sources=all_sources, all_sinks=all_sinks, flows=results) summary = ModuleSummary() set_summary(summary=summary, apis=config.apis, all_sources=all_sources, all_sinks=all_sinks, new_sources=None, new_sinks=None) static = ModuleStatic() static.flows.MergeFrom(result.flows) static.dangers.MergeFrom(result.dangers) static.sources.MergeFrom(summary.sources) static.sinks.MergeFrom(summary.sinks) static.taint_wrappers.MergeFrom(summary.taint_wrappers) write_proto_to_file(proto=static, filename=outfile, binary=False)
def astgen(self, inpath, outfile, root=None, configpath=None, pkg_name=None, pkg_version=None, evaluate_smt=False): analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args( inpath=inpath, outfile=outfile, root=root, configpath=configpath, language=self.language) # ./vendor/nikic/php-parser/bin/php-parse -d ../testdata/test-eval-exec.php configpb = AstLookupConfig() configpath_bin = configpath + '.bin' # create binary config from text format self._pb_text_to_bin(proto=configpb, infile=configpath, outfile=configpath_bin) astgen_cmd = [ 'php', 'astgen.php', '-c', configpath_bin, '-i', analyze_path, '-o', outfile ] if root is not None: astgen_cmd.extend(['-b', root]) if pkg_name is not None: astgen_cmd.extend(['-n', pkg_name]) if pkg_version is not None: astgen_cmd.extend(['-v', pkg_version]) exec_command("php astgen", astgen_cmd, cwd="static_proxy") # convert binary output to text format resultpb = PkgAstResults() read_proto_from_file(resultpb, filename=outfile, binary=True) # optionally evaluate smt formula if evaluate_smt: satisfied = self._check_smt(astgen_results=[resultpb], configpath=configpath) resultpb.pkgs[0].config.smt_satisfied = satisfied # save resultpb write_proto_to_file(resultpb, filename=outfile, binary=False) # clean up residues self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)
def _check_smt(astgen_results, configpath=None): if len(astgen_results) == 0: logging.warning("no astgen_results specified, returning False!") return False # if configpath is not specified, use the config in any of the astgen result, o.w. use configpath if configpath: configpb = AstLookupConfig() read_proto_from_file(configpb, configpath, binary=False) else: configpb = astgen_results[0].pkgs[0].config logging.warning("checking satisfiability of smt formula %s", configpb.smt_formula) used_apis = set() # FIXME: works if each astgen_result has only one pkg # Get the results from the different packages in the astgen results for current_package in astgen_results: current_package_results = current_package.pkgs[0].api_results current_package_config = current_package.pkgs[0].config if current_package_results: if current_package_config.func_only: # func only match partial_name2full_names = StaticAnalyzer._get_partial_name2full_names( current_package_config.apis) for api_result in current_package_results: partial_name = StaticAnalyzer._get_api_partial_name( api_result) used_apis.update(partial_name2full_names[partial_name]) else: # full name match for api_result in current_package_results: used_apis.add(api_result.full_name) # Transform the names found the astgen results to the numbers used in the formula logging.warning("there are %d used apis: %s", len(used_apis), used_apis) used_apis_numerical = [] for current_api in configpb.apis: if current_api.full_name in used_apis: used_apis_numerical.append(current_api.id) # Transform the formula (the variable that will be evaluated is used_apis_numerical) smt_formula = re.sub(r'(\d+)', r'(\1 in used_apis_numerical)', configpb.smt_formula) satisfied = eval(smt_formula) logging.warning("satisfiability = %s", satisfied) return satisfied
def ast_to_trigger_words(config_path, trigger_words_path): config = AstLookupConfig() read_proto_from_file(config, config_path, binary=False) source_set = set() sink_set = set() for api in config.apis: # TODO: add support for instantiable field in API comparison if api.functionality == ast_pb2.SOURCE: if config.func_only: source_set.add(api.name + "(") else: source_set.add(api.full_name + "(") elif api.functionality in (ast_pb2.SINK, ast_pb2.DANGER): if config.func_only: sink_set.add(api.name + "(") else: sink_set.add(api.full_name + "(") trigger_words = {} trigger_words["sources"] = list(source_set) trigger_words["sinks"] = {key: {} for key in sink_set} json.dump(trigger_words, open(trigger_words_path, 'w'), indent=2)
def taint(self, inpath, outfile, configpath=None, pkg_name=None, pkg_version=None): analyze_path, is_decompress_path, outfile, _, configpath = self._sanitize_astgen_args( inpath=inpath, outfile=outfile, root=None, configpath=configpath, language=self.language) # convert the config to binary configpb = AstLookupConfig() configpath_bin = configpath + '.bin' # create binary config from text format self._pb_text_to_bin(proto=configpb, infile=configpath, outfile=configpath_bin) # perform static taint analysis taint_cmd = [ 'node', 'jsprime_wrapper.js', pkg_name, analyze_path, configpath_bin, outfile ] exec_command("javascript taint", taint_cmd, cwd="static_proxy/jsprime") pkg_static = ModuleStatic() read_proto_from_file(pkg_static, outfile, binary=True) logging.warning("taint analysis results: %s", pkg_static) # save resultpb write_proto_to_file(pkg_static, filename=outfile, binary=False) # clean up residues os.remove(configpath_bin) self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)
def _gen_combined_configpath(self, configpath, dep_taint_results): # load the old config configpb = AstLookupConfig() read_proto_from_file(configpb, configpath, binary=False) # iterate through the taint results to update configpb num_new_sources = 0 num_new_sinks = 0 for dep_taint_result in dep_taint_results: # dep_taint_result is of type module_pb2.ModuleStatic for new_source in dep_taint_result.sources: configpb.apis.append(new_source.node) num_new_sources += 1 for new_sink in dep_taint_result.sinks: configpb.apis.append(new_sink.node) num_new_sinks += 1 if num_new_sources + num_new_sinks > 0: logging.warning("added %d new sources and %d new sinks!", num_new_sources, num_new_sinks) # generate the new config file outf = tempfile.NamedTemporaryFile(prefix='configpath-', delete=False) write_proto_to_file(proto=configpb, filename=outf.name, binary=False) return outf.name
# the add() function returns reference sink = summary.sinks.add() sink.node.type = AstNode.FUNCTION_DECL sink.node.name = "post" sink.node.full_name = "requests.post" sink.node.base_type = "requests" sink.node.arguments.extend(["url", "mode", "data"]) # TODO: set the source range for sink.node reachable_old_sink = sink.reachable_sinks.add() reachable_old_sink.CopyFrom(get_mock_sink_node()) reachable_old_sink.sink_type = ast_pb.SINK_NETWORK # load the astgen config from file config = AstLookupConfig() read_proto_from_file(config, '../../../config/astgen_python_smt.config', binary=False) print("loaded config with %d apis to check!" % len(config.apis)) # initialize result and summary result = ModuleResult() summary = ModuleSummary() static = ModuleStatic() # compute and fill the results into protobuf set_result(result) set_summary(summary) static.flows.MergeFrom(result.flows) static.dangers.MergeFrom(result.dangers)
def ast_to_progpilot( config_path, out_path, new_sources_path, new_sinks_path, new_configuration_path, sources_path=join(package_directory, "../../config/php_api/progpilot_sources.json"), sinks_path=join(package_directory, "../../config/php_api/progpilot_sinks.json"), configuration_path=join(package_directory, "../../config/static_php_progpilot.yml")): progpilot_sources = json.load(open(sources_path, 'r')) progpilot_sinks = json.load(open(sinks_path, 'r')) config = AstLookupConfig() read_proto_from_file(config, config_path, binary=False) maloss_sources = [] maloss_sinks = [] for api in config.apis: # TODO: add support for instantiable field in API comparison if api.functionality == ast_pb2.SOURCE: api_json = { 'name': api.name, 'is_function': True, 'language': 'php' } if not config.func_only and api.base_type: api_json['instanceof'] = api.base_type if api.arg_nodes and len(api.arg_nodes): raise Exception("Cannot handle arg_nodes for Sources now: %s" % api) maloss_sources.append(api_json) elif api.functionality in (ast_pb2.SINK, ast_pb2.DANGER): api_json = { 'name': api.name, 'language': 'php', 'attack': 'maloss_sink', 'cwe': 'CWE_89' } if not config.func_only and api.base_type: api_json['instanceof'] = api.base_type if api.arg_nodes and len(api.arg_nodes): for api_arg in api.arg_nodes: api_json.setdefault('parameters', []) api_json['parameters'].append({'id': api_arg.id}) else: # TODO: all sinks must have parameters (?) continue maloss_sinks.append(api_json) # combine to get all sources and sinks all_sources = progpilot_sources['sources'] + [ ms for ms in maloss_sources if not is_in_progpilot_entries( progpilot_entries=progpilot_sources['sources'], new_entry=ms) ] all_sinks = progpilot_sinks['sinks'] + [ ms for ms in maloss_sinks if not is_in_progpilot_entries( progpilot_entries=progpilot_sinks['sinks'], new_entry=ms) ] json.dump({'sources': all_sources}, open(new_sources_path, 'w'), indent=2) json.dump({'sinks': all_sinks}, open(new_sinks_path, 'w'), indent=2) progpilot_config = yaml.load(open(configuration_path, 'r')) progpilot_config['inputs']['setSources'] = new_sources_path progpilot_config['inputs']['setSinks'] = new_sinks_path progpilot_config['outputs']['setOutfile'] = out_path yaml.dump(progpilot_config, open(new_configuration_path, 'w')) return all_sources, all_sinks
"--package_version", dest="package_version", help="Package version of the specified input.") parser.add_argument( "-c", "--configpath", dest="configpath", help= "Optional path to the filter of nodes, stored in proto buffer format (AstLookupConfig in ast.proto)." ) return parser.parse_args(argv) if __name__ == "__main__": # Parse options args = parse_args(sys.argv[1:]) # Load config pb configpath = args.configpath configpb = AstLookupConfig() read_proto_from_file(configpb, configpath, binary=False) logging.debug("loaded lookup config from %s:\n%s", configpath, configpb) # Run the ast generation py3_astgen(inpath=args.inpath, outfile=args.outfile, configpb=configpb, root=args.root, pkg_name=args.package_name, pkg_version=args.package_version)
def astgen(self, inpath, outfile, root=None, configpath=None, pkg_name=None, pkg_version=None, evaluate_smt=False): analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args( inpath=inpath, outfile=outfile, root=root, configpath=configpath, language=self.language) # try python2 try: # load the config proto configpb = AstLookupConfig() read_proto_from_file(configpb, configpath, binary=False) logging.debug("loaded lookup config from %s:\n%s", configpath, configpb) # invoke the language specific ast generators to call functions # get input files infiles, root = self._get_infiles(inpath=analyze_path, root=root, language=self.language) # initialize resultpb resultpb = PkgAstResults() pkg = resultpb.pkgs.add() pkg.config.CopyFrom(configpb) pkg.pkg_name = pkg_name if pkg_name is not None else basename(analyze_path) if pkg_version is not None: pkg.pkg_version = pkg_version pkg.language = ast_pb2.PYTHON for infile in infiles: all_source = open(infile, 'r').read() try: tree = ast.parse(all_source, filename=infile) except SyntaxError as se: logging.warning("Syntax error %s parsing file %s in python2!", se, infile) raise se # mark the tree with tokens information asttok = asttokens.ASTTokens(source_text=all_source, tree=tree, filename=infile) visitor = PythonDeclRefVisitor(asttok=asttok, configpb=configpb) visitor.visit(tree) logging.warning("collected functions: %s", Counter(visitor.get_declrefs()).items()) filepb = self._get_filepb(infile, root) for base, name, args, source_text, source_range in visitor.get_declrefs(): api_result = self._get_api_result(base, name, args, source_text, source_range, filepb) pkg.api_results.add().CopyFrom(api_result) # save resultpb write_proto_to_file(resultpb, outfile, binary=False) # try python3 except SyntaxError as se: logging.error("Syntax error %s, now trying to parse %s again in python3!", se, analyze_path) astgen_py3_cmd = ['python3', 'astgen_py3.py', analyze_path, outfile, '-c', configpath] if root is not None: astgen_py3_cmd.extend(['-b', root]) if pkg_name is not None: astgen_py3_cmd.extend(['-n', pkg_name]) if pkg_version is not None: astgen_py3_cmd.extend(['-v', pkg_version]) exec_command("python3 astgen", astgen_py3_cmd, cwd="static_proxy") except Exception as e: logging.error("Fatal error %s running astgen for %s!", e, analyze_path) # optionally evaluate smt formula if evaluate_smt: resultpb = PkgAstResults() read_proto_from_file(resultpb, filename=outfile, binary=False) satisfied = self._check_smt(astgen_results=[resultpb], configpath=configpath) resultpb.pkgs[0].config.smt_satisfied = satisfied write_proto_to_file(resultpb, filename=outfile, binary=False) # clean up residues self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)
def reformat(apis_file, json_result_file, outfile): try: results = json.load(open(json_result_file, 'r')) except Exception as e: logging.error("failed to load pyt results in json: %s", json_result_file) return None # load the astgen config from file config = AstLookupConfig() read_proto_from_file(config, apis_file, binary=False) logging.warning("loaded config with %d apis to check!", len(config.apis)) # convert list of apis into dictionary with key=id, value=full_name for easier identification source_dict = {} sink_dict = {} for entry in config.apis: # FIXME: should we support func_only mode if entry.functionality == ast_pb2.SOURCE: source_dict[entry.id] = entry.full_name elif entry.functionality in (ast_pb2.SINK, ast_pb2.DANGER): sink_dict[entry.id] = entry.full_name nodes = [] # dictionary with key=name of file within package found to contain vulnerabilities and value=tuple of (tree, asttok, visitor) for that file vuln_files_ASTs = {} for entry in results['vulnerabilities']: source = entry['source'] # source['label'], source['line_number'], source['path'] source_trigger_word = entry['source_trigger_word'] sink = entry['sink'] # sink['label'], sink['line_number'], sink['path'] sink_trigger_word = entry['sink_trigger_word'] api_type = entry['type'] reassignment_nodes = entry['reassignment_nodes'] # of type dict vuln_files_ASTs[source['path']] = () vuln_files_ASTs[sink['path']] = () nodes.append( Vulnerability(source, source_trigger_word, sink, sink_trigger_word, api_type, reassignment_nodes)) # initiate AST visitors (one tree per vulnerable file within package) for file in vuln_files_ASTs: src_ast = open(file, 'r').read() tree = ast.parse(src_ast, filename=file) asttok = asttokens.ASTTokens(source_text=src_ast, tree=tree, filename=file) # visitor = PythonVisitor(asttok=asttok) visit_info = (tree, asttok) vuln_files_ASTs[file] = visit_info # initialize result and summary result = ModuleResult() set_result(result, config.apis, source_dict, sink_dict, nodes, vuln_files_ASTs) summary = ModuleSummary() set_summary(summary, config.apis, source_dict, sink_dict, nodes, vuln_files_ASTs) static = ModuleStatic() static.flows.MergeFrom(result.flows) static.dangers.MergeFrom(result.dangers) static.sources.MergeFrom(summary.sources) static.sinks.MergeFrom(summary.sinks) static.taint_wrappers.MergeFrom(summary.taint_wrappers) write_proto_to_file(proto=static, filename=outfile, binary=False)
def astgen(self, inpath, outfile, root=None, configpath=None, pkg_name=None, pkg_version=None, evaluate_smt=False): """ There are two ways to implement the javascript ast parsing, each of them has their cons and pros. One is to directly use the npm esprima module, the other is to use the pypi esprima module. 1. The npm module is the latest version and has lots of features to use directly. But it doesn't have a visitor and requires manually implementation. 2. The pypi module is claimed to be a line by line translation of esprima in python, but it may be outdated and inactively maintained. However, it contains a visitor similar to python ast.NodeVisitor that we can directly use. To minimize the efforts, I currently choose the latter. """ analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args( inpath=inpath, outfile=outfile, root=root, configpath=configpath, language=self.language) # load the config proto configpb = AstLookupConfig() read_proto_from_file(configpb, configpath, binary=False) logging.debug("loaded lookup config from %s:\n%s", configpath, configpb) # invoke the language specific ast generators to call functions # FIXME: current testdata sometimes fails the analyzer, inspect it! # get input files infiles, root = self._get_infiles(inpath=analyze_path, root=root, language=self.language) # initialize resultpb resultpb = PkgAstResults() pkg = resultpb.pkgs.add() pkg.config.CopyFrom(configpb) pkg.pkg_name = pkg_name if pkg_name is not None else basename( analyze_path) if pkg_version is not None: pkg.pkg_version = pkg_version pkg.language = ast_pb2.JAVASCRIPT for infile in infiles: all_source = open(infile, 'r').read() try: # tree = esprima.parseModule(), esprima.parseScript() tree = esprima.parse(all_source, options={'loc': True}) except Exception as e: logging.error( "Fatal error %s parsing file %s! Skipping this file!", e, infile) continue visitor = JavaScriptDeclRefVisitor(source=all_source, configpb=configpb) visitor.visit(tree) logging.warning("collected functions: %s", Counter(visitor.get_declrefs()).items()) filepb = self._get_filepb(infile, root) for base, name, args, source_text, source_range in visitor.get_declrefs( ): api_result = self._get_api_result(base, name, args, source_text, source_range, filepb) pkg.api_results.add().CopyFrom(api_result) # optionally evaluate smt formula if evaluate_smt: satisfied = self._check_smt(astgen_results=[resultpb], configpath=configpath) resultpb.pkgs[0].config.smt_satisfied = satisfied # save resultpb write_proto_to_file(resultpb, outfile, binary=False) # clean up residues self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)