Ejemplo n.º 1
0
def reformat(apis_file, all_sources, all_sinks, json_result_file, outfile):
    try:
        results = json.load(open(json_result_file, 'r'))
    except Exception as e:
        logging.error("failed to load progpilot results in json: %s",
                      json_result_file)
        return None

    logging.warning("there are %d sources and %d sinks checked!",
                    len(all_sources), len(all_sinks))
    # load the astgen config from file
    config = AstLookupConfig()
    read_proto_from_file(config, apis_file, binary=False)
    logging.warning("loaded config with %d apis to check!", len(config.apis))

    result = ModuleResult()
    set_result(result=result,
               apis=config.apis,
               all_sources=all_sources,
               all_sinks=all_sinks,
               flows=results)
    summary = ModuleSummary()
    set_summary(summary=summary,
                apis=config.apis,
                all_sources=all_sources,
                all_sinks=all_sinks,
                new_sources=None,
                new_sinks=None)
    static = ModuleStatic()
    static.flows.MergeFrom(result.flows)
    static.dangers.MergeFrom(result.dangers)
    static.sources.MergeFrom(summary.sources)
    static.sinks.MergeFrom(summary.sinks)
    static.taint_wrappers.MergeFrom(summary.taint_wrappers)
    write_proto_to_file(proto=static, filename=outfile, binary=False)
Ejemplo n.º 2
0
    def astgen(self,
               inpath,
               outfile,
               root=None,
               configpath=None,
               pkg_name=None,
               pkg_version=None,
               evaluate_smt=False):
        analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args(
            inpath=inpath,
            outfile=outfile,
            root=root,
            configpath=configpath,
            language=self.language)

        # ./vendor/nikic/php-parser/bin/php-parse  -d ../testdata/test-eval-exec.php
        configpb = AstLookupConfig()
        configpath_bin = configpath + '.bin'

        # create binary config from text format
        self._pb_text_to_bin(proto=configpb,
                             infile=configpath,
                             outfile=configpath_bin)
        astgen_cmd = [
            'php', 'astgen.php', '-c', configpath_bin, '-i', analyze_path,
            '-o', outfile
        ]
        if root is not None:
            astgen_cmd.extend(['-b', root])
        if pkg_name is not None:
            astgen_cmd.extend(['-n', pkg_name])
        if pkg_version is not None:
            astgen_cmd.extend(['-v', pkg_version])
        exec_command("php astgen", astgen_cmd, cwd="static_proxy")

        # convert binary output to text format
        resultpb = PkgAstResults()
        read_proto_from_file(resultpb, filename=outfile, binary=True)

        # optionally evaluate smt formula
        if evaluate_smt:
            satisfied = self._check_smt(astgen_results=[resultpb],
                                        configpath=configpath)
            resultpb.pkgs[0].config.smt_satisfied = satisfied

        # save resultpb
        write_proto_to_file(resultpb, filename=outfile, binary=False)

        # clean up residues
        self._cleanup_astgen(analyze_path=analyze_path,
                             is_decompress_path=is_decompress_path)
Ejemplo n.º 3
0
    def _check_smt(astgen_results, configpath=None):
        if len(astgen_results) == 0:
            logging.warning("no astgen_results specified, returning False!")
            return False
        # if configpath is not specified, use the config in any of the astgen result, o.w. use configpath
        if configpath:
            configpb = AstLookupConfig()
            read_proto_from_file(configpb, configpath, binary=False)
        else:
            configpb = astgen_results[0].pkgs[0].config
        logging.warning("checking satisfiability of smt formula %s",
                        configpb.smt_formula)

        used_apis = set()

        # FIXME: works if each astgen_result has only one pkg
        # Get the results from the different packages in the astgen results
        for current_package in astgen_results:
            current_package_results = current_package.pkgs[0].api_results
            current_package_config = current_package.pkgs[0].config
            if current_package_results:
                if current_package_config.func_only:
                    # func only match
                    partial_name2full_names = StaticAnalyzer._get_partial_name2full_names(
                        current_package_config.apis)
                    for api_result in current_package_results:
                        partial_name = StaticAnalyzer._get_api_partial_name(
                            api_result)
                        used_apis.update(partial_name2full_names[partial_name])
                else:
                    # full name match
                    for api_result in current_package_results:
                        used_apis.add(api_result.full_name)

        # Transform the names found the astgen results to the numbers used in the formula
        logging.warning("there are %d used apis: %s", len(used_apis),
                        used_apis)
        used_apis_numerical = []

        for current_api in configpb.apis:
            if current_api.full_name in used_apis:
                used_apis_numerical.append(current_api.id)

        # Transform the formula (the variable that will be evaluated is used_apis_numerical)
        smt_formula = re.sub(r'(\d+)', r'(\1 in used_apis_numerical)',
                             configpb.smt_formula)

        satisfied = eval(smt_formula)
        logging.warning("satisfiability = %s", satisfied)
        return satisfied
Ejemplo n.º 4
0
def ast_to_trigger_words(config_path, trigger_words_path):
    config = AstLookupConfig()
    read_proto_from_file(config, config_path, binary=False)
    source_set = set()
    sink_set = set()
    for api in config.apis:
        # TODO: add support for instantiable field in API comparison
        if api.functionality == ast_pb2.SOURCE:
            if config.func_only:
                source_set.add(api.name + "(")
            else:
                source_set.add(api.full_name + "(")
        elif api.functionality in (ast_pb2.SINK, ast_pb2.DANGER):
            if config.func_only:
                sink_set.add(api.name + "(")
            else:
                sink_set.add(api.full_name + "(")

    trigger_words = {}
    trigger_words["sources"] = list(source_set)
    trigger_words["sinks"] = {key: {} for key in sink_set}
    json.dump(trigger_words, open(trigger_words_path, 'w'), indent=2)
Ejemplo n.º 5
0
    def taint(self,
              inpath,
              outfile,
              configpath=None,
              pkg_name=None,
              pkg_version=None):
        analyze_path, is_decompress_path, outfile, _, configpath = self._sanitize_astgen_args(
            inpath=inpath,
            outfile=outfile,
            root=None,
            configpath=configpath,
            language=self.language)

        # convert the config to binary
        configpb = AstLookupConfig()
        configpath_bin = configpath + '.bin'

        # create binary config from text format
        self._pb_text_to_bin(proto=configpb,
                             infile=configpath,
                             outfile=configpath_bin)

        # perform static taint analysis
        taint_cmd = [
            'node', 'jsprime_wrapper.js', pkg_name, analyze_path,
            configpath_bin, outfile
        ]
        exec_command("javascript taint", taint_cmd, cwd="static_proxy/jsprime")
        pkg_static = ModuleStatic()
        read_proto_from_file(pkg_static, outfile, binary=True)
        logging.warning("taint analysis results: %s", pkg_static)

        # save resultpb
        write_proto_to_file(pkg_static, filename=outfile, binary=False)

        # clean up residues
        os.remove(configpath_bin)
        self._cleanup_astgen(analyze_path=analyze_path,
                             is_decompress_path=is_decompress_path)
Ejemplo n.º 6
0
    def _gen_combined_configpath(self, configpath, dep_taint_results):
        # load the old config
        configpb = AstLookupConfig()
        read_proto_from_file(configpb, configpath, binary=False)

        # iterate through the taint results to update configpb
        num_new_sources = 0
        num_new_sinks = 0
        for dep_taint_result in dep_taint_results:
            # dep_taint_result is of type module_pb2.ModuleStatic
            for new_source in dep_taint_result.sources:
                configpb.apis.append(new_source.node)
                num_new_sources += 1
            for new_sink in dep_taint_result.sinks:
                configpb.apis.append(new_sink.node)
                num_new_sinks += 1
        if num_new_sources + num_new_sinks > 0:
            logging.warning("added %d new sources and %d new sinks!",
                            num_new_sources, num_new_sinks)

        # generate the new config file
        outf = tempfile.NamedTemporaryFile(prefix='configpath-', delete=False)
        write_proto_to_file(proto=configpb, filename=outf.name, binary=False)
        return outf.name
Ejemplo n.º 7
0
    # the add() function returns reference
    sink = summary.sinks.add()
    sink.node.type = AstNode.FUNCTION_DECL
    sink.node.name = "post"
    sink.node.full_name = "requests.post"
    sink.node.base_type = "requests"
    sink.node.arguments.extend(["url", "mode", "data"])
    # TODO: set the source range for sink.node
    reachable_old_sink = sink.reachable_sinks.add()
    reachable_old_sink.CopyFrom(get_mock_sink_node())
    reachable_old_sink.sink_type = ast_pb.SINK_NETWORK


# load the astgen config from file
config = AstLookupConfig()
read_proto_from_file(config,
                     '../../../config/astgen_python_smt.config',
                     binary=False)
print("loaded config with %d apis to check!" % len(config.apis))

# initialize result and summary
result = ModuleResult()
summary = ModuleSummary()
static = ModuleStatic()

# compute and fill the results into protobuf
set_result(result)
set_summary(summary)
static.flows.MergeFrom(result.flows)
static.dangers.MergeFrom(result.dangers)
Ejemplo n.º 8
0
def ast_to_progpilot(
    config_path,
    out_path,
    new_sources_path,
    new_sinks_path,
    new_configuration_path,
    sources_path=join(package_directory,
                      "../../config/php_api/progpilot_sources.json"),
    sinks_path=join(package_directory,
                    "../../config/php_api/progpilot_sinks.json"),
    configuration_path=join(package_directory,
                            "../../config/static_php_progpilot.yml")):
    progpilot_sources = json.load(open(sources_path, 'r'))
    progpilot_sinks = json.load(open(sinks_path, 'r'))
    config = AstLookupConfig()
    read_proto_from_file(config, config_path, binary=False)
    maloss_sources = []
    maloss_sinks = []
    for api in config.apis:
        # TODO: add support for instantiable field in API comparison
        if api.functionality == ast_pb2.SOURCE:
            api_json = {
                'name': api.name,
                'is_function': True,
                'language': 'php'
            }
            if not config.func_only and api.base_type:
                api_json['instanceof'] = api.base_type
            if api.arg_nodes and len(api.arg_nodes):
                raise Exception("Cannot handle arg_nodes for Sources now: %s" %
                                api)
            maloss_sources.append(api_json)
        elif api.functionality in (ast_pb2.SINK, ast_pb2.DANGER):
            api_json = {
                'name': api.name,
                'language': 'php',
                'attack': 'maloss_sink',
                'cwe': 'CWE_89'
            }
            if not config.func_only and api.base_type:
                api_json['instanceof'] = api.base_type
            if api.arg_nodes and len(api.arg_nodes):
                for api_arg in api.arg_nodes:
                    api_json.setdefault('parameters', [])
                    api_json['parameters'].append({'id': api_arg.id})
            else:
                # TODO: all sinks must have parameters (?)
                continue
            maloss_sinks.append(api_json)
    # combine to get all sources and sinks
    all_sources = progpilot_sources['sources'] + [
        ms for ms in maloss_sources if not is_in_progpilot_entries(
            progpilot_entries=progpilot_sources['sources'], new_entry=ms)
    ]
    all_sinks = progpilot_sinks['sinks'] + [
        ms for ms in maloss_sinks if not is_in_progpilot_entries(
            progpilot_entries=progpilot_sinks['sinks'], new_entry=ms)
    ]
    json.dump({'sources': all_sources}, open(new_sources_path, 'w'), indent=2)
    json.dump({'sinks': all_sinks}, open(new_sinks_path, 'w'), indent=2)
    progpilot_config = yaml.load(open(configuration_path, 'r'))
    progpilot_config['inputs']['setSources'] = new_sources_path
    progpilot_config['inputs']['setSinks'] = new_sinks_path
    progpilot_config['outputs']['setOutfile'] = out_path
    yaml.dump(progpilot_config, open(new_configuration_path, 'w'))
    return all_sources, all_sinks
Ejemplo n.º 9
0
                        "--package_version",
                        dest="package_version",
                        help="Package version of the specified input.")
    parser.add_argument(
        "-c",
        "--configpath",
        dest="configpath",
        help=
        "Optional path to the filter of nodes, stored in proto buffer format (AstLookupConfig in ast.proto)."
    )
    return parser.parse_args(argv)


if __name__ == "__main__":
    # Parse options
    args = parse_args(sys.argv[1:])

    # Load config pb
    configpath = args.configpath
    configpb = AstLookupConfig()
    read_proto_from_file(configpb, configpath, binary=False)
    logging.debug("loaded lookup config from %s:\n%s", configpath, configpb)

    # Run the ast generation
    py3_astgen(inpath=args.inpath,
               outfile=args.outfile,
               configpb=configpb,
               root=args.root,
               pkg_name=args.package_name,
               pkg_version=args.package_version)
Ejemplo n.º 10
0
    def astgen(self, inpath, outfile, root=None, configpath=None, pkg_name=None, pkg_version=None, evaluate_smt=False):
        analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args(
            inpath=inpath, outfile=outfile, root=root, configpath=configpath, language=self.language)

        # try python2
        try:
            # load the config proto
            configpb = AstLookupConfig()
            read_proto_from_file(configpb, configpath, binary=False)
            logging.debug("loaded lookup config from %s:\n%s", configpath, configpb)
            # invoke the language specific ast generators to call functions

            # get input files
            infiles, root = self._get_infiles(inpath=analyze_path, root=root, language=self.language)

            # initialize resultpb
            resultpb = PkgAstResults()
            pkg = resultpb.pkgs.add()
            pkg.config.CopyFrom(configpb)
            pkg.pkg_name = pkg_name if pkg_name is not None else basename(analyze_path)
            if pkg_version is not None:
                pkg.pkg_version = pkg_version
            pkg.language = ast_pb2.PYTHON
            for infile in infiles:
                all_source = open(infile, 'r').read()
                try:
                    tree = ast.parse(all_source, filename=infile)
                except SyntaxError as se:
                    logging.warning("Syntax error %s parsing file %s in python2!", se, infile)
                    raise se
                # mark the tree with tokens information
                asttok = asttokens.ASTTokens(source_text=all_source, tree=tree, filename=infile)
                visitor = PythonDeclRefVisitor(asttok=asttok, configpb=configpb)
                visitor.visit(tree)
                logging.warning("collected functions: %s", Counter(visitor.get_declrefs()).items())

                filepb = self._get_filepb(infile, root)
                for base, name, args, source_text, source_range in visitor.get_declrefs():
                    api_result = self._get_api_result(base, name, args, source_text, source_range, filepb)
                    pkg.api_results.add().CopyFrom(api_result)

            # save resultpb
            write_proto_to_file(resultpb, outfile, binary=False)

        # try python3
        except SyntaxError as se:
            logging.error("Syntax error %s, now trying to parse %s again in python3!", se, analyze_path)
            astgen_py3_cmd = ['python3', 'astgen_py3.py', analyze_path, outfile, '-c', configpath]
            if root is not None:
                astgen_py3_cmd.extend(['-b', root])
            if pkg_name is not None:
                astgen_py3_cmd.extend(['-n', pkg_name])
            if pkg_version is not None:
                astgen_py3_cmd.extend(['-v', pkg_version])
            exec_command("python3 astgen", astgen_py3_cmd, cwd="static_proxy")
        except Exception as e:
            logging.error("Fatal error %s running astgen for %s!", e, analyze_path)

        # optionally evaluate smt formula
        if evaluate_smt:
            resultpb = PkgAstResults()
            read_proto_from_file(resultpb, filename=outfile, binary=False)
            satisfied = self._check_smt(astgen_results=[resultpb], configpath=configpath)
            resultpb.pkgs[0].config.smt_satisfied = satisfied
            write_proto_to_file(resultpb, filename=outfile, binary=False)

        # clean up residues
        self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)
Ejemplo n.º 11
0
def reformat(apis_file, json_result_file, outfile):
    try:
        results = json.load(open(json_result_file, 'r'))
    except Exception as e:
        logging.error("failed to load pyt results in json: %s",
                      json_result_file)
        return None

    # load the astgen config from file
    config = AstLookupConfig()
    read_proto_from_file(config, apis_file, binary=False)
    logging.warning("loaded config with %d apis to check!", len(config.apis))

    # convert list of apis into dictionary with key=id, value=full_name for easier identification
    source_dict = {}
    sink_dict = {}
    for entry in config.apis:
        # FIXME: should we support func_only mode
        if entry.functionality == ast_pb2.SOURCE:
            source_dict[entry.id] = entry.full_name
        elif entry.functionality in (ast_pb2.SINK, ast_pb2.DANGER):
            sink_dict[entry.id] = entry.full_name

    nodes = []
    # dictionary with key=name of file within package found to contain vulnerabilities and value=tuple of (tree, asttok, visitor) for that file
    vuln_files_ASTs = {}
    for entry in results['vulnerabilities']:
        source = entry['source']
        # source['label'], source['line_number'], source['path']
        source_trigger_word = entry['source_trigger_word']
        sink = entry['sink']
        # sink['label'], sink['line_number'], sink['path']
        sink_trigger_word = entry['sink_trigger_word']
        api_type = entry['type']
        reassignment_nodes = entry['reassignment_nodes']
        # of type dict
        vuln_files_ASTs[source['path']] = ()
        vuln_files_ASTs[sink['path']] = ()
        nodes.append(
            Vulnerability(source, source_trigger_word, sink, sink_trigger_word,
                          api_type, reassignment_nodes))

    # initiate AST visitors (one tree per vulnerable file within package)
    for file in vuln_files_ASTs:
        src_ast = open(file, 'r').read()
        tree = ast.parse(src_ast, filename=file)
        asttok = asttokens.ASTTokens(source_text=src_ast,
                                     tree=tree,
                                     filename=file)
        # visitor = PythonVisitor(asttok=asttok)
        visit_info = (tree, asttok)
        vuln_files_ASTs[file] = visit_info

    # initialize result and summary
    result = ModuleResult()
    set_result(result, config.apis, source_dict, sink_dict, nodes,
               vuln_files_ASTs)
    summary = ModuleSummary()
    set_summary(summary, config.apis, source_dict, sink_dict, nodes,
                vuln_files_ASTs)
    static = ModuleStatic()
    static.flows.MergeFrom(result.flows)
    static.dangers.MergeFrom(result.dangers)
    static.sources.MergeFrom(summary.sources)
    static.sinks.MergeFrom(summary.sinks)
    static.taint_wrappers.MergeFrom(summary.taint_wrappers)
    write_proto_to_file(proto=static, filename=outfile, binary=False)
Ejemplo n.º 12
0
    def astgen(self,
               inpath,
               outfile,
               root=None,
               configpath=None,
               pkg_name=None,
               pkg_version=None,
               evaluate_smt=False):
        """
        There are two ways to implement the javascript ast parsing, each of them has their cons and pros.
        One is to directly use the npm esprima module, the other is to use the pypi esprima module.

        1. The npm module is the latest version and has lots of features to use directly. But it doesn't have a visitor
        and requires manually implementation.
        2. The pypi module is claimed to be a line by line translation of esprima in python, but it may be outdated and
        inactively maintained. However, it contains a visitor similar to python ast.NodeVisitor that we can directly use.

        To minimize the efforts, I currently choose the latter.
        """
        analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args(
            inpath=inpath,
            outfile=outfile,
            root=root,
            configpath=configpath,
            language=self.language)

        # load the config proto
        configpb = AstLookupConfig()
        read_proto_from_file(configpb, configpath, binary=False)
        logging.debug("loaded lookup config from %s:\n%s", configpath,
                      configpb)
        # invoke the language specific ast generators to call functions

        # FIXME: current testdata sometimes fails the analyzer, inspect it!
        # get input files
        infiles, root = self._get_infiles(inpath=analyze_path,
                                          root=root,
                                          language=self.language)

        # initialize resultpb
        resultpb = PkgAstResults()
        pkg = resultpb.pkgs.add()
        pkg.config.CopyFrom(configpb)
        pkg.pkg_name = pkg_name if pkg_name is not None else basename(
            analyze_path)
        if pkg_version is not None:
            pkg.pkg_version = pkg_version
        pkg.language = ast_pb2.JAVASCRIPT
        for infile in infiles:
            all_source = open(infile, 'r').read()
            try:
                # tree = esprima.parseModule(), esprima.parseScript()
                tree = esprima.parse(all_source, options={'loc': True})
            except Exception as e:
                logging.error(
                    "Fatal error %s parsing file %s! Skipping this file!", e,
                    infile)
                continue
            visitor = JavaScriptDeclRefVisitor(source=all_source,
                                               configpb=configpb)
            visitor.visit(tree)
            logging.warning("collected functions: %s",
                            Counter(visitor.get_declrefs()).items())

            filepb = self._get_filepb(infile, root)
            for base, name, args, source_text, source_range in visitor.get_declrefs(
            ):
                api_result = self._get_api_result(base, name, args,
                                                  source_text, source_range,
                                                  filepb)
                pkg.api_results.add().CopyFrom(api_result)

        # optionally evaluate smt formula
        if evaluate_smt:
            satisfied = self._check_smt(astgen_results=[resultpb],
                                        configpath=configpath)
            resultpb.pkgs[0].config.smt_satisfied = satisfied

        # save resultpb
        write_proto_to_file(resultpb, outfile, binary=False)

        # clean up residues
        self._cleanup_astgen(analyze_path=analyze_path,
                             is_decompress_path=is_decompress_path)