Ejemplo n.º 1
0
 def __init__(self):
     self.db_provider = DBContentsProvider()
     parser = argparse.ArgumentParser(description='Find defect callees.')
     parser.add_argument('--function', '-func',
                         help='the target argument-sensitive function name')
     parser.add_argument("-f", "--filepath",
                         help=' file of check information saved')
     parser.add_argument("-t", "--threshold", required=False, type=int, default=0.5,
                         help="the threshold of the entropy")
     self.args_ = parser.parse_args()
Ejemplo n.º 2
0
 def __init__(self, calee_featureList):
     self.db_provider = DBContentsProvider()
     self.calee_featureList = calee_featureList
     #关于路径数量,语句数量“明显差异”的阈值,比例 > thld_path_ratio
     self.thld_path_ratio = G_thld_path_ratio
     self.thld_stmt_ratio = G_thld_stmt_ratio
     #关于特征是否满足的阈值
     self.thld_is_check = G_thld_is_check
     self.thld_is_path = G_thld_is_path
     self.thld_is_stmt = G_thld_is_stmt
     self.thld_is_notuseTwosides = G_thld_is_notuseTwosides
Ejemplo n.º 3
0
 def __init__(self, calee_featureList):
     self.db_provider = DBContentsProvider()
     self.calee_featureList = calee_featureList
     #关于路径数量,语句数量“明显差异”的阈值,比例 > thld_path_ratio
     tmp_test = 2
     self.thld_path_ratio = tmp_test
     self.thld_stmt_ratio = tmp_test
     #关于特征是否满足的阈值
     self.thld_is_check = 0.8
     self.thld_is_path = 0.8
     self.thld_is_stmt = 0.8
     self.thld_is_useOneside = 0.8
     #关于个性特征的权重值
     self.weight_path = 0.5
     self.weight_stmt = 0.5
     self.weight_useOneSide = 1
Ejemplo n.º 4
0
class DisplayEntropyInfo:
    def __init__(self, entropy):
        self.db_provider = DBContentsProvider()
        self.file_io_provider = ObjDataAndBinFile()
        self.entropy = entropy

    def run_gremlin_query(self, query):
        return self.db_provider.run_gremlin_query(query)

    def query_loc_callsite(self, callee_id):
        query = """
            g.v(%s).statements.transform{[g.v(it.functionId).functionToFile.filepath, it.location]}
            """ % callee_id
        result = self.run_gremlin_query(query)
        loc = "%s: %s" % (result[0][0][0], result[0][1])
        return loc

    def display_entropy(self):
        sorted_entropy = self.sort_entropy()
        print "\n# Total entropy | implict-check | explict check | callsite id | location\n"
        for en in sorted_entropy:
            print en

    def sort_entropy(self):
        sorted_entropy = []
        for en in self.entropy:
            sum_entropy = sum(en[1])
            sum_entropy += sum(en[2])
            #sum_entropy = round(en[1][2] + en[2][2],2)
            sum_entropy = round(sum_entropy,2)
            loc = self.query_loc_callsite(en[0])
            sorted_entropy.append([sum_entropy, en[1], en[2], en[0], loc])
        sorted_entropy = sorted(sorted_entropy, key=lambda entropy: entropy[0], reverse=True)
        return sorted_entropy
Ejemplo n.º 5
0
class DisplayEntropyInfo:
    def __init__(self, entropy):
        self.db_provider = DBContentsProvider()
        self.file_io_provider = ObjDataAndBinFile()
        self.entropy = entropy

    def run_gremlin_query(self, query):
        return self.db_provider.run_gremlin_query(query)

    def query_loc_callsite(self, callee_id):
        query = """
            g.v(%s).statements.transform{[g.v(it.functionId).functionToFile.filepath, it.location]}
            """ % callee_id
        result = self.run_gremlin_query(query)
        loc = "%s: %s" % (result[0][0][0], result[0][1])
        return loc

    def display_entropy(self):
        entropys = self.sort_entropy()
        print "\n# Total entropy | implict-check | explict check | callsite id | location\n"
        filename = "Data/entropy_thesis.xls"
        wXLS = writeXLS()
        wXLS.write_excel(filename, entropys)
        for en in entropys:
            print en

    def sort_entropy(self):
        sorted_entropy = []
        for en in self.entropy:
            sum_entropy = sum(en[1:])
            sum_entropy = round(sum_entropy, 2)
            loc = self.query_loc_callsite(en[0])
            tmp = []
            tmp.append(sum_entropy)
            tmp.extend(en[1:])
            tmp.append(loc)
            sorted_entropy.append(tmp)
        sorted_entropy = sorted(sorted_entropy,
                                key=lambda entropy: entropy[0],
                                reverse=True)
        return sorted_entropy
Ejemplo n.º 6
0
class MiningErrfuncShell:

    def __init__(self):
        self.db_provider = DBContentsProvider()
        parser = argparse.ArgumentParser(description='Find defect callees.')
        parser.add_argument('--function', '-func',
                            help='the target argument-sensitive function name')
        parser.add_argument("-f", "--filepath",
                            help=' file of check information saved')
        parser.add_argument("-t", "--threshold", required=False, type=int, default=0.5,
                            help="the threshold of the entropy")
        self.args_ = parser.parse_args()

    def run_gremlin_query(self, query):
        return self.db_provider.run_gremlin_query(query)

    def query_allCallee_name(self):
        query = """
            g.V.has('type','Callee').as('x').code.dedup().back('x').code.toList()
            """
        result = self.run_gremlin_query(query)
        return result

    def run(self):
        #一些奇怪的,暂时无法消除bug(与joern实现有关)的函数,略过
        func_unnormal = ['INCOHERENT']
        allCallee_name = self.query_allCallee_name()
        display_data = []
        num_func = len(allCallee_name)
        num_alalysed_func = 0
        f_debug= open("Data/degbug.txt",'a' )
        f_debug.write("\nBeginTime = %s   num_func = %s\n"%(datetime.datetime.now(),num_func))
        f_debug.close
        for function_name in allCallee_name:
            if function_name in func_unnormal:
                continue
            function_name_str = function_name.encode('gbk')
            #
            datapath = "Data/result_libtif407/%s.data"%function_name_str
            if os.path.exists(datapath):
                #filename = "Data/42153.data"
                feature_callees = ObjDataAndBinFile.binfile2objdata(datapath)
            else:
                extract_errfun_feature = ExtractErrFunFeatures(function_name_str)
                #patterns = extract_check_patterns.run(False, callee_ids)
                feature_callees = extract_errfun_feature.run(flag_thread=False)

            obj_MiningErrFunc = MiningErrFunc(feature_callees)
            #mining_result = [is_err, weight_call, ratio_ft_path,ratio_ft_stmt,ratio_ft_usedOneside]
            mining_result = obj_MiningErrFunc.run()
            tmp = []
            tmp.append(function_name)
            tmp.extend(mining_result)
            display_data.append(tmp)
            num_alalysed_func = num_alalysed_func +1
            f_debug= open("Data/degbug.txt",'a' )
            f_debug.write(str(tmp[1]) + str("  ") + str(tmp[2]) + "  " +  str(tmp[0])
                            + "  " +  str(tmp[3])+ "  " +  str(tmp[4])+ " "  + str(tmp[5]))
            f_debug.write("\n")
            f_debug.close()
        f_debug= open("Data/degbug.txt",'a' )
        f_debug.write("EndTime = %s   num_alalysed_func = %s\n"%(datetime.datetime.now(),num_alalysed_func))
        f_debug.close()

        display_data = sorted(display_data, key=lambda l: (l[1],l[2]), reverse=True)
        # 保存数据
        f= open("Data/10141859.txt",'w' )
        #f= open("Data/%s.data"%int(time.time),'w' )
        for data in display_data:
            f.write(str(data[0]) + str("  ") + str(data[1]) + "  " +  str(data[2])
                    + "  " +  str(data[3])+ "  " +  str(data[4]) + " " + str(data[5]))
            f.write("\n")
        f.close()


        ##debug:缺陷检测部分,检测前面num_detect个fun
        num_detect = 50
        index = 0
        for func_item in display_data:
            function_name_str = func_item[0].encode('gbk')
            extract_errfun_feature = ExtractErrFunFeatures(function_name_str)
            #patterns = extract_check_patterns.run(False, callee_ids)
            feature_callees = extract_errfun_feature.run(flag_thread=False)
            for ft_callee in feature_callees:
                ft_info = "%s %s %s %s %s %s %s %s"%(func_item[0],ft_callee[0],ft_callee[2][0],ft_callee[2][1],
                            ft_callee[2][2],ft_callee[3][0], ft_callee[3][1],ft_callee[3][2])
                f_dectect= open("Data/detect.txt",'a' )
                f_dectect.write(ft_info)
                f_dectect.write("\n")
            index =index +1
            if index > num_func:
                break
        ##
        return
Ejemplo n.º 7
0
 def __init__(self, function_name):
     self.db_provider = DBContentsProvider()
     self.file_io_provider = ObjDataAndBinFile()
     self.function_name = function_name
     self.count_threads = 20
Ejemplo n.º 8
0
class ExtractErrFunFeatures:
    def __init__(self, function_name):
        self.db_provider = DBContentsProvider()
        self.file_io_provider = ObjDataAndBinFile()
        self.function_name = function_name
        self.count_threads = 20

    def set_implicit_check_pattern(self, arg_checked, arg_by):
        # CNT is constant, OutVar is variable from outside of caller
        if arg_by == "CNT":
            return "arg_%s DEFBY %s" % (arg_checked, arg_by)
        if arg_by == "OutVar":
            return "arg_%s DEFBY %s" % (arg_checked, arg_by)
        return "arg_%s DEFBY arg_%s" % (arg_checked, arg_by)

    '''
    def set_explicit_check_pattern(self, arg_checked, checkinfo):

        flowlabel_code = checkinfo[0][0]
        operator_code = checkinfo[1]
        related_args = checkinfo[2]
        pattern_str = "%s %s arg_%s VS (" % (arg_checked, flowlabel_code, operator_code)
        for arg_index in related_args:
            pattern_str += " arg_%s " % arg_index
        pattern_str += ")"
        return
    '''

    def set_explicit_check_pattern(self, arg_checked, checkinfo):
        # explicit_checkinfo_args[index_arg].append([norm_cmp_items, norm_cmp_op, norm_cmp_value])
        norm_cmp_items = "f("
        for i in range(0, len(checkinfo[0]) - 1):
            norm_cmp_items += "arg_%d, " % checkinfo[0][i]
        norm_cmp_items += "arg_%d)" % checkinfo[0][len(checkinfo[0]) - 1]

        norm_cmp_op = checkinfo[1]

        norm_cmp_value = "f("
        for i in range(0, len(checkinfo[2]) - 1):
            norm_cmp_value += "%s, " % checkinfo[2][i]
        norm_cmp_value += "%s)" % checkinfo[2][len(checkinfo[2]) - 1]

        return "%s %s %s" % (norm_cmp_items, norm_cmp_op, norm_cmp_value)

    def run_gremlin_query(self, query):
        return self.db_provider.run_gremlin_query(query)

    def save_data_to_file(self, data, file_path):
        # filename = "Data/OutStatsData_%s.data"%time.strftime('%Y%m%d-%H%M%S')
        # print "生成GetOutStatsData的原始数据文件:%s" % file_path
        self.file_io_provider.objdata2file(data, file_path)

    def query_loc_callsite(self, callee_id):
        query = """
            g.v(%s).statements.transform{[g.v(it.functionId).functionToFile.filepath, it.location]}
            """ % callee_id
        result = self.run_gremlin_query(query)
        loc = "%s: %s" % (result[0][0][0], result[0][1])
        return loc

    def query_callee_ids(self, function_name):
        query = """
        g.V().has('type','Callee').has('code','%s').id.toList()
        """ % function_name
        callee_ids = self.run_gremlin_query(query)
        return callee_ids

    def query_callsite_id(self, callee_id):
        query = """
        g.v(%s).in.in.id
        """ % callee_id
        callsite_id = self.run_gremlin_query(query)
        return callsite_id[0]

    def query_backward_paths(self, callee_id):
        query = """
        getBackwardPaths(%s)
        """ % callee_id
        all_paths = self.run_gremlin_query(query)
        return all_paths

    def query_farward_paths_from_condition(self, condition_id):
        query = """
        getFarwardPaths_from_condition(%s)
        """ % condition_id
        all_paths = self.run_gremlin_query(query)
        return all_paths

    # def_chain = src.id <--var_str-- dst.id
    def query_define_chains(self, path):
        def_chain = []
        for node_id in path:
            query = """
            g.v(%s).inE('REACHES').transform{[it.inV.id, it.var, it.outV.id]}
            """ % node_id
            def_chain_tmp = self.run_gremlin_query(query)
            # select the definition of the @path
            for chain in def_chain_tmp:
                if (chain[0][0] in path) and (chain[2][0] in path):
                    # remove the define node which dst == src, it will make some process loop forever
                    if chain[0][0] != chain[2][0]:
                        def_chain.append([chain[0][0], chain[1], chain[2][0]])

        # remove the invalid define chain from @def_chain
        invalid_chains = []
        for i in range(0, len(def_chain)):
            for j in range(0, len(def_chain)):
                if i == j:
                    continue
                # the nearest definition on the same node with the same variable is the valid one
                if def_chain[i][0] == def_chain[j][0] and def_chain[i][
                        1] == def_chain[j][1]:
                    if path.index(def_chain[i][2]) > path.index(
                            def_chain[j][2]):
                        invalid_chains.append(i)
                    else:
                        invalid_chains.append(j)
        invalid_chains = self.unique_list(invalid_chains)
        invalid_chains.sort(reverse=True)
        for i in invalid_chains:
            def_chain.remove(def_chain[i])
        return def_chain

    def query_args(self, callee_id):
        query = """
        getArgs(%s)
        """ % callee_id
        arg_ids = self.run_gremlin_query(query)
        return arg_ids

    def query_symbols_by_ids(self, ids):
        symbols_id = []
        symbols_code = []
        for arg in ids:
            query = """
            _getSymbols(%s)
            """ % arg
            s_ids = self.run_gremlin_query(query)
            symbols_id.append(s_ids)
            s_codes = []
            if s_ids:
                for vid in s_ids:
                    s_codes.append(self.query_code_by_id(vid))
            else:
                s_codes.append(u'')
            symbols_code.append(s_codes)
        return symbols_id, symbols_code

    def query_define_vars_dst_on_symbols(self, src_id, symbols, def_chain):
        define_vars = []
        define_dst_node = []
        callsite = src_id
        for s in symbols:
            var_symbol = self.query_code_by_id(s)
            head_node = self.search_dst_by_var_src(callsite, var_symbol,
                                                   def_chain)
            if not head_node:
                continue
            define_vars.append(var_symbol)
            define_dst_node.append(head_node)
            src_nodes = [head_node]
            while src_nodes:
                src_new = []
                for src in src_nodes:
                    middle_define_vars, dst_nodes = self.search_vars_dsts_by_src(
                        src, def_chain)
                    if middle_define_vars:
                        for dst in dst_nodes:
                            if dst not in define_dst_node:
                                src_new.extend(dst_nodes)

                            define_vars.extend(middle_define_vars)
                            define_dst_node.extend(dst_nodes)

                src_nodes = src_new
            #define_dst_node = self.unique_list(define_dst_node)
            #define_vars = self.unique_list(define_vars)
        return define_vars, define_dst_node

    def query_code_by_id(self, vid):
        query = """
        _getCodeById(%s)
        """ % vid
        code = self.run_gremlin_query(query)
        return code

    # search dst node from def_chain by var and src node
    def search_dst_by_var_src(self, src, var, def_chain):
        for def_node in def_chain:
            if src == def_node[0] and var == def_node[1]:
                return def_node[2]
        return False

    def search_vars_dsts_by_src(self, src, def_chain):
        define_vars = []
        dst_nodes = []
        for def_node in def_chain:
            if def_node[0] == src:
                define_vars.append(def_node[1])
                dst_nodes.append(def_node[2])
        return define_vars, dst_nodes

    def query_flowlabel_between_nodes(self, out_v, in_v):
        query = """
        _getFlowlabelOfCfgIds(%s, %s)
        """ % (out_v, in_v)
        flowlabel = self.run_gremlin_query(query)
        return flowlabel

    # parseControl return [flowlabel_code, ids_child[0], tpye_code, operator_code, children]
    # children = [id,type,code]
    def query_parsed_control(self, control_id, next_node):
        query = """
        parseControl(%s,%s)
        """ % (control_id, next_node)
        control_info = self.run_gremlin_query(query)

        flowlabel_code = control_info[0]
        id_exp = control_info[1]
        type_exp = control_info[2]
        operator_expr = control_info[3]
        children_expr = control_info[4]

        return flowlabel_code, id_exp, type_exp, operator_expr, children_expr

    # the controls are condition statements control the callsite_id
    def query_controls(self, callsite_id):
        query = """4849840
        getControlsFromCfgId(%s)
        """ % callsite_id
        controls = self.run_gremlin_query(query)
        return controls

    # get controls of the path
    def query_controls_path(self, controls, path):
        controls_path = []
        for c in controls:
            if c in path:
                controls_path.append(c)
        return controls_path

    def query_control_symbols(self, control):
        query = """
        _getSymbols(%s)
        """ % control
        control_symbols = self.run_gremlin_query(query)
        return control_symbols

    @staticmethod
    def unique_list(old_list):
        new_list = []
        for i in old_list:
            if i not in new_list:
                new_list.append(i)
        return new_list

    @staticmethod
    def is_lists_cross(list1, list2):
        for l in list1:
            if l in list2:
                return True
        return False

    @staticmethod
    def get_index_same_items_of_list1(list1, list2):
        indexList = []
        for i in range(0, len(list1)):
            if list1[i] in list2:
                indexList.append(i)
        return indexList

    @staticmethod
    def list1_VSset_list2(list1, list2):
        if (not list1) or (not list2):
            return "xx"
        if set(list1) > set(list2):
            return '>'
        if set(list1) == set(list2):
            return '='
        if set(list1) < set(list2):
            return '<'
        else:
            return 'x'

    def query_check_patterns_path(self, callee_id, callsite_id, path,
                                  controls_path):
        arg_ids = self.query_args(callee_id)
        symbols_id_of_args, symbols_code_of_args = self.query_symbols_by_ids(
            arg_ids)
        def_chain_path = self.query_define_chains(path)
        define_vars_of_args = []
        define_dst_of_args = []
        for symbols_arg in symbols_id_of_args:
            defvars_of_arg, define_dst_of_arg = \
                self.query_define_vars_dst_on_symbols(callsite_id, symbols_arg, def_chain_path)
            define_vars_of_args.append(defvars_of_arg)
            define_dst_of_args.append(define_dst_of_arg)

        arg_num = len(arg_ids)

        # I. 隐式约束 query_implicit_check_patterns_path
        implicit_check_patterns = [[] for i in range(arg_num)]

        for i in range(0, arg_num):
            # 1. 判断该参数是否为常量
            #   1.1 没有用到符号
            if not symbols_id_of_args[i]:
                implicit_check_patterns[i].append(
                    self.set_implicit_check_pattern(i, "CNT"))
                continue

            # Because Joern can not identify the Global variable/const,
            # the arg may have symbol but its define_vars_of_args is NULL.
            # Because the global variable is not recommend, used rarely,
            # so we set its check pattern as defined by const  "CNT"
            # const: type 'PrimaryExpression'
            #   1.2 有符号没却没有用于定义的变量 and 符号代码全为大写字符,则判断为全局变量,Joern不精准的原因
            if symbols_id_of_args[i] and (not define_vars_of_args[i]):
                flag_notupper = False
                for symbol in symbols_code_of_args[i]:
                    if not symbol.isupper():
                        flag_notupper = True
                        break
                if not flag_notupper:
                    implicit_check_patterns[i].append(
                        self.set_implicit_check_pattern(i, "CNT"))
                    continue

            # If the right values of all the define nodes of the define chains' tails (define_dst_of_args[i]) are constants,
            # the arg is defined by constant
            #   1.3. 所有的定义语句都是常量赋值,那么最终的实参也是常量
            flag_value = True
            if define_dst_of_args[i]:
                for nodeid in define_dst_of_args[i]:
                    nodecode = self.run_gremlin_query("g.v(%s).code" % nodeid)
                    value = nodecode.split("=", 1)
                    if len(value) != 2:
                        flag_value = False
                        break
                    rightvalue = value[1]
                    # 此处有错误,如果是字符串常量呢?
                    if rightvalue.isdigit() != True:
                        flag_value = False
                        break
                if flag_value:
                    implicit_check_patterns[i].append(
                        self.set_implicit_check_pattern(i, "CNT"))
                    continue
            for j in range(i + 1, arg_num):
                # 2. 判断与其它参数的关系
                #   2.1. symbols属于包含关系
                if self.list1_VSset_list2(symbols_id_of_args[i],
                                          symbols_id_of_args[j]) == '>':
                    implicit_check_patterns[i].append(
                        self.set_implicit_check_pattern(i, j))
                    continue
                #if self.is_lists_cross(symbols_id_of_args[i], symbols_id_of_args[j]):
                #    implicit_check_patterns[i].append(self.set_implicit_check_pattern(i, j))
                if self.list1_VSset_list2(symbols_id_of_args[i],
                                          symbols_id_of_args[j]) == '<':
                    implicit_check_patterns[j].append(
                        self.set_implicit_check_pattern(j, i))
                    continue
                if self.list1_VSset_list2(symbols_id_of_args[i],
                                          symbols_id_of_args[j]) == '=':
                    implicit_check_patterns[i].append(
                        self.set_implicit_check_pattern(i, j))
                    implicit_check_patterns[j].append(
                        self.set_implicit_check_pattern(j, i))
                    continue

                #   2.2. var包含关系
                if self.list1_VSset_list2(define_vars_of_args[i],
                                          symbols_code_of_args[j]) == '>':
                    implicit_check_patterns[i].append(
                        self.set_implicit_check_pattern(i, j))
                    continue
                if self.list1_VSset_list2(define_vars_of_args[j],
                                          symbols_code_of_args[i]) == '>':
                    implicit_check_patterns[i].append(
                        self.set_implicit_check_pattern(j, i))
                    continue

                tmp_defvar_i = define_vars_of_args[i][
                    1:len(define_vars_of_args[i])]
                tmp_defvar_j = define_vars_of_args[j][
                    1:len(define_vars_of_args[j])]
                if tmp_defvar_j:
                    if self.list1_VSset_list2(define_vars_of_args[i],
                                              tmp_defvar_j) == '>':
                        implicit_check_patterns[i].append(
                            self.set_implicit_check_pattern(i, j))
                        continue
                if tmp_defvar_i:
                    if self.list1_VSset_list2(define_vars_of_args[j],
                                              tmp_defvar_i) == '>':
                        implicit_check_patterns[i].append(
                            self.set_implicit_check_pattern(j, i))
                        continue
                if tmp_defvar_i and tmp_defvar_j:
                    if self.list1_VSset_list2(tmp_defvar_j,
                                              tmp_defvar_i) == '=':
                        implicit_check_patterns[i].append(
                            self.set_implicit_check_pattern(i, j))
                        implicit_check_patterns[i].append(
                            self.set_implicit_check_pattern(j, i))
                        continue

                #   2.3. 混合定义的情况
                if self.is_lists_cross(define_vars_of_args[i],
                                       define_vars_of_args[j]):
                    implicit_check_patterns[i].append(
                        self.set_implicit_check_pattern(i, j))
                    implicit_check_patterns[i].append(
                        self.set_implicit_check_pattern(j, i))
                    # implicit_check_patterns[j].append(self.set_implicit_check_pattern(j, i))
                    continue

                # 3. 其它归为未知变量:后续过程间分析时需要区分是否依赖于调用者的入参
                # the default define patten is defined by "OutVar"

                # implicit_check_patterns[i].append(self.set_implicit_check_pattern(i, "OutVar"))

        # II. 显式约束 query_explicit_check_patterns_path:
        # If there is a define node on one symbol of the @arg, whose location is between the control node @control
        # and the callsite, then the @control is not take an explicit check on the @arg.
        # Else if the defvar(@control) ^ defvar(@arg) != [], then @control is take an explicit check on the @arg.
        explicit_check_patterns = [[] for i in range(arg_num)]
        explicit_checkinfo_args = [[] for i in range(arg_num)]
        checked_arg_control = []
        symbols_id_of_controls, symbols_code_of_controls = self.query_symbols_by_ids(
            controls_path)
        '''
        define_vars_of_controls = []
        define_dst_of_controls = []
        for symbols_control in symbols_id_of_controls:
            defvars_of_control, define_dst_of_control = \
                self.query_define_vars_dst_on_symbols(callsite_id, symbols_control, def_chain_path)
            define_vars_of_controls.append(defvars_of_control)
            define_dst_of_controls.append(define_dst_of_control)
        '''
        # 1. 筛选有效的条件检查,并与对应的参数关联
        for index_arg in range(0, arg_num):
            for index_control in range(0, len(controls_path)):
                checked_var_of_args = self.get_index_same_items_of_list1(
                    define_vars_of_args[index_arg],
                    symbols_code_of_controls[index_control])
                location_control = path.index(controls_path[index_control])
                if checked_var_of_args:
                    flag_valid_control = True
                    for index_checked_var_of_args in checked_var_of_args:
                        location_checked_dst_node = path.index(
                            define_dst_of_args[index_arg]
                            [index_checked_var_of_args])
                        # if one control check on var, but the var was defined again after the control,
                        # then the control is not consided as a valid check on var
                        if location_control > location_checked_dst_node:
                            flag_valid_control = False
                            break
                    if flag_valid_control:
                        # log the relation between arg and valid controls
                        checked_arg_control.append([index_arg, index_control])

        # 2.解析有效的条件检查;收集条件检查的 逻辑运算符号,结果满足条件(True,False),相关参数
        # collect check info from @checked_arg_control into each arg
        for c_arg_control in checked_arg_control:
            index_arg = c_arg_control[0]
            index_control = c_arg_control[1]

            args_by_control = []
            for tmp_arg_control in checked_arg_control:
                if tmp_arg_control[1] == index_control:
                    if tmp_arg_control[0] != index_arg:
                        args_by_control.append(tmp_arg_control[0])
            args_by_control = self.unique_list(args_by_control)
            args_by_control.sort()

            # [flowlabel_code, ids_child[0], tpye_code, operator_code, children]
            # children = [id,type,code]
            index_next_node = path.index(controls_path[index_control]) - 1
            flowlabel_code, id_exp, type_exp, operator_expr, children_expr = \
                self.query_parsed_control(controls_path[index_control], path[index_next_node])
            norm_cmp_items = []
            norm_cmp_op = ""
            norm_cmp_value = []

            if type_exp == "Identifier":
                if flowlabel_code == "True":
                    norm_cmp_value.append("notNULL")
                    norm_cmp_op = "=="
                else:
                    norm_cmp_value.append("NULL")
                    norm_cmp_op = "=="
                norm_cmp_items = [index_arg]
                explicit_checkinfo_args[index_arg].append(
                    [norm_cmp_items, norm_cmp_op, norm_cmp_value])
                continue
            # op = !
            elif type_exp == "UnaryOp":
                if flowlabel_code == "True":
                    norm_cmp_value.append("NULL")
                else:
                    norm_cmp_value.append("notNULL")
                norm_cmp_op = "=="
                norm_cmp_items = [index_arg]
                explicit_checkinfo_args[index_arg].append(
                    [norm_cmp_items, norm_cmp_op, norm_cmp_value])
                continue

            elif type_exp == "EqualityExpression":
                if flowlabel_code == "True":
                    norm_cmp_op = operator_expr
                else:
                    if operator_expr == "==":
                        norm_cmp_op = "!="
                    else:
                        norm_cmp_op = "=="
                norm_cmp_items, norm_cmp_value = self.analysis_EqualityExpression(
                    args_by_control, define_vars_of_args, index_arg,
                    children_expr)
                explicit_checkinfo_args[index_arg].append(
                    [norm_cmp_items, norm_cmp_op, norm_cmp_value])

            elif type_exp == "RelationalExpression":
                norm_cmp_items, norm_cmp_value, flag_left_value = self.analysis_RelationalExpression(
                    args_by_control, define_vars_of_args, index_arg,
                    children_expr)
                if flag_left_value:
                    if flowlabel_code == "True":
                        norm_cmp_op = operator_expr
                    else:
                        if operator_expr == ">": norm_cmp_op = "<"
                        elif operator_expr == ">=": norm_cmp_op = "<="
                        elif operator_expr == "<": norm_cmp_op = ">"
                        elif operator_expr == "<=": norm_cmp_op = ">="
                        else:
                            norm_cmp_op = "unknow %s" % operator_expr
                if not flag_left_value:
                    if flowlabel_code == "False":
                        norm_cmp_op = operator_expr
                    else:
                        if operator_expr == ">":
                            norm_cmp_op = "<"
                        elif operator_expr == ">=":
                            norm_cmp_op = "<="
                        elif operator_expr == "<":
                            norm_cmp_op = ">"
                        elif operator_expr == "<=":
                            norm_cmp_op = ">="
                        else:
                            norm_cmp_op = "unknow %s" % operator_expr
                explicit_checkinfo_args[index_arg].append(
                    [norm_cmp_items, norm_cmp_op, norm_cmp_value])
                continue

            else:
                print "error:unknown type of expression: %s" % type_exp

        for index_arg in range(0, arg_num):
            for checkinfo in explicit_checkinfo_args[index_arg]:
                if not checkinfo[2]:
                    print "error:explicit_checkinfo_args"
                explicit_check_patterns[index_arg].append(
                    self.set_explicit_check_pattern(arg_checked=index_arg,
                                                    checkinfo=checkinfo))

        # 去重
        for i in range(0, len(implicit_check_patterns)):
            implicit_check_patterns[i] = self.unique_list(
                implicit_check_patterns[i])
        for i in range(0, len(explicit_check_patterns)):
            explicit_check_patterns[i] = self.unique_list(
                explicit_check_patterns[i])

        return implicit_check_patterns, explicit_check_patterns

    def analysis_EqualityExpression(self, args_by_control, define_vars_of_args,
                                    index_arg, children_expr):
        norm_cmp_items = []
        norm_cmp_value = []

        # child = [id, type, code]
        flag_xchild_find = False
        xchild_id = -1
        for i in range(0, len(children_expr)):
            child = children_expr[i]
            # find which child contains the vars of the checked arg
            if not flag_xchild_find:
                for symbol in define_vars_of_args[index_arg]:
                    if symbol in child[2]:
                        flag_xchild_find = True
                        xchild_id = child[0]
                        norm_cmp_items.append(index_arg)
                        break
            # 判断该index_arg相关的child是否包括那些参数的符号变量,放入norm_cmp_items
            if child[0] == xchild_id:
                for relate_arg in args_by_control:
                    for symbol_rarg in define_vars_of_args[relate_arg]:
                        if symbol_rarg in child[2]:
                            norm_cmp_items.append(relate_arg)
                            break

            # 判断该非index_arg相关的child是否包括那些参数的符号变量,放入norm_cmp_value
            if child[0] != xchild_id:
                r_args = []
                for symbol in define_vars_of_args[index_arg]:
                    if symbol in child[2]:
                        r_args.append(index_arg)
                        #norm_cmp_value.append("arg_%d" % index_arg)
                        break
                for relate_arg in args_by_control:
                    for symbol_rarg in define_vars_of_args[relate_arg]:
                        if symbol_rarg in child[2]:
                            r_args.append(relate_arg)
                            #norm_cmp_value.append("arg_%d" % relate_arg)
                            break
                tmp_r_args = self.unique_list(r_args)
                tmp_r_args.sort()
                for tmp_arg in tmp_r_args:
                    norm_cmp_value.append("arg_%d" % tmp_arg)

                if len(r_args) == 0:
                    # Because the Joern consides "NULL" as a Identifier, so it requires special handling.
                    if child[2] == "NULL":
                        norm_cmp_value.append("NULL")
                    elif child[1] == "PrimaryExpression":
                        norm_cmp_value.append(child[2])
                    else:
                        norm_cmp_value.append("Var")
                        continue

        return self.unique_list(norm_cmp_items), norm_cmp_value

    def analysis_RelationalExpression(self, args_by_control,
                                      define_vars_of_args, index_arg,
                                      children_expr):
        norm_cmp_items = []
        norm_cmp_value = []

        # child = [id, type, code]
        flag_xchild_find = False
        xchild_id = -1
        for i in range(0, len(children_expr)):
            child = children_expr[i]
            # find which child contains the vars of the checked arg
            if not flag_xchild_find:
                for symbol in define_vars_of_args[index_arg]:
                    if symbol in child[2]:
                        flag_xchild_find = True
                        xchild_id = child[0]
                        norm_cmp_items.append(index_arg)
                        break
            # 判断该index_arg相关的child是否包括那些参数的符号变量,放入norm_cmp_items
            if child[0] == xchild_id:
                for relate_arg in args_by_control:
                    for symbol_rarg in define_vars_of_args[relate_arg]:
                        if symbol_rarg in child[2]:
                            norm_cmp_items.append(relate_arg)
                            break

            # 判断该非index_arg相关的child是否包括那些参数的符号变量,放入norm_cmp_value
            if child[0] != xchild_id:
                r_args = []
                for symbol in define_vars_of_args[index_arg]:
                    if symbol in child[2]:
                        r_args.append(index_arg)
                        # norm_cmp_value.append("arg_%d" % index_arg)
                        break
                for relate_arg in args_by_control:
                    for symbol_rarg in define_vars_of_args[relate_arg]:
                        if symbol_rarg in child[2]:
                            r_args.append(relate_arg)
                            # norm_cmp_value.append("arg_%d" % relate_arg)
                            break
                tmp_r_args = self.unique_list(r_args)
                tmp_r_args.sort()
                for tmp_arg in tmp_r_args:
                    norm_cmp_value.append("arg_%d" % tmp_arg)

                if len(r_args) == 0:
                    # Because the Joern consides "NULL" as a Identifier, so it requires special handling.
                    if child[2] == "NULL":
                        norm_cmp_value.append("NULL")
                    elif child[1] == "PrimaryExpression":
                        norm_cmp_value.append(child[2])
                    else:
                        norm_cmp_value.append("Var")
                        continue
        if xchild_id == 0:
            flag_left_value = True
        else:
            flag_left_value = False
        return self.unique_list(
            norm_cmp_items), norm_cmp_value, flag_left_value

    def query_check_patterns_path_thread(self, callee_id, callsite_id, path,
                                         controls_path, result, index):
        implicit_check_patterns, explicit_check_patterns = \
            self.query_check_patterns_path(callee_id, callsite_id, path, controls_path)

        result[index] = [implicit_check_patterns, explicit_check_patterns]
        return result

    # 返回returnVar_code 或者 false
    def query_returnVar_of_callsite(self, callee_id):
        query = """
        _getReturnVarOfCalleeId(%s)
        """ % (callee_id)
        returnVar = self.run_gremlin_query(query)
        return returnVar

    # 先判断当前位置是否为条件检查,是则返回当前节点,否则继续沿着CFG图继续搜索,返回checkpoint_id_of_var或者false
    def query_checkpoint_of_returnVar(self, callee_id, var):
        query = """
        _query_checkpoint_of_returnVar(%s,'%s')
        """ % (callee_id, var)
        #query = """
        #_getCheckPointOfCalleeId(%s,"%s")
        #""" % (callee_id, var)
        checkpoint = self.run_gremlin_query(query)
        return checkpoint

    def is_a_isparent_b(self, nodeid, var):
        query = """
        is_a_isparent_b(%s,'%s')
        """ % (nodeid, var)
        result = self.run_gremlin_query(query)
        return result

    # feature_callee = [    callee_id, 未使用前检查(0,1),
    #                       [正确路径的路径数量,正确路径的语句数量,错误路径中使用返回值变量(1,0)],
    #                       [错误路径的路径数量,正确路径的语句数量,错误路径中使用了返回值变量(1,0)]
    #                   ]
    def run_no_thread(self, callee_ids):
        feature_func = []
        feature_callee = []
        i = 0
        # -test
        print "len(callee_ids) = %d " % (len(callee_ids))
        # -test
        for callee_id in callee_ids:
            feature_callee = [callee_id, 0, [0, 0, 0], [0, 0, 0]]
            #implicit_check_patterns = [[] for i in range(arg_num)]
            #explicit_check_patterns = [[] for i in range(arg_num)]
            #check_patterns_callee = [implicit_check_patterns, explicit_check_patterns]
            i = i + 1
            # get callsite_id = cfgnodid of callee_id
            callsite_id = self.query_callsite_id(callee_id)
            # -test
            print "%3d.%10d%10d " % (i, callee_id, callsite_id)
            # -test

            # get controls control the callsite_id
            #all_controls = self.query_controls(callsite_id)
            #两种检查方式:1是检查返回值变量,2是函数在检查语句的表达式中直接检查
            # 1. 提取返回值变量
            returnVar = self.query_returnVar_of_callsite(callee_id)
            #print "%s : \t %s" %(self.query_loc_callsite(callee_id),returnVar)
            # 2. 提取未使用前参加的第一个条件检查语句id
            if not returnVar:
                var_code = "Err"
            else:
                # 遇到b->buf类似的例子时候,会返回多个变量。此仅仅考虑了第一个symbol
                # b->buf,会分为b,bug两个变量,需后续改进
                if isinstance(returnVar, list):
                    var_code = returnVar[0]
                else:
                    var_code = returnVar
            checkpoint_id = 0
            checkpoint_id = self.query_checkpoint_of_returnVar(
                callee_id, var_code)
            print "%s : \t %s %s" % (self.query_loc_callsite(callee_id),
                                     checkpoint_id, returnVar)
            if not checkpoint_id:
                feature_callee = [callee_id, 0, [0, 0, 0], [0, 0, 0]]
                feature_func.append(feature_callee)
                continue
            #
            #if(returnVar == False) and (checkpoint_id == False)):

            # 3. 存在未使用检查,继续分析路径信息
            feature_callee = [callee_id, 1]
            all_paths = self.query_farward_paths_from_condition(checkpoint_id)
            #todo: switch未处理,导致all_paths =  false
            if not all_paths:
                feature_callee = [callee_id, 1, [0, 0, 0], [0, 0, 0]]
                feature_func.append(feature_callee)
                continue

            print all_paths
            for path in all_paths:
                feature_right_path = [0, 0, 0]
                if path:
                    num_lpaths = len(path)
                    num_lstatements = sum(len(lpath) for lpath in path)
                    for childpath in path:
                        flag_returnVar_used = self.is_var_usedin_path(
                            childpath, var_code)
                        if flag_returnVar_used:
                            break
                    feature_right_path = [
                        num_lpaths, num_lstatements, flag_returnVar_used
                    ]
                feature_callee.append(feature_right_path)
            feature_func.append(feature_callee)
        return feature_func

    def is_var_usedin_path(self, lpath, returnVar):
        flag_returnVar_used = 0
        if (returnVar == "Err"):
            return 0
        for smt_id in lpath:
            if self.is_a_isparent_b(smt_id, returnVar):
                flag_returnVar_used = 1
                break
            else:
                flag_returnVar_used = 0
        return flag_returnVar_used

    def run(self, flag_thread=True, *callee_from):
        if len(callee_from):
            if isinstance(callee_from[0], list):
                callee_ids = callee_from[0]
                filepath = "Data/%s.data" % callee_ids[0]
        else:
            callee_ids = self.query_callee_ids(self.function_name)
            filepath = "Data/%s.data" % self.function_name

        if flag_thread:
            feature_func = self.run_thread(callee_ids)
        else:
            feature_func = self.run_no_thread(callee_ids)

        print "feature_func =: "
        #print check_patterns
        # display chek_patterns
        for pattern in feature_func:
            loc = self.query_loc_callsite(pattern[0])
            print "%s:" % (loc)
            print pattern

        ObjDataAndBinFile.objdata2file(feature_func, filepath)
        return feature_func
Ejemplo n.º 9
0
class MiningErrFunc:
    def __init__(self, calee_featureList):
        self.db_provider = DBContentsProvider()
        self.calee_featureList = calee_featureList
        #关于路径数量,语句数量“明显差异”的阈值,比例 > thld_path_ratio
        self.thld_path_ratio = G_thld_path_ratio
        self.thld_stmt_ratio = G_thld_stmt_ratio
        #关于特征是否满足的阈值
        self.thld_is_check = G_thld_is_check
        self.thld_is_path = G_thld_is_path
        self.thld_is_stmt = G_thld_is_stmt
        self.thld_is_notuseTwosides = G_thld_is_notuseTwosides
        #关于个性特征的权重值
        #self.weight_path = G_weight_path
        #self.weight_stmt = G_weight_stmt
        #self.weight_useOneSide = G_weight_useOneSide

    def run_gremlin_query(self, query):
        return self.db_provider.run_gremlin_query(query)

    def query_loc_callsite(self, callee_id):
        query = """
            g.v(%s).statements.transform{[g.v(it.functionId).functionToFile.filepath, it.location]}
            """ % callee_id
        result = self.run_gremlin_query(query)
        loc = "%s: %s" % (result[0][0][0], result[0][1])
        return loc

    # feature_callee = [    callee_id, 未使用前检查(0,1),
    #                       [正确路径的路径数量,正确路径的语句数量,错误路径中使用返回值变量(1,0)],
    #                       [错误路径的路径数量,正确路径的语句数量,错误路径中使用了返回值变量(1,0)]
    #                   ]
    # 正确路径:true边的路径;错误路径:false边的路径
    def get_featrue(self, featureList):
        ft_call = []
        for feature in featureList:
            callee_id = feature[0]
            var_ischecked = feature[1]
            tpath_paths_count = feature[2][0]
            fpath_paths_count = feature[3][0]
            tpath_stmts_count = feature[2][1]
            fpath_stmts_count = feature[3][1]
            tpath_rvar_isused = feature[2][2]
            fpath_rvar_isused = feature[3][2]

            if (tpath_stmts_count > fpath_stmts_count):
                if (tpath_paths_count != 0 and fpath_paths_count == 0):
                    path_ratio = 999
                if (tpath_paths_count == 0 and fpath_paths_count == 0):
                    path_ratio = 0

                if (tpath_stmts_count != 0 and fpath_stmts_count == 0):
                    stmt_ratio = 999
                if (tpath_stmts_count == 0 and fpath_stmts_count == 0):
                    stmt_ratio = 0

                if (fpath_paths_count != 0):
                    path_ratio = round(
                        float(tpath_paths_count) / fpath_paths_count, 2)
                if (fpath_stmts_count != 0):
                    stmt_ratio = round(
                        float(tpath_stmts_count) / fpath_stmts_count, 2)
            else:
                if (fpath_paths_count != 0 and tpath_paths_count == 0):
                    path_ratio = 999
                if (fpath_paths_count == 0 and tpath_paths_count == 0):
                    path_ratio = 0

                if (fpath_stmts_count != 0 and tpath_stmts_count == 0):
                    stmt_ratio = 999
                if (fpath_stmts_count == 0 and tpath_stmts_count == 0):
                    stmt_ratio = 0
                if (tpath_paths_count != 0):
                    path_ratio = round(
                        float(fpath_paths_count) / tpath_paths_count, 2)
                if (tpath_stmts_count != 0):
                    stmt_ratio = round(
                        float(fpath_stmts_count) / tpath_stmts_count, 2)
            if path_ratio >= self.thld_path_ratio:
                ft_path = 1
            else:
                ft_path = 0
            if stmt_ratio >= self.thld_stmt_ratio:
                ft_stmt = 1
            else:
                ft_stmt = 0

            if tpath_rvar_isused == 1 and fpath_rvar_isused == 1:
                ft_used_oneside = 0
            else:
                ft_used_oneside = 1
            ft_call.append(
                [callee_id, var_ischecked, ft_path, ft_stmt, ft_used_oneside])
        return ft_call

    def mining_err(self, ft_call):
        callee_counts = len(ft_call)
        count_ft_check = 0
        count_ft_path = 0
        count_ft_stmt = 0
        count_ft_notusedTwoside = 0
        for ft in ft_call:
            count_ft_check = count_ft_check + ft[1]
            count_ft_path = count_ft_path + ft[2]
            count_ft_stmt = count_ft_stmt + ft[3]
            count_ft_notusedTwoside = count_ft_notusedTwoside + ft[4]
        ratio_ft_check = round(float(count_ft_check) / callee_counts, 2)
        ratio_ft_path = round(float(count_ft_path) / callee_counts, 2)
        ratio_ft_stmt = round(float(count_ft_stmt) / callee_counts, 2)
        ratio_ft_notusedTwoside = round(
            float(count_ft_notusedTwoside) / callee_counts, 2)

        #挖掘策略
        weight_call = 0
        if ((ratio_ft_check > self.thld_is_check)
                and (ratio_ft_path > self.thld_is_path)
                and (ratio_ft_stmt > self.thld_is_stmt)
                and (ratio_ft_notusedTwoside > self.thld_is_notuseTwosides)):
            is_err = 1
        else:
            is_err = 0

        # 此处不再优先级评估
        """
        if(ratio_ft_path >= self.thld_is_path):
            weight_call = weight_call + self.weight_path
        if(ratio_ft_stmt >= self.thld_is_stmt):
            weight_call = weight_call + self.weight_stmt
        if(ratio_ft_notusedTwoside >= self.thld_is_notuseTwosides):
            weight_call = weight_call + self.weight_useOneSide
        """
        mining_result = [
            callee_counts, is_err, ratio_ft_check, ratio_ft_path,
            ratio_ft_stmt, ratio_ft_notusedTwoside
        ]
        return mining_result
        #display

    def run(self):
        ft_call = self.get_featrue(self.calee_featureList)
        mining_result = self.mining_err(ft_call)
        # 打印挖掘结果
        """
        f = open("Data/result.txt","w")
        print >> f, "ft_call = "
        print >> f,mining_result
        for i in range(0,len(ft_call)):
            loc = self.query_loc_callsite(self.calee_featureList[i][0])
            print >> f, "%s:"%(loc)
            print loc
            print >> f,self.calee_featureList[i]
            print self.calee_featureList[i]
            print >> f,ft_call[i]
        """
        return mining_result

    """
Ejemplo n.º 10
0
class ExtractErrFunFeatures:
    def __init__(self, function_name):
        self.db_provider = DBContentsProvider()
        self.file_io_provider = ObjDataAndBinFile()
        self.function_name = function_name
        self.count_threads = 20

    def run_gremlin_query(self, query):
        return self.db_provider.run_gremlin_query(query)

    def save_data_to_file(self, data, file_path):
        # filename = "Data/OutStatsData_%s.data"%time.strftime('%Y%m%d-%H%M%S')
        # print "生成GetOutStatsData的原始数据文件:%s" % file_path
        self.file_io_provider.objdata2file(data, file_path)

    def query_loc_callsite(self, callee_id):
        query = """
            g.v(%s).statements.transform{[g.v(it.functionId).functionToFile.filepath, it.location]}
            """ % callee_id
        result = self.run_gremlin_query(query)
        loc = "%s: %s" % (result[0][0][0], result[0][1])
        return loc

    def query_callee_ids(self, function_name):
        query = """
        g.V().has('type','Callee').has('code','%s').id.toList()
        """ % function_name
        callee_ids = self.run_gremlin_query(query)
        return callee_ids

    def query_callsite_id(self, callee_id):
        query = """
        g.v(%s).in.in.id
        """ % callee_id
        callsite_id = self.run_gremlin_query(query)
        return callsite_id[0]

    def query_backward_paths(self, callee_id):
        query = """
        getBackwardPaths(%s)
        """ % callee_id
        all_paths = self.run_gremlin_query(query)
        return all_paths

    #返回两条路径,前者为条件为true的分支,后者为false的分支
    def query_farward_paths_from_condition(self, condition_id):
        query = """
        getFarwardPaths_from_condition(%s)
        """ % condition_id
        all_paths = self.run_gremlin_query(query)
        return all_paths

    # def_chain = src.id <--var_str-- dst.id
    def query_define_chains(self, path):
        def_chain = []
        for node_id in path:
            query = """
            g.v(%s).inE('REACHES').transform{[it.inV.id, it.var, it.outV.id]}
            """ % node_id
            def_chain_tmp = self.run_gremlin_query(query)
            # select the definition of the @path
            for chain in def_chain_tmp:
                if (chain[0][0] in path) and (chain[2][0] in path):
                    # remove the define node which dst == src, it will make some process loop forever
                    if chain[0][0] != chain[2][0]:
                        def_chain.append([chain[0][0], chain[1], chain[2][0]])

        # remove the invalid define chain from @def_chain
        invalid_chains = []
        for i in range(0, len(def_chain)):
            for j in range(0, len(def_chain)):
                if i == j:
                    continue
                # the nearest definition on the same node with the same variable is the valid one
                if def_chain[i][0] == def_chain[j][0] and def_chain[i][
                        1] == def_chain[j][1]:
                    if path.index(def_chain[i][2]) > path.index(
                            def_chain[j][2]):
                        invalid_chains.append(i)
                    else:
                        invalid_chains.append(j)
        invalid_chains = self.unique_list(invalid_chains)
        invalid_chains.sort(reverse=True)
        for i in invalid_chains:
            def_chain.remove(def_chain[i])
        return def_chain

    def query_symbols_by_ids(self, ids):
        symbols_id = []
        symbols_code = []
        for arg in ids:
            query = """
            _getSymbols(%s)
            """ % arg
            s_ids = self.run_gremlin_query(query)
            symbols_id.append(s_ids)
            s_codes = []
            if s_ids:
                for vid in s_ids:
                    s_codes.append(self.query_code_by_id(vid))
            else:
                s_codes.append(u'')
            symbols_code.append(s_codes)
        return symbols_id, symbols_code

    def query_define_vars_dst_on_symbols(self, src_id, symbols, def_chain):
        define_vars = []
        define_dst_node = []
        callsite = src_id
        for s in symbols:
            var_symbol = self.query_code_by_id(s)
            head_node = self.search_dst_by_var_src(callsite, var_symbol,
                                                   def_chain)
            if not head_node:
                continue
            define_vars.append(var_symbol)
            define_dst_node.append(head_node)
            src_nodes = [head_node]
            while src_nodes:
                src_new = []
                for src in src_nodes:
                    middle_define_vars, dst_nodes = self.search_vars_dsts_by_src(
                        src, def_chain)
                    if middle_define_vars:
                        for dst in dst_nodes:
                            if dst not in define_dst_node:
                                src_new.extend(dst_nodes)

                            define_vars.extend(middle_define_vars)
                            define_dst_node.extend(dst_nodes)

                src_nodes = src_new
            #define_dst_node = self.unique_list(define_dst_node)
            #define_vars = self.unique_list(define_vars)
        return define_vars, define_dst_node

    def query_code_by_id(self, vid):
        query = """
        _getCodeById(%s)
        """ % vid
        code = self.run_gremlin_query(query)
        return code

    # search dst node from def_chain by var and src node
    def search_dst_by_var_src(self, src, var, def_chain):
        for def_node in def_chain:
            if src == def_node[0] and var == def_node[1]:
                return def_node[2]
        return False

    def search_vars_dsts_by_src(self, src, def_chain):
        define_vars = []
        dst_nodes = []
        for def_node in def_chain:
            if def_node[0] == src:
                define_vars.append(def_node[1])
                dst_nodes.append(def_node[2])
        return define_vars, dst_nodes

    def query_flowlabel_between_nodes(self, out_v, in_v):
        query = """
        _getFlowlabelOfCfgIds(%s, %s)
        """ % (out_v, in_v)
        flowlabel = self.run_gremlin_query(query)
        return flowlabel

    # parseControl return [flowlabel_code, ids_child[0], tpye_code, operator_code, children]
    # children = [id,type,code]
    def query_parsed_control(self, control_id, next_node):
        query = """
        parseControl(%s,%s)
        """ % (control_id, next_node)
        control_info = self.run_gremlin_query(query)

        flowlabel_code = control_info[0]
        id_exp = control_info[1]
        type_exp = control_info[2]
        operator_expr = control_info[3]
        children_expr = control_info[4]

        return flowlabel_code, id_exp, type_exp, operator_expr, children_expr

    # the controls are condition statements control the callsite_id
    def query_controls(self, callsite_id):
        query = """4849840
        getControlsFromCfgId(%s)
        """ % callsite_id
        controls = self.run_gremlin_query(query)
        return controls

    # get controls of the path
    def query_controls_path(self, controls, path):
        controls_path = []
        for c in controls:
            if c in path:
                controls_path.append(c)
        return controls_path

    def query_control_symbols(self, control):
        query = """
        _getSymbols(%s)
        """ % control
        control_symbols = self.run_gremlin_query(query)
        return control_symbols

    @staticmethod
    def unique_list(old_list):
        new_list = []
        for i in old_list:
            if i not in new_list:
                new_list.append(i)
        return new_list

    @staticmethod
    def is_lists_cross(list1, list2):
        for l in list1:
            if l in list2:
                return True
        return False

    @staticmethod
    def get_index_same_items_of_list1(list1, list2):
        indexList = []
        for i in range(0, len(list1)):
            if list1[i] in list2:
                indexList.append(i)
        return indexList

    @staticmethod
    def list1_VSset_list2(list1, list2):
        if (not list1) or (not list2):
            return "xx"
        if set(list1) > set(list2):
            return '>'
        if set(list1) == set(list2):
            return '='
        if set(list1) < set(list2):
            return '<'
        else:
            return 'x'

    # 返回returnVar_code 或者 false
    def query_returnVar_of_callsite(self, callee_id):
        query = """
        _getReturnVarOfCalleeId(%s)
        """ % (callee_id)
        returnVar = self.run_gremlin_query(query)
        return returnVar

    # 先判断当前位置是否为条件检查,是则返回当前节点,否则继续沿着CFG图继续搜索,返回checkpoint_id_of_var或者false
    def query_checkpoint_of_returnVar(self, callee_id, var):
        query = """
        _query_checkpoint_of_returnVar(%s,'%s')
        """ % (callee_id, var)
        #query = """
        #_getCheckPointOfCalleeId(%s,"%s")
        #""" % (callee_id, var)
        checkpoint = self.run_gremlin_query(query)
        return checkpoint

    def is_a_isparent_b(self, nodeid, var):
        query = """
        is_a_isparent_b(%s,'%s')
        """ % (nodeid, var)
        result = self.run_gremlin_query(query)
        return result

    def run_thread(self, callee_ids):
        print "错误:本程序的多线程版本尚未实现"
        return False

    # feature_callee = [    callee_id, 未使用前检查(0,1),
    #                       [正确路径的路径数量,正确路径的语句数量,正确路径中使用返回值变量(1,0)],
    #                       [错误路径的路径数量,正确路径的语句数量,错误路径中使用了返回值变量(1,0)]
    #                   ]
    def run_no_thread(self, callee_ids):
        feature_func = []
        feature_callee = []
        # 临时增加,只为统计实例是否有返回值,从而确定该函数是否为返回值函数
        rVar_func = []
        falg_rvar_func = 0
        i = 0
        # -test
        savedinfo = "len(callee_ids) = %d " % (len(callee_ids))
        write_info(gl.G_debuginfo_path, savedinfo)
        # -test
        for callee_id in callee_ids:
            feature_callee = [callee_id, 0, [0, 0, 0], [0, 0, 0]]
            #implicit_check_patterns = [[] for i in range(arg_num)]
            #explicit_check_patterns = [[] for i in range(arg_num)]
            #check_patterns_callee = [implicit_check_patterns, explicit_check_patterns]
            i = i + 1
            # get callsite_id = cfgnodid of callee_id
            callsite_id = self.query_callsite_id(callee_id)
            # -test
            savedinfo = "%3d.%10d%10d " % (i, callee_id, callsite_id)
            write_info(gl.G_debuginfo_path, savedinfo)
            # -testExtractErrFunFeatures

            # get controls control the callsite_id
            #all_controls = self.query_controls(callsite_id)
            #两种检查方式:1是检查返回值变量,2是函数在检查语句的表达式中直接检查
            # 1. 提取返回值变量
            returnVar = self.query_returnVar_of_callsite(callee_id)
            #print "%s : \t %s" %(self.query_loc_callsite(callee_id),returnVar)
            # 2. 提取未使用前参加的第一个条件检查语句id
            if not returnVar:
                var_code = "Err"
            else:
                # 遇到b->buf类似的例子时候,会返回多个变量。此仅仅考虑了第一个symbol
                # b->buf,会分为b,buf两个变量,需后续改进
                falg_rvar_func = 1
                if isinstance(returnVar, list):
                    var_code = returnVar[0]
                else:
                    var_code = returnVar
            checkpoint_id = 0
            checkpoint_id = self.query_checkpoint_of_returnVar(
                callee_id, var_code)
            savedinfo = "%s : \t %s %s" % (self.query_loc_callsite(callee_id),
                                           checkpoint_id, returnVar)
            write_info(gl.G_debuginfo_path, savedinfo)
            if not checkpoint_id:
                feature_callee = [callee_id, 0, [0, 0, 0], [0, 0, 0]]
                feature_func.append(feature_callee)
                continue
            #
            #if(returnVar == False) and (checkpoint_id == False)):

            # 3. 存在未使用检查,继续分析路径信息
            feature_callee = [callee_id, 1]

            #all_paths为2个分量的list,1为ture分支,2为false分支
            all_paths = self.query_farward_paths_from_condition(checkpoint_id)
            write_info(gl.G_debuginfo_path, all_paths)
            #todo: switch未处理,导致all_paths =  false
            if not all_paths:
                feature_callee = [callee_id, 1, [0, 0, 0], [0, 0, 0]]
                feature_func.append(feature_callee)
                continue
            for path in all_paths:
                feature_right_path = [0, 0, 0]
                if path:
                    num_lpaths = len(path)
                    num_lstatements = sum(len(lpath) for lpath in path)
                    for childpath in path:
                        flag_returnVar_used = self.is_var_usedin_path(
                            childpath, var_code)
                        if flag_returnVar_used:
                            break
                    feature_right_path = [
                        num_lpaths, num_lstatements, flag_returnVar_used
                    ]
                feature_callee.append(feature_right_path)
            feature_func.append(feature_callee)
        return feature_func, falg_rvar_func

    def is_var_usedin_path(self, lpath, returnVar):
        flag_returnVar_used = 0
        if (returnVar == "Err"):
            return 0
        for smt_id in lpath:
            if self.is_a_isparent_b(smt_id, returnVar):
                flag_returnVar_used = 1
                break
            else:
                flag_returnVar_used = 0
        return flag_returnVar_used

    def run(self, flag_thread=False, *callee_from):
        if len(callee_from):
            if isinstance(callee_from[0], list):
                callee_ids = callee_from[0]
                filepath = "%s/%s.data" % (gl.G_prjdata_dir, callee_ids[0])
        else:
            callee_ids = self.query_callee_ids(self.function_name)
            filepath = "%s/%s.data" % (gl.G_prjdata_dir, self.function_name)

        savedinfo = "extract_errFunc_feature:启动\n"
        write_info(gl.G_debuginfo_path, savedinfo)

        if flag_thread:
            feature_func = self.run_thread(callee_ids)
        else:
            feature_func, flag_rvar_func = self.run_no_thread(callee_ids)
        savedinfo = "extract_errFunc_feature:保存数据到%s\n" % filepath
        write_info(gl.G_debuginfo_path, savedinfo)
        datatmp = [feature_func, flag_rvar_func]
        ObjDataAndBinFile.objdata2file(datatmp, filepath)
        # feature_callee = [    callee_id, 未使用前检查(0,1),
        #                       [正确路径的路径数量,正确路径的语句数量,正确路径中使用返回值变量(1,0)],
        #                       [错误路径的路径数量,正确路径的语句数量,错误路径中使用了返回值变量(1,0)]
        #                   ]
        write_info(gl.G_debuginfo_path, "extract_errFunc_feature:各实例的特征提取结果如下")
        savedinfo = "feature_callee = [\n" \
                    "\tcallee_id, \n" \
                    "\t未使用前检查(0,1),\n" \
                    "\t[正确路径的路径数量,正确路径的语句数量,正确路径中使用返回值变量(1,0)],\n" \
                    "\t[错误路径的路径数量,正确路径的语句数量,错误路径中使用了返回值变量(1,0)],\n" \
                    "]\n"
        write_info(gl.G_debuginfo_path, savedinfo)

        #print check_patterns
        # display chek_patterns
        write_info(gl.G_debuginfo_path, "feature_callees = \n")
        for pattern in feature_func:
            loc = self.query_loc_callsite(pattern[0])
            savedinfo = "%s:" % (loc)
            write_info(gl.G_debuginfo_path, savedinfo)

            write_info(gl.G_debuginfo_path, pattern)
        return feature_func, flag_rvar_func
Ejemplo n.º 11
0
class MiningErrFunc:
    def __init__(self, calee_featureList):
        self.db_provider = DBContentsProvider()
        self.calee_featureList = calee_featureList
        #关于路径数量,语句数量“明显差异”的阈值,比例 > thld_path_ratio
        tmp_test = 2
        self.thld_path_ratio = tmp_test
        self.thld_stmt_ratio = tmp_test
        #关于特征是否满足的阈值
        self.thld_is_check = 0.8
        self.thld_is_path = 0.8
        self.thld_is_stmt = 0.8
        self.thld_is_useOneside = 0.8
        #关于个性特征的权重值
        self.weight_path = 0.5
        self.weight_stmt = 0.5
        self.weight_useOneSide = 1

    def run_gremlin_query(self, query):
        return self.db_provider.run_gremlin_query(query)

    def query_loc_callsite(self, callee_id):
        query = """
            g.v(%s).statements.transform{[g.v(it.functionId).functionToFile.filepath, it.location]}
            """ % callee_id
        result = self.run_gremlin_query(query)
        loc = "%s: %s" % (result[0][0][0], result[0][1])
        return loc

    # feature_callee = [    callee_id, 未使用前检查(0,1),
    #                       [正确路径的路径数量,正确路径的语句数量,错误路径中使用返回值变量(1,0)],
    #                       [错误路径的路径数量,正确路径的语句数量,错误路径中使用了返回值变量(1,0)]
    #                   ]
    def get_featrue(self, featureList):
        ft_call = []
        for feature in featureList:
            if (feature[2][1] > feature[3][1]):
                if (feature[2][0] != 0 and feature[3][0] == 0):
                    path_ratio = 999
                if (feature[2][0] == 0 and feature[3][0] == 0):
                    path_ratio = 0

                if (feature[2][1] != 0 and feature[3][1] == 0):
                    stmt_ratio = 999
                if (feature[2][1] == 0 and feature[3][1] == 0):
                    stmt_ratio = 0

                if (feature[3][0] != 0):
                    path_ratio = round(float(feature[2][0]) / feature[3][0], 2)
                if (feature[3][1] != 0):
                    stmt_ratio = round(float(feature[2][1]) / feature[3][1], 2)
            else:
                if (feature[3][0] != 0 and feature[2][0] == 0):
                    path_ratio = 999
                if (feature[3][0] == 0 and feature[2][0] == 0):
                    path_ratio = 0

                if (feature[3][1] != 0 and feature[2][1] == 0):
                    stmt_ratio = 999
                if (feature[3][1] == 0 and feature[2][1] == 0):
                    stmt_ratio = 0
                if (feature[2][0] != 0):
                    path_ratio = round(float(feature[3][0]) / feature[2][0], 2)
                if (feature[2][1] != 0):
                    stmt_ratio = round(float(feature[3][1]) / feature[2][1], 2)
            if path_ratio >= self.thld_path_ratio:
                ft_path = 1
            else:
                ft_path = 0
            if stmt_ratio >= self.thld_stmt_ratio:
                ft_stmt = 1
            else:
                ft_stmt = 0

            if ((feature[2][2] == 1 and feature[3][2] == 0)
                    or (feature[2][2] == 0 and feature[3][2] == 1)):
                ft_used_oneside = 1
            else:
                ft_used_oneside = 0
            ft_call.append(
                [feature[0], feature[1], ft_path, ft_stmt, ft_used_oneside])
        return ft_call

    def mining_err(self, ft_call):
        num = len(ft_call)
        count_ft_check = 0
        count_ft_path = 0
        count_ft_stmt = 0
        count_ft_usedOneside = 0
        for ft in ft_call:
            count_ft_check = count_ft_check + ft[1]
            count_ft_path = count_ft_path + ft[2]
            count_ft_stmt = count_ft_stmt + ft[3]
            count_ft_usedOneside = count_ft_usedOneside + ft[4]
        ratio_ft_check = round(float(count_ft_check) / num, 2)
        ratio_ft_path = round(float(count_ft_path) / num, 2)
        ratio_ft_stmt = round(float(count_ft_stmt) / num, 2)
        ratio_ft_usedOneside = round(float(count_ft_usedOneside) / num, 2)

        #挖掘策略
        weight_call = 0
        if (ratio_ft_check > self.thld_is_check):
            is_err = 1
        else:
            is_err = 0

        if (ratio_ft_path >= self.thld_is_path):
            weight_call = weight_call + self.weight_path
        if (ratio_ft_stmt >= self.thld_is_stmt):
            weight_call = weight_call + self.weight_stmt
        if (ratio_ft_usedOneside >= self.thld_is_useOneside):
            weight_call = weight_call + self.weight_useOneSide
        mining_result = [
            is_err, weight_call, ratio_ft_path, ratio_ft_stmt,
            ratio_ft_usedOneside
        ]
        return mining_result
        #display

    def run(self):
        ft_call = self.get_featrue(self.calee_featureList)
        mining_result = self.mining_err(ft_call)
        # 打印挖掘结果
        f = open("Data/result.txt", "w")
        print >> f, "ft_call = "
        print >> f, mining_result
        for i in range(0, len(ft_call)):
            loc = self.query_loc_callsite(self.calee_featureList[i][0])
            print >> f, "%s:" % (loc)
            print loc
            print >> f, self.calee_featureList[i]
            print self.calee_featureList[i]
            print >> f, ft_call[i]

        return mining_result

    """
Ejemplo n.º 12
0
class MiningErrfuncShell:
    def __init__(self):
        self.db_provider = DBContentsProvider()
        parser = argparse.ArgumentParser(description='识别源代码项目中的返回值敏感型函数')
        parser.add_argument(
            '-t',
            "--type",
            required=True,
            type=str,
            choices=["select", "all"],
            help='select:只对config.py中设置的函数进行识别\n all:对源代码项目中所有的函数进行识别')
        parser.add_argument("-prj",
                            "--projectname",
                            required=True,
                            type=str,
                            help="待识别函数所在的源代码项目,将以此建立同名数据文件夹")
        self.args = parser.parse_args()

        gl.G_prjdata_dir = "%s/%s" % (gl.G_alldata_dir, self.args.projectname)
        gl.G_result_path = "%s/%s" % (gl.G_prjdata_dir, "xp_err.txt")
        gl.G_debuginfo_path = "%s/%s" % (gl.G_prjdata_dir, "degbug.txt")
        gl.G_xcallee_txt = "%s/%s" % (gl.G_prjdata_dir, "xcallee.txt")

    def run_gremlin_query(self, query):
        return self.db_provider.run_gremlin_query(query)

    def query_loc_callsite(self, callee_id):
        query = """
                g.v(%s).statements.transform{[g.v(it.functionId).functionToFile.filepath, it.location]}
                """ % callee_id
        result = self.run_gremlin_query(query)
        loc = "%s: %s" % (result[0][0][0], result[0][1])
        return loc

    def set_fuctions_bechecked(self):
        func_list = []
        if self.args.type == "all":
            func_list = self.db_provider.query_allCallee_name()
        elif self.args.type == "select":
            func_list = gl.G_func_list
        # 删除列表中出现在gl.G_func_unnormal中的异常函数,即Joern无法处理的函数
        for item in gl.G_func_unnormal:
            if item in func_list:
                func_list.remove(item)
        return func_list

    def query_loc_callsite(self, callee_id):
        query = """
        g.v(%s).statements.transform{[g.v(it.functionId).functionToFile.filepath, it.location]}
        """ % callee_id
        result = self.run_gremlin_query(query)
        loc = "%s: %s" % (result[0][0][0], result[0][1])
        return loc

    #result = [isVar funcname callee_counts,is_err,
    # ratio_ft_check, ratio_ft_path,ratio_ft_stmt,ratio_ft_notusedTwoside]
    def is_xCallee(self, function_name):
        #function_name_str = function_name.encode('gbk')
        #获取特征数据
        savedinfo = "获取%s的特征数据\n" % function_name
        write_info(gl.G_debuginfo_path, savedinfo)
        datapath = gl.G_prjdata_dir + "/%s.data" % function_name
        if os.path.exists(datapath):
            #filename = "Data/42153.data"
            datatmp = ObjDataAndBinFile.binfile2objdata(datapath)
            feature_callees = datatmp[0]
            flag_rvar_func = datatmp[1]
        else:
            extract_errfun_feature = ExtractErrFunFeatures(function_name)
            #patterns = extract_check_patterns.run(False, callee_ids)
            feature_callees, flag_rvar_func = extract_errfun_feature.run(
                flag_thread=False)

        # 挖掘返回值敏感型函数
        write_info(gl.G_debuginfo_path, "获取%s的识别结果\n" % function_name)
        obj_MiningErrFunc = MiningxCalleeErr(feature_callees)
        #mining_result = [callee_counts,is_err,
        # ratio_ft_check, ratio_ft_path,ratio_ft_stmt,ratio_ft_notusedTwoside]
        mining_result = obj_MiningErrFunc.run()
        # 修正返回值函数判断,若存在检查比例,显然具有返回值。其余保持原结果
        write_info(gl.G_debuginfo_path, mining_result)
        return mining_result

    def run(self):
        # 根据用户输入的项目名称,创建保存程序输出的文件夹
        result = make_dir(gl.G_prjdata_dir)
        write_info(gl.G_debuginfo_path, result)

        allCallee_name = self.set_fuctions_bechecked()
        xpoint_reasult = []
        num_func = len(allCallee_name)
        num_alalysed_func = 0
        infoSaved = "\n# *%s* 中返回值敏感型函数识别:\n起始时间 = %s  待识别函数数量 = %s"\
                    %(self.args.projectname, datetime.datetime.now(),num_func)
        write_info(gl.G_result_path, infoSaved)

        tmp_index = 0
        for function_name in allCallee_name:
            print "%d 识别%s\n" % (tmp_index, function_name)
            tmp_index = tmp_index + 1
            xp_tmp = []
            # 识别function_name
            #is_erFunc = [isVar funcname callee_counts,is_err,
            # ratio_ft_check, ratio_ft_path,ratio_ft_stmt,ratio_ft_notusedTwoside]
            xp_tmp = self.is_xCallee(function_name)
            xpoint_reasult.append(xp_tmp)
            num_alalysed_func = num_alalysed_func + 1
            for item in xp_tmp:

                loc = self.query_loc_callsite(item[0])
                print item, loc
        return