Example #1
0
    def disc_gain_rt(self, index, data):
        '''
        计算一个属性的信息增益
        '''
        statisc_dict = {}
        index_val = self.disc_type[index]
        total_info = cal_set_info(data)

        for val in index_val:
            statisc_dict[val] = {}

        for d in data:
            if d[-1] in statisc_dict[d[index]]:
                statisc_dict[d[index]][d[-1]] += 1
            else:
                statisc_dict[d[index]][d[-1]] = 1

        '''
        statisc_dict结构:
        {
            attr_value1:{yes:num1, no:num2},
            ...
            attr_value2:{yes:num1, no:num2},
        }
        '''
        info_gain, info_measure = cal_gain_ratio(statisc_dict, data)

        return -1 if info_measure == -1 else info_gain / info_measure
Example #2
0
    def disc_gain_rt(self, index, data):
        '''
        计算一个属性的信息增益
        '''
        statisc_dict = {}
        index_val = self.disc_type[index]
        total_info = cal_set_info(data)

        for val in index_val:
            statisc_dict[val] = {}

        for d in data:
            if d[-1] in statisc_dict[d[index]]:
                statisc_dict[d[index]][d[-1]] += 1
            else:
                statisc_dict[d[index]][d[-1]] = 1
        '''
        statisc_dict结构:
        {
            attr_value1:{yes:num1, no:num2},
            ...
            attr_value2:{yes:num1, no:num2},
        }
        '''
        info_gain, info_measure = cal_gain_ratio(statisc_dict, data)

        return -1 if info_measure == -1 else info_gain / info_measure
Example #3
0
    def num_gain_rt(self, index, data):
        '''
        连续数值属性计算信息增益,先根据第index列排序,选取标签改变时对应的index列属性值,
        作为分界点,分别计算出每个分界点对应的信息增益,返回最大增益及其对应的分界点
        '''
        ctgs = set()
        sorted_data = sorted(data, key=itemgetter(index))
        cls = sorted_data[0][-1]

        #只选取便签改变时对应的属性值
        for d in sorted_data:
            if d[-1] != cls:
                cls = d[-1]
                ctgs.add(d[index])

        max_gain, border, gain_ratio = sys.float_info.min, 0.0, -1.0
        for ctg in ctgs:
            statisc_dict = {}
            info_gain = 0.0
            '''
            结构为
            {
                'left': {yes: num1, no:num2}
                'right': {yes:num1, no:num2}
            }
            '''
            statisc_dict['left'], statisc_dict['right'] = binary_sp(
                data, ctg, index)

            info_gain, info_measure = cal_gain_ratio(statisc_dict, data)

            if info_measure == -1:
                continue
            if info_gain > max_gain:
                max_gain, border, gain_ratio = info_gain, ctg, info_gain / info_measure

        return gain_ratio, border
Example #4
0
    def num_gain_rt(self, index, data):
        '''
        连续数值属性计算信息增益,先根据第index列排序,选取标签改变时对应的index列属性值,
        作为分界点,分别计算出每个分界点对应的信息增益,返回最大增益及其对应的分界点
        '''
        ctgs = set()
        sorted_data = sorted(data, key=itemgetter(index))
        cls = sorted_data[0][-1]

        #只选取便签改变时对应的属性值
        for d in sorted_data:
            if d[-1] != cls:
                cls = d[-1]
                ctgs.add(d[index])

        max_gain, border, gain_ratio = sys.float_info.min, 0.0, -1.0
        for ctg in ctgs:
            statisc_dict = {}
            info_gain = 0.0
            '''
            结构为
            {
                'left': {yes: num1, no:num2}
                'right': {yes:num1, no:num2}
            }
            '''
            statisc_dict['left'], statisc_dict['right'] = binary_sp(data, ctg, index)

            info_gain, info_measure = cal_gain_ratio(statisc_dict, data)

            if info_measure == -1:
                continue
            if info_gain > max_gain:
                max_gain, border, gain_ratio = info_gain, ctg, info_gain / info_measure

        return  gain_ratio, border