Esempio n. 1
0
    def __prun_tree(self, cur_node):
        '''剪枝'''
        if len(cur_node.childNode) == 0:        #叶子节点直接跳过
            return
        else:
            cur_node.cls = get_cls_from_data(cur_node.dataset)
            cur_err_sum = get_err_sum(cur_node.cls, cur_node.dataset) + 0.5
            leaf_err_set = []
            self.leaf_err_sum(cur_node, leaf_err_set)
            leaf_e_sum  = sum(leaf_err_set) + 0.5 * len(leaf_err_set)
            leaf_err_ratio =  leaf_e_sum / len(cur_node.dataset)
            std_dev = np.sqrt(leaf_err_ratio * (1 - leaf_err_ratio))

            if leaf_e_sum + std_dev > cur_err_sum:
                print leaf_e_sum + std_dev, cur_err_sum, "  prun!!!!"
                cur_node.childNode = {}
                cur_node.cls = get_cls_from_data(cur_node.dataset)
            else:
                for _, c in cur_node.childNode.items():
                    self.__prun_tree(c)
Esempio n. 2
0
    def __prun_tree(self, cur_node):
        '''剪枝'''
        if len(cur_node.childNode) == 0:  #叶子节点直接跳过
            return
        else:
            cur_node.cls = get_cls_from_data(cur_node.dataset)
            cur_err_sum = get_err_sum(cur_node.cls, cur_node.dataset) + 0.5
            leaf_err_set = []
            self.leaf_err_sum(cur_node, leaf_err_set)
            leaf_e_sum = sum(leaf_err_set) + 0.5 * len(leaf_err_set)
            leaf_err_ratio = leaf_e_sum / len(cur_node.dataset)
            std_dev = np.sqrt(leaf_err_ratio * (1 - leaf_err_ratio))

            if leaf_e_sum + std_dev > cur_err_sum:
                print leaf_e_sum + std_dev, cur_err_sum, "  prun!!!!"
                cur_node.childNode = {}
                cur_node.cls = get_cls_from_data(cur_node.dataset)
            else:
                for _, c in cur_node.childNode.items():
                    self.__prun_tree(c)
Esempio n. 3
0
    def __construct_tree(self, cur_node, attr_list):
        '''
        递归构建决策树
        '''
        data = cur_node.dataset
        data_classified = {}
        max_gain_ratio, index = sys.float_info.min, -1
        num_border = 0.0

        for idx in attr_list:
            if idx in self.disc_type:                        #离散属性
                gain_r = self.disc_gain_rt(idx, data)
            else:                                      #数值属性
                gain_r, num_border = self.num_gain_rt(idx,data)

            if gain_r > max_gain_ratio:
                max_gain_ratio = gain_r
                index = idx

        if index == -1:                                #所有属性都不能满足条件
            cur_node.cls = get_cls_from_data(data)
            return

        cur_node.attr_index = index

        if index in self.disc_type:                 #离散属性
            cur_node.attr_type = 1

            #对数据进行分类
            for val in self.disc_type[index]:
                data_classified[val] = []
            for d in data:
                data_classified[d[index]].append(d)

        else:                                       #连续属性
            cur_node.attr_type = 0
            cur_node.demark = num_border

            data_classified[0] = []
            data_classified[1] = []
            for d in data:
                if d[index] < num_border:
                    data_classified[0].append(d)
                else:
                    data_classified[1].append(d)

        if len(attr_list) == 1:                 #下一次递归属性集为空
            for k, v in data_classified.items():
                child_node = TreeNode(v)
                #属性值对应的数据集为空,则使用当前节点的数据集判断节点对应的分类
                if len(v) == 0:
                    child_node.cls = get_cls_from_data(data)
                else:
                    child_node.cls = get_cls_from_data(v)
                cur_node.childNode[k] = child_node
        else:
            sub_attr = list(attr_list)
            sub_attr.remove(index)
            for k, v in data_classified.items():
                child_node = TreeNode(v)
                if len(v) == 0:
                    child_node.cls = get_cls_from_data(data)
                elif check_purity(v) == 1:
                    child_node.cls = v[0][-1]           #随便取一个sample的标签
                else:
                    self.__construct_tree(child_node, sub_attr)     #对子节点进行递归
                cur_node.childNode[k] = child_node
Esempio n. 4
0
    def __construct_tree(self, cur_node, attr_list):
        '''
        递归构建决策树
        '''
        data = cur_node.dataset
        data_classified = {}
        max_gain_ratio, index = sys.float_info.min, -1
        num_border = 0.0

        for idx in attr_list:
            if idx in self.disc_type:  #离散属性
                gain_r = self.disc_gain_rt(idx, data)
            else:  #数值属性
                gain_r, num_border = self.num_gain_rt(idx, data)

            if gain_r > max_gain_ratio:
                max_gain_ratio = gain_r
                index = idx

        if index == -1:  #所有属性都不能满足条件
            cur_node.cls = get_cls_from_data(data)
            return

        cur_node.attr_index = index

        if index in self.disc_type:  #离散属性
            cur_node.attr_type = 1

            #对数据进行分类
            for val in self.disc_type[index]:
                data_classified[val] = []
            for d in data:
                data_classified[d[index]].append(d)

        else:  #连续属性
            cur_node.attr_type = 0
            cur_node.demark = num_border

            data_classified[0] = []
            data_classified[1] = []
            for d in data:
                if d[index] < num_border:
                    data_classified[0].append(d)
                else:
                    data_classified[1].append(d)

        if len(attr_list) == 1:  #下一次递归属性集为空
            for k, v in data_classified.items():
                child_node = TreeNode(v)
                #属性值对应的数据集为空,则使用当前节点的数据集判断节点对应的分类
                if len(v) == 0:
                    child_node.cls = get_cls_from_data(data)
                else:
                    child_node.cls = get_cls_from_data(v)
                cur_node.childNode[k] = child_node
        else:
            sub_attr = list(attr_list)
            sub_attr.remove(index)
            for k, v in data_classified.items():
                child_node = TreeNode(v)
                if len(v) == 0:
                    child_node.cls = get_cls_from_data(data)
                elif check_purity(v) == 1:
                    child_node.cls = v[0][-1]  #随便取一个sample的标签
                else:
                    self.__construct_tree(child_node, sub_attr)  #对子节点进行递归
                cur_node.childNode[k] = child_node