def leaf_err_sum(self, cur_node, err_set): ''' 悲观剪枝,用于计算一个当前节点子树的错误率 err_num: 当一个叶子节点数据集为空时,错误节点数目就是父节点的错误节点数 ''' if len(cur_node.childNode) == 0: #叶子节点 if len(cur_node.dataset) == 0: err_set.append(0) else: err_sum = get_err_sum(cur_node.cls, cur_node.dataset) err_set.append(err_sum) else: # 内部节点 for _, c in cur_node.childNode.items(): if len(c.childNode) == 0 and len(c.dataset) == 0: self.leaf_err_sum(c, err_set) else: self.leaf_err_sum(c, err_set)
def __prun_tree(self, cur_node): '''剪枝''' if len(cur_node.childNode) == 0: #叶子节点直接跳过 return else: cur_node.cls = get_cls_from_data(cur_node.dataset) cur_err_sum = get_err_sum(cur_node.cls, cur_node.dataset) + 0.5 leaf_err_set = [] self.leaf_err_sum(cur_node, leaf_err_set) leaf_e_sum = sum(leaf_err_set) + 0.5 * len(leaf_err_set) leaf_err_ratio = leaf_e_sum / len(cur_node.dataset) std_dev = np.sqrt(leaf_err_ratio * (1 - leaf_err_ratio)) if leaf_e_sum + std_dev > cur_err_sum: print leaf_e_sum + std_dev, cur_err_sum, " prun!!!!" cur_node.childNode = {} cur_node.cls = get_cls_from_data(cur_node.dataset) else: for _, c in cur_node.childNode.items(): self.__prun_tree(c)