예제 #1
0
def PostPruning_PEP(Tree, labels, dataSet):
    """
    :param Tree:预处理的决策树
    :param labels:特征类别属性
    :param dataSet:数据集
    :return:剪枝后的决策树
    """
    classList = [example[-1] for example in dataSet]
    majorClass = majorityCnt(classList)  # 找到数量最多的类别
    et = nodeError(dataSet) + 1 / 2  # 计算非叶节点t误差
    Nt = getNumLeaf(Tree)  # 子树Tt叶节点数目
    eTt = leafError(Tree, labels, dataSet) + Nt / 2  # 子树Tt所有叶节点误差
    nt = len(dataSet)  # 节点t训练实例数目
    if nt > eTt:
        SeTt = np.sqrt(eTt * (nt - eTt) / nt)  # 子树Tt总误差
    else:
        SeTt = 0
    # print('================================')
    # print('Tree=', Tree)
    # print('et=', et)
    # print('eTt=', eTt)
    # print('Nt=', Nt)
    # print('nt=', nt)
    #print('SeTt=', SeTt)
    # print('eTt + SeTt=', eTt + SeTt)
    if et < eTt + SeTt:  # 若节点t误差小于子树Tt误差
        return majorClass  # 则进行剪枝,直接返回最大类
    firstFeat = list(Tree.keys())[0]  # 取出tree的第一个键名
    secondDict = Tree[firstFeat]  # 取出tree第一个键值
    labelIndex = labels.index(firstFeat)  # 找到键名在特征属性的索引值
    # print('firstFeat=', firstFeat)
    # print('secondDict=', secondDict)
    subLabels = labels[:labelIndex]  # 剔除预处理的键名
    subLabels.extend(labels[labelIndex + 1:])
    for keys in secondDict.keys():  # 遍历第二个字典的键
        if type(secondDict[keys]).__name__ == 'dict':
            items = keys.split(',')  # 如果该键包含多个特征值,那么进行分离
            # print('items=', items)
            subDataSet = splitDataSet(dataSet, labelIndex, items)  # 划分数据集
            secondDict[keys] = PostPruning_PEP(secondDict[keys], subLabels,
                                               subDataSet)
    return Tree
예제 #2
0
def calErrorRatio(Tree, labels, dataSet, NT, infoSet):
    """
    :param Tree:决策树
    :param labels:特征类别属性
    :param dataSet:数据集
    :param NT:数据集总样本数目
    :param infoSet:所有节点的信息总集合
    :return:各个节点的信息集:
                            包括:子树,节点数目,误差增加率和子树分类前特征
    """
    firstFeat = list(Tree.keys())[0]  # 取出tree的第一个键名
    secondDict = Tree[firstFeat]  # 取出tree第一个键值
    labelIndex = labels.index(firstFeat)  # 找到键名在特征属性的索引值
    subLabels = labels[:labelIndex]  # 剔除预处理的键名
    subLabels.extend(labels[labelIndex + 1:])
    for keys in secondDict.keys():  # 遍历第二个字典的键
        if type(secondDict[keys]).__name__ == 'dict':
            items = keys.split(',')  # 如果该键包含多个特征值,那么进行分离
            subDataSet = splitDataSet(dataSet, labelIndex, items)  # 划分数据集
            info, infoSet = calErrorRatio(secondDict[keys], subLabels,
                                          subDataSet, NT, infoSet)
            info.setdefault('keys', keys)  # 在节点信息集中,增加分类前特征
            infoSet.append(info)
    # print('=============================')
    # print('Tree=', Tree)
    # print('firstFeat=', firstFeat)
    # print('secondDict=', secondDict)
    Rt = nodeError(dataSet) / NT  # 计算节点误差率
    RTt = leafError(Tree, labels, dataSet) / NT  # 计算子树误差率
    Nt = getNumLeaf(Tree)  # 计算叶节点数目
    if Nt == 1:
        a = 2.0
    else:
        a = (Rt - RTt) / (Nt - 1)  # 计算误差增加率
    info = {'Tree': Tree, 'NumLeaf': Nt, 'a': a}  # 构建节点信息集
    # print('info=', info)
    return info, infoSet
예제 #3
0
print('trainSet=\n', labels, '\n')
for data in trainSet:
    print(data)
print('-------------------------')
print('testSet=\n', labels, '\n')
for data in testSet:
    print(data)
print('-------------------------')
'''

"""对ID3算法生成的决策树,进行剪枝"""
''''''
print('===========================================')
# 先用ID3算法,将训练集生成决策树
ID3TrainTree = createID3Tree(list(trainSet), list(labels))
print('The ID3 Decision Tree:', 'Depth:', getTreeDepth(ID3TrainTree), ';Leaf:', getNumLeaf(ID3TrainTree))
print('The Node with largest number is', findKeyNode(ID3TrainTree))

# 根据测试集,进行预剪枝
PreID3Tree = PrePruning(list(trainSet), testSet, list(labels))
print('The Pre_Pruning_ID3 Decision Tree:', 'Depth:', getTreeDepth(PreID3Tree), ';Leaf:', getNumLeaf(PreID3Tree))
print('The Node with largest number is', findKeyNode(PreID3Tree))

# 根据测试集,使用REP方法进行后剪枝
REPID3Tree = PostPruning_REP(list(trainSet), testSet, list(labels))
print('The REP_Pruning_ID3 Decision Tree:', 'Depth:', getTreeDepth(REPID3Tree), ';Leaf:', getNumLeaf(REPID3Tree))
print('The Node with largest number is', findKeyNode(REPID3Tree))

# 根据测试集,使用REP方法进行后剪枝
PEPID3Tree = PostPruning_PEP(ID3TrainTree, list(labels), list(dataSet))
print('The PEP_Pruning_ID3 Decision Tree:', 'Depth:', getTreeDepth(PEPID3Tree), ';Leaf:', getNumLeaf(PEPID3Tree))