def show(self): if self._tree==None: raise NotFittedError("Estimator not fitted, call `fit` first") #plot the tree using matplotlib import treePlotter treePlotter.createPlot(self._tree)
def show(self, outpdf): if self._tree == None: pass # plot the tree using matplotlib import treePlotter treePlotter.createPlot(self._tree, outpdf)
def lenses(): fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = createTree(lenses, lensesLabels) print lensesTree treePlotter.createPlot(lensesTree)
def main(): dataSet, labels = createDataSet() labels_tmp = labels[:] # 拷贝,createTree会改变labels desicionTree = createTree(dataSet, labels_tmp) #storeTree(desicionTree, 'classifierStorage.txt') #desicionTree = grabTree('classifierStorage.txt') print('desicionTree:\n', desicionTree) treePlotter.createPlot(desicionTree) testSet = createTestSet() print('classifyResult:\n', classifyAll(desicionTree, labels, testSet))
def main(): labels = ['buying', 'maintenance', 'doors', 'persons', 'lug_boot', 'safety'] data_set = [] with open('car_data') as f: for line in f.readlines(): data = line.strip().split(',') data_set.append(data) decision_tree = create_tree(data_set, labels) #print "decision_tree", decision_tree treePlotter.createPlot(decision_tree)
def main_bak(): data_set, labels = create_data_set() labels_tmp = labels[:] decision_tree = create_tree(data_set, labels_tmp) # print "decision_tree", decision_tree #test_set = create_test_set() # 验证数据 #result = classify_all(decision_tree, labels, test_set) #print "result", result treePlotter.createPlot(decision_tree)
secondDict = inputTree[firstStr] #子树 featIndex = featLabels.index(firstStr) #找该属性对应的序号 for key in secondDict.keys(): #遍历子树,判断属于哪一分支 if testVec[featIndex] == key: if type(secondDict[key]).__name__=='dict': #该结点属于分支结点 classLabel = classify(secondDict[key],featLabels,testVec) else: #叶子结点 classLabel = secondDict[key] return classLabel """存储决策树""" def storeTree(inputTree,fileName): fw = open(fileName,'w') pickle.dump(inputTree,fw) fw.close() """从磁盘加载决策树""" def grabTree(fileName): fr = open(fileName) return pickle.load(fr) if __name__ == '__main__': myDat,labels = createDataSet() #myDat[0][-1] = 'maybe' #entropy = calcShannonEnt(myDat) mytree = createTree(myDat,labels) print mytree createPlot(mytree) storeTree(mytree,"./tree.model")
def abalone_parts_test(): model = { 'Viscera': { '>0.0145': { 'Shell': { '<=0.0345': { 'Viscera': { '<=0.0285': ' 5 (50.0/9.0)', '>0.0285': ' 4 (3.0)' } }, '>0.0345': { 'Sex': { '=M': ' 6 (6.0/3.0)', '=F': ' 5 (3.0)', '=I': ' 5 (59.0/12.0)' } } } }, '<=0.0145': { 'Shucked': { '>0.007': ' 4 (66.0/31.0)', '<=0.007': { 'Shucked': { '>0.0045': { 'Shucked': { '>0.005': { 'Height': { '<=0.02': ' 4 (2.0)', '>0.02': ' 3 (4.0)' } }, '<=0.005': ' 4 (3.0)' } }, '<=0.0045': { 'Height': { '<=0.025': ' 1 (2.0/1.0)', '>0.025': ' 3 (2.0)' } } } } } } } } #---------get Attribute list-------------------------- name_path = './abalone.names' feature_list = get_Attribute(name_path) #-----------get datasets------------------------ path = './abalone_parts.data' datasets = read_data(path) # #--------Start PEP_pruning--------------------------- model_pruned = PEP_result(copy.deepcopy(model), feature_list, datasets) print "剪枝前的模型=", model print "剪枝后的模型=", model_pruned createPlot(model) createPlot(model_pruned) #--------Start accuracy computation--------------------------- print "unpruned_accuracy,pruned_accuracy", accuracy_analysis( model, model_pruned, datasets, feature_list, name_path)
# plt.xlabel('count') # plt.ylabel('result') # plt.title('Hahaha Goooood!!!') # fig.savefig('plot.svg') # import matplotlib # matplotlib.use('Agg') # import matplotlib.pyplot as plt # fig = plt.figure(1, facecolor='white') # fig.clf() # ax = plt.subplot(111, frameon=True) # # ax.scatter([.2, .5], [.1, .5]) # plt.figure(1, figsize=(3,3)) # ax = plt.subplot(111) # ax.annotate("Test", xy=(0.2, 0.2), xycoords='data', xytext=(0.8, 0.8), # textcoords='data', size=20, va="center", ha="center", # bbox=dict(boxstyle="round4", fc="w"), # arrowprops=dict(arrowstyle="-|>", # connectionstyle="arc3,rad=-0.2", fc="w"), ) # ax.annotate("This is my text", xy=(0.2, 0.1), xycoords='data', # xytext=(0.4, 0.3), textcoords='data', ha='center', va='center', # arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), ) # fig.savefig('plot.svg') # import textPlotter # textPlotter.createPlot() import treePlotter treePlotter.createPlot(treePlotter.retrieveTree(0))
def plot_tree(self): """ visually generated cart tree. """ figure(dpi=400, figsize=(12, 12)) treePlotter.createPlot(self.tree)
import treePlotter import trees a1, a2 = trees.createDataSet() b1 = trees.createTree(a1, a2) treePlotter.createPlot(b1)
return retDataSet def createTree(dataSet, labels): classList = [example[-1] for example in dataSet] # ['N', 'N', 'Y', 'Y', 'Y', 'N', 'Y'] if classList.count(classList[0]) == len(classList): # classList所有元素都相等,即类别完全相同,停止划分 return classList[0] #splitDataSet(dataSet, 0, 0)此时全是N,返回N # if len(dataSet[0]) == 1: #[0, 0, 0, 0, 'N'] # # 遍历完所有特征时返回出现次数最多的 # return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) #0-> 2 # 选择最大的gain ratio对应的feature bestFeatLabel = labels[bestFeat] #outlook -> windy myTree = {bestFeatLabel:{}} #多重字典构建树{'outlook': {0: 'N' del(labels[bestFeat]) #['temperature', 'humidity', 'windy'] -> ['temperature', 'humidity'] featValues = [example[bestFeat] for example in dataSet] #[0, 0, 1, 2, 2, 2, 1] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] #['temperature', 'humidity', 'windy'] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) # 划分数据,为下一层计算准备 return myTree dataSet, labels = createDataSet() labels_tmp = labels[:] desicionTree = createTree(dataSet, labels_tmp) print(desicionTree) treePlotter.createPlot(desicionTree)
firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featureIndex = featureLabels.index(firstStr) for key in secondDict.keys(): if testVector[featureIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featureLabels, testVector) else: classLabel = secondDict[key] return classLabel def storeTree(inputTree, filename): #store the decision tree that had been trained. import pickle fw = open(filename, 'w') pickle.dump(inputTree, fw) fw.close() def grabTree(filename): #get the tree that was stored in the 'filename'. import pickle fr = open(filename) return pickle.load(fr) if __name__=="__main__": fr = open('lenses.txt') #open the file 'lenses.txt' lenses = [inst.strip().split('\t') for inst in fr.readlines()] #dispose the file. lensesLabels = ['age','prescript','astigmatic','tearRate'] #set labels. lensesTree = createTree(lenses, lensesLabels) #create tree. print lensesTree,'\n\n\n' #print lenses tree in text import treePlotter treePlotter.createPlot(lensesTree) #print lenses tree in diagram
# coding=utf-8 from trees import * import treePlotter # dataSet, labels = createDataSet() # # print calShannonEnt(dataSet) # # print chooseBestFeatureToSplit(dataSet) # tree = createTree(dataSet, labels) # print tree # # # # treePlotter.createPlot() # # # print classify(tree,labels,[1,0]) # print classify(tree,labels,[1,1]) dataSet, labels = fileToDataSet("/media/yuan/Windows8_OS/machinelearninginaction/Ch03/lenses.txt") tree = createTree(dataSet, labels) treePlotter.createPlot(tree)
import trees import treePlotter myDat, labels = trees.createDataSet() print myDat print trees.calcShannonEnt(myDat) print trees.splitDataSet(myDat, 0, 1) print trees.splitDataSet(myDat, 0, 0) print trees.splitDataSet(myDat, 1, 1) print trees.chooseBestFeatureToSplit(myDat) print trees.createTree(myDat, labels) treePlotter.createPlot() print 'createPlot over' print treePlotter.retrieveTree(1) myTree = treePlotter.retrieveTree(0) print treePlotter.getNumLeafs(myTree) print treePlotter.getTreeDepth(myTree)
def main(): data_set, labels = create_data_set() my_tree = create_tree(data_set, labels) #print "my_tree", my_tree treePlotter.createPlot(my_tree)
if len(dataSet[0]) == 1: return majorityCnt(classList) # [0, 0, 0, 0, 'N']; 遍历完所有特征时返回出现次数最多的 bestFeat = chooseBestFeatureToSplit(dataSet) # 0 -> 2; 选择最大的 gain ratio 对应的 feature bestFeatLabel = labels[bestFeat] # outlook -> windy myTree = {bestFeatLabel:{}} # 多重字典构建树 {'outlook': {0: 'N' del(labels[bestFeat]) # ['temperature', 'humidity', 'windy'] -> ['temperature', 'humidity'] featValues = [example[bestFeat] for example in dataSet] # [0, 0, 1, 2, 2, 2, 1] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] # ['temperature', 'humidity', 'windy'] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) # 划分数据, 为下一层计算准备 return myTree ################################################################## ## 下面是测试 dataSet, labels = createDataSet(); labels_tmp = labels[:] desicionTree = createTree(dataSet, labels_tmp) # 因为建树的过程中会 del(labels[key]), 所以用了一个临时量 treePlotter.createPlot(desicionTree) # 画出决策树 def classify(inputTree, featLabels, testVec): # 对新数据进行分类 # 输入 -> 决策树, 分类标签, 测试数据; 输出 -> 决策结果; 描述 -> 跑决策树 firstStr = list(inputTree.keys())[0] # ['outlook'], outlook secondDict = inputTree[firstStr] # {0: 'N', 1: 'Y', 2: {'windy': {0: 'Y', 1: 'N'}}} featIndex = featLabels.index(firstStr) # outlook 所在的列序号 0 for key in secondDict.keys(): # secondDict.keys() = [0, 1, 2] if testVec[featIndex] == key: # secondDict[key] = N; test 向量的当前 feature 是哪个值, 就走哪个树杈 if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) # 如果 secondDict[key] 仍然是字典, 则继续向下层走 else: classLabel = secondDict[key] # secondDict[key] 已经只是分类标签了, 则返回这个类别标签 return classLabel print(classify(desicionTree, labels, [0, 1, 0, 0])) # N ################################################################## ## 多个样例测试 def classifyAll(inputTree, featLabels, testDataSet): # 输入 -> 决策树, 分类标签, 测试数据集; 输出 -> 决策结果; 描述 -> 跑决策树 classLabelAll = [] for testVec in testDataSet: classLabelAll.append(classify(inputTree, featLabels, testVec)) # 逐个 item 进行分类判断
import id3 import treePlotter if __name__ == '__main__': f = open('lenses.txt') names = f.readline().strip().split('\t') x = [] y = [] for ele in f.readlines(): t = ele.strip().split('\t') x.append(t[:-1]) y.append(t[-1]) # print x # print y print names print x print y Classifier = id3.ID3(names, x, y) ans = Classifier.result() print ans treePlotter.createPlot(ans)
def testC45(filename): DataList,classLabelVector = trees.file2strlist(filename) mytree=hw4.createTree(DataList,classLabelVector) treePlotter.createPlot(mytree)
#myTree['no surfacing'][3] = 'maybe' #tp.createPlot(myTree) #classify print '-------------- classify --------------------' myDat, labels = createDataSet() print 'labels', labels myTree = tp.retrieveTree(0) print 'myTree ', myTree print '[1,0]: ', classify(myTree, labels, [1,0]) print '[1,1]: ', classify(myTree, labels, [1,1]) #store and grab tree print '-------------- store and grab tree --------------------' storeTree(myTree,'classifierStorate.txt') newTree = grabTree('classifierStorate.txt') print 'grabedTree: ', newTree #Example1: choose suitable lens type print '-------------- Eg1: choose suitable lens type --------------------' fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astog,atoc', 'tearRate'] lensesTree = createTree(lenses,lensesLabels) print lensesTree tp.createPlot(lensesTree)
print(features) print(trees.calcShannonEnt(mydata)) ''' mydata[0][-1] = 'maybe' print(trees.calcShannonEnt(mydata)) ''' #print(trees.splitDataSet(mydata,0,1)) index = trees.chooseBestFeatureToSplit(mydata) #print(index) ''' mytree = trees.createTree(mydata,features) print(mytree) ''' import treePlotter ''' mytree = treePlotter.retrieveTree(0) treePlotter.createPlot(mytree) mytree['no surfacing'][3] = 'maybe' treePlotter.createPlot(mytree) ''' mytree = treePlotter.retrieveTree(0) print(trees.classify(mytree,features,[0,0])) print(trees.classify(mytree,features,[1,1])) trees.storeTree(mytree, 'classifier.txt') grabtree = trees.grabTree('classifier.txt') print(grabtree)
# -*- coding: utf-8 -*- import sys import os import numpy as np import matplotlib.pyplot as plt import treePlotter as tp # 配置utf-8输出环境 reload(sys) sys.setdefaultencoding('utf-8') # 绘制树 myTree = {'root': {0: 'leaf node', 1: {'level 2': {0: 'leaf node', 1: 'leaf node'}},2:{'level2': {0: 'leaf node', 1: 'leaf node'}}}} tp.createPlot(myTree)
if __name__ == '__main__': """ weather: 0-sunny, 1-windy, 2-rainny parents: 0-yes, 1-no money: 0-rich, 1-poor decison: 0-cinema, 1-tennis, 2-stay in, 3-shopping """ data = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0], [2, 0, 1], [2, 1, 0], [2, 0, 1], [1, 1, 1], [1, 1, 0], [1, 0, 0], [0, 1, 0]]) label = np.array([0, 1, 0, 0, 2, 0, 0, 3, 0, 1]) # ID3 decision_tree_id3 = DecisionTree(cls_method='ID3') decision_tree_id3.fit(data, label) print(decision_tree_id3.tree) createPlot(decision_tree_id3.tree) # C45 decision_tree_c45 = DecisionTree(cls_method='C45') decision_tree_c45.fit(data, label) print(decision_tree_c45.tree) createPlot(decision_tree_c45.tree) # CART decision_tree_cart = DecisionTree(cls_method='CART') decision_tree_cart.fit(data, label) print(decision_tree_cart.tree) createPlot(decision_tree_cart.tree)
def abalone_test(m): model = { 'Viscera': { '>0.0145': { 'Shell': { '<=0.0345': { 'Viscera': { '<=0.0285': ' 5 (50.0/9.0)', '>0.0285': ' 4 (3.0)' } }, '>0.0345': { 'Sex': { '=M': ' 6 (6.0/3.0)', '=F': ' 5 (3.0)', '=I': ' 5 (59.0/12.0)' } } } }, '<=0.0145': { 'Shucked': { '>0.007': ' 4 (66.0/31.0)', '<=0.007': { 'Shucked': { '>0.0045': { 'Shucked': { '>0.005': { 'Height': { '<=0.02': ' 4 (2.0)', '>0.02': ' 3 (4.0)' } }, '<=0.005': ' 4 (3.0)' } }, '<=0.0045': { 'Height': { '<=0.025': ' 1 (2.0/1.0)', '>0.025': ' 3 (2.0)' } } } } } } } } path = "./abalone_parts.data" name_path = "./abalone.names" fea_list = get_Attribute(name_path) datasets = read_data(path) pae_dict, class_count = pae_list( path) #不要进入递归,这个是剪枝前就要确定下来,并且在剪枝的过程中不可改变的。 #Attention,if you want to perform Laplace Law of succession,just set: #pae_list=1.0/m #m=counts of classes of the whole original datasets pae_lists = [pae_dict[key] for key in pae_dict] #获得先验概率列表 class_list = [key for key in class_count] #获取数据集的类别列表 model_pruned = MEP_result(copy.deepcopy(model), fea_list, copy.deepcopy(datasets), pae_lists, class_list, m) accuracy_unprune, accuracy_prune, misjudge_datasets = accuracy_analysis( model, model_pruned, copy.deepcopy(datasets), fea_list, name_path) print "accuracy_unprune=", accuracy_unprune print "accuracy_prune=", accuracy_prune createPlot(model) createPlot(model_pruned) print "model=", model print "model_pruned=", model_pruned
def show(self, outpdf): if self._tree == None: pass #plot the tree using matplotlib import treePlotter treePlotter.createPlot(self._tree, outpdf)
elif each_round[i] == 1 or each_round[i] == -2: players[i].Bullet -= 1 elif each_round[i] == 2: players[i].Bullet -= 2 # print "Player %d's bullet: %d" % ((i + 1), players[i].Bullet) # 结算结果 for i in range(people): if players[i].status == 1: if (each_round[i] + Max) > 0 and each_round[i] != Max: print "Player %d lose" % (i + 1) players[i].status = 0 losers += 1 if losers == (people - 1): for i in range(people): # players[i].Rounds += 1 if players[i].status == 1: print "Player %d win" % (i + 1) result[i] = 1 # players[i].Vtimes += 1 round_history[i + iteration * people][-1] = str(result[i]) break count += 1 for i in range(people): players[i].Bullet = 0 players[i].status = 1 iteration += 1 mytree = createTree(round_history[:7*iteration]) storeTree(mytree, "Tree.txt") createPlot(mytree)
def showTree(tree): import treePlotter treePlotter.createPlot(tree)
import treePlotter as tp print tp.retrieveTree(0) print tp.retrieveTree(1) myTree = tp.retrieveTree(0) print tp.getNumLeafs(myTree) print tp.getTreeDepth(myTree) # tp.createPlot(myTree) tp.createPlot(tp.retrieveTree(1))
#######################################################递归构造决策树##################################################### # 递归构造决策树 def creatDecisionTree(dataSet, featureVec): '''This function is to built Decision Tree in recursion!''' classList = [el[-1] for el in dataSet] # 数据的类别集合 if (classList.count( classList[0]) == len(classList)): # 如果数据集中的实例全部属于同一类,则停止递归 return classList[0] if (len(featureVec) == 0): # 如果特征值用完了,停止递归 return MajorityCnt(classList) bestFeatureIndex = calcInformationGain(dataSet, featureVec) # 获取最优特征 DecisionTree = {FeatureLabels[bestFeatureIndex]: {}} # 构造树特征 SplitDataSet, SplitDataProb, SplitValueVec = splitDataSet( dataSet, bestFeatureIndex) for value in SplitValueVec: DecisionTree[ FeatureLabels[bestFeatureIndex]][value] = creatDecisionTree( SplitDataSet[SplitValueVec.index(value)], featureVec) return DecisionTree #####################################################隐形眼镜推荐决策树实例################################################# fr = open('lenses.txt') DataList = fr.readlines() DataMat = [] for line in DataList: DataMat.append(line.strip().split('\t')) MyTree = creatDecisionTree(DataMat, [0, 1, 2, 3]) print MyTree treePlotter.createPlot(MyTree)
#This test goes with Python3 import trees import treePlotter if '__main__' == __name__: dataSet, labels = trees.createDataSet() decisionTree = trees.createTree(dataSet, labels) treePlotter.createPlot(decisionTree)
if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel def storeTree(inputTree,filename): import pickle fw = open(filename,'w') pickle.dump(inputTree,fw) fw.close() def grabTree(filename): import pickle fr = open(filename) return pickle.load(fr) if __name__ == '__main__': #dataSet,labels=createDataSet() #myTree=createTree(dataSet,labels) #print myTree #--------------- fr=open('lenses.txt') lenses=[inst.strip().split('\t') for inst in fr.readlines()] lensesLabels=['age','prescript','astigmatic','tearRate'] lensesTree=createTree(lenses,lensesLabels) #print lensesTree treePlotter.createPlot(lensesTree)
def credit_a_test(): model = { 'A9': { '=t': { 'A15': { '>228': ' + (106.0/2.0)', '<=228': { 'A11': { '>3': { 'A15': { '>4': { 'A15': { '<=5': ' - (2.0)', '>5': { 'A7': { '=v': ' + (5.0)', '=z': ' - (1.0)', '=dd': ' + (0.0)', '=ff': ' + (0.0)', '=o': ' + (0.0)', '=n': ' + (0.0)', '=h': ' + (3.0)', '=bb': ' + (1.0)', '=j': ' + (0.0)' } } } }, '<=4': ' + (25.0)' } }, '<=3': { 'A4': { '=u': { 'A7': { '=v': { 'A14': { '<=110': ' + (18.0/1.0)', '>110': { 'A15': { '>8': ' + (4.0)', '<=8': { 'A6': { '=aa': { 'A2': { '<=41': ' - (3.0)', '>41': ' + (2.0)' } }, '=w': { 'A12': { '=t': ' - (2.0)', '=f': ' + (3.0)' } }, '=q': { 'A12': { '=t': ' + (4.0)', '=f': ' - (2.0)' } }, '=ff': ' - (0.0)', '=r': ' - (0.0)', '=i': ' - (0.0)', '=x': ' - (0.0)', '=e': ' - (0.0)', '=d': ' - (2.0)', '=c': ' - (4.0/1.0)', '=m': { 'A13': { '=g': ' + (2.0)', '=p': ' - (0.0)', '=s': ' - (5.0)' } }, '=cc': ' + (2.0/1.0)', '=k': ' - (2.0)', '=j': ' - (0.0)' } } } } } }, '=z': ' + (1.0)', '=bb': { 'A14': { '<=164': ' + (3.4/0.4)', '>164': ' - (5.6)' } }, '=ff': ' - (1.0)', '=o': ' + (0.0)', '=n': ' + (0.0)', '=h': ' + (18.0)', '=dd': ' + (0.0)', '=j': ' - (1.0)' } }, '=l': ' + (0.0)', '=y': { 'A13': { '=g': { 'A14': { '<=204': ' - (16.0/1.0)', '>204': ' + (5.0/1.0)' } }, '=p': ' - (0.0)', '=s': ' + (2.0)' } }, '=t': ' + (0.0)' } } } } } }, '=f': { 'A13': { '=g': ' - (204.0/10.0)', '=p': { 'A2': { '<=36': ' - (4.0/1.0)', '>36': ' + (2.0)' } }, '=s': { 'A4': { '=u': { 'A6': { '=aa': ' - (0.0)', '=w': ' - (0.0)', '=q': ' - (1.0)', '=ff': ' - (2.0)', '=r': ' - (0.0)', '=i': ' - (3.0)', '=x': ' + (1.0)', '=e': ' - (0.0)', '=d': ' - (2.0)', '=c': ' - (3.0)', '=m': ' - (3.0)', '=cc': ' - (1.0)', '=k': ' - (4.0)', '=j': ' - (0.0)' } }, '=l': ' + (1.0)', '=y': ' - (8.0/1.0)', '=t': ' - (0.0)' } } } } } } path = "./crx.data" name_path = "./crx.names" fea_list = get_Attribute(name_path) datasets = read_data(path) model_pruned = PEP_result(copy.deepcopy(model), fea_list, copy.deepcopy(datasets)) accuracy_unprune, accuracy_prune = accuracy_analysis( model, model_pruned, datasets, fea_list, name_path) print "accuracy_unprune=", accuracy_unprune print "accuracy_prune=", accuracy_prune print "model=", model print "model_pruned=", model_pruned createPlot(model) createPlot(model_pruned)
rootNode = {} bestPropIdx=self._chooseBestProp(dataArray) rootNode[bestPropIdx] = {} uniqValues=np.unique(dataArray[:,bestPropIdx]) for oneValue in uniqValues: splitDataArray=self._splitData(dataArray,bestPropIdx,oneValue) rootNode[bestPropIdx][oneValue]=self.createTree(splitDataArray) return rootNode def loadData(): dataMat = [] fr = open("decisiontree.txt") # readlines他会一次性将decisiontree.txt文件全部加载到内存的列表中 lines = fr.readlines() for line in lines: curLine = line.strip().split('\t') dataMat.append(curLine) return dataMat if __name__ == '__main__': data = loadData() print(data) dataarray = np.array(data) dt=DecisionTree() tree=dt.createTree(dataarray) print(tree) import treePlotter as tp import matplotlib.pyplot as plt tp.createPlot(tree)
def getTreePlt(tree): return treePlotter.createPlot(tree)
new_data1 = data[data[best_feature] == value] new_data2 = new_data1.drop(best_feature, axis=1) if len(list(new_data2.columns)) > 1: feature_tree[best_feature][value] = decision_tree(new_data2, col_y) else: feature_tree[best_feature][value] = list(new_data2[col_y])[0] break return feature_tree import pandas as pd def createData(): data = { 'X1': [1, 1, 1, 0, 0, 0], 'X2': [1, 1, 0, 1, 1, 1], 'X3': ['yes', 'yes', 'no', 'no', 'no', 'yes'], 'X4': ['A', 'B', 'B', 'B', 'A', 'A'], 'X5': ['M', 'FM', 'M', 'M', 'FM', 'M'], 'target': ['Y', 'Y', 'Y', 'N', 'N', 'N'] } return pd.DataFrame(data) data = createData() tree = decision_tree(data, 'target') import treePlotter treePlotter.createPlot(tree)
print(calcShannonEnt(myDat)) myDat[0][-1] = 'maybe' print(calcShannonEnt(myDat)) '''测试按照给定特征划分数据集函数''' myDat, labels = createDataSet() print(splitDataSet(myDat, 0, 1)) print(splitDataSet(myDat, 0, 0)) '''测试最好的数据集划分方式''' print(myDat) print(chooseBestFeatureToSplit(myDat)) '''测试树''' myDat, labels = createDataSet() myTree = createTree(myDat, labels) print(myTree) '''测试分类函数''' myDat, labels = createDataSet() myTree = treePlotter.retrieveTree(0) print(classify(myTree, labels, [1, 0])) print(classify(myTree, labels, [1, 1])) '''测试pick模块储存决策树''' storeTree(myTree, 'classifierStorage.txt') print(grabTree('classifierStorage.txt')) '''使用决策树预测隐形眼镜类型''' fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = createTree(lenses, lensesLabels) print(lensesTree) print(treePlotter.createPlot(lensesTree))
def lense_test(): fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = createTree(lenses, lensesLabels) treePlotter.createPlot(lensesTree)
if classList.count(classList[0]) == len(classList): return classList[0]#stop splitting when all of the classes are equal if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) # 得到列表包含所有属性值 for value in uniqueVals: subLabels = labels[:] #copy all of labels, so trees don't mess up existing labels myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) return myTree if __name__ == '__main__': # del表示删除,与remove区别如下 nums = [1,0, 2 ,0 ,3,0,0] nums.remove(0) print (nums) #[1, 2, 0, 3, 0, 0] del(nums[0]) print(nums) # 预测隐形眼镜类型 fr=open('lenses.txt') lenses=[inst.strip().split('\t') for inst in fr.readlines()] lensesLabels=['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree=createTree(lenses, lensesLabels) print (lensesTree) treePlotter.createPlot(lensesTree) #绘图,treePlotter.py文件如下
inputTree: pre-generated decision tree featLabels: labels testVec: test dataset """ firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': # contine split classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabel if __name__ == '__main__': # example 1 # dataset, labels = createDataSet() # tree = createTree(dataset, labels) # tree['no surfaceing'][3] = 'maybe' # createPlot(tree) # example 2 fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = createTree(lenses, lensesLabels) createPlot(lensesTree)
leaves_num, leaves = self.get_leaves(sub_tree) leaves_count += leaves_num leaves += leaves return leaves_count, leaves if __name__ == '__main__': train_set = pd.read_csv("train.csv").values test_set = pd.read_csv("test.csv").values gender_submission = pd.read_csv("gender_submission.csv").values test_set = test_set[1:] decision_tree = decision_tree( train_set, id_index=0, label_index=1, algorithm='c45') decision_tree.fit() debug = False if debug: tp.createPlot(decision_tree.tree()) submission = [] submission.append(['PassengerId', 'Survived']) right_count = 0 count = len(test_set) for i in range(count): label = decision_tree.classifier(test_set[i]) submission.append([test_set[i][0], label]) if label == gender_submission[i + 1][1]: right_count += 1 submission_df = pd.DataFrame(data=submission, columns=['PassengerId', 'Survived']) submission_df.to_csv('submission.csv', index=False) print(str(right_count) + "/" + str(count))
print("CARTTree:{}".format(true_count)) true_count = 0 for i in range(len(test_label)): predict = classify(test_data[i], tree2) if predict == test_label[i]: true_count += 1 print("C3Tree:{}".format(true_count)) #print(attribute_based_on_Giniindex(X[49:51, :], y[49:51])) from pylab import * mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题 import matplotlib.pyplot as plt treePlotter.createPlot(a, 1) treePlotter.createPlot(b, 2) # 剪枝处理 pruning(tree=tree1, alpha=4) # pruning(tree=tree2, alpha=4) a = printtree(tree=tree1) # b = printtree(tree=tree2) true_count = 0 for i in range(len(test_label)): predict = classify(test_data[i], tree1) if predict == test_label[i]: true_count += 1 print("CARTTree:{}".format(true_count)) true_count = 0 # for i in range(len(test_label)):
featIndex = featLabels.index(firstStr) key = testVec[featIndex] valueOfFeat = secondDict[key] if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel def storeTree(inputTree, filename): import pickle fw = open(filename, 'wb') pickle.dump(inputTree, fw) fw.close() def grabTree(filename): import pickle fr = open(filename, 'rb') return pickle.load(fr) if __name__ == '__main__': fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = createTree(lenses, lensesLabels) print(lensesTree) createPlot(lensesTree)
def main(): labels_tmp = labels[:] desicionTree = createTree(dataSet, labels_tmp) treePlotter.createPlot(desicionTree)
# plotTree.totalW = float(getNumLeafs(inTree)) # plotTree.totalD = float(getTreeDepth(inTree)) # plotTree.xOff = -0.5 / plotTree.totalW; # plotTree.yOff = 1.0; # plotTree(inTree, (0.5, 1.0), '') # plt.show() import sys # # from tree import * # reload(sys) # sys.setdefaultencoding('utf-8') import importlib importlib.reload(sys) from pylab import * mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题 ################################## # 测试决策树的构建 myDat, labels = createDataSet() myTree = createTree(myDat, labels) # 绘制决策树 import treePlotter treePlotter.createPlot(myTree)
return classList[0] # 否则,为每个最优特征取值,递归地创建子树 else: for value in bestFeatValues: subDataSet = splitDataSet(dataSet, bestFeat, value) subFeatures = features[:] myTree[bestFeatName][value] = createTree(subDataSet, subFeatures, chooseBestFeature) ### END CODE HERE ### return myTree data1, labels1 = createDataSet1() ID3Tree = createTree(data1, labels1, chooseBestFeature_ID3) treePlotter.createPlot(ID3Tree) # ### <center> Sample Output:</center> # ![tree0.png](attachment:tree0.png) # ### 任务三:C4.5树 # # ID3用信息增益选择属性的方式会让他对取值数目较多的属性产生偏好,接下来我们通过一个直观的例子来说明。 # # 假设数据集变成如下所示,某个属性(如风速)变为每个样本一个值的情况,构建一个ID3树。 # In[7]: def createDataSet2(): data = [[0, 0, 1, 0, 'yes'], [1, 1, 0, 1, 'yes'], [0, 0, 0, 2, 'no'],
import treePlotter def test(): print "hello world" if __name__ == '__main__': # train_data, labels = trees.createDataSet() # my_trees = trees.createTree(train_data, labels) # print(my_trees) #trees.storeTree(my_trees, 'classifiermelon.txt') melon_tree = trees.grabTree('classifiermelon.txt') print(melon_tree) melon_labels = ['color', 'root', 'sound', 'texture', 'navel', 'touch'] melon_feature = [1, 1, 1, 1, 1, 1] print("the predicted result is:", trees.classify(melon_tree, melon_labels, melon_feature)) treePlotter.createPlot(melon_tree) # print(treePlotter.getNumLeafs(my_trees), treePlotter.getTreeDepth(my_trees)) # ent = trees.calcShannonEnt(train_data) # feature1 = trees.splitDataSet(train_data, 0, 0) # feature2 = trees.splitDataSet(train_data, 0, 1) # best_feature = trees.chooseBestFeatureToSplit(train_data) # print(ent) # print(feature1, feature2) # print(best_feature)
print 'treeDepth ', tp.getTreeDepth(myTree) #tp.createPlot(myTree) #update dict and plot again #myTree['no surfacing'][3] = 'maybe' #tp.createPlot(myTree) #classify print '-------------- classify --------------------' myDat, labels = createDataSet() print 'labels', labels myTree = tp.retrieveTree(0) print 'myTree ', myTree print '[1,0]: ', classify(myTree, labels, [1, 0]) print '[1,1]: ', classify(myTree, labels, [1, 1]) #store and grab tree print '-------------- store and grab tree --------------------' storeTree(myTree, 'classifierStorate.txt') newTree = grabTree('classifierStorate.txt') print 'grabedTree: ', newTree #Example1: choose suitable lens type print '-------------- Eg1: choose suitable lens type --------------------' fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astog,atoc', 'tearRate'] lensesTree = createTree(lenses, lensesLabels) print lensesTree tp.createPlot(lensesTree)
import trees import treePlotter fr = open( r'C:\Users\MILI\Desktop\Machine learning\MachineLearningInAction-Camp\Week2\Reference Code\lenses.txt' ) lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = trees.createTree(lenses, lensesLabels) print(lensesTree) treePlotter.createPlot(lensesTree)
import treePlotter reload(treePlotter) myTree = treePlotter.retrieveTree(0) treePlotter.createPlot(myTree)
def credit_a_test(m): model = { 'A9': { '=t': { 'A15': { '>228': ' + (106.0/2.0)', '<=228': { 'A11': { '>3': { 'A15': { '>4': { 'A15': { '<=5': ' - (2.0)', '>5': { 'A7': { '=v': ' + (5.0)', '=z': ' - (1.0)', '=dd': ' + (0.0)', '=ff': ' + (0.0)', '=o': ' + (0.0)', '=n': ' + (0.0)', '=h': ' + (3.0)', '=bb': ' + (1.0)', '=j': ' + (0.0)' } } } }, '<=4': ' + (25.0)' } }, '<=3': { 'A4': { '=u': { 'A7': { '=v': { 'A14': { '<=110': ' + (18.0/1.0)', '>110': { 'A15': { '>8': ' + (4.0)', '<=8': { 'A6': { '=aa': { 'A2': { '<=41': ' - (3.0)', '>41': ' + (2.0)' } }, '=w': { 'A12': { '=t': ' - (2.0)', '=f': ' + (3.0)' } }, '=q': { 'A12': { '=t': ' + (4.0)', '=f': ' - (2.0)' } }, '=ff': ' - (0.0)', '=r': ' - (0.0)', '=i': ' - (0.0)', '=x': ' - (0.0)', '=e': ' - (0.0)', '=d': ' - (2.0)', '=c': ' - (4.0/1.0)', '=m': { 'A13': { '=g': ' + (2.0)', '=p': ' - (0.0)', '=s': ' - (5.0)' } }, '=cc': ' + (2.0/1.0)', '=k': ' - (2.0)', '=j': ' - (0.0)' } } } } } }, '=z': ' + (1.0)', '=bb': { 'A14': { '<=164': ' + (3.4/0.4)', '>164': ' - (5.6)' } }, '=ff': ' - (1.0)', '=o': ' + (0.0)', '=n': ' + (0.0)', '=h': ' + (18.0)', '=dd': ' + (0.0)', '=j': ' - (1.0)' } }, '=l': ' + (0.0)', '=y': { 'A13': { '=g': { 'A14': { '<=204': ' - (16.0/1.0)', '>204': ' + (5.0/1.0)' } }, '=p': ' - (0.0)', '=s': ' + (2.0)' } }, '=t': ' + (0.0)' } } } } } }, '=f': { 'A13': { '=g': ' - (204.0/10.0)', '=p': { 'A2': { '<=36': ' - (4.0/1.0)', '>36': ' + (2.0)' } }, '=s': { 'A4': { '=u': { 'A6': { '=aa': ' - (0.0)', '=w': ' - (0.0)', '=q': ' - (1.0)', '=ff': ' - (2.0)', '=r': ' - (0.0)', '=i': ' - (3.0)', '=x': ' + (1.0)', '=e': ' - (0.0)', '=d': ' - (2.0)', '=c': ' - (3.0)', '=m': ' - (3.0)', '=cc': ' - (1.0)', '=k': ' - (4.0)', '=j': ' - (0.0)' } }, '=l': ' + (1.0)', '=y': ' - (8.0/1.0)', '=t': ' - (0.0)' } } } } } } path = "./crx.data" name_path = "./crx.names" fea_list = get_Attribute(name_path) datasets = read_data(path) print "刚读入的数据集", datasets pae_dict, class_count = pae_list( path) #不要进入递归,这个是剪枝前就要确定下来,并且在剪枝的过程中不可改变的。 pae_lists = [pae_dict[key] for key in pae_dict] #获得先验概率列表 #Attention,if you want to perform Laplace Law of succession,just set: #pae_list=1.0/m #m=counts of classes of the whole original datasets class_list = [key for key in class_count] #获取数据集的类别列表 model_pruned = MEP_result(copy.deepcopy(model), fea_list, copy.deepcopy(datasets), pae_lists, class_list, m) print "这里检查下数据集", datasets accuracy_unprune, accuracy_prune, misjudge_datasets = accuracy_analysis( model, model_pruned, copy.deepcopy(datasets), fea_list, name_path) print "accuracy_unprune=", accuracy_unprune print "accuracy_prune=", accuracy_prune for item in misjudge_datasets: print item print "model=", model print "model_pruned=", model_pruned createPlot(model) createPlot(model_pruned)
return classList[0] if len(dataset[0]) == 1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataset) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel: {}} del (labels[bestFeat]) featValues = [example[bestFeat] for example in dataset] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree( splitDataSet(dataset, bestFeat, value), subLabels) return myTree def majorityCnt(classList): classCount = {} for vote in classList: classCount[vote] = classCount.get(vote, 0) + 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] if __name__ == '__main__': myDat, labels = createDataset() myTree = createTree(myDat, labels) treePlotter.createPlot()
import sys import os import numpy as np import matplotlib.pyplot as plt import treePlotter as tp # 配置utf-8输出环境 reload(sys) sys.setdefaultencoding('utf-8') # 绘制树 myTree = { 'root': { 0: 'leaf node', 1: { 'level 2': { 0: 'leaf node', 1: 'leaf node' } }, 2: { 'level2': { 0: 'leaf node', 1: 'leaf node' } } } } tp.createPlot(myTree)
def tree(): mytree={'root':{0:'left node',1:{'level2':{3:'left node',4:'right node'}},5:'right node'}} tp.createPlot(mytree)