def test_split(self): dataSet, labels = trees.createDataSet() print("\n dataSet == %s" % (dataSet)) dataset1 = trees.splitDataSet(dataSet, 0, 1) print("\n dataSet1 == %s" % (dataset1)) dataset2 = trees.splitDataSet(dataSet, 0, 0) print("\n dataSet2 == %s" % (dataset2))
return bestFeature def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys():classCount[vote]= 0 classCount[vote]+=1 sortedClassCount = sorted(classCount.iteritems(),\ key=operator.itemgetter(1),reverse=True) return sortedClassCount[0][0] def createTree(dataSet,labels): classList = [example[-1] for example in dataset] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeatLabel = labels[be] if __name__=="__main__": importlib.reload(trees) myDat, lables = trees.createDataSet() print(myDat) print(trees.splitDataSet(myDat,0,1)) # print(trees.splitDataSet(myDat,0,0)) # 得到熵 熵越高则混合的数据越多 可以在数据中添加更多的分类,观察熵是如何变化的 # print(calcShannonEnt(myDat))
from numpy import array import trees reload(trees) myDat,labels=trees.createDataSet() print myDat print trees.calcShannonEnt(myDat) print trees.splitDataSet(myDat,0,1) print trees.splitDataSet(myDat,0,0) print trees.chooseBestFeatureToSplit(myDat) myTree = trees.createTree(myDat,labels) print myTree
import trees # 测试计算香农熵 from importlib import reload reload(trees) myDat, labels = trees.createDataSet() trees.calcShannonEnt(myDat) myDat[0][-1] = 'maybe' trees.calcShannonEnt(myDat) # 测试函数splitDataSet reload(trees) myDat, labels = trees.createDataSet() trees.splitDataSet(myDat, 0, 1) trees.splitDataSet(myDat, 0, 0) # 测试chooseBestFeatureToSplit reload(trees) myDat, labels = trees.createDataSet() trees.chooseBestFeatureToSplit(myDat) # 测试创建树 reload(trees) myDat, labels = trees.createDataSet() myTree = trees.createTree(myDat, labels) myTree # 测试matplotlib import treePlotter treePlotter.createPlot()
def testSplitDataSet(self): myDat,labels = TreesTestCase.createDataSet() part1 = trees.splitDataSet(myDat, 0, 1) part2 = trees.splitDataSet(myDat, 0, 0) self.assertEqual(part1, [[1, 'yes'], [1, 'yes'], [0, 'no']]) self.assertEqual(part2, [[1, 'no'], [1, 'no'],])
import trees myDat, labels = trees.createDataSet() print(myDat) print(labels) print(trees.calcShannonEnt(myDat)) print(trees.splitDataSet(myDat, 0, 0)) print(trees.createTree(myDat, labels))
#coding=utf-8 __author__ = 'baconLIN' import trees myDat,labels=trees.createDataSet() print(myDat) shannonEnt = trees.calcShannonEnt(myDat) print(shannonEnt) mySplit1 = trees.splitDataSet(myDat,0,1) print mySplit1 shannonEntSplit1 = trees.calcShannonEnt(mySplit1) print(shannonEntSplit1) mySplit2 = trees.splitDataSet(myDat,0,0) print mySplit2 shannonEntSplit2 = trees.calcShannonEnt(mySplit2) print(shannonEntSplit2) mySplit3 = trees.splitDataSet(myDat,1,1) print mySplit3 shannonEntSplit3 = trees.calcShannonEnt(mySplit3) print(shannonEntSplit3) bestFeature = trees.chooseBestFeatureToSplit(myDat) print(bestFeature) myTree = trees.createTree(myDat,labels) print(myTree) import treePlotter myTree2 = treePlotter.retrieveTree(0)
#程序说明 #标题:决策树的生成和绘制 #内容:1.读取文档数据生成数据集,2.利用数据集生成决策树,3.绘制决策树的图形 #时间:2018年5月30日 6月6日添加备注 import trees import treePlotter #11111111111111111111111111111111111111111111111111111 #利用手动创建的数据集生成树,绘制树的图形,测试程序过程步骤 #输出手动创建的数据集,计算香农熵 myDat,labels=trees.createDataSet() print "myDat 数据集是:",myDat print "\nlabels 标签是:",labels rawCalc =trees.calcShannonEnt(myDat) print "\ncalcShannonEnt(myDat) 数据集的原始熵是:",rawCalc print "\ntrees.splitDataSet( myDat,1,1)将数据集的按 特征[1]=1(即 flippers==1) 提取出来的矩阵是:",trees.splitDataSet(myDat,1,1) # bestLabel = trees.chooseBestFeatureToSplit(myDat) print "\nchooseBestFeatureToSplit(myDat) 数据集的bestLabel最好特征的[下标]是:",bestLabel,"\tlabels[bestLabel]最好特征是:",labels[bestLabel] # myTree = trees.createTree(myDat,labels) print "\ntrees.createTree(myDat,labels) 根据数据集创建的树是:", myTree #读取预先存储的树[0] 并绘制图形 print "\n读取预先存储的树[0] 并绘制出第一个图形:" myTree0 = treePlotter.retrieveTree(0) treePlotter.createPlot(myTree0) #读取预先存储的树[1] 并绘制图形 print "\n读取预先存储的树[1] 并绘制出第二个图形:" myTree1 = treePlotter.retrieveTree(1) treePlotter.createPlot(myTree1)
print trees.calcShannonEnt(dataSet)#0.970950594455 dataSet[0][-1] = 'maybe' print dataSet print trees.calcShannonEnt(dataSet)#1.37095059445 #熵越大,则混合的数据越多 #还原 dataSet[0][-1] = 'yes' print dataSet #[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] #划分数据集 #当第0列,值为0 的实例 print trees.splitDataSet(dataSet, 0, 0) #[[1, 'no'], [1, 'no']] #当第0列,值为1 的实例 print trees.splitDataSet(dataSet, 0, 1) #[[1, 'yes'], [1, 'yes'], [0, 'no']] print trees.chooseBestFeatureToSplit(dataSet) #0 print "---createTree---" print trees.createTree(dataSet, labels) """
# author = [email protected] # -*- coding: cp936 -*- # coding: cp936 import trees def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] label = ['no surfacing', 'flippers'] return dataSet, label mydata, labels = createDataSet() print mydata print trees.calcShannonEnt(mydata) # mydata[0][-1] = 'maybe' # print mydata # print trees.calcShannonEnt(mydata) print trees.splitDataSet(mydata, 1, 1) print trees.splitDataSet(mydata, 1, 0) print trees.chooseBestFeatureToSplit(mydata) print trees.createTree(mydata, labels)
import trees import treePlotter myDat, labels = trees.createDataSet() print myDat print trees.calcShannonEnt(myDat) print trees.splitDataSet(myDat, 0, 1) print trees.splitDataSet(myDat, 0, 0) print trees.splitDataSet(myDat, 1, 1) print trees.chooseBestFeatureToSplit(myDat) print trees.createTree(myDat, labels) treePlotter.createPlot() print 'createPlot over' print treePlotter.retrieveTree(1) myTree = treePlotter.retrieveTree(0) print treePlotter.getNumLeafs(myTree) print treePlotter.getTreeDepth(myTree)
import trees a1, a2 = trees.createDataSet() #print(a1) b1 = trees.splitDataSet(a1, 0, 1) #print(b1) #c1 = trees.splitDataSet(a1,0,0) #print(c1)
# This is a sample Python script. # Press Shift+F10 to execute it or replace it with your code. # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. import trees # Press the green button in the gutter to run the script. if __name__ == '__main__': myDat, labels = trees.createDataSet() # 划分数据集 retDataSet = trees.splitDataSet(myDat, 0, 5) print(retDataSet) bestFeature = trees.chooseBestFeatureToSplit(myDat) print("best feature: %d" % bestFeature)
if classList.count(classList[0])==len(classList): return classList[0]#stop the splitting when class are same if len(dataSet[0])==1: return majorityCnt(classList) bestFeat=chooseBestFeatureToSplit(dataSet) bestFeatLabel=labels[bestFeat] myTree={bestFeatLabel:{}} del (labels[bestFeat]) featValues=[example[bestFeat] for example in dataSet] uniqueVals=set(featValues) for value in uniqueVals: subLabels=labels[:] myTree[bestFeatLabel][value]=createTree(splitDataSet\ (dataSet,bestFeat,value),subLabels) return myTree if __name__ == "__main__": import trees myDat,labels=trees.createDataSet() print(myDat,'\n',labels) print("The shannonEnt is :",trees.calcShannonEnt(myDat)) print("\nsplitDataSet(myDat,1,1):\n",trees.splitDataSet(myDat,1,1)) #print("\nsplitDataSet(myDat,0,0):\n",trees.splitDataSet(myDat,0,0)) print("\nchooseBestFeatureToSplit: ",trees.chooseBestFeatureToSplit(myDat)) myTree=createTree(myDat,labels) print("\nmyTree:\n",myTree)
}, 1: 'no' } } } }] return listOfTrees[i] # createPlot(thisTree) myData, labels = trees.createDataSet() print(myData) print(trees.calcShannonEnt(myData)) print('---------------------------------') print(trees.splitDataSet(myData, 0, 1)) print(trees.splitDataSet(myData, 0, 0)) print(trees.splitDataSet(myData, 1, 1)) print(trees.splitDataSet(myData, 1, 0)) print('---------------------------------') myData, labels = trees.createDataSet() print(myData) print('第', trees.chooseBestFeatureToSplit(myData), '个特征是最好的用于划分数据集的特征') print('---------------------------------') myData, labels = trees.createDataSet() myTree = trees.createTree(myData, labels) print('myTree=', myTree) print('---------------------------------') # createPlot() print('---------------------------------') print(retrieveTree(1))
import trees as DT myDat, labels = DT.createDataSet() entropy = DT.calcShannonEnt(myDat) # 0.9709505944546686 # Let’s make the data a little messier and see how the entropy changes # myDat[0][-1]='maybe' # entropy = DT.calcShannonEnt(myDat) # """ # output: # 1.3709505944546687 # """ splittedDat = DT.splitDataSet(myDat, 0, 1) # [[1, 'yes'], [1, 'yes'], [0, 'no']] splittedDat = DT.splitDataSet(myDat, 0, 0) # [[1, 'no'], [1, 'no']] bestFeature = DT.chooseBestFeatureToSplit(myDat) # 0 myTree = DT.createTree( myDat, labels ) # {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}} import treePlotter as TP # TP.createPlot() myTree = TP.retrieveTree( 0) #{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}} n = TP.getNumLeafs(myTree) # 3 d = TP.getTreeDepth(myTree) # 2
# coding=utf-8 import trees import treePlotter dateset, labels = trees.createDataSet() print dateset print labels shannon = trees.calcShannonEnt(dateset) print shannon print '--条件熵' ha = trees.calcShannonEnt(trees.splitDataSet(dateset, 0, 5)) hb = trees.calcShannonEnt(trees.splitDataSet(dateset, 0, 3)) hc = trees.calcShannonEnt(trees.splitDataSet(dateset, 0, 1)) hxy = 1.0 / 3 * ha + 1.0 / 4 * hb + 5.0 / 12 * hc print ha, hb, hc print hxy print '--信息增益' ig = shannon - hxy print ig print '--找到最佳分类特征' feature = trees.chooseBestFeatureToSplit(dateset) print labels[feature] print '--创建决策树' labelsCopy = labels[:] tree = trees.createTree(dateset, labelsCopy) print tree # print '--画图' # treePlotter.createPlot(tree) print '--用决策树测试数据' #mytree = treePlotter.retrieveTree(0) testdata = [4, 4, 1, 'cha']