def test1(): dataSet, labels = trees.createDataSet() print(dataSet) print(labels) # print(trees.calcShannonEnt(dataSet)) # print(trees.splitDataSet(dataSet,0,0)) print(trees.chooseBestFeatureToSplit(dataSet))
dataset = [] #labels '年龄', '处方', '散光', '眼镜材质' labels = ['age', 'prescript', 'astigmatic', 'tearRate'] for line in fr.readlines(): d = line.strip().split('\t') dataset.append(d) fr.close() print dataset print '\n' print '数据集类的香农熵:' print trees.calcShannonEnt(dataset) print '\n' bestFeatureColumn = trees.chooseBestFeatureToSplit(dataset) print '数据集最佳分类的属性是:' print labels[bestFeatureColumn] print '\n' print '决策树:' Tree = trees.createTree(dataset, labels) print Tree firstFeature = Tree.keys()[0] print firstFeature firstFeatureValues = Tree[firstFeature].keys() print firstFeatureValues print '\n' treePlotter.createPlot(Tree)
myDat, labels = trees.createDataSet() trees.calcShannonEnt(myDat) myDat[0][-1] = 'maybe' trees.calcShannonEnt(myDat) # 测试函数splitDataSet reload(trees) myDat, labels = trees.createDataSet() trees.splitDataSet(myDat, 0, 1) trees.splitDataSet(myDat, 0, 0) # 测试chooseBestFeatureToSplit reload(trees) myDat, labels = trees.createDataSet() trees.chooseBestFeatureToSplit(myDat) # 测试创建树 reload(trees) myDat, labels = trees.createDataSet() myTree = trees.createTree(myDat, labels) myTree # 测试matplotlib import treePlotter treePlotter.createPlot() # 测试获取叶子数量及树深度的函数 reload(treePlotter) treePlotter.retrieveTree(1) myTree = treePlotter.retrieveTree(0)
def testChooseBestFeatureToSplit(self): myDat,labels = TreesTestCase.createDataSet() featureIndex = trees.chooseBestFeatureToSplit(myDat) self.assertEqual(featureIndex, 0)
# -*- coding: utf-8 -*- # os.chdir('D:\www\IdeaProject\MLiA_SourceCode\machinelearninginaction') # print os.getcwd() from numpy.ma import zeros, array if __name__ == '__main__': print 'hello' import trees import treePlotter data,lables = trees.createDataSet() print data print lables shannonEnt = trees.calcShannonEnt(data) print shannonEnt data[0][-1] ='maybe' print trees.calcShannonEnt(data) print data print trees.chooseBestFeatureToSplit(data) mytree = trees.createTree(data,lables) print mytree fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLables = ['age','prescript','astigmatic','tearRate'] lensesTree = trees.createTree(lenses,lensesLables) print lensesTree treePlotter.createPlot(lensesTree)
# createPlot(thisTree) myData, labels = trees.createDataSet() print(myData) print(trees.calcShannonEnt(myData)) print('---------------------------------') print(trees.splitDataSet(myData, 0, 1)) print(trees.splitDataSet(myData, 0, 0)) print(trees.splitDataSet(myData, 1, 1)) print(trees.splitDataSet(myData, 1, 0)) print('---------------------------------') myData, labels = trees.createDataSet() print(myData) print('第', trees.chooseBestFeatureToSplit(myData), '个特征是最好的用于划分数据集的特征') print('---------------------------------') myData, labels = trees.createDataSet() myTree = trees.createTree(myData, labels) print('myTree=', myTree) print('---------------------------------') # createPlot() print('---------------------------------') print(retrieveTree(1)) myTree = retrieveTree(0) print(myTree) print(getNumLeafs(myTree)) print(getTreeDepth(myTree)) print('---------------------------------') myTree = retrieveTree(0) createPlot(myTree)
# autor: zhumenger import trees myDat, lables = trees.createDataSet() print(myDat) print(lables) print(trees.calcShannonEnt(myDat))#返回期望值, 期望值越高,则混合的数据也越多 myDat[0][-1] = 'maybe' print(trees.calcShannonEnt(myDat)) #测试splitDataSet() print(trees.splitDataSet(myDat, 0, 1)) print(trees.splitDataSet(myDat, 0, 0)) trees.chooseBestFeatureToSplit(myDat) #寻找最好的划分方式 print(trees.chooseBestFeatureToSplit(myDat)) #得到按照第 0 个特征值进行划分的结果最好 #3-4: print(trees.createTree(myDat, lables))
#! /usr/bin/env python # -*- coding: utf-8 -*- import trees if __name__ == '__main__': data = trees.createDataSet1() # print data dataSet = data[0] lables = data[1] print dataSet feature = trees.chooseBestFeatureToSplit(dataSet) feature1 = trees.chooseBestFeatureToSplit1(dataSet) print feature print feature1 mytree = trees.createTree(dataSet, lables) print mytree print trees.splitDataSet(dataSet, 0, 1) featLabels = ['outlook', 'temperature', 'humidity', 'windy'] testVec = [0, 1, 0, 0] print trees.classify(mytree, featLabels, testVec)
entropy = DT.calcShannonEnt(myDat) # 0.9709505944546686 # Let’s make the data a little messier and see how the entropy changes # myDat[0][-1]='maybe' # entropy = DT.calcShannonEnt(myDat) # """ # output: # 1.3709505944546687 # """ splittedDat = DT.splitDataSet(myDat, 0, 1) # [[1, 'yes'], [1, 'yes'], [0, 'no']] splittedDat = DT.splitDataSet(myDat, 0, 0) # [[1, 'no'], [1, 'no']] bestFeature = DT.chooseBestFeatureToSplit(myDat) # 0 myTree = DT.createTree( myDat, labels ) # {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}} import treePlotter as TP # TP.createPlot() myTree = TP.retrieveTree( 0) #{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}} n = TP.getNumLeafs(myTree) # 3 d = TP.getTreeDepth(myTree) # 2 TP.createPlot(myTree)
# -*- coding: utf-8 -*- # 求使数据集熵最大的列 import trees ds, ls = trees.createDataSet() trees.chooseBestFeatureToSplit(ds) # 创建决策树 import trees ds, ls = trees.createDataSet() trees.createTree(ds, ls) # 绘制树 import treePlotter mt = treePlotter.retrieveTree(0) treePlotter.createPlot(mt) # 利用决策树判断分类 import trees import treePlotter it = treePlotter.retrieveTree(0) ds, ls = trees.createDataSet() trees.classify(it, ls, [0, 0]) # 序列化与反序列化决策树 import trees import treePlotter it = treePlotter.retrieveTree(0) trees.storeTree(it, 'classifierStorage.txt') ot = trees.grabTree('classifierStorage.txt')
import trees import treePlotter myDat, labels = trees.createDataSet() print myDat print trees.calcShannonEnt(myDat) print trees.splitDataSet(myDat, 0, 1) print trees.splitDataSet(myDat, 0, 0) print trees.splitDataSet(myDat, 1, 1) print trees.chooseBestFeatureToSplit(myDat) print trees.createTree(myDat, labels) treePlotter.createPlot() print 'createPlot over' print treePlotter.retrieveTree(1) myTree = treePlotter.retrieveTree(0) print treePlotter.getNumLeafs(myTree) print treePlotter.getTreeDepth(myTree)
def testChooseBestFeatureToSplit(dataSet): # print dataSet print(trees.chooseBestFeatureToSplit(dataSet))
#!/usr/bin/env python # -*- coding: UTF-8 -*- # 代码来源:http://www.cnblogs.com/hantan2008/archive/2015/07/27/4674097.html # 该代码实现了决策树算法分类(ID3算法) import trees import treePlotter if __name__ == '__main__': pass myDat, labels = trees.createDataSetFromTXT("dataset.txt") shan = trees.calcShannonEnt(myDat) print shan col = trees.chooseBestFeatureToSplit(myDat) print col Tree = trees.createTree(myDat, labels) print Tree treePlotter.createPlot(Tree)
if classList.count(classList[0])==len(classList): return classList[0]#stop the splitting when class are same if len(dataSet[0])==1: return majorityCnt(classList) bestFeat=chooseBestFeatureToSplit(dataSet) bestFeatLabel=labels[bestFeat] myTree={bestFeatLabel:{}} del (labels[bestFeat]) featValues=[example[bestFeat] for example in dataSet] uniqueVals=set(featValues) for value in uniqueVals: subLabels=labels[:] myTree[bestFeatLabel][value]=createTree(splitDataSet\ (dataSet,bestFeat,value),subLabels) return myTree if __name__ == "__main__": import trees myDat,labels=trees.createDataSet() print(myDat,'\n',labels) print("The shannonEnt is :",trees.calcShannonEnt(myDat)) print("\nsplitDataSet(myDat,1,1):\n",trees.splitDataSet(myDat,1,1)) #print("\nsplitDataSet(myDat,0,0):\n",trees.splitDataSet(myDat,0,0)) print("\nchooseBestFeatureToSplit: ",trees.chooseBestFeatureToSplit(myDat)) myTree=createTree(myDat,labels) print("\nmyTree:\n",myTree)
from numpy import array import trees reload(trees) myDat,labels=trees.createDataSet() print myDat print trees.calcShannonEnt(myDat) print trees.splitDataSet(myDat,0,1) print trees.splitDataSet(myDat,0,0) print trees.chooseBestFeatureToSplit(myDat) myTree = trees.createTree(myDat,labels) print myTree
import trees mydata,features = trees.createDataSet() print(mydata) print(features) print(trees.calcShannonEnt(mydata)) ''' mydata[0][-1] = 'maybe' print(trees.calcShannonEnt(mydata)) ''' #print(trees.splitDataSet(mydata,0,1)) index = trees.chooseBestFeatureToSplit(mydata) #print(index) ''' mytree = trees.createTree(mydata,features) print(mytree) ''' import treePlotter ''' mytree = treePlotter.retrieveTree(0) treePlotter.createPlot(mytree) mytree['no surfacing'][3] = 'maybe' treePlotter.createPlot(mytree) ''' mytree = treePlotter.retrieveTree(0) print(trees.classify(mytree,features,[0,0])) print(trees.classify(mytree,features,[1,1])) trees.storeTree(mytree, 'classifier.txt')
import trees a1, a2 = trees.createDataSet() b1 = trees.chooseBestFeatureToSplit(a1) #print(b1) #核心就是匹配那个香浓值最接近选哪个
import treePlotter # homedir= os.getcwd()+'/machinelearninginaction/ch03/' #绝对路径 homedir = '' #相对路径 #3.1.1 信息增益 myDat, labels = trees.createDataSet() print "计算香农熵:", trees.calcShannonEnt(myDat) myDat[0][-1] = ' maybe' print "计算香农熵:", trees.calcShannonEnt(myDat) #3.1.2 划分数据集 myDat, labels = trees.createDataSet() trees.splitDataSet(myDat, 0, 1) trees.splitDataSet(myDat, 0, 0) print "选择最好的数据集划分方式:", trees.chooseBestFeatureToSplit(myDat) #3.1.3 递归构建决策树 myDat, labels = trees.createDataSet() myTree = trees.createTree(myDat, labels) print "myTree:", myTree #3.2.1 Matplotlib注解 treePlotter.createPlot() #3.2.2 构造注解树 treePlotter.retrieveTree(1) myTree = treePlotter.retrieveTree(0) print "获取叶节点的数目:", treePlotter.getNumLeafs(myTree) print "获取树的层数:", treePlotter.getTreeDepth(myTree) treePlotter.createPlot(myTree)
def test_bestChoose(self): dataSet, label = trees.createDataSet() print("\n dataSet == %s" % (dataSet)) bestFeature = trees.chooseBestFeatureToSplit(dataSet) print("\n bestFeature == %s" % (bestFeature))
#内容:1.读取文档数据生成数据集,2.利用数据集生成决策树,3.绘制决策树的图形 #时间:2018年5月30日 6月6日添加备注 import trees import treePlotter #11111111111111111111111111111111111111111111111111111 #利用手动创建的数据集生成树,绘制树的图形,测试程序过程步骤 #输出手动创建的数据集,计算香农熵 myDat,labels=trees.createDataSet() print "myDat 数据集是:",myDat print "\nlabels 标签是:",labels rawCalc =trees.calcShannonEnt(myDat) print "\ncalcShannonEnt(myDat) 数据集的原始熵是:",rawCalc print "\ntrees.splitDataSet( myDat,1,1)将数据集的按 特征[1]=1(即 flippers==1) 提取出来的矩阵是:",trees.splitDataSet(myDat,1,1) # bestLabel = trees.chooseBestFeatureToSplit(myDat) print "\nchooseBestFeatureToSplit(myDat) 数据集的bestLabel最好特征的[下标]是:",bestLabel,"\tlabels[bestLabel]最好特征是:",labels[bestLabel] # myTree = trees.createTree(myDat,labels) print "\ntrees.createTree(myDat,labels) 根据数据集创建的树是:", myTree #读取预先存储的树[0] 并绘制图形 print "\n读取预先存储的树[0] 并绘制出第一个图形:" myTree0 = treePlotter.retrieveTree(0) treePlotter.createPlot(myTree0) #读取预先存储的树[1] 并绘制图形 print "\n读取预先存储的树[1] 并绘制出第二个图形:" myTree1 = treePlotter.retrieveTree(1) treePlotter.createPlot(myTree1) #change one date in "no surfacing" #and print