def tests(): dataSet, labels = trees.createDataSet() print dataSet print trees.calcShannonEnt(dataSet) myTree = trees.createTree(dataSet, labels) print myTree, labels print trees.classify(myTree, labels, [1, 0]) print trees.classify(myTree, labels, [1, 1]) print trees.classify(myTree, labels, [0, 0]) print trees.classify(myTree, labels, [0, 1])
def test_shannon(self): dataSet, labels = trees.createDataSet() print("\n dataSet == %s" % (dataSet)) shannon = trees.calcShannonEnt(dataSet) print("\n shannon == %s" % (shannon)) # 不确定性越大,熵越大 dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'yes?no'], [0, 1, 'no'], [0, 1, 'no']] shannon = trees.calcShannonEnt(dataSet) print("\n shannon == %s" % (shannon))
def main(): myDat, labels = trees.createDataSet() print 'create data:' print myDat shan = trees.calcShannonEnt(myDat) print 'calc shan:' print shan myDat[0][-1] = 'maybe' shan1 = trees.calcShannonEnt(myDat) print 'change data and calc shan1:' print shan1
def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 # 特征数量 baseEntropy = tr.calcShannonEnt(dataSet) # 计算数据集的香农熵 bestInfoGain = 0.0 # 信息增益 bestFeature = -1 # 最优特征的索引值 for i in range(numFeatures): # 遍历所有特征 # 获取dataSet的第i个所有特征 featList = [example[i] for example in dataSet] uniqueVals = set(featList) # 创建set集合{},元素不可重复 newEntropy = 0.0 # 经验条件熵 for value in uniqueVals: # 计算信息增益 subDataSet = sd.splitDataSet(dataSet, i, value) # subDataSet划分后的子集 prob = len(subDataSet) / float(len(dataSet)) # 计算子集的概率 newEntropy += prob * tr.calcShannonEnt(subDataSet) # 根据公式计算经验条件熵 infoGain = baseEntropy - newEntropy # 信息增益 print("第%d个特征的增益为%.3f" % (i, infoGain)) # 打印每个特征的信息增益 if (infoGain > bestInfoGain): # 计算信息增益 bestInfoGain = infoGain # 更新信息增益,找到最大的信息增益 bestFeature = i # 记录信息增益最大的特征的索引值 return bestFeature # 返回信息增益最大的特征的索引值
# fr = open('lensesCN.txt') lenses = [unicode(inst, 'utf-8').strip().strip().split('\t') for inst in fr.readlines()] #lensesLabels = ["年龄组" , "规定", "闪光", "泪液扫除率"] lensesLabels = ['age' , 'prescript', 'astigmatic', 'tearRate'] lensesTree = tr.createTree(lenses,lensesLabels) print(lensesTree) tp.createPlot(lensesTree) dataSet, labels = tr.createDataSet() shannonEnt = tr.calcShannonEnt(dataSet) print(shannonEnt) print(tp.retrieveTree(1)) myTree = tp.retrieveTree(0) numLeafs = tp.getNumLeafs(myTree) treeDepth = tp.getTreeDepth(myTree) print(numLeafs) print(treeDepth) myTree = tp.retrieveTree(0) tp.createPlot(myTree)
import trees mydat, labels = trees.createDataSet() a = trees.calcShannonEnt(mydat) print(mydat) print(labels) print(a)
import trees myDat, labels = trees.createDataSet() print(myDat) print(labels) print(trees.calcShannonEnt(myDat)) print(trees.splitDataSet(myDat, 0, 0)) print(trees.createTree(myDat, labels))
import trees mydata,features = trees.createDataSet() print(mydata) print(features) print(trees.calcShannonEnt(mydata)) ''' mydata[0][-1] = 'maybe' print(trees.calcShannonEnt(mydata)) ''' #print(trees.splitDataSet(mydata,0,1)) index = trees.chooseBestFeatureToSplit(mydata) #print(index) ''' mytree = trees.createTree(mydata,features) print(mytree) ''' import treePlotter ''' mytree = treePlotter.retrieveTree(0) treePlotter.createPlot(mytree) mytree['no surfacing'][3] = 'maybe' treePlotter.createPlot(mytree) ''' mytree = treePlotter.retrieveTree(0) print(trees.classify(mytree,features,[0,0])) print(trees.classify(mytree,features,[1,1])) trees.storeTree(mytree, 'classifier.txt')
#!/usr/bin/python # -*- coding:utf-8 -*- import os import trees import treePlotter # homedir= os.getcwd()+'/machinelearninginaction/ch03/' #绝对路径 homedir = '' #相对路径 #3.1.1 信息增益 myDat, labels = trees.createDataSet() print "计算香农熵:", trees.calcShannonEnt(myDat) myDat[0][-1] = ' maybe' print "计算香农熵:", trees.calcShannonEnt(myDat) #3.1.2 划分数据集 myDat, labels = trees.createDataSet() trees.splitDataSet(myDat, 0, 1) trees.splitDataSet(myDat, 0, 0) print "选择最好的数据集划分方式:", trees.chooseBestFeatureToSplit(myDat) #3.1.3 递归构建决策树 myDat, labels = trees.createDataSet() myTree = trees.createTree(myDat, labels) print "myTree:", myTree #3.2.1 Matplotlib注解 treePlotter.createPlot() #3.2.2 构造注解树
import trees myDat, labels = trees.createDataSet() print(trees.calcShannonEnt(myDat))
# coding=utf-8 import trees import treePlotter dateset, labels = trees.createDataSet() print dateset print labels shannon = trees.calcShannonEnt(dateset) print shannon print '--条件熵' ha = trees.calcShannonEnt(trees.splitDataSet(dateset, 0, 5)) hb = trees.calcShannonEnt(trees.splitDataSet(dateset, 0, 3)) hc = trees.calcShannonEnt(trees.splitDataSet(dateset, 0, 1)) hxy = 1.0 / 3 * ha + 1.0 / 4 * hb + 5.0 / 12 * hc print ha, hb, hc print hxy print '--信息增益' ig = shannon - hxy print ig print '--找到最佳分类特征' feature = trees.chooseBestFeatureToSplit(dateset) print labels[feature] print '--创建决策树' labelsCopy = labels[:] tree = trees.createTree(dateset, labelsCopy) print tree # print '--画图' # treePlotter.createPlot(tree) print '--用决策树测试数据' #mytree = treePlotter.retrieveTree(0) testdata = [4, 4, 1, 'cha']
import trees as t data_set, labels = t.createDataSet() data_set[0][-1] = 'test' # entropy more bigger, mix of classes more higher print(t.calcShannonEnt(data_set)) print(t.calcShannonEnt([[1,0,'yes'], [1,1,'no']]))
import trees mat, labels = trees.createDataSet() ent = trees.calcShannonEnt(mat) print(ent) mat[0][-1] = 'maybe' ent = trees.calcShannonEnt(mat) print(ent) # mat, labels = trees.createDataSet() # print(mat) # split1 = trees.splitDataSet(mat, 0, 1) # print(split1) # split2 = trees.splitDataSet(mat, 0, 0) # print(split2) # mat, labels = trees.createDataSet() # feat = trees.chooseBestFeatureToSplit(mat) # print(feat)
# coding=utf-8 #程序说明 #标题:决策树的生成和绘制 #内容:1.读取文档数据生成数据集,2.利用数据集生成决策树,3.绘制决策树的图形 #时间:2018年5月30日 6月6日添加备注 import trees import treePlotter #11111111111111111111111111111111111111111111111111111 #利用手动创建的数据集生成树,绘制树的图形,测试程序过程步骤 #输出手动创建的数据集,计算香农熵 myDat,labels=trees.createDataSet() print "myDat 数据集是:",myDat print "\nlabels 标签是:",labels rawCalc =trees.calcShannonEnt(myDat) print "\ncalcShannonEnt(myDat) 数据集的原始熵是:",rawCalc print "\ntrees.splitDataSet( myDat,1,1)将数据集的按 特征[1]=1(即 flippers==1) 提取出来的矩阵是:",trees.splitDataSet(myDat,1,1) # bestLabel = trees.chooseBestFeatureToSplit(myDat) print "\nchooseBestFeatureToSplit(myDat) 数据集的bestLabel最好特征的[下标]是:",bestLabel,"\tlabels[bestLabel]最好特征是:",labels[bestLabel] # myTree = trees.createTree(myDat,labels) print "\ntrees.createTree(myDat,labels) 根据数据集创建的树是:", myTree #读取预先存储的树[0] 并绘制图形 print "\n读取预先存储的树[0] 并绘制出第一个图形:" myTree0 = treePlotter.retrieveTree(0) treePlotter.createPlot(myTree0) #读取预先存储的树[1] 并绘制图形 print "\n读取预先存储的树[1] 并绘制出第二个图形:" myTree1 = treePlotter.retrieveTree(1)
def testCalcShannonEnt(self): myDat,labels = TreesTestCase.createDataSet() entropy = trees.calcShannonEnt(myDat) ent = float('%0.3f' % entropy) self.assertEqual(ent, 0.971)
def testCalcShannonEnt(dataSet): shannonEnt = trees.calcShannonEnt(dataSet) print(shannonEnt) return shannonEnt
#!/usr/bin/env python # -*- coding: UTF-8 -*- # 代码来源:http://www.cnblogs.com/hantan2008/archive/2015/07/27/4674097.html # 该代码实现了决策树算法分类(ID3算法) import trees import treePlotter if __name__ == '__main__': pass myDat, labels = trees.createDataSetFromTXT("dataset.txt") shan = trees.calcShannonEnt(myDat) print shan col = trees.chooseBestFeatureToSplit(myDat) print col Tree = trees.createTree(myDat, labels) print Tree treePlotter.createPlot(Tree)
if classList.count(classList[0])==len(classList): return classList[0]#stop the splitting when class are same if len(dataSet[0])==1: return majorityCnt(classList) bestFeat=chooseBestFeatureToSplit(dataSet) bestFeatLabel=labels[bestFeat] myTree={bestFeatLabel:{}} del (labels[bestFeat]) featValues=[example[bestFeat] for example in dataSet] uniqueVals=set(featValues) for value in uniqueVals: subLabels=labels[:] myTree[bestFeatLabel][value]=createTree(splitDataSet\ (dataSet,bestFeat,value),subLabels) return myTree if __name__ == "__main__": import trees myDat,labels=trees.createDataSet() print(myDat,'\n',labels) print("The shannonEnt is :",trees.calcShannonEnt(myDat)) print("\nsplitDataSet(myDat,1,1):\n",trees.splitDataSet(myDat,1,1)) #print("\nsplitDataSet(myDat,0,0):\n",trees.splitDataSet(myDat,0,0)) print("\nchooseBestFeatureToSplit: ",trees.chooseBestFeatureToSplit(myDat)) myTree=createTree(myDat,labels) print("\nmyTree:\n",myTree)
# This is a sample Python script. # Press Shift+F10 to execute it or replace it with your code. # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. import trees # Press the green button in the gutter to run the script. if __name__ == '__main__': myDat, labels = trees.createDataSet() sh = trees.calcShannonEnt(myDat) print(sh) # 熵越高,说明混合的数据也越多 # 修改数据集,添加类型'maybe' myDat[0][-1] = 'maybe' sh = trees.calcShannonEnt(myDat) print(sh)
# autor: zhumenger import trees myDat, lables = trees.createDataSet() print(myDat) print(lables) print(trees.calcShannonEnt(myDat))#返回期望值, 期望值越高,则混合的数据也越多 myDat[0][-1] = 'maybe' print(trees.calcShannonEnt(myDat)) #测试splitDataSet() print(trees.splitDataSet(myDat, 0, 1)) print(trees.splitDataSet(myDat, 0, 0)) trees.chooseBestFeatureToSplit(myDat) #寻找最好的划分方式 print(trees.chooseBestFeatureToSplit(myDat)) #得到按照第 0 个特征值进行划分的结果最好 #3-4: print(trees.createTree(myDat, lables))
# -*- coding: utf-8 -*- # os.chdir('D:\www\IdeaProject\MLiA_SourceCode\machinelearninginaction') # print os.getcwd() from numpy.ma import zeros, array if __name__ == '__main__': print 'hello' import trees import treePlotter data,lables = trees.createDataSet() print data print lables shannonEnt = trees.calcShannonEnt(data) print shannonEnt data[0][-1] ='maybe' print trees.calcShannonEnt(data) print data print trees.chooseBestFeatureToSplit(data) mytree = trees.createTree(data,lables) print mytree fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLables = ['age','prescript','astigmatic','tearRate'] lensesTree = trees.createTree(lenses,lensesLables) print lensesTree treePlotter.createPlot(lensesTree)
import trees a1, a2 = trees.createDataSet() a3 = trees.calcShannonEnt(a1) print(a3)
import trees import treePlotter fr = open('lenses.txt') dataset = [] #labels '年龄', '处方', '散光', '眼镜材质' labels = ['age', 'prescript', 'astigmatic', 'tearRate'] for line in fr.readlines(): d = line.strip().split('\t') dataset.append(d) fr.close() print dataset print '\n' print '数据集类的香农熵:' print trees.calcShannonEnt(dataset) print '\n' bestFeatureColumn = trees.chooseBestFeatureToSplit(dataset) print '数据集最佳分类的属性是:' print labels[bestFeatureColumn] print '\n' print '决策树:' Tree = trees.createTree(dataset, labels) print Tree firstFeature = Tree.keys()[0] print firstFeature firstFeatureValues = Tree[firstFeature].keys() print firstFeatureValues print '\n'
# -*- coding: utf-8 -*- import trees dataSet, labels = trees.createDataSet() print dataSet print labels #计算熵 print trees.calcShannonEnt(dataSet)#0.970950594455 dataSet[0][-1] = 'maybe' print dataSet print trees.calcShannonEnt(dataSet)#1.37095059445 #熵越大,则混合的数据越多 #还原 dataSet[0][-1] = 'yes' print dataSet #[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] #划分数据集 #当第0列,值为0 的实例 print trees.splitDataSet(dataSet, 0, 0) #[[1, 'no'], [1, 'no']] #当第0列,值为1 的实例 print trees.splitDataSet(dataSet, 0, 1)
# -*- coding: utf-8 -*- import trees dataMat, labels = trees.createDataSet2() print(dataMat) result = trees.calcShannonEnt(dataMat) print("result is: %.2f" % result)
import trees as DT myDat, labels = DT.createDataSet() entropy = DT.calcShannonEnt(myDat) # 0.9709505944546686 # Let’s make the data a little messier and see how the entropy changes # myDat[0][-1]='maybe' # entropy = DT.calcShannonEnt(myDat) # """ # output: # 1.3709505944546687 # """ splittedDat = DT.splitDataSet(myDat, 0, 1) # [[1, 'yes'], [1, 'yes'], [0, 'no']] splittedDat = DT.splitDataSet(myDat, 0, 0) # [[1, 'no'], [1, 'no']] bestFeature = DT.chooseBestFeatureToSplit(myDat) # 0 myTree = DT.createTree( myDat, labels ) # {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}} import treePlotter as TP # TP.createPlot() myTree = TP.retrieveTree( 0) #{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}} n = TP.getNumLeafs(myTree) # 3 d = TP.getTreeDepth(myTree) # 2
from imp import reload import trees myDat, labels = trees.createDataSet() trees.calcShannonEnt(myDat) myTree = trees.createTree(myDat, labels) import treePlotter treePlotter.createPlot()
#coding=utf-8 __author__ = 'baconLIN' import trees myDat,labels=trees.createDataSet() print(myDat) shannonEnt = trees.calcShannonEnt(myDat) print(shannonEnt) mySplit1 = trees.splitDataSet(myDat,0,1) print mySplit1 shannonEntSplit1 = trees.calcShannonEnt(mySplit1) print(shannonEntSplit1) mySplit2 = trees.splitDataSet(myDat,0,0) print mySplit2 shannonEntSplit2 = trees.calcShannonEnt(mySplit2) print(shannonEntSplit2) mySplit3 = trees.splitDataSet(myDat,1,1) print mySplit3 shannonEntSplit3 = trees.calcShannonEnt(mySplit3) print(shannonEntSplit3) bestFeature = trees.chooseBestFeatureToSplit(myDat) print(bestFeature) myTree = trees.createTree(myDat,labels) print(myTree) import treePlotter myTree2 = treePlotter.retrieveTree(0)