Ejemplo n.º 1
0
def test1():
    dataSet, labels = trees.createDataSet()
    print(dataSet)
    print(labels)
    # print(trees.calcShannonEnt(dataSet))
    # print(trees.splitDataSet(dataSet,0,0))
    print(trees.chooseBestFeatureToSplit(dataSet))
Ejemplo n.º 2
0
dataset = []
#labels '年龄', '处方', '散光', '眼镜材质'
labels = ['age', 'prescript', 'astigmatic', 'tearRate']
for line in fr.readlines():
    d = line.strip().split('\t')
    dataset.append(d)
fr.close()

print dataset
print '\n'

print '数据集类的香农熵:'
print trees.calcShannonEnt(dataset)
print '\n'

bestFeatureColumn = trees.chooseBestFeatureToSplit(dataset)
print '数据集最佳分类的属性是:'
print labels[bestFeatureColumn]
print '\n'

print '决策树:'
Tree = trees.createTree(dataset, labels)
print Tree
firstFeature = Tree.keys()[0]
print firstFeature
firstFeatureValues = Tree[firstFeature].keys()
print firstFeatureValues
print '\n'

treePlotter.createPlot(Tree)
Ejemplo n.º 3
0
myDat, labels = trees.createDataSet()
trees.calcShannonEnt(myDat)

myDat[0][-1] = 'maybe'
trees.calcShannonEnt(myDat)

# 测试函数splitDataSet
reload(trees)
myDat, labels = trees.createDataSet()
trees.splitDataSet(myDat, 0, 1)
trees.splitDataSet(myDat, 0, 0)

# 测试chooseBestFeatureToSplit
reload(trees)
myDat, labels = trees.createDataSet()
trees.chooseBestFeatureToSplit(myDat)

# 测试创建树
reload(trees)
myDat, labels = trees.createDataSet()
myTree = trees.createTree(myDat, labels)
myTree

# 测试matplotlib
import treePlotter
treePlotter.createPlot()

# 测试获取叶子数量及树深度的函数
reload(treePlotter)
treePlotter.retrieveTree(1)
myTree = treePlotter.retrieveTree(0)
Ejemplo n.º 4
0
	def testChooseBestFeatureToSplit(self):
		myDat,labels = TreesTestCase.createDataSet()
		featureIndex = trees.chooseBestFeatureToSplit(myDat)
		self.assertEqual(featureIndex, 0)
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-
#  os.chdir('D:\www\IdeaProject\MLiA_SourceCode\machinelearninginaction')
# print os.getcwd()
from numpy.ma import zeros, array


if __name__ == '__main__':
    print 'hello'
    import trees
    import treePlotter
    data,lables = trees.createDataSet()
    print data
    print lables
    shannonEnt = trees.calcShannonEnt(data)
    print shannonEnt
    data[0][-1] ='maybe'
    print trees.calcShannonEnt(data)
    print data
    print trees.chooseBestFeatureToSplit(data)
    mytree = trees.createTree(data,lables)
    print mytree
    fr = open('lenses.txt')
    lenses = [inst.strip().split('\t')  for inst in fr.readlines()]
    lensesLables = ['age','prescript','astigmatic','tearRate']
    lensesTree = trees.createTree(lenses,lensesLables)
    print lensesTree
    treePlotter.createPlot(lensesTree)
Ejemplo n.º 6
0

# createPlot(thisTree)

myData, labels = trees.createDataSet()
print(myData)
print(trees.calcShannonEnt(myData))
print('---------------------------------')
print(trees.splitDataSet(myData, 0, 1))
print(trees.splitDataSet(myData, 0, 0))
print(trees.splitDataSet(myData, 1, 1))
print(trees.splitDataSet(myData, 1, 0))
print('---------------------------------')
myData, labels = trees.createDataSet()
print(myData)
print('第', trees.chooseBestFeatureToSplit(myData), '个特征是最好的用于划分数据集的特征')
print('---------------------------------')
myData, labels = trees.createDataSet()
myTree = trees.createTree(myData, labels)
print('myTree=', myTree)
print('---------------------------------')
# createPlot()
print('---------------------------------')
print(retrieveTree(1))
myTree = retrieveTree(0)
print(myTree)
print(getNumLeafs(myTree))
print(getTreeDepth(myTree))
print('---------------------------------')
myTree = retrieveTree(0)
createPlot(myTree)
# autor: zhumenger
import trees
myDat, lables = trees.createDataSet()
print(myDat)
print(lables)
print(trees.calcShannonEnt(myDat))#返回期望值, 期望值越高,则混合的数据也越多

myDat[0][-1] = 'maybe'
print(trees.calcShannonEnt(myDat))

#测试splitDataSet()
print(trees.splitDataSet(myDat, 0, 1))
print(trees.splitDataSet(myDat, 0, 0))

trees.chooseBestFeatureToSplit(myDat)

#寻找最好的划分方式
print(trees.chooseBestFeatureToSplit(myDat)) #得到按照第 0 个特征值进行划分的结果最好

#3-4:
print(trees.createTree(myDat, lables))
Ejemplo n.º 8
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import trees

if __name__ == '__main__':
    data = trees.createDataSet1()
    # print data
    dataSet = data[0]
    lables = data[1]
    print dataSet
    feature = trees.chooseBestFeatureToSplit(dataSet)
    feature1 = trees.chooseBestFeatureToSplit1(dataSet)
    print feature
    print feature1
    mytree = trees.createTree(dataSet, lables)
    print mytree

    print trees.splitDataSet(dataSet, 0, 1)

    featLabels = ['outlook', 'temperature', 'humidity', 'windy']
    testVec = [0, 1, 0, 0]
    print trees.classify(mytree, featLabels, testVec)
Ejemplo n.º 9
0
entropy = DT.calcShannonEnt(myDat)  # 0.9709505944546686

# Let’s make the data a little messier and see how the entropy changes
# myDat[0][-1]='maybe'
# entropy = DT.calcShannonEnt(myDat)
# """
# output:
# 1.3709505944546687
# """

splittedDat = DT.splitDataSet(myDat, 0,
                              1)  # [[1, 'yes'], [1, 'yes'], [0, 'no']]

splittedDat = DT.splitDataSet(myDat, 0, 0)  # [[1, 'no'], [1, 'no']]

bestFeature = DT.chooseBestFeatureToSplit(myDat)  # 0

myTree = DT.createTree(
    myDat, labels
)  # {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

import treePlotter as TP

# TP.createPlot()
myTree = TP.retrieveTree(
    0)  #{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
n = TP.getNumLeafs(myTree)  # 3
d = TP.getTreeDepth(myTree)  # 2

TP.createPlot(myTree)
Ejemplo n.º 10
0
# -*- coding: utf-8 -*-

# 求使数据集熵最大的列
import trees
ds, ls = trees.createDataSet()
trees.chooseBestFeatureToSplit(ds)

# 创建决策树
import trees
ds, ls = trees.createDataSet()
trees.createTree(ds, ls)

# 绘制树
import treePlotter
mt = treePlotter.retrieveTree(0)
treePlotter.createPlot(mt)

# 利用决策树判断分类
import trees
import treePlotter
it = treePlotter.retrieveTree(0)
ds, ls = trees.createDataSet()
trees.classify(it, ls, [0, 0])

# 序列化与反序列化决策树
import trees
import treePlotter
it = treePlotter.retrieveTree(0)
trees.storeTree(it, 'classifierStorage.txt')
ot = trees.grabTree('classifierStorage.txt')
import trees
import treePlotter

myDat, labels = trees.createDataSet()
print myDat
print trees.calcShannonEnt(myDat)
print trees.splitDataSet(myDat, 0, 1)
print trees.splitDataSet(myDat, 0, 0)
print trees.splitDataSet(myDat, 1, 1)
print trees.chooseBestFeatureToSplit(myDat)
print trees.createTree(myDat, labels)

treePlotter.createPlot()
print 'createPlot over'

print treePlotter.retrieveTree(1)
myTree = treePlotter.retrieveTree(0)
print treePlotter.getNumLeafs(myTree)
print treePlotter.getTreeDepth(myTree)
Ejemplo n.º 12
0
def testChooseBestFeatureToSplit(dataSet):
    # print dataSet
    print(trees.chooseBestFeatureToSplit(dataSet))
Ejemplo n.º 13
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-


# 代码来源:http://www.cnblogs.com/hantan2008/archive/2015/07/27/4674097.html
# 该代码实现了决策树算法分类(ID3算法)



import trees
import treePlotter

if __name__ == '__main__':
    pass

myDat, labels = trees.createDataSetFromTXT("dataset.txt")

shan = trees.calcShannonEnt(myDat)
print shan

col = trees.chooseBestFeatureToSplit(myDat)
print col

Tree = trees.createTree(myDat, labels)
print Tree

treePlotter.createPlot(Tree)
Ejemplo n.º 14
0
    if classList.count(classList[0])==len(classList):
        return classList[0]#stop the splitting when class are same
    if len(dataSet[0])==1:
        return majorityCnt(classList)
    bestFeat=chooseBestFeatureToSplit(dataSet)
    bestFeatLabel=labels[bestFeat]
    myTree={bestFeatLabel:{}}
    del (labels[bestFeat])
    featValues=[example[bestFeat] for example in dataSet]
    uniqueVals=set(featValues)
    for value in uniqueVals:
        subLabels=labels[:]
        myTree[bestFeatLabel][value]=createTree(splitDataSet\
                                                    (dataSet,bestFeat,value),subLabels)
    return myTree


if __name__ == "__main__":
    import trees
    myDat,labels=trees.createDataSet()
    print(myDat,'\n',labels)
    print("The shannonEnt is :",trees.calcShannonEnt(myDat))

    print("\nsplitDataSet(myDat,1,1):\n",trees.splitDataSet(myDat,1,1))
    #print("\nsplitDataSet(myDat,0,0):\n",trees.splitDataSet(myDat,0,0))

    print("\nchooseBestFeatureToSplit: ",trees.chooseBestFeatureToSplit(myDat))

    myTree=createTree(myDat,labels)
    print("\nmyTree:\n",myTree)
Ejemplo n.º 15
0
from numpy import array

import trees
reload(trees)
myDat,labels=trees.createDataSet()
print myDat
print trees.calcShannonEnt(myDat)
print trees.splitDataSet(myDat,0,1)
print trees.splitDataSet(myDat,0,0)
print trees.chooseBestFeatureToSplit(myDat)
myTree = trees.createTree(myDat,labels)
print myTree
Ejemplo n.º 16
0
import trees

mydata,features = trees.createDataSet()
print(mydata)
print(features)
print(trees.calcShannonEnt(mydata))
'''
mydata[0][-1] = 'maybe'
print(trees.calcShannonEnt(mydata))
'''
#print(trees.splitDataSet(mydata,0,1))

index = trees.chooseBestFeatureToSplit(mydata)
#print(index)
'''
mytree = trees.createTree(mydata,features)
print(mytree)
'''
import treePlotter
'''
mytree = treePlotter.retrieveTree(0)
treePlotter.createPlot(mytree)
mytree['no surfacing'][3] = 'maybe'
treePlotter.createPlot(mytree)
'''

mytree = treePlotter.retrieveTree(0)
print(trees.classify(mytree,features,[0,0]))
print(trees.classify(mytree,features,[1,1]))

trees.storeTree(mytree, 'classifier.txt')
Ejemplo n.º 17
0
import trees
a1, a2 = trees.createDataSet()
b1 = trees.chooseBestFeatureToSplit(a1)
#print(b1)

#核心就是匹配那个香浓值最接近选哪个
import treePlotter

# homedir= os.getcwd()+'/machinelearninginaction/ch03/'  #绝对路径
homedir = ''  #相对路径

#3.1.1 信息增益
myDat, labels = trees.createDataSet()
print "计算香农熵:", trees.calcShannonEnt(myDat)
myDat[0][-1] = ' maybe'
print "计算香农熵:", trees.calcShannonEnt(myDat)

#3.1.2 划分数据集
myDat, labels = trees.createDataSet()
trees.splitDataSet(myDat, 0, 1)
trees.splitDataSet(myDat, 0, 0)
print "选择最好的数据集划分方式:", trees.chooseBestFeatureToSplit(myDat)

#3.1.3 递归构建决策树
myDat, labels = trees.createDataSet()
myTree = trees.createTree(myDat, labels)
print "myTree:", myTree

#3.2.1 Matplotlib注解
treePlotter.createPlot()

#3.2.2 构造注解树
treePlotter.retrieveTree(1)
myTree = treePlotter.retrieveTree(0)
print "获取叶节点的数目:", treePlotter.getNumLeafs(myTree)
print "获取树的层数:", treePlotter.getTreeDepth(myTree)
treePlotter.createPlot(myTree)
Ejemplo n.º 19
0
 def test_bestChoose(self):
     dataSet, label = trees.createDataSet()
     print("\n dataSet == %s" % (dataSet))
     bestFeature = trees.chooseBestFeatureToSplit(dataSet)
     print("\n bestFeature == %s" % (bestFeature))
Ejemplo n.º 20
0
#内容:1.读取文档数据生成数据集,2.利用数据集生成决策树,3.绘制决策树的图形
#时间:2018年5月30日 6月6日添加备注

import trees
import treePlotter
#11111111111111111111111111111111111111111111111111111
#利用手动创建的数据集生成树,绘制树的图形,测试程序过程步骤
#输出手动创建的数据集,计算香农熵
myDat,labels=trees.createDataSet()
print "myDat 数据集是:",myDat
print "\nlabels 标签是:",labels
rawCalc =trees.calcShannonEnt(myDat)
print "\ncalcShannonEnt(myDat) 数据集的原始熵是:",rawCalc
print "\ntrees.splitDataSet( myDat,1,1)将数据集的按 特征[1]=1(即 flippers==1) 提取出来的矩阵是:",trees.splitDataSet(myDat,1,1)
#
bestLabel = trees.chooseBestFeatureToSplit(myDat)
print "\nchooseBestFeatureToSplit(myDat) 数据集的bestLabel最好特征的[下标]是:",bestLabel,"\tlabels[bestLabel]最好特征是:",labels[bestLabel]
#
myTree = trees.createTree(myDat,labels)
print "\ntrees.createTree(myDat,labels) 根据数据集创建的树是:", myTree
#读取预先存储的树[0] 并绘制图形
print "\n读取预先存储的树[0] 并绘制出第一个图形:"
myTree0 = treePlotter.retrieveTree(0)
treePlotter.createPlot(myTree0)
#读取预先存储的树[1] 并绘制图形
print "\n读取预先存储的树[1] 并绘制出第二个图形:"
myTree1 = treePlotter.retrieveTree(1)
treePlotter.createPlot(myTree1)

#change one date in "no surfacing"
#and print