def tests():
    dataSet, labels = trees.createDataSet()
    print dataSet
    print trees.calcShannonEnt(dataSet)
    myTree = trees.createTree(dataSet, labels)
    print myTree, labels
    print trees.classify(myTree, labels, [1, 0])
    print trees.classify(myTree, labels, [1, 1])
    print trees.classify(myTree, labels, [0, 0])
    print trees.classify(myTree, labels, [0, 1])
Exemple #2
0
    def test_shannon(self):
        dataSet, labels = trees.createDataSet()
        print("\n dataSet == %s" % (dataSet))
        shannon = trees.calcShannonEnt(dataSet)
        print("\n shannon == %s" % (shannon))

        # 不确定性越大,熵越大
        dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'yes?no'],
                   [0, 1, 'no'], [0, 1, 'no']]
        shannon = trees.calcShannonEnt(dataSet)
        print("\n shannon == %s" % (shannon))
Exemple #3
0
def main():
    myDat, labels = trees.createDataSet()
    print 'create data:'
    print myDat

    shan = trees.calcShannonEnt(myDat)
    print 'calc shan:'
    print shan

    myDat[0][-1] = 'maybe'
    shan1 = trees.calcShannonEnt(myDat)
    print 'change data and calc shan1:'
    print shan1
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1  # 特征数量
    baseEntropy = tr.calcShannonEnt(dataSet)  # 计算数据集的香农熵
    bestInfoGain = 0.0  # 信息增益
    bestFeature = -1  # 最优特征的索引值
    for i in range(numFeatures):  # 遍历所有特征
        # 获取dataSet的第i个所有特征
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)  # 创建set集合{},元素不可重复
        newEntropy = 0.0  # 经验条件熵
        for value in uniqueVals:  # 计算信息增益
            subDataSet = sd.splitDataSet(dataSet, i, value)  # subDataSet划分后的子集
            prob = len(subDataSet) / float(len(dataSet))  # 计算子集的概率
            newEntropy += prob * tr.calcShannonEnt(subDataSet)  # 根据公式计算经验条件熵
        infoGain = baseEntropy - newEntropy  # 信息增益
        print("第%d个特征的增益为%.3f" % (i, infoGain))  # 打印每个特征的信息增益
        if (infoGain > bestInfoGain):  # 计算信息增益
            bestInfoGain = infoGain  # 更新信息增益,找到最大的信息增益
            bestFeature = i  # 记录信息增益最大的特征的索引值
    return bestFeature  # 返回信息增益最大的特征的索引值
Exemple #5
0



#
fr = open('lensesCN.txt')
lenses = [unicode(inst, 'utf-8').strip().strip().split('\t') for inst in fr.readlines()]
#lensesLabels = ["年龄组" , "规定", "闪光", "泪液扫除率"]
lensesLabels = ['age' , 'prescript', 'astigmatic', 'tearRate']
lensesTree = tr.createTree(lenses,lensesLabels)
print(lensesTree)
tp.createPlot(lensesTree)

dataSet, labels = tr.createDataSet()

shannonEnt = tr.calcShannonEnt(dataSet)

print(shannonEnt)

print(tp.retrieveTree(1))

myTree = tp.retrieveTree(0)
numLeafs = tp.getNumLeafs(myTree)
treeDepth = tp.getTreeDepth(myTree)

print(numLeafs)
print(treeDepth)


myTree = tp.retrieveTree(0)
tp.createPlot(myTree)
import trees
mydat, labels = trees.createDataSet()
a = trees.calcShannonEnt(mydat)
print(mydat)
print(labels)
print(a)
import trees

myDat, labels = trees.createDataSet()
print(myDat)
print(labels)
print(trees.calcShannonEnt(myDat))

print(trees.splitDataSet(myDat, 0, 0))

print(trees.createTree(myDat, labels))
Exemple #8
0
import trees

mydata,features = trees.createDataSet()
print(mydata)
print(features)
print(trees.calcShannonEnt(mydata))
'''
mydata[0][-1] = 'maybe'
print(trees.calcShannonEnt(mydata))
'''
#print(trees.splitDataSet(mydata,0,1))

index = trees.chooseBestFeatureToSplit(mydata)
#print(index)
'''
mytree = trees.createTree(mydata,features)
print(mytree)
'''
import treePlotter
'''
mytree = treePlotter.retrieveTree(0)
treePlotter.createPlot(mytree)
mytree['no surfacing'][3] = 'maybe'
treePlotter.createPlot(mytree)
'''

mytree = treePlotter.retrieveTree(0)
print(trees.classify(mytree,features,[0,0]))
print(trees.classify(mytree,features,[1,1]))

trees.storeTree(mytree, 'classifier.txt')
#!/usr/bin/python
# -*- coding:utf-8 -*-

import os
import trees
import treePlotter

# homedir= os.getcwd()+'/machinelearninginaction/ch03/'  #绝对路径
homedir = ''  #相对路径

#3.1.1 信息增益
myDat, labels = trees.createDataSet()
print "计算香农熵:", trees.calcShannonEnt(myDat)
myDat[0][-1] = ' maybe'
print "计算香农熵:", trees.calcShannonEnt(myDat)

#3.1.2 划分数据集
myDat, labels = trees.createDataSet()
trees.splitDataSet(myDat, 0, 1)
trees.splitDataSet(myDat, 0, 0)
print "选择最好的数据集划分方式:", trees.chooseBestFeatureToSplit(myDat)

#3.1.3 递归构建决策树
myDat, labels = trees.createDataSet()
myTree = trees.createTree(myDat, labels)
print "myTree:", myTree

#3.2.1 Matplotlib注解
treePlotter.createPlot()

#3.2.2 构造注解树
Exemple #10
0
import trees

myDat, labels = trees.createDataSet()

print(trees.calcShannonEnt(myDat))
Exemple #11
0
# coding=utf-8
import trees
import treePlotter

dateset, labels = trees.createDataSet()
print dateset
print labels
shannon = trees.calcShannonEnt(dateset)
print shannon
print '--条件熵'
ha = trees.calcShannonEnt(trees.splitDataSet(dateset, 0, 5))
hb = trees.calcShannonEnt(trees.splitDataSet(dateset, 0, 3))
hc = trees.calcShannonEnt(trees.splitDataSet(dateset, 0, 1))
hxy = 1.0 / 3 * ha + 1.0 / 4 * hb + 5.0 / 12 * hc
print ha, hb, hc
print hxy
print '--信息增益'
ig = shannon - hxy
print ig
print '--找到最佳分类特征'
feature = trees.chooseBestFeatureToSplit(dateset)
print labels[feature]
print '--创建决策树'
labelsCopy = labels[:]
tree = trees.createTree(dateset, labelsCopy)
print tree
# print '--画图'
# treePlotter.createPlot(tree)
print '--用决策树测试数据'
#mytree = treePlotter.retrieveTree(0)
testdata = [4, 4, 1, 'cha']
import trees as t

data_set, labels = t.createDataSet()
data_set[0][-1] = 'test'
# entropy more bigger, mix of classes more higher
print(t.calcShannonEnt(data_set))
print(t.calcShannonEnt([[1,0,'yes'], [1,1,'no']]))
Exemple #13
0
import trees

mat, labels = trees.createDataSet()
ent = trees.calcShannonEnt(mat)
print(ent)
mat[0][-1] = 'maybe'
ent = trees.calcShannonEnt(mat)
print(ent)

# mat, labels = trees.createDataSet()
# print(mat)
# split1 = trees.splitDataSet(mat, 0, 1)
# print(split1)
# split2 = trees.splitDataSet(mat, 0, 0)
# print(split2)

# mat, labels = trees.createDataSet()
# feat = trees.chooseBestFeatureToSplit(mat)
# print(feat)
Exemple #14
0
# coding=utf-8

#程序说明
#标题:决策树的生成和绘制
#内容:1.读取文档数据生成数据集,2.利用数据集生成决策树,3.绘制决策树的图形
#时间:2018年5月30日 6月6日添加备注

import trees
import treePlotter
#11111111111111111111111111111111111111111111111111111
#利用手动创建的数据集生成树,绘制树的图形,测试程序过程步骤
#输出手动创建的数据集,计算香农熵
myDat,labels=trees.createDataSet()
print "myDat 数据集是:",myDat
print "\nlabels 标签是:",labels
rawCalc =trees.calcShannonEnt(myDat)
print "\ncalcShannonEnt(myDat) 数据集的原始熵是:",rawCalc
print "\ntrees.splitDataSet( myDat,1,1)将数据集的按 特征[1]=1(即 flippers==1) 提取出来的矩阵是:",trees.splitDataSet(myDat,1,1)
#
bestLabel = trees.chooseBestFeatureToSplit(myDat)
print "\nchooseBestFeatureToSplit(myDat) 数据集的bestLabel最好特征的[下标]是:",bestLabel,"\tlabels[bestLabel]最好特征是:",labels[bestLabel]
#
myTree = trees.createTree(myDat,labels)
print "\ntrees.createTree(myDat,labels) 根据数据集创建的树是:", myTree
#读取预先存储的树[0] 并绘制图形
print "\n读取预先存储的树[0] 并绘制出第一个图形:"
myTree0 = treePlotter.retrieveTree(0)
treePlotter.createPlot(myTree0)
#读取预先存储的树[1] 并绘制图形
print "\n读取预先存储的树[1] 并绘制出第二个图形:"
myTree1 = treePlotter.retrieveTree(1)
Exemple #15
0
	def testCalcShannonEnt(self):
		myDat,labels = TreesTestCase.createDataSet()
		entropy = trees.calcShannonEnt(myDat)
		ent = float('%0.3f' % entropy)
		self.assertEqual(ent, 0.971)
Exemple #16
0
def testCalcShannonEnt(dataSet):
    shannonEnt = trees.calcShannonEnt(dataSet)
    print(shannonEnt)
    return shannonEnt
Exemple #17
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-


# 代码来源:http://www.cnblogs.com/hantan2008/archive/2015/07/27/4674097.html
# 该代码实现了决策树算法分类(ID3算法)



import trees
import treePlotter

if __name__ == '__main__':
    pass

myDat, labels = trees.createDataSetFromTXT("dataset.txt")

shan = trees.calcShannonEnt(myDat)
print shan

col = trees.chooseBestFeatureToSplit(myDat)
print col

Tree = trees.createTree(myDat, labels)
print Tree

treePlotter.createPlot(Tree)
Exemple #18
0
    if classList.count(classList[0])==len(classList):
        return classList[0]#stop the splitting when class are same
    if len(dataSet[0])==1:
        return majorityCnt(classList)
    bestFeat=chooseBestFeatureToSplit(dataSet)
    bestFeatLabel=labels[bestFeat]
    myTree={bestFeatLabel:{}}
    del (labels[bestFeat])
    featValues=[example[bestFeat] for example in dataSet]
    uniqueVals=set(featValues)
    for value in uniqueVals:
        subLabels=labels[:]
        myTree[bestFeatLabel][value]=createTree(splitDataSet\
                                                    (dataSet,bestFeat,value),subLabels)
    return myTree


if __name__ == "__main__":
    import trees
    myDat,labels=trees.createDataSet()
    print(myDat,'\n',labels)
    print("The shannonEnt is :",trees.calcShannonEnt(myDat))

    print("\nsplitDataSet(myDat,1,1):\n",trees.splitDataSet(myDat,1,1))
    #print("\nsplitDataSet(myDat,0,0):\n",trees.splitDataSet(myDat,0,0))

    print("\nchooseBestFeatureToSplit: ",trees.chooseBestFeatureToSplit(myDat))

    myTree=createTree(myDat,labels)
    print("\nmyTree:\n",myTree)
Exemple #19
0
# This is a sample Python script.

# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

import trees

# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    myDat, labels = trees.createDataSet()
    sh = trees.calcShannonEnt(myDat)
    print(sh)
    # 熵越高,说明混合的数据也越多
    # 修改数据集,添加类型'maybe'
    myDat[0][-1] = 'maybe'
    sh = trees.calcShannonEnt(myDat)
    print(sh)
# autor: zhumenger
import trees
myDat, lables = trees.createDataSet()
print(myDat)
print(lables)
print(trees.calcShannonEnt(myDat))#返回期望值, 期望值越高,则混合的数据也越多

myDat[0][-1] = 'maybe'
print(trees.calcShannonEnt(myDat))

#测试splitDataSet()
print(trees.splitDataSet(myDat, 0, 1))
print(trees.splitDataSet(myDat, 0, 0))

trees.chooseBestFeatureToSplit(myDat)

#寻找最好的划分方式
print(trees.chooseBestFeatureToSplit(myDat)) #得到按照第 0 个特征值进行划分的结果最好

#3-4:
print(trees.createTree(myDat, lables))
Exemple #21
0
# -*- coding: utf-8 -*-
#  os.chdir('D:\www\IdeaProject\MLiA_SourceCode\machinelearninginaction')
# print os.getcwd()
from numpy.ma import zeros, array


if __name__ == '__main__':
    print 'hello'
    import trees
    import treePlotter
    data,lables = trees.createDataSet()
    print data
    print lables
    shannonEnt = trees.calcShannonEnt(data)
    print shannonEnt
    data[0][-1] ='maybe'
    print trees.calcShannonEnt(data)
    print data
    print trees.chooseBestFeatureToSplit(data)
    mytree = trees.createTree(data,lables)
    print mytree
    fr = open('lenses.txt')
    lenses = [inst.strip().split('\t')  for inst in fr.readlines()]
    lensesLables = ['age','prescript','astigmatic','tearRate']
    lensesTree = trees.createTree(lenses,lensesLables)
    print lensesTree
    treePlotter.createPlot(lensesTree)
Exemple #22
0
import trees
a1, a2 = trees.createDataSet()
a3 = trees.calcShannonEnt(a1)
print(a3)
Exemple #23
0
import trees
import treePlotter
fr = open('lenses.txt')
dataset = []
#labels '年龄', '处方', '散光', '眼镜材质'
labels = ['age', 'prescript', 'astigmatic', 'tearRate']
for line in fr.readlines():
    d = line.strip().split('\t')
    dataset.append(d)
fr.close()

print dataset
print '\n'

print '数据集类的香农熵:'
print trees.calcShannonEnt(dataset)
print '\n'

bestFeatureColumn = trees.chooseBestFeatureToSplit(dataset)
print '数据集最佳分类的属性是:'
print labels[bestFeatureColumn]
print '\n'

print '决策树:'
Tree = trees.createTree(dataset, labels)
print Tree
firstFeature = Tree.keys()[0]
print firstFeature
firstFeatureValues = Tree[firstFeature].keys()
print firstFeatureValues
print '\n'
# -*- coding: utf-8 -*- 
import trees

dataSet, labels = trees.createDataSet()

print dataSet
print labels


#计算熵
print trees.calcShannonEnt(dataSet)#0.970950594455

dataSet[0][-1] = 'maybe'

print dataSet
print trees.calcShannonEnt(dataSet)#1.37095059445
#熵越大,则混合的数据越多

#还原
dataSet[0][-1] = 'yes'
print dataSet
#[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
#划分数据集

#当第0列,值为0 的实例
print trees.splitDataSet(dataSet, 0, 0)
#[[1, 'no'], [1, 'no']]


#当第0列,值为1 的实例
print trees.splitDataSet(dataSet, 0, 1)
Exemple #25
0
# -*- coding: utf-8 -*-
import trees

dataMat, labels = trees.createDataSet2()
print(dataMat)
result = trees.calcShannonEnt(dataMat)
print("result is: %.2f" % result)
Exemple #26
0
import trees as DT

myDat, labels = DT.createDataSet()
entropy = DT.calcShannonEnt(myDat)  # 0.9709505944546686

# Let’s make the data a little messier and see how the entropy changes
# myDat[0][-1]='maybe'
# entropy = DT.calcShannonEnt(myDat)
# """
# output:
# 1.3709505944546687
# """

splittedDat = DT.splitDataSet(myDat, 0,
                              1)  # [[1, 'yes'], [1, 'yes'], [0, 'no']]

splittedDat = DT.splitDataSet(myDat, 0, 0)  # [[1, 'no'], [1, 'no']]

bestFeature = DT.chooseBestFeatureToSplit(myDat)  # 0

myTree = DT.createTree(
    myDat, labels
)  # {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

import treePlotter as TP

# TP.createPlot()
myTree = TP.retrieveTree(
    0)  #{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
n = TP.getNumLeafs(myTree)  # 3
d = TP.getTreeDepth(myTree)  # 2
Exemple #27
0
from imp import reload
import trees

myDat, labels = trees.createDataSet()
trees.calcShannonEnt(myDat)
myTree = trees.createTree(myDat, labels)

import treePlotter
treePlotter.createPlot()
Exemple #28
0
#coding=utf-8
__author__ = 'baconLIN'

import trees

myDat,labels=trees.createDataSet()
print(myDat)
shannonEnt = trees.calcShannonEnt(myDat)
print(shannonEnt)

mySplit1 = trees.splitDataSet(myDat,0,1)
print mySplit1
shannonEntSplit1 = trees.calcShannonEnt(mySplit1)
print(shannonEntSplit1)
mySplit2 = trees.splitDataSet(myDat,0,0)
print mySplit2
shannonEntSplit2 = trees.calcShannonEnt(mySplit2)
print(shannonEntSplit2)
mySplit3 = trees.splitDataSet(myDat,1,1)
print mySplit3
shannonEntSplit3 = trees.calcShannonEnt(mySplit3)
print(shannonEntSplit3)

bestFeature = trees.chooseBestFeatureToSplit(myDat)
print(bestFeature)

myTree = trees.createTree(myDat,labels)
print(myTree)

import treePlotter
myTree2 = treePlotter.retrieveTree(0)