def test2(): ''' line = '0.530897\t0.893462' curLine = line.strip().split('\t') print(curLine) fltLine = map(float, curLine) #在python3里面变成map了。。。错了 print(list(curLine)) print(set(fltLine)) ''' myDat = regTrees.loadDataSet('exp.txt') #200*2 float, exp和ex2也差不多,y的方差更小 #print(myDat[2]) #print(list(myDat[2]))一样的 print(shape(myDat)) #print(m1[:,-1]) #print(m1[5,:]) # 用mat()转成矩阵才能这样用 myMat = mat(myDat) #retTree = regTrees.createTree(myMat, ops=(1000,10)) #(0,1)就是每个点都分了一个叉,典型的overfitting #retTree = regTrees.createTree(myMat, ops=(0.2,4)) #ex2比ex00分布差不多,y的取值大了100倍,因此用10000,4和原来的效果差不多 #print(retTree) retTree = regTrees.createTree(myMat, ops=(10, 4)) testDat = mat( regTrees.loadDataSet('ex2test.txt')) #ex2test.txt的数据分布范围和ex2很接近,真实的测试集 pruned_Tree = regTrees.prune(retTree, testDat) print(pruned_Tree) #regTrees.plot1(myMat) regTrees.plot1withTree(myMat, retTree) regTrees.plot1withTree(myMat, pruned_Tree)
def reDraw(tolS, tolN): reDraw.f.clf() # clear the figure reDraw.a = reDraw.f.add_subplot(111) if chkBtnVar.get(): if tolN < 2: tolN = 2 myTree=regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf,\ regTrees.modelErr, (tolS,tolN)) if is_prune: myTree = regTrees.prune(myTree, reDraw.testDat) yHat = regTrees.createForeCast(myTree, reDraw.testX, \ regTrees.modelTreeEval) else: myTree = regTrees.createTree(reDraw.rawDat, ops=(tolS, tolN)) if is_prune: myTree = regTrees.prune(myTree, reDraw.testDat) yHat = regTrees.createForeCast(myTree, reDraw.testX) reDraw.a.scatter(reDraw.testDat[:, 0], reDraw.testDat[:, 1], s=5) #use scatter for data set reDraw.a.plot(reDraw.testX, yHat, linewidth=4.0) #use plot for yHat reDraw.canvas.show()
#print(myMat) print(regTrees.createTree(myMat)) print("ex0.txt") myDat1=regTrees.loadDataSet('ex0.txt') myMat1=mat(myDat1) print(shape(myMat1))#200 3 print(regTrees.createTree(myMat1)) #建树完成 myDat2=regTrees.loadDataSet('ex2.txt') myMat2=mat(myDat2) print(regTrees.createTree(myMat2)) myTree=regTrees.createTree(myMat2,ops=(0,1)) myDatTest=regTrees.loadDataSet('ex2test.txt') myMat2Test=mat(myDatTest) regTrees.prune(myTree,myMat2Test) print(myTree) print("分段函数表示:") myMat2=mat(regTrees.loadDataSet('exp2.txt')) print(regTrees.createTree(myMat2,regTrees.modelLeaf,regTrees.modelErr,(1,10))) trainMat=mat(regTrees.loadDataSet('bikeSpeedVsIq_train.txt')) testMat=mat(regTrees.loadDataSet('bikeSpeedVsIq_test.txt')) myTree=regTrees.createTree(trainMat,ops=(1,20)) yHat=regTrees.createForeCast(myTree,testMat[:,0]) print(corrcoef(yHat,testMat[:,1],rowvar=0)[0,1]) ws,X,Y=regTrees.linearSolve(trainMat) print(ws)
print regTrees.createTree(myMat) myDat1 = regTrees.loadDataSet('ex0.txt') myMat1 = mat(myDat1) print regTrees.createTree(myMat1) #print regTrees.createTree(myMat, ops=(0,1)) myDat2 = regTrees.loadDataSet('ex2.txt') myMat2 = mat(myDat2) #print regTrees.createTree(myMat2) print regTrees.createTree(myMat2,ops=(10000,4)) myTree = regTrees.createTree(myMat2, ops=(0,1)) myDatTest = regTrees.loadDataSet('ex2test.txt') myMat2Test = mat(myDatTest) print regTrees.prune(myTree, myMat2Test) myMat2 = mat(regTrees.loadDataSet('exp2.txt')) print regTrees.createTree(myMat2, regTrees.modelLeaf, regTrees.modelErr, (1,10)) trainMat = mat(regTrees.loadDataSet('bikeSpeedVsIq_train.txt')) testMat = mat(regTrees.loadDataSet('bikeSpeedVsIq_test.txt')) myTree = regTrees.createTree(trainMat, ops=(1,20)) yHat = regTrees.createForeCast(myTree, testMat[:,0]) print corrcoef(yHat, testMat[:,1], rowvar=0)[0,1] myTree = regTrees.createTree(trainMat, regTrees.modelLeaf, regTrees.modelErr, ops=(1,20)) yHat = regTrees.createForeCast(myTree, testMat[:,0], regTrees.modelTreeEval) print corrcoef(yHat, testMat[:,1], rowvar=0)[0,1] ws, X, Y = regTrees.linearSolve(trainMat)
import regTrees from numpy import * # mydat = regTrees.loadDataSet('ex00.txt') # mydat = mat(mydat) # print(regTrees.createTree(mydat)) # # testmat = mat(eye(4)) # mat0,mat1 = regTrees.binSplitDataSet(testmat,1,0.5) # print(mat0,mat1) mydat = regTrees.loadDataSet('ex2.txt') mydat = mat(mydat) mytree = regTrees.createTree(mydat, ops=(0, 1)) mytest = regTrees.loadDataSet('ex2test.txt') mytest = mat(mytest) print(regTrees.prune(mytree, mytest))
import regTrees as rt import plotRegTrees as pt if __name__ == '__main__': dataSet = rt.loadCSV("dataSet.csv") myTree = rt.createTree(dataSet, evaluationFunc=rt.gini) print(u"myTree:%s" % myTree) #绘制决策树 print(u"绘制决策树:") pt.createPlot1(myTree) decisionTree = rt.buildDecisionTree(dataSet, evaluationFunc=rt.gini) testData = [5.9, 3, 4.2, 1.75] r = rt.classify(testData, decisionTree) print(u"分类后测试结果:") print(r) print() rt.prune(decisionTree, 0.4) r1 = rt.classify(testData, decisionTree) print(u"剪枝后测试结果:") print(r1)
import regTrees import numpy as np testMat = np.mat(np.eye((4))) # print(testMat) # mat0 ,mat1 = regTrees.binSplitDataSet(testMat,1,0.5) # print(mat0) # print(mat1) myDat = regTrees.loadDataSet('ex00.txt') myMat = np.mat(myDat) # print(regTrees.createTree(myMat)) myDat1 = regTrees.loadDataSet('ex0.txt') myMat1 = np.mat(myDat1) # print(regTrees.createTree(myMat1)) myDat2 = regTrees.loadDataSet('ex2.txt') myMat2 = np.mat(myDat2) myTree = regTrees.createTree(myMat2, ops=(0, 1)) myDatTest = regTrees.loadDataSet('ex2test.txt') myMat2Test = np.mat(myDatTest) print(regTrees.prune(myTree, myMat2Test))
print('-------- loadDataSet regLeaf : ') print(regTrees.regLeaf(tMat)) print('-------- loadDataSet regErr : ') print(regTrees.regErr(tMat)) print('-------- tMat : ') print(tMat) myTree = regTrees.createTree(tMat) print('-------- regTree createTree : ') print(myTree) myDat2 = regTrees.loadDataSet('ex2.txt') myMat2 = mat(myDat2) myTree2 = regTrees.createTree(myMat2, ops=(0, 1)) print('-------- regTree createTree2 : ') print(myTree2) myDat3 = regTrees.loadDataSet('ex2test.txt') myMat2Test = mat(myDat3) regTrees.prune(myTree2, myMat2Test) myMat4 = mat(regTrees.loadDataSet('exp2.txt')) print( regTrees.createTree(myMat4, regTrees.modelLeaf, regTrees.modelErr, (1, 10)))
# -*- coding: utf-8 -*- import regTrees from numpy import * testmat = mat(eye(4)) mat0, mat1 = regTrees.bin_split_data_set(testmat, 1, 0.5) my_dat1 = regTrees.load_data_set('ex00.txt') tree = regTrees.create_tree(mat(my_dat1)) my_dat1 = regTrees.load_data_set('ex2.txt') tree = regTrees.create_tree(mat(my_dat1), ops=(0, 1)) print(tree) my_test1 = regTrees.load_data_set('ex2test.txt') t = regTrees.prune(tree, mat(my_test1)) print(t) my_mat = mat(regTrees.load_data_set('exp2.txt')) regTrees.create_tree(my_mat, regTrees.model_leaf, regTrees.model_err, (1,10)) train = mat(regTrees.load_data_set('bikeSpeedVsIq_train.txt')) test = mat(regTrees.load_data_set('bikeSpeedVsIq_test.txt')) tree = regTrees.create_tree(train, ops=(1, 20)) # regression tree # tree = regTrees.create_tree(train, regTrees.model_leaf, regTrees.model_err, (1,20)) # model tree yhat = regTrees.create_forecast(tree, test[:, 0], regTrees.model_tree_eval) corrcoef(yhat, test[:, 1], rowvar=0)[0, 1] ws, x, y = regTrees.linear_solve(train) for i in range(shape(test)[0]): yhat[i] = test[i, 0] * ws[1, 0] + ws[0, 0] corrcoef(yhat, test[:, 1], rowvar=0)[0, 1]
# print "myMat1:",myMat1 print "regTrees.createTree(myMat1):", regTrees.createTree(myMat1) #9.4.1 预剪枝 myDat2 = regTrees.loadDataSet(homedir + 'ex2.txt') myMat2 = mat(myDat2) print "regTrees.createTree(myMat2):", regTrees.createTree(myMat2) #9.4.2 后剪枝 myDat2 = regTrees.loadDataSet(homedir + 'ex2.txt') myMat2 = mat(myDat2) myTree = regTrees.createTree(myMat2, ops=(0, 1)) print "myTree:", myTree myDatTest = regTrees.loadDataSet(homedir + 'ex2test.txt') myMat2Test = mat(myDatTest) print "regTrees.prune(myTree,myMat2Test)", regTrees.prune(myTree, myMat2Test) #9.5 模型树 myMat2 = mat(regTrees.loadDataSet(homedir + 'exp2.txt')) print "regTrees.createTree(myMat2,regTrees.modelLeaf,regTrees.modelErr,(1,10)):", regTrees.createTree( myMat2, regTrees.modelLeaf, regTrees.modelErr, (1, 10)) #9.6 示例:树回归与标准回归的比较 trainMat = mat(regTrees.loadDataSet(homedir + 'bikeSpeedVsIq_train.txt')) testMat = mat(regTrees.loadDataSet(homedir + 'bikeSpeedVsIq_test.txt')) myTree = regTrees.createTree(trainMat, ops=(1, 20)) yHat = regTrees.createForeCast(myTree, testMat[:, 0]) corrcoef1 = corrcoef(yHat, testMat[:, 1], rowvar=0)[0, 1] myTree = regTrees.createTree(trainMat, regTrees.modelLeaf, regTrees.modelErr,
__author__ = 'bacon' import regTrees from numpy import * testMat = mat(eye(4)) print testMat mat0, mat1 = regTrees.binSplitDataSet(testMat, 1, 0.5) print mat0 print mat1 # myDat = regTrees.loadDataSet('ex00.txt') # myMat = mat(myDat) # print regTrees.createTree(myMat) # # myDat2=regTrees.loadDataSet('ex0.txt') # myMat2=mat(myDat2) # print regTrees.createTree(myMat2) # # print regTrees.createTree(myMat,ops=(0,1)) # myDat3 = regTrees.loadDataSet('ex2.txt') myMat3 = mat(myDat3) # print regTrees.createTree(myMat3) myTree = regTrees.createTree(myMat3, ops=(0, 1)) myDatTest = regTrees.loadDataSet('ex2test.txt') myMatTest = mat(myDatTest) print regTrees.prune(myTree, myDatTest)
################### 预剪枝方法的 树回归 P164 myDat1 = regTrees.loadDataSet('ex00.txt') myMat1 = numpy.mat(myDat1) trees1 = regTrees.createTree(myMat1,ops=(1,4)) print "预剪枝方法生成树回归是:", trees1 trees1 = regTrees.createTree(myMat1,ops=(1,4)) ################## 后剪枝方法的 树回归 P169 myDat2 = regTrees.loadDataSet('ex2.txt') myMat2 = numpy.mat(myDat2) trees2 = regTrees.createTree(myMat2,ops=(10,10)) print "\n后剪枝之前的树trees2是:\n", trees2 myDat3 = regTrees.loadDataSet('ex2test.txt') myMat3 = numpy.mat(myDat3) trees3 = regTrees.prune(trees2, myMat3) print "\n后剪枝之后的树trees3 是:\n", trees3 ################## 叶子节点是模型树(线性模型)P172 myMat4 = numpy.mat(regTrees.loadDataSet('exp2.txt')) #Page170 是书上的模型树即叶子节点是线性模型, modelLeaf函数返回的是线性的权重ws,modelErr函数返回的误差的值 trees4 = regTrees.createTree(myMat4, leafType = regTrees.modelLeaf, errType = regTrees.modelErr, ops=(1,4)) print "\n叶子节点是模型树的树回归:\n", trees4 ################## 树回归与标准回归的比较 P174 ###回归树的预测情况和相关系数的计算 trainMat = numpy.mat(regTrees.loadDataSet('bikeSpeedVsIq_train.txt')) #加载训练数据 testMat = numpy.mat(regTrees.loadDataSet('bikeSpeedVsIq_test.txt')) #加载测试数据 myTree = regTrees.createTree(trainMat, ops = (1, 20)) #获得训练数据的树:树回归(叶子节点是常数项)
tree_dict=regTrees.createTree(myMat) print tree_dict print ' ' myDat1=regTrees.loadDataSet('ex0.txt') myMat1=mat(myDat1) tree_dict=regTrees.createTree(myMat1) print tree_dict print ' ' myDat2=regTrees.loadDataSet('ex2.txt') myMat2=mat(myDat2) #tree_dict=regTrees.createTree(myMat2) print tree_dict myTree=regTrees.createTree(myMat2,ops=(0,1)) myDataTest=regTrees.loadDataSet('ex2test.txt') myMatTest=mat(myDataTest) regTrees.prune(myTree,myMatTest) ## mode tree myMat3=mat(regTrees.loadDataSet('exp2.txt')) modelTree=regTrees.createTree(myMat3,regTrees.modelLeaf,regTrees.modelErr,(1,10)) print modelTree
regTrees.createTree(myMat1) # 如果不画图基本上不是人看的.... # 看看其他的参数对模型的影响,隐含的就是通过参数设置来裁剪树,俗称前剪枝 regTrees.createTree(myMat, ops=(0, 1)) # ops的第二个参数是最小切分的样本数,所以基本上每个样本一个叶节点了。。。 myDat2 = regTrees.loadDataSet('ex2.txt') myMat2 = np.mat(myDat2) regTrees.createTree(myMat2) # 默认是(1,4) regTrees.createTree(myMat2, ops=(10000, 4)) # 后剪枝 reload(regTrees) myTree = regTrees.createTree(myMat2, ops=(0, 1)) myDatTest = regTrees.loadDataSet('ex2test.txt') myMat2Test = np.mat(myDatTest) regTrees.prune(myTree, myMat2Test) # 你真的剪了么。。。。 # 模型树部分了 reload(regTrees) myMat2 = np.mat(regTrees.loadDataSet('exp2.txt')) regTrees.createTree(myMat2, regTrees.modelLeaf, regTrees.modelErr, (1, 10)) # 区别就是调用方法时选择不同的生成叶节点的方法和误差计算 # 模型比较 reload(regTrees) trainMat = np.mat(regTrees.loadDataSet('bikeSpeedVsIq_train.txt')) testMat = np.mat(regTrees.loadDataSet('bikeSpeedVsIq_test.txt')) myTree = regTrees.createTree(trainMat, ops=(1, 20)) yHat = regTrees.createForeCast(myTree, testMat[:, 0]) # 创建一个回归树 np.corrcoef(yHat, testMat[:, 1], rowvar=0)[0, 1]