def plantTrees(zMap, density=1, order=1): # Graph that will contain all the trees forestGraph = sg.SceneGraphNode("forest") xSize = zMap.shape[0] ySize = zMap.shape[1] dx = 2 * MAP_X_SIZE / xSize dy = 2 * MAP_Y_SIZE / ySize x = -MAP_X_SIZE + dx / 2 # Choosing where to plant trees treeCoordinates = np.random.randint(0, 40 / density, zMap.shape) # Plants a tree in a (x, y, z) position for i in range(zMap.shape[0]): y = -MAP_Y_SIZE + dy / 2 for j in range(zMap.shape[1]): if treeCoordinates[i, j] == 0: # Burying the tree a little to ensure it isn't floating normal = terrainNormal(xSize, ySize, zMap, i, j) correction = (abs(normal[0]) + abs(normal[1])) / 5 # Randomizing the order, size and skip parameters np.random.seed(RANDOM + i * j) variance = np.random.uniform() realOrder = max(1, ORDER - int(variance > 0.4)) realSize = tree.SIZE + 0.4 * int(variance > 0.6) + 0.2 * int( variance > 0.8) realSize += (realOrder - 1) * 0.5 realSkip = np.random.randint(0, realOrder**2 + 1) # Randomizing the rule of creation realRule = treeRule(realOrder) # Trees can't be planted close to each other for k in range(i, i + int(realSize + realOrder)): for l in range(j, j + int(realSize + realOrder)): try: treeCoordinates[k, l] = 1 except: pass treeGraph = tree.createTree(realRule, realOrder, realSize, realSkip) treeGraph.transform = tr.translate(x, y, zMap[i, j] - correction) forestGraph.childs += [treeGraph] y += dy x += dx return forestGraph
def suffix(x): if int(x.conStart2) > int(x.conStart1): cStart1 = int(x.conStart1) cEnd1 = int(x.conEnd1) cType1 = x.conType1 cStart2 = int(x.conStart2) cEnd2 = int(x.conEnd2) cType2 = x.conType2 else: cStart1 = int(x.conStart2) cEnd1 = int(x.conEnd2) cType1 = x.conType2 cStart2 = int(x.conStart1) cEnd2 = int(x.conEnd1) cType2 = x.conType1 if V: print "Suffixing: %s, line: %s" % (x.fileName, x.lineNum) return tree.createString(tree.suffix( tree.createTree(x.parse), cStart1, cEnd1, cType1, cStart2, cEnd2, cType2))
def main(): print('script started') """ google api code """ credentials = auth.get_credentials() http = credentials.authorize(httplib2.Http()) service = discovery.build('drive', 'v3', http=http) """ download file list """ if config.DOWNLOAD_METADATA: downloadMetadata.downloadFileList(service) """ create tree """ nodeList = None if config.CREATE_TREE: nodeList = tree.createTree() """ download files """ if config.CREATE_STRUCTURE: createFileStructure.createStructure(nodeList, service) print('') print('script finished')
def mineTree(FPtree, headerTable, minSup, preFix, freqItemDict): #minSup:支持度,freqItemDict:频繁项集存放的地方,preFix:该项的前缀,FPtree:构建的FP树,headerTable:FP树对应的头表, bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1]) ] #从频次出现低项开始挖掘 for basePat in bigL: newFreqSet = preFix.copy() newFreqSet.add(basePat) #preFix是basePat的前缀路径,preFix+basePat是一个频繁项集,出现次数等于basePat出现的次数.当basePat为空时,preFix就是一个单路径FP树,它的路径上所有子集 # 在构造单路径FP树过程中已经生成了 print newFreqSet #记录每个频繁项的支持度计数 if frozenset(newFreqSet) in freqItemDict: freqItemDict[frozenset(newFreqSet)] += headerTable[basePat][0] else: freqItemDict[frozenset(newFreqSet)] = headerTable[basePat][0] # print newFreqSet,freqItemDict[frozenset(newFreqSet)] condPattBases = findPrefixPath(basePat, headerTable[basePat][1]) #求条件模式基 myCondTree, myHead = createTree(condPattBases, minSup) #实际上,它是以头表判断是否到是单路径FP树,头表为空则表示该basePat的前缀路径是条件FP树,否则,以条件模式基继续构造条件FP树,直到条件FP树为空 if myHead != None: mineTree( myCondTree, myHead, minSup, newFreqSet, freqItemDict) #FP树中的递归有一个特点:没有返回值,需要记录的数据都放在参数中了,上层可以直接拿到数据
def test_contactLenses(): fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] #四个属性名称 lensesAtts = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = tree.createTree(lenses, lensesAtts) print lensesTree tree.createPlot(lensesTree)
def __init__(self, dataSet, labels): newDataSet = [] for dataList in dataSet: newDataList = [] for data in dataList: newDataList.append(str(data)) newDataSet.append(newDataList) newLabels = [str(label) for label in labels] self._tree = createTree(newDataSet, newLabels)
def testClass(): myDat, labels = tree.createDataSet() myTree = tree.createTree(myDat, labels) # persistenting the decision tree tree.storeTree(myTree, 'myTree.train') myTree2 = tree.grabTree('myTree.train') testVec = [1, 0] print "Test ",testVec," result: ", tree.classify(myTree2, labels, testVec) testVec = [1, 1] print "Test ",testVec," result: ", tree.classify(myTree2, labels, testVec)
def classifyTest(): import tree as t import treePlotter as tp dataSet, labels = t.createDataSet() myTree = t.createTree(dataSet, labels.copy()) print(myTree) print(labels) print(classify(myTree, labels, [1, 0])) print(classify(myTree, labels, [1, 1])) tp.createPlot(myTree)
def spt(x): if int(x.conStart2) > int(x.conStart1): cStart1 = int(x.conStart1) cEnd1 = int(x.conEnd1) cStart2 = int(x.conStart2) cEnd2 = int(x.conEnd2) else: cStart1 = int(x.conStart2) cEnd1 = int(x.conEnd2) cStart2 = int(x.conStart1) cEnd2 = int(x.conEnd1) if V: print "Finding spt for: %s, line: %s" % (x.fileName, x.lineNum) return tree.createString(tree.spt( tree.createTree(x.parse), cStart1, cEnd2))
def fit(self, data, label): """ 模型拟合过程,实现原理参考对应的链接 :param data: 特征矩阵 :param label: 标签 :return: """ # 设置初始的数据分布的采样权重,此时都相等 self.data_weight = np.ones((data.shape[0], 1)) / data.shape[0] # 记录数据集的索引 index = np.arange(0, data.shape[0], 1) # 进行迭代求解 for i in range(self.n_iterates): # 根据数据权重进行采样, 注意bagging是有放回,boosting是无放回 # https://zhuanlan.zhihu.com/p/47922595 sub_samping = np.random.choice( index, int(self.data_weight.shape[0] * self.alpha), replace=False, p=self.data_weight.reshape(-1, ).tolist()) train_x = data[sub_samping] train_y = label[sub_samping] dt = createTree(train_x, train_y, self.feature_list) # 进行弱学习模型训练 self.model_list.append(dt) # 存储该弱学习模型 pred = list( map(lambda _: predict(dt, _, self.feature_list), train_x)) # 计算模型在训练集上的误差率 (即预测错误的样本权重相加,相同为0,不同为1) pred_error = np.ones((len(pred), 1)) pred_error[pred == train_y] = 0 et = pred_error.T.dot(self.data_weight[sub_samping]) # 把模型的权重加入到列表中 at = 0.5 * np.log((1 - et) / et) self.model_weight.append(at) # 更新样本的权重 self.data_weight[sub_samping] = self.data_weight[ sub_samping] * np.exp(-at * train_y * pred).reshape(-1, 1) # 权重归一化 self.data_weight = self.data_weight / self.data_weight.sum()
def fit(self, data, label): # 得到决策树的列个数 if self.max_features == 'auto': self.feature_size = np.rint(np.sqrt(data.shape[1])).astype(int) + 1 else: self.feature_size = np.rint(np.log2(data.shape[1])).astype(int) + 1 # 记录特征矩阵所有特征的索引 feature_index = np.arange(0, data.shape[1], 1) for index in range(self.n_estimators): sub_x, sub_y = self.sub_sampling(data, label) # 从特征索引列表中随机抽取特征作为决策树训练的数据集 feature_sample_index = np.random.choice(feature_index, self.feature_size, replace=False) # 保存决策树的列索引 self.tree_feature.append(feature_sample_index) self.tree_list.append( createTree(sub_x[:, feature_sample_index], sub_y, self.feature_list[feature_sample_index]))
def cross_validation_tree(dataSet,rate,tmpLabels): count = int(1/rate) #验证次数 test_len = int(len(dataSet) * rate) #测试数据集的长度 train_dataSet = [] #临时训练数据集 test_dataSet = [] #临时测试数据集 tree_list = [] #生成树列表 accuracy_list = [] #存放准确率 for i in range(count): if i == 0: #第一个分割的处理 train_dataSet = dataSet[test_len:] test_dataSet = dataSet[:test_len] elif i == count - 1: #最后分割的处理 train_dataSet = dataSet[:-test_len] test_dataSet = dataSet[-test_len:] else: train_dataSet = dataSet[:test_len * i] train_dataSet.extend(dataSet[test_len*(i+1):]) test_dataSet = dataSet[test_len * i:test_len * (i+1)] lastest_tree = tree.createTree(train_dataSet, tmpLabels) tree_list.append(lastest_tree) #print('lastest_tree:'+str(lastest_tree)) tree.Pep(lastest_tree,train_dataSet,tmpLabels) accuracy_list.append(tree.DataSetGetResult(lastest_tree,test_dataSet,tmpLabels)) #批量得到结果 #求准确度平均值 average_accuracy = 0 max_accuracy = max(accuracy_list) #最大值 for each in accuracy_list: average_accuracy += each average_accuracy /= len(accuracy_list) print('accuracy:'+str(accuracy_list)) print('max_accuracy:'+str(max_accuracy*100)+'%') print('average_accuracy:'+str(average_accuracy*100)+'%') i = 1 #树的临时编号 for each_tree in tree_list: tree.saveTree(each_tree, save_path = 'Model/Tree/cross/'+str(i)+'.tree') print("Decision tree list saved successfully!"+'Model/Tree/cross/'+str(i)+'.tree') i += 1
import tree import treePlotter fp = open('./data.txt') lenses = [line.strip().split(' ') for line in fp.readlines()] labels = ['age', 'prescript', 'astigmatic', 'tearRate', 'other'] myTree = tree.createTree(lenses, labels) treePlotter.createTreePlot(myTree)
import tree # 计算数据集的香农熵 #myDat, labels = tree.createDataSet() #print(myDat) #print(tree.calcShannonEnt(myDat)) #myDat[0][-1] = 'maybe' #print(myDat) #print(tree.calcShannonEnt(myDat)) print("*****************************************************************") # 在前面的简单样本数据上测试函数splitDataSet() #print(tree.splitDataSet(myDat, 0, 1)) #print(tree.splitDataSet(myDat, 0, 0)) print("*****************************************************************") #myDat, labels = tree.createDataSet() #print(myDat) #print(tree.chooseBestFeatureToSplit(myDat)) print("*****************************************************************") # 变量myTree包含了很多代表树结构信息的嵌套字典。 # 从左边开始第一个关键字no surfacing是第一个划分数据集的特征名称,该关键字的值也是另一个数据字典。 # 第二个关键字是no surfacing特征划分的数据集,这些关键字的值是no surfacing节点的子节点 # 这些值可能是类标签,也可能是另一个数据字典。如果值是类标签,则该子节点是叶子节点; # 如果值是另一个数据字典,则子节点是一个判断节点,这种格式结构不断重复就构成了整棵树,本节的例子中,这棵树包含了3个叶子节点以及2个判断节点 myDat, labels = tree.createDataSet() myTree = tree.createTree(myDat, labels) print(myTree)
def test_plot(): data, attributes = createDataSet() tree.createPlot(tree.createTree(data, attributes))
from tree import createDataSet, createTree from treePlotter import retrieveTree, createPlot """ 决策树非常好的匹配了实验数据,然而只写匹配选项可能太多了,我们将这种问题称为过度匹配(overfitting). 为了减少过度匹配问题,我们可以裁剪决策树,去掉一些不必要的叶子节点.如果叶子节点只能增加少许信息,则可以删除该节点, 并将他加入其他叶子节点中 本章采用的算法叫做ID3, 无法直接处理数值型数据,尽管我们可以通过量化的方法将数值型数据转化为标称型数据, 但是存在提案多的特征划分, 第九章学习另一个决策树构造算法CART, C4.5 """ def classify(inputTree, featLabels, testVec): """ 在实际数据集中改属性存储在哪个位置? 是第一个属性还是第二个属性? :param inputTree: :param featLabels: :param testVec: :return: """ firstStr = list(inputTree.keys())[0] secondDict = inputTree[firstStr] # 将标签字符串转换为索引, 使用index方法查找当前列表中第一个匹配firstStr变量的元素 featIndex = featLabels.index(firstStr) for key in secondDict.keys(): # 比较testVec变量中的值与树节点的值 if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: # 如果到达叶子节点,返回当前节点的分类标签
# if you do get a dictonary you know it's a tree, and the first element will be another dict def createPlot(inTree): fig = plt.figure(1, facecolor='white') fig.clf() axprops = dict(xticks=[], yticks=[]) createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) # no ticks # createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses plotTree.totalW = float(getNumLeafs(inTree)) plotTree.totalD = float(getTreeDepth(inTree)) plotTree.xOff = -0.5 / plotTree.totalW; plotTree.yOff = 1.0; plotTree(inTree, (0.5, 1.0), '') plt.show() def retrieveTree(i): listOfTrees = [{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}, {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}} ] return listOfTrees[i] # createPlot(thisTree) if __name__ =="__main__": fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = tree.createTree(lenses, lensesLabels) createPlot(lensesTree)
#!/usr/bin/python #encoding:utf-8 import tree from ScrolledText import example import plottree dataSet , labels = tree.createDataSet(); print tree.createTree(dataSet, labels) plottree.createPlot()
# -*- coding: utf-8 -*- import tree import copy dataset, label = tree.createDataSet() print(label) # 这里仅仅用 labels=label是不行的,因为它们指向同一个内存 labels = copy.deepcopy(label) myTree = tree.createTree(dataset, labels) # print(myTree) print(label) testResult = tree.classify(myTree, label, [1, 1]) print(testResult) tree.storeTree(myTree, "F:\NatureRecognition/tree.txt") tt = tree.grabTree("F:\NatureRecognition/tree.txt") print(tt)
#encoding=utf-8 # from pudb import set_trace # set_trace() # import pudb # pu.db import tree d = tree.load_data() t = tree.createTree(d) import visualization leaf = visualization.calLeafNum(t)
# encoding=utf8 import tree import treePlotter if __name__ == '__main__': myDat, labels = tree.createDataSet() subLabels = labels[:] # print myDat # shannonEnt = tree.calcShannonEnt(myDat) # print shannonEnt # 熵越高,混合的数据越多,也就是种类越多 # myDat[0][-1] = 'maybe' # print tree.calcShannonEnt(myDat) # print tree.splitDataSet(myDat, 1, 1) # print tree.chooseBestFeatureToSplit(myDat) myTree = tree.createTree(myDat, subLabels) print myTree # print treePlotter.getNumLeafs(myTree) # print treePlotter.getTreeDepth(myTree) # treePlotter.createPlot() # print tree.classify(myTree, labels, [1, 0]) # print tree.classify(myTree, labels, [1, 1]) # print tree.classify(myTree, labels, [0, 0]) tree.storeTree(myTree, 'tree.txt') print tree.grabTree('tree.txt')
# 子树不剪,则继续下一个子树 cutBranch_uptodown(secondDict[key], subDataSet, tempfeatLabels) if __name__ == '__main__': global num num = 0 # dataset, features = ig.createDataSet() # dataset,features = createDataSet() # dataset, features = createDataSet_iris() dataset, features = createDataSetCNDA(os.getcwd() + '/templates/tree/bloodpresure/bloodpresure.xls') # print dataset # print dataset print features features2 = features[:] # labels2=labels:这样的赋值只是引用地址的传递,当labels改变时,labels2也会改变。只有labels2=labels[:]这样的才是真正的拷贝 tree = tree.createTree(dataset, features, 'C4.5') # print tree # print classify(tree,features2,[0,1,1,1,0]) tp.createPlot(tree) count = [] # getCount(tree,dataset,features2,count) # print num # print count cutBranch_uptodown(tree, dataset, features2) # cutBranch_downtoup(tree, dataset, features2, count) tp.createPlot(tree) def cutBranch_downtoup(inputTree, dataSet, featLabels, count): # 自底向上剪枝
def createGraph(self, node): tree.createTree(node) return 0
# -*- coding: UTF-8 -*- 或者 #coding=utf-8 ''' Created on 2016-8-19 @author: XiaoYuan1 ''' import tree import classify from copy import copy '创建训练数据源' mydat,labels = tree.createDataSet() labels2 = copy(labels) '构建决策树' mytree = tree.createTree(mydat, labels) print mytree '使用决策树模型对数据进行分类' result = classify.classify(mytree, labels2, [1,0]) print result
def createGraph(self,node): tree.createTree(node) return 0
myTree=tree.createTree(myDat,labels) print myTree ''' ''' import treePlotter reload(treePlotter) print treePlotter.retrieveTree(1) myTree=treePlotter.retrieveTree(0) print treePlotter.getNumLeafs(myTree) print treePlotter.getTreeDepth(myTree) print treePlotter.createPlot(myTree) ''' import tree reload(tree) ''' myDat,labels=tree.createDataSet() print labels myTree=treePlotter.retrieveTree(0) print myTree print tree.classify(myTree,labels,[1,0]) print tree.classify(myTree,labels,[1,1]) tree.storeTree(myTree,'classfierStorage.txt') print tree.grabTree('classfierStorage.txt') ''' fr=open('lenses.txt') lenses=[inst.strip().split('\t') for inst in fr.readlines()] lensesLabels=['age','prescript','astigmatic','tearRate'] lensesTree=tree.createTree(lenses,lensesLabels) print lenses import treePlotter treePlotter.createPlot(lenseTree)
def testClass2(): fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = tree.createTree(lenses, lensesLabels) print lensesTree
#-*- coding:utf-8 -*- import tree mydat,label = tree.createDataSet() #mydat #tree.calcShannonEnt(mydat)#得到数据集的熵值 #reload(tree) #tree.splitDataSet(mydat,0,1)#得到第0个特征值为1的元素list #tree.splitDataSet(mydat,0,0) #tree.chooseBestFeatureToSplit(mydat)#得到最佳特征值索引 mytree = tree.createTree(mydat,label)#得到决策树信息 mytree
numLeafs = getNumLeafs(myTree) depth = getTreeDepth(myTree) firstStr = list(myTree.keys())[0] cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff) plotMidText(cntrPt, parentPt, nodeTxt) plotNode(firstStr, cntrPt, parentPt, decisionNode) secondDict = myTree[firstStr] plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD for key in secondDict.keys(): if type(secondDict[key]).__name__ == 'dict': plotTree(secondDict[key], cntrPt, str(key)) else: plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode) plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key)) plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD if __name__ == '__main__': # myDat, labels = createDataset() # myTree = createTree(myDat, labels) # createPlot(myTree) fr = open('./dataset/lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', ' astigmatic', 'tearRate'] lensesTree = createTree(lenses, lensesLabels) createPlot(lensesTree)
query = query.lower() condition = condition.lower() observedList.append(query) observedCondition.append(condition) query = '' condition = '' take = 0 #print(observedList) #print(observedCondition) countobserved = 0 countquery = 0 node = Node(None, None, None, None, None) for i in range(0, args.iteration): isobserved = True testtree = tree.createTree() testtree.startTree() for node in testtree.nodes: name = node.name if name in observedList: if observedCondition[observedList.index(name)] != node.status: isobserved = False if isobserved: countobserved += 1 for node in testtree.nodes: if node.name in queryList: name = node.name if queryCondition[queryList.index(name)] == node.status: countquery += 1
# labels = ['no surfacing', 'filppers'] # dataset[0][-1] = 'maybe' # shannonEnt = tree.calcShannonEnt(dataset) # print shannonEnt # print tree.splitDataSet(dataset, 0, 0) # print tree.chooseBestFeature(dataset) # print tree.createTree(dataset, labels) # treeplotter.createPlot() # myTree = treeplotter.retrieveTree(0) # print myTree # print treeplotter.getNumLeafs(myTree) # print treeplotter.getTreeDepth(myTree) # treeplotter.createPlot(myTree) # print tree.classify(myTree, labels,[1,1]) fr = open('lenses.txt') lines = fr.readlines() lensesAll = [ inst.split("\t") for inst in lines] lensesTrain = lensesAll[5:len(lines)] lensesLables = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = tree.createTree(lensesTrain, lensesLables[:]) # treeplotter.createPlot(lensesTree) # lensesTree = tree.grabTree( 'Decision.txt') # treeplotter.createPlot(lensesTree) for i in range(5): print "分类为%s, 正确为%s" %(tree.classify(lensesTree, lensesLables, lensesAll[i][0:-1]), lensesAll[i][-1])
import tree import treeplot def loaddata(filename): f = open(filename) lines = f.readlines() dat = [line.strip().split('\t') for line in lines] label = ['age','prescript','astigmatic','tearrate'] return dat,label if __name__ == "__main__": dat, label = loaddata("lenses.txt") lensestree = tree.createTree(dat,label) print(lensestree) treeplot.createPlot(lensestree)
import numpy as np ''' lenses,lensesLabels = tree.createDataSet() lensesTree = tree.createTree(lenses,lensesLabels) print(lensesTree) treeplotter.createPlot(lensesTree) ''' df = pd.read_csv( "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None) x = df.iloc[:, [0, 1, 2, 3]].values y = df.iloc[:, 4].values labels = [ "sepal length<5. 55", "setal width>3. 35", "petal length<2. 45", "petal width>0. 8" ] mydat = list(np.where(x[:, 0] < 5.5, 1, 0)) mydat = np.vstack([mydat, list(np.where(x[:, 1] < 3.3, 1, 0))]) mydat = np.vstack([mydat, list(np.where(x[:, 2] < 2., 1, 0))]) mydat = np.vstack([mydat, list(np.where(x[:, 3] < 1, 1, 0))]) mydat = mydat.transpose(1, 0) mydat = np.column_stack((mydat, y)) mydat = list(mydat) for i in range(len(mydat)): mydat[i] = list(mydat[i]) lensesTree = tree.createTree(mydat, labels, "C4.5") print(lensesTree) treeplotter.createPlot(lensesTree)
def main(): fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] Tree = tree.createTree(lenses, lensesLabels) createPlot(Tree)
示例:使用决策树预测隐形眼镜类型 (1)收集数据:提供的文本文件。 (2)准备数据:解析tab键分隔的数据行。 (3)分析数据:快速检查数据,确保正确地解析数据内容 (4)训练算法:使用3.1节的createTree 函数。 (5)测试算法:编写测试函数验证决策树可以正确分类给定的数据实例。 (6)使用算法:存储树的数据结构,以便下次使用时无需重新构造树。 ''' import tree fr=open('lenses.txt') ''' #lenses=[inst.strip().split('\t') for inst in fr.readline()] lensesLabels =['age','prescript','astigmatic','tearRate'] #lensesTree = createTree(lenses,lensesLabels) ''' dataSet=[] while True: line = fr.readline() if not line:break dataSet.append(line.strip().split('\t')) lensesLabels =['age','prescript','astigmatic','tearRate'] lensesTree = tree.createTree(dataSet,lensesLabels) print lensesTree ''' 匹配选项可能太多了。我们将这种问题称之为过度匹配(overfitting)。 为了减少过度匹配问题,我们可以裁剪决策树,去掉一些不必要的叶子节点。 如果叶子节点只能增加少许信息,则可以删除该节点,将它并人到其他叶子节点中。 '''
def test_createTree(): data, attributes = createDataSet() print tree.createTree(data, attributes)
import arff import tree import sys arg = sys.argv m = int(arg[3]) trainData = arff.load(open(arg[1], 'r')) testData = arff.load(open(arg[2], 'r')) myTree = tree.createTree(trainData['data'], trainData['attributes'], m) tree.plotTree(myTree, trainData['attributes']) prediction = [tree.classify(myTree, testData['attributes'], obs) for obs in testData['data']] true = [obs[-1] for obs in testData['data']] print "<Predictions for the Test Set Instances>" n = 0 for i in range(len(prediction)): index = i + 1 if prediction[i] == true[i]: n += 1 print "{}: Actual: {} Predicted: {}".format(n, true[i], prediction[i]) print "Number of correctly classified: {} Total number of test instances: {}".format(n, len(testData['data']))
#-*- coding:utf-8 –*- import preprocess import tree import evaluation parties, realSim = preprocess.preprocess( filename='NO_DMV_Match.csv', col=['last_name', 'first_name', 'middle_name'], parties=3, corruption=0.4, hashnumber=100, length=1000) print parties print realSim root = tree.createTree(parties, 2) #tree.printTree(root) clusters = tree.cluster(root) evaluation.printCluster(clusters) evaluation.compareAll(clusters, realSim, 0.7)
import tree as t import treePlotter as tp import os f = open(os.path.dirname(__file__) +'/lenses.txt') lenses = [r.strip().split('\t') for r in f.readlines()] lensesLabel = ['age','prescript','astigmatic','tearRate'] lensesTree = t.createTree(lenses,lensesLabel) tp.createPlot(lensesTree) fmt = '%10s' print [fmt % x for x in lensesLabel] for lense in lenses: print [fmt % x for x in lense],t.classify(lensesTree,lensesLabel,lense[0:-1])