def chooseBestFeatureToSplit(dataset): numFeatures = len(dataset[0]) baseEntropy = entropy_shannon.calcShannonEnt(dataset) print "baseEntropy", baseEntropy bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataset] uniqueVals = set(featList) #print "uniqueVals",uniqueVals newEntorpy = 0.0 for value in uniqueVals: subDataSet = split_dataset.splitDataSet(dataset, i, value) prob = len(subDataSet) / float(len(dataset)) newEntorpy += prob * entropy_shannon.calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntorpy print "infoGain & entropy for " + str(i) + " => " + str( infoGain) + " " + str(newEntorpy) if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature
def chooseBestFeatureToSplit(dataset): numFeatures = len(dataset[0]) baseEntropy = entropy_shannon.calcShannonEnt(dataset) print "baseEntropy", baseEntropy bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataset] uniqueVals = set(featList) # print "uniqueVals",uniqueVals newEntorpy = 0.0 for value in uniqueVals: subDataSet = split_dataset.splitDataSet(dataset, i, value) prob = len(subDataSet) / float(len(dataset)) newEntorpy += prob * entropy_shannon.calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntorpy print "infoGain & entropy for " + str(i) + " => " + str(infoGain) + " " + str(newEntorpy) if infoGain > bestInfoGain: bestInfoGain = infoGain bestFeature = i return bestFeature
def createTree(dataset, labels): classList = [example[-1] for example in dataset] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataset[0]) == 1: majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataset) bestFeatLabel = labels[bestFeat] myTree = {} myTree = {bestFeatLabel: {}} del (labels[bestFeat]) featValues = [example[bestFeat] for example in dataset] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(split_dataset.splitDataSet(dataset, bestFeat, value), subLabels) return myTree
def createTree(dataset, labels): classList = [example[-1] for example in dataset] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataset[0]) == 1: majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataset) bestFeatLabel = labels[bestFeat] myTree = {} myTree = {bestFeatLabel: {}} del (labels[bestFeat]) featValues = [example[bestFeat] for example in dataset] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree( split_dataset.splitDataSet(dataset, bestFeat, value), subLabels) return myTree