def __init__(self, inputs, labels, featureName, featureNames, featureTypes, subset=[], boundary=None, operator=''): #Find the actual feature array based on the feature name colIndex = np.where(featureName == featureNames)[0][0] features = inputs[:, colIndex] #The key feature of noonterminal nodes is that they have a decision if featureTypes[colIndex] == 'string': #then the feature is of type categorical and the decision must be categorical self.decision = Decision.Categorical(featureName, subset) else: #else the feature is of type numerical and decision must be numerical self.decision = Decision.Numerical(featureName, operator, boundary) #create the true and false nodes, both unexplored, by filtering the #data to each node based on the decision trueIndices = np.vectorize(self.decision.function)(features) falseIndices = np.logical_not(trueIndices) self.trueNode = Node(inputs[trueIndices], labels[trueIndices], '') self.falseNode = Node(inputs[falseIndices], labels[falseIndices], '') #the node will never use its input field again, but the labels field #is needed for tree pruning Node.__init__(self, [], labels, self.decision.__str__(), 'decision')
def selectBestFeature(inputs, labels, impurityCat, impurityNum, featuresToConsider, featureNames, featureTypes, numIntervals, seed=0): random.seed(seed) if featuresToConsider > inputs.shape[1] or featuresToConsider < 0: Exception('featuresToConsider must be between 0 and {maxx}'.format( maxx=featuresToConsider)) possibleFeatures = np.random.choice(featureNames, featuresToConsider, False) bestImpurity = np.inf bestSplit = ('', '', '' ) #in the form (featureName, isCategorical, arguements) for name in possibleFeatures: colIndex = np.where(name == featureNames)[0][0] vals = inputs[:, colIndex] isCategorical = False if featureTypes[colIndex] == 'string': isCategorical = True if isCategorical: #if the feature is categorical, compute a split for each possible #value and pick the maximum one subset = [] splitImpurity = bestImpurity for cat in np.unique(vals): dec = Decision.Categorical(name, [cat]) trueIndices = np.vectorize(dec.function)(vals) falseIndices = np.logical_not(trueIndices) trueNodeImpurity = impurityResubLabel( labels[trueIndices], impurityCat, impurityNum, sum(trueIndices) / len(labels)) falseNodeImpurity = impurityResubLabel( labels[falseIndices], impurityCat, impurityNum, sum(falseIndices) / len(labels)) if splitImpurity > trueNodeImpurity + falseNodeImpurity: subset = [cat] splitImpurity = trueNodeImpurity + falseNodeImpurity split = (name, True, subset) else: #else the feature is numerical. In that case I discretize the #continuous features into "numIntervals" bins or #however many intervals are possible. def findReImpurityGivenBoundary(bound): lessBound = labels[vals <= bound] moreBound = labels[vals > bound] trueNodeReImpurity = impurityResubLabel( lessBound, impurityCat, impurityNum, len(lessBound) / len(labels)) falseNodeReImpurity = impurityResubLabel( moreBound, impurityCat, impurityNum, len(moreBound) / len(labels)) return trueNodeReImpurity + falseNodeReImpurity numIntervalsForContinuousFeat = min(numIntervals, inputs.shape[0]) possibleBounds = [ max(i) for i in np.array_split(np.sort(vals), numIntervalsForContinuousFeat) ] possibleBounds = np.array(possibleBounds) splitImpurity = np.inf bestBoundary = 0 for bound in possibleBounds: imp = findReImpurityGivenBoundary(bound) if splitImpurity > imp: splitImpurity = imp bestBoundary = bound split = (name, False, ('less', bestBoundary)) if bestImpurity > splitImpurity: bestImpurity = splitImpurity bestSplit = split return bestSplit