def tree_grow(x, y, nfeat, nmin=2, minleaf=1): """ Input parameters: x (2D array): Data matrix y (1D array): Binary class labels nmin (int): Min # observations a node must contain for it to be allowed to split minleaf (int): Min # observations required for a leaf node nfeat (int): # features to be considered for each split Outputs: Tree object based on best splits of gini index impurity reduction function """ print("GROWING CLASSIFICATION TREE") #print(f" x type = {type(x)}, y type = {type(y)}") #print(f"for x = {x[0:20]}, y={y[0:20]}, nmin = {nmin}, minleaf = {minleaf}") # each node has a name, list of indices (records), and "leaf" boolean attribute root = Node('root', indices=np.arange(0, x.shape[0]), leaf=False) nodelist = [root] split_nr = 0 # will be used for node names while nodelist: # while nodelist not empty split_nr += 1 current_node = nodelist.pop( 0 ) # get node from nodelist TODO: choose random node or first on list? #print(f"Processing node {current_node}") # TODO: skip this if nfeat not specified? Adjust optional nfeat in tree_grow def if nfeat: feat_list = random.sample( list(np.arange(0, x.shape[1])), k=nfeat) # randomly draw nfeat col indices from # cols of x else: feat_list = list( np.arange(0, x.shape[1]) ) # feat_list is simply indices of all columns of x (except first = indices) [feat, split_val] = best_split(x, y, current_node, feat_list, minleaf) if feat == None and split_val == None: # no possible split found current_node.leaf = True # add class prediction label to leaf node: current_node.y = y[current_node.indices] if sum((current_node.y) / len(current_node.y)) > 0.5: current_node.prediction = 1 else: current_node.prediction = 0 else: # choose split with highest impurity reduction current_node.split_feat = feat # add feature (col nr of x) and split value by which node will be split current_node.split_val = split_val # from indices in current nodes (current_node.indices), select those where value in column f > split_val indices_left = current_node.indices[x[current_node.indices, feat] > split_val] left = Node(f"L{split_nr}", parent=current_node, indices=indices_left) # if child node too small for splitting or we have a pure node (impurity=0), make it a leaf node: if (len(indices_left) < nmin) or (impurity(y[indices_left]) == 0): left.leaf = True left.y = y[indices_left] if sum((left.y) / len(left.y)) > 0.5: left.prediction = 1 else: left.prediction = 0 else: # add to nodelist left.leaf = False nodelist.append(left) indices_right = np.setdiff1d( current_node.indices, indices_left ) # indices_right = indices in current node not in indices_left right = Node(f"R{split_nr}", parent=current_node, indices=indices_right) if (len(indices_right) < nmin) or (impurity(y[indices_right]) == 0): # make child leaf node right.leaf = True right.y = y[indices_right] if (sum(right.y) / len(right.y)) > 0.5: right.prediction = 1 else: right.prediction = 0 else: # add to nodelist right.leaf = False nodelist.append(right) print(f"TREE DONE") #\n {RenderTree(root)}") return root
def tree_grow(x, y, nfeat, nmin=2, minleaf=1): """ Parameters: x (2D array): Data matrix y (1D array): Binary class labels nmin (int): Min # observations a node must contain for it to be allowed to split minleaf (int): Min # observations required for a leaf node nfeat (int): # features to be considered for each split Returns: Tree object based on best splits of gini index impurity reduction function """ # each node has a name, list of indices (records), and "leaf" boolean attribute root = Node('root', indices=np.arange(0, x.shape[0]), leaf=False, prediction=None, split_feat=None, split_val=None) nodelist = [root] split_nr = 0 # will be used for node names while nodelist: # while nodelist not empty split_nr += 1 current_node = nodelist.pop( 0 ) # get node from nodelist TODO: choose random node or first on list? print( f"\n PROCESSING NEW NODE {current_node} on subtree: x = \n {x[current_node.indices, :]} \n y = {y[current_node.indices]}" ) # TODO: skip this if nfeat not specified? Adjust optional nfeat in tree_grow def if nfeat: feat_list = random.sample( list(np.arange(0, x.shape[1])), k=nfeat) # randomly draw nfeat col indices from # cols of x else: feat_list = list( np.arange(0, x.shape[1]) ) # feat_list is simply indices of all columns of x (except first = indices) poss_splits = [ ] # will contain for each feature index (in col1), the impurity reduction (col2) & split value (col3) of the best split for f in feat_list: # find the best split (based on gini index) for rows of x specific by current_node.indices list, based on col f [reduction_val, split_val] = bestsplit_of_col(x[current_node.indices, f], y[current_node.indices], minleaf) if reduction_val != 0: # if found a split which is allowed, then this is the best split for feature f poss_splits.append([f, reduction_val, split_val]) if not poss_splits: # no possible split found current_node.leaf = True # add class prediction label to leaf node: if sum(current_node.indices) / len(current_node.indices) > 0.5: current_node.prediction = 1 else: current_node.prediction = 0 else: # choose split with highest impurity reduction: poss_splits.sort( key=lambda x: x[1], reverse=True ) # sort poss_splits list by the 2nd column (reduction values) in descending order # TODO: add tiebraker in case of 2 features with identical impurity reduction -> keep list of features previsouly used for splitting? feat = poss_splits[0][0] split_val = poss_splits[0][2] current_node.split_feat = feat # add feature (col nr of x) and split value by which node will be split current_node.split_val = split_val # from indices in current nodes (current_node.indices), select those where value in column f > split_val indices_left = current_node.indices[x[current_node.indices, feat] > split_val] left = Node(f"L{split_nr}", parent=current_node, indices=indices_left) # if child node too small for splitting or we have a pure node (impurity=0), make it a leaf node: if (len(indices_left) < nmin) or (impurity(y[indices_left]) == 0): left.leaf = True left.y = y[indices_left] if sum((left.y) / len(left.y)) > 0.5: left.prediction = 1 else: left.prediction = 0 else: # make child node and add to nodelist left.leaf = False nodelist.append(left) indices_right = np.setdiff1d( current_node.indices, indices_left ) # indices_right = indices in current node not in indices_left right = Node(f"R{split_nr}", parent=current_node, indices=indices_right) if (len(indices_right) < nmin) or (impurity(y[indices_right]) == 0): # make child leaf node right.leaf = True right.y = y[indices_right] if (sum(right.y) / len(right.y)) > 0.5: right.prediction = 1 else: right.prediction = 0 else: # make child node and add to nodelist right.leaf = False nodelist.append(right) print( f"Finished processing node, tree now looks as follows: \n {RenderTree(root)}" ) print(f"\n TREE DONE: \n {RenderTree(root)}") return root