def create_immediate_children(self): assert self.mydataframe is not None, "create_child_nodes called with no dataframe" assert self.target_attribute is not None, "create child nodes called with no target_attribute" # TODO keeping track of columns still available to be split on subsequent nodes would allow us to feed the column list into # TODO determine best split. This would cut back on checks therein (depending on how expensive pandas.unique is) split_column, split_values, score = MathFunctions.determine_best_split( self.mydataframe, self.target_attribute) # if no available split, then this is a leaf. if not split_column: target_classes = self.mydataframe[self.target_attribute].unique() assert len(target_classes) == 1 self.leaf_class = target_classes[0] # otherwise, make more nodes else: self.score = score self.decision_attribute = split_column for single_split in split_values: new_child = Node(parent_node=self) new_child.depth = self.depth + 1 new_child.target_attribute = self.target_attribute child_dataframe = self.mydataframe.loc[ self.mydataframe[split_column].isin(single_split)] new_child.mydataframe = child_dataframe new_child_edge = self.child_edge() new_child_edge.node_ptr = new_child new_child_edge.split_portion = single_split self.child_edges.append(new_child_edge)
def test_determine_split(self): assert self.target_attribute in self.mydataframe.columns, self.target_attribute + " was not in column map" split_column, split_values, trash = MathFunctions.determine_best_split( self.mydataframe, self.target_attribute) return (split_column, split_values)