def build_tree(self, x_train: pd.DataFrame, y_train: np.array, tree: TreeNode) -> None: """ Build decision tree with ID3 feature choosing algorithm. :param x_train: Pandas data frame with the data (features) to train the model on. The array length is N*M where N is the number of samples in the dataset, and M is the number of features. :param y_train: Numpy array with the labels of the data. Length is N. :param tree: A root node. :return: None. """ # Create new leaf with the class. if x_train.shape[0] == 1 or y_train.shape[ 0] <= self.pruning_M or self.are_all_labels_equal(y_train): new_node = TreeNode(self.find_major_class(y_train)) tree.add_child(new_node) return # Find best feature to continue with, and best threhold to split the data with it. f_name, best_threshold = self.get_feature_with_max_info_gain( x_train, y_train) # Add another layer to the tree representing the current f, add thresh for prediction later. new_node = TreeNode((f_name, str(best_threshold))) tree.add_child(new_node) samples_great, labels_great, samples_small, labels_small = self.split_by_threshold( data=x_train, threshold=best_threshold, feature_name=f_name) if samples_great.shape[0] == 1: # If there is only one sample, no need to do recursion call. node = TreeNode(labels_great[0]) new_node.add_child(node) else: self.build_tree(samples_great, labels_great, new_node) if samples_small.shape[0] == 1: # If there is only one sample, no need to do recursion call. node = TreeNode(labels_small[0]) new_node.add_child(node) else: self.build_tree(samples_small, labels_small, new_node) return
def _build_parent_node_list(directory, level, children): node_list = list() index = 0 children.sort(key=lambda child_node: child_node.get_path()) while index < len(children): if index % 2 != 0: node = TreeNode(directory + '_' + level + '_' + (index + 1) % 2) node.add_child(children[index]) node.add_child(children[index - 1]) if index == len(children) - 2: index += 1 node.add_child(children[index]) node.calc_signature() node_list.append(node) index += 1 return node_list
def _create_dir_node(dir_path, node_list): node = TreeNode(dir_path) for child_node in node_list: node.add_child(child_node) node.calc_signature() return node