Esempio n. 1
0
    def build_tree(self, x_train: pd.DataFrame, y_train: np.array,
                   tree: TreeNode) -> None:
        """
        Build decision tree with ID3 feature choosing algorithm.

        :param x_train: Pandas data frame with the data (features) to train the model on. The array length is N*M where
                N is the number of samples in the dataset, and M is the number of features.
        :param y_train: Numpy array with the labels of the data. Length is N.
        :param tree: A root node.
        :return: None.
        """
        # Create new leaf with the class.
        if x_train.shape[0] == 1 or y_train.shape[
                0] <= self.pruning_M or self.are_all_labels_equal(y_train):
            new_node = TreeNode(self.find_major_class(y_train))
            tree.add_child(new_node)
            return

        # Find best feature to continue with, and best threhold to split the data with it.
        f_name, best_threshold = self.get_feature_with_max_info_gain(
            x_train, y_train)

        # Add another layer to the tree representing the current f, add thresh for prediction later.
        new_node = TreeNode((f_name, str(best_threshold)))
        tree.add_child(new_node)

        samples_great, labels_great, samples_small, labels_small = self.split_by_threshold(
            data=x_train, threshold=best_threshold, feature_name=f_name)
        if samples_great.shape[0] == 1:
            # If there is only one sample, no need to do recursion call.
            node = TreeNode(labels_great[0])
            new_node.add_child(node)
        else:
            self.build_tree(samples_great, labels_great, new_node)

        if samples_small.shape[0] == 1:
            # If there is only one sample, no need to do recursion call.
            node = TreeNode(labels_small[0])
            new_node.add_child(node)
        else:
            self.build_tree(samples_small, labels_small, new_node)

        return
Esempio n. 2
0
def _build_parent_node_list(directory, level, children):
    node_list = list()
    index = 0

    children.sort(key=lambda child_node: child_node.get_path())
    while index < len(children):
        if index % 2 != 0:
            node = TreeNode(directory + '_' + level + '_' + (index + 1) % 2)
            node.add_child(children[index])
            node.add_child(children[index - 1])

            if index == len(children) - 2:
                index += 1
                node.add_child(children[index])

            node.calc_signature()
            node_list.append(node)
        index += 1
    return node_list
Esempio n. 3
0
def _create_dir_node(dir_path, node_list):
    node = TreeNode(dir_path)
    for child_node in node_list:
        node.add_child(child_node)
    node.calc_signature()
    return node