Esempio n. 1
0
def id3(x, y, feature_list, impurity_measure="entropy", parent_node=None):
    """
    Creates a decision tree, with the id3 alogoritm, based on learning data
    :param x: dataset of items as list of list
    :param y: target values for items as list
    :param feature_list: available features for each iteration as list
    :param impurity_measure: impurity measure entropy or gini as string
    :param parent_node: parent for this node iteration as Tree (node representation), default None
    :return: root node for tree as type Tree (node representation)
    """
    x = np.array(x)
    uniques, counts = np.unique(y, return_counts=True)#Find unique target values

    if len(uniques) == 1:  # Set is pure, only one target value left in set
        return Tree(uniques[0], label=uniques[0])

    if len(feature_list) == 0:#No more attributes to split on
        if counts[0] > counts[1]:
            return Tree(uniques[0], label=uniques[0])
        else:
            return Tree(uniques[1], label=uniques[1])

    best_attribute = max_ig(x, y, feature_list, impurity_measure)

    vals, counts = np.unique(x[:, best_attribute], return_counts=True)#Finds unique attribute values
    this_node = Tree(best_attribute, parent_node)

    n_children = len(vals)

    feature_list.remove(best_attribute)
    children_labels = []
    for child in range(n_children): #Creates each children, based on unique attribute values
        x_for_this_child = []
        y_for_this_child = []
        for item in range(len(x)): #Loops through all items, and passes on all items with correct attribute value
            if x[item][best_attribute] == vals[child]: # Items where the attribute value is equal to the decision branch
                x_for_this_child.append(x[item])
                y_for_this_child.append(y[item])

        # Continues to grow this child node
        new_child = id3(x_for_this_child, y_for_this_child, feature_list, impurity_measure, parent_node=this_node)
        new_child.set_was_split_on(vals[child]) #Saves what attribute value this child was split on
        this_node.add_child(new_child)
        children_labels.append(new_child.label)

    label_list, label_count = np.unique(children_labels, return_counts=True) #Lists of children labels
    label = label_list[np.argmax(label_count)]
    this_node.set_label(label)#This node get most popular label among children

    return this_node
Esempio n. 2
0
    def __fit_without_prune(self, data, features, target):
        '''
            Built entire decision tree without pruning
        '''

        continuous_features = list()
        discrete_features = list()
        for feature in features:
            if len(list(data[feature])) > 0:
                is_continue = self.is_attr_continue(list(data[feature]))
                if is_continue:
                    continuous_features.append(feature)
                else:
                    discrete_features.append(feature)

        if not continuous_features:
            return MyID3(self.gain_ratio).fit(data, features, target)

        # Continuous attribute

        # If only one value exist
        entropy_data_target = Calculate.entropy(data[target])
        if entropy_data_target == 0:
            value_list = Calculate.get_unique_data(data, target)
            value_dict = dict()
            for key, value in value_list.items():
                value_dict[key] = len(value_list[key])

            return Tree(
                Node(
                    None,
                    0.0,  # Entropy must be 0 since only one value exist
                    value_dict,
                    result=data[target][0],
                    is_leaf=True))

        if (len(features) == 0):
            value_list = Calculate.get_unique_data(data, target)
            value_dict = dict()
            for key, value in value_list.items():
                value_dict[key] = len(value_list[key])
            return Tree(
                Node(None,
                     entropy_data_target,
                     value_dict,
                     result=Calculate.most_label(data[target]),
                     is_leaf=True))

        # Find best attribute and build tree recursively
        best_attr = ''
        best_point = 0
        is_discrete = False
        best_splitter = 0
        chosen_edge = list(['', ''])
        for feature in continuous_features:
            best_treshold = self.find_threshold(data[[feature]],
                                                data[[target]])
            if best_treshold[1] > best_point:
                best_attr = str(feature)
                chosen_edge[0] = best_attr + ' > ' + str(best_treshold[0])
                chosen_edge[1] = best_attr + ' <= ' + str(best_treshold[0])
                best_point = best_treshold[1]
                best_splitter = best_treshold[0]
        for feature in discrete_features:
            point = Calculate.info_gain(data[feature], data[target])
            if point > best_point:
                best_point = point
                best_attr = str(feature)
                is_discrete = True

        value_list = Calculate.get_unique_data(data, target)
        value_dict = dict()
        for key, value in value_list.items():
            value_dict[key] = len(value_list[key])
        dtree = Tree(Node(best_attr, best_point, value_dict))

        # Scan all posible value to be generated subtree
        if is_discrete:
            list_attribute = Calculate.get_unique_data(data, best_attr)
        else:
            list_attribute = Calculate.split_by_threshold(
                data, best_attr, best_splitter)

        i = 0

        for attribute in list_attribute:
            data = pd.DataFrame(data=list_attribute[attribute]).reset_index(
                drop=True)
            dtree.add_child(self.__fit_without_prune(data, features, target))
            if is_discrete:
                dtree.children[i].value.edge = attribute
            else:
                dtree.children[i].value.edge = chosen_edge[i]
            i += 1

        return dtree
Esempio n. 3
0
    def fit(self, data, attributes, target_name):
        '''
            Built and return decision tree using ID3 algorithm
        '''

        data_target = data[target_name]

        # Data target contains one label
        entropy_data_target = Calculate.entropy(data_target)
        if entropy_data_target == 0:
            value_list = Calculate.get_unique_data(data, target_name)
            value_dict = dict()
            for key, value in value_list.items():
                value_dict[key] = len(value_list[key])

            # Set current_node, info_gain, values
            tree = Tree(
                Node(None,
                     entropy_data_target,
                     value_dict,
                     result=data_target[0],
                     is_leaf=True))
            return tree

        # Nothing attribute shall be chosen
        if len(attributes) == 0:
            # Set current_node, info_gain, values
            value_list = Calculate.get_unique_data(data, target_name)
            value_dict = dict()
            for key, value in value_list.items():
                value_dict[key] = len(value_list[key])

            tree = Tree(
                Node(None,
                     entropy_data_target,
                     value_dict,
                     result=Calculate.most_label(data_target),
                     is_leaf=True))
            return tree
        else:
            # Find best attribute to be node using either info gain or gain ratio
            best_attr = ''
            best_point = 0  # Could be Info gain or Gain ratio
            for attr in attributes:
                if self.gain_ratio:
                    point = Calculate.gain_ratio(data[attr], data_target)
                    if point > best_point:
                        best_point = point
                        best_attr = attr
                else:
                    point = Calculate.info_gain(data[attr], data_target)
                    if point > best_point:
                        best_point = point
                        best_attr = attr

            value_list = Calculate.get_unique_data(data, target_name)
            value_dict = dict()
            for key, value in value_list.items():
                value_dict[key] = len(value_list[key])

            # Build decision tree recursively
            dtree = Tree(Node(best_attr, best_point, value_dict))

            # Delete usage attribute in attributes
            attributes.remove(best_attr)

            # Scan all posible value to be generated subtree
            list_attribute = Calculate.get_unique_data(data, best_attr)
            i = 0
            for attribute in list_attribute:
                data = pd.DataFrame(
                    data=list_attribute[attribute]).reset_index(drop=True)
                data.drop(best_attr, axis=1, inplace=True)
                dtree.add_child(self.fit(data, attributes, target_name))
                dtree.children[i].value.edge = attribute
                i += 1
            return dtree