예제 #1
0
파일: myC45.py 프로젝트: kookka/py-dtree
    def find_threshold(self, data_attr, data_target):
        '''
            Find threshold of data_attr with respect to data_target
        '''
        attr_name = data_attr.columns[0]
        target_name = data_target.columns[0]
        data = pd.concat([data_attr, data_target], axis=1)
        data = data.sort_values(attr_name).reset_index(drop=True)

        # Retrieve all indexes with different value of target attribute
        diff_index = list()
        for i in range(len(data) - 1):
            if data[target_name].iloc[i] != data[target_name].iloc[i + 1]:
                diff_index.append(i)

        best_point = 0
        best_idx = -1
        for i in diff_index:
            if (self.gain_ratio):
                point = Calculate.gain_ratio(data[attr_name],
                                             data[target_name],
                                             is_continue=True,
                                             split_index=i)
            else:
                point = Calculate.info_gain(data[attr_name],
                                            data[target_name],
                                            is_continue=True,
                                            split_index=i)
            if point > best_point:
                best_point = point
                best_idx = i

        best_splitter = (data[attr_name].iloc[best_idx] +
                         data[attr_name].iloc[best_idx + 1]) / 2

        return [best_splitter, best_point]
예제 #2
0
    def fit(self, data, attributes, target_name):
        '''
            Built and return decision tree using ID3 algorithm
        '''

        data_target = data[target_name]

        # Data target contains one label
        entropy_data_target = Calculate.entropy(data_target)
        if entropy_data_target == 0:
            value_list = Calculate.get_unique_data(data, target_name)
            value_dict = dict()
            for key, value in value_list.items():
                value_dict[key] = len(value_list[key])

            # Set current_node, info_gain, values
            tree = Tree(
                Node(None,
                     entropy_data_target,
                     value_dict,
                     result=data_target[0],
                     is_leaf=True))
            return tree

        # Nothing attribute shall be chosen
        if len(attributes) == 0:
            # Set current_node, info_gain, values
            value_list = Calculate.get_unique_data(data, target_name)
            value_dict = dict()
            for key, value in value_list.items():
                value_dict[key] = len(value_list[key])

            tree = Tree(
                Node(None,
                     entropy_data_target,
                     value_dict,
                     result=Calculate.most_label(data_target),
                     is_leaf=True))
            return tree
        else:
            # Find best attribute to be node using either info gain or gain ratio
            best_attr = ''
            best_point = 0  # Could be Info gain or Gain ratio
            for attr in attributes:
                if self.gain_ratio:
                    point = Calculate.gain_ratio(data[attr], data_target)
                    if point > best_point:
                        best_point = point
                        best_attr = attr
                else:
                    point = Calculate.info_gain(data[attr], data_target)
                    if point > best_point:
                        best_point = point
                        best_attr = attr

            value_list = Calculate.get_unique_data(data, target_name)
            value_dict = dict()
            for key, value in value_list.items():
                value_dict[key] = len(value_list[key])

            # Build decision tree recursively
            dtree = Tree(Node(best_attr, best_point, value_dict))

            # Delete usage attribute in attributes
            attributes.remove(best_attr)

            # Scan all posible value to be generated subtree
            list_attribute = Calculate.get_unique_data(data, best_attr)
            i = 0
            for attribute in list_attribute:
                data = pd.DataFrame(
                    data=list_attribute[attribute]).reset_index(drop=True)
                data.drop(best_attr, axis=1, inplace=True)
                dtree.add_child(self.fit(data, attributes, target_name))
                dtree.children[i].value.edge = attribute
                i += 1
            return dtree