Beispiel #1
0
    def Find_Best_Feature(self, df):
        """
            This function retuns the feature that can split the data.
            in which, the impurity is the least.

            you should implement the part of calculating for
            impurity (entropy or gini_index) for given feature.변경하다


            이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를
            반환하는 함수입니다.

            여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를
            계산하는 것입니다.


            [Parameter]:
                df : [dataFrame (got by pandas) ][N x D] : Training_data

            [Variables] :
                header : [list of string ] [1 x D]   the set of attribute_name, last element is for output_feature.
                input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output)
                Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame,
                          you can write   ' data  = df.values '
                self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.)
                split_value : in numeric_data, You just divide the data two part.
                              one is the less than split_value, the other is no less than split_value.

            [Objects]: (the part of implement)
                impurity : (float) entropy or gini_index got by splitting data given attribute.
                impurity_list [list of float] [1 x D] : the list which store the all impurity in order.

            [return]:
                Best_feature : (string) feature_name
                feature_type : (string) the type of feature ('Category' or 'Numeric')
        """

        header = df.columns.values
        input_feature = header[:-1]
        output_feature = header[-1]
        Y_data = df[output_feature].values

        impurity_list = []
        # for all features in DataFrame,
        for idx, h in enumerate(input_feature):
            # ============       Edit here      ==================
            col_data = df[h]
            Y = df.values[:, -1]
            distinct_data = np.unique(col_data)
            # Category Feature Case
            if idx in self.Category_feature_idx:
                impurity = 0
                for attr in distinct_data:
                    attr_idx = (col_data == attr)
                    y0 = Y[attr_idx]
                    p0 = len(y0) / len(Y)
                    impurity += p0 * impurity_func(y0, self.criterion)

            # Numeric Feature Case
            else:
                split_value = Finding_split_point(df, h, self.criterion)
                for val in distinct_data:
                    less_idx = (col_data < split_value)

                    y0 = Y[less_idx]
                    y1 = Y[~less_idx]

                    p0 = len(y0) / len(Y)
                    p1 = len(y1) / len(Y)
                    impurity = np.sum([
                        p0 * impurity_func(y0, self.criterion),
                        p1 * impurity_func(y1, self.criterion)
                    ])

        #=====================================================
            impurity_list.append(np.round(impurity, 6))

        idx = np.argmin(impurity_list)
        Best_feature = input_feature[idx]
        feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric'

        return Best_feature, feature_type
Beispiel #2
0
    def Find_Best_Feature(self, df):
        """
            This function retuns the feature that can split the data.
            in which, the impurity is the least.

            you should implement the part of calculating for
            impurity (entropy or gini_index) for given feature.변경하다


            이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를
            반환하는 함수입니다.

            여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를
            계산하는 것입니다.


            [Parameter]:
                df : [dataFrame (got by pandas) ][N x D] : Training_data

            [Variables] :
                header : [list of string ] [1 x D]   the set of attribute_name, last element is for output_feature.
                input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output)
                Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame,
                          you can write   ' data  = df.values '
                self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.)
                split_value : in numeric_data, You just divide the data two part.
                              one is the less than split_value, the other is no less than split_value.

            [Objects]: (the part of implement)
                impurity : (float) entropy or gini_index got by splitting data given attribute.
                impurity_list [list of float] [1 x D] : the list which store the all impurity in order.

            [return]:
                Best_feature : (string) feature_name
                feature_type : (string) the type of feature ('Category' or 'Numeric')
        """

        header = df.columns.values
        input_feature = header[:-1]
        output_feature = header[-1]
        Y_data = df[output_feature].values

        impurity_list = []

        # for all features in DataFrame,
        for idx, h in enumerate(input_feature):
            # ============       Edit here      ==================
            impurity = 0  # 이 위에 포문에서 피쳐(컬럼) 한줄 씩 가져옴. Humidty, temp, outlook 이런거야
            # Category Feature Case
            if idx in self.Category_feature_idx:  # 이건 카테고리 데이터일때, 예를 들어 humidity high랑 normal 뿐이니까
                values = []
                total_value_len = len(Y_data)  # 총 데이터의 개수
                for value in df[h].values:
                    if value not in values:
                        values.append(value)
                        # 위에 3줄은 이제 그 특정 피쳐에서 값을 뽑아내는건데 df[h]에 이제 high, high, normal, high 이런거 들어있고
                        # df[h]가 판다스라 어케 뽑아내는지 몰라서 저렇게함. values에 이제 high, normal 이렇게 담기게 되는 거
                for value in values:
                    data = Y_data[
                        df[h].values ==
                        value]  # 이제 values에서 하나 하나 뽑아서 Y_data 값을 가져오는거 예를들어, df[h] = (h, h, n ,h) Y_data = (yes, no, yes, yes) 이면 value가 h 일때 data 는 (yes, no, yes)가 되는거지
                    impurity += (len(data) / total_value_len) * impurity_func(
                        data, self.criterion
                    )  # impurity 구하는거는 이제 그 따로따로 임퓨리티 구해서 전체에서의 비율 곱하기
                    # 위 예제에서는 (3/4)*I(2, 1) + (1/4)*I(1,0) 이렇게 되는거

            # Numeric Feature Case
            else:  # 컨티뉴 일때는
                split_value = Finding_split_point(
                    df, h, self.criterion)  # 이건 조교가 짜준코드 가장 적절한 스플릿 포인트를 찾아

                less_idx = (df[h] < split_value)  # 스플릿 포인트보다 작은거의 인덱스

                y0 = Y_data[less_idx]  # 스플릿 포인트보다 작은 놈들의 Y_data
                y1 = Y_data[~less_idx]  # 이건 반대

                p0 = len(y0) / len(Y_data)
                p1 = len(y1) / len(Y_data)

                impurity = np.sum([
                    p0 * impurity_func(y0, self.criterion),
                    p1 * impurity_func(y1, self.criterion)
                ])  # 두개의 impurity합이 이제 impurity가 됨/
        #=====================================================
            impurity_list.append(np.round(impurity, 6))

        idx = np.argmin(impurity_list)
        Best_feature = input_feature[idx]
        feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric'

        return Best_feature, feature_type
Beispiel #3
0
    def make_child(self, node, Best_feature, feature_type):
        '''
        if attribute type is Category,
        For each attribute X, child_node is assigned to the data that has x-value.
        the child node is managed by dictionary. (dict key: 'feature_name = value')('outlook = sunny')

        attribute가 카테고리일 경우
        각각의 attribute 값 x에 대해서 x를 value로 갖는 데이터가 자식노드에 할당됩니다.
        자식노드는 dictionary에 의해 관리됩니다. (dict key: 'feature_name = value')('outlook = sunny')

        elif attribute type is Numeric
        Search for all value x which is the best split value. and the node has 2 child node.
        one is (data < x), the other is (data >= x).
        (dict key: 'feature_name < x' or 'feature_name >= x') ('Age' < 20)

        attribute가 수치값인 경우
        모든 수치값 x에 대해서 data를 가장 잘 나누는 split value를 탐색 후 2개의 자식노드로 분할합니다.
        하나는 (data < x) 다른 하나는 (data >= x).
        (dict key: 'feature_name < x' or 'feature_name >= x') ('Age' < 20)

        [Parameter]
            node : current Node (class object)
            Best_feature: 'the feature that can makes the impurity the least' (string)
            feature type: 'Category' or 'Numeric' (string)
        [return]
            nothing.
        '''

        df = node.df
        col_data = df[Best_feature]
        node.split_feature = Best_feature

        print('\nBest_Feature: ', Best_feature)

        if feature_type == 'Category':
            distinct_data = np.unique(col_data)

            for i, d in enumerate(distinct_data):
                if type(d) == np.float64 or type(d) == np.float32:
                    d = int(d)
                print('\t' * node.depth, 'parent: ', self.branch_name,
                      '\tDepth:', node.depth, '\t', i, 'th branch: ', d)
                child_df = df[(col_data == d)]
                child_node = Node(child_df, self.criterion, node.depth + 1,
                                  '%s = %s' % (Best_feature, d))
                node.child[d] = child_node

        elif feature_type == 'Numeric':
            split_value = Finding_split_point(df, Best_feature, self.criterion)
            less_idx = (col_data < split_value)
            idx_set = [less_idx, ~less_idx]

            print('\t' * node.depth, 'parent: ', self.branch_name, '\tDepth: ',
                  node.depth, '\tsplit point: ', split_value)
            for i, idx in enumerate(idx_set):
                child_df = df[idx]
                inequal_symbol = (i == 0) and '<' or '>='
                child_node = Node(
                    child_df, self.criterion, node.depth + 1,
                    '%s %s %s' % (Best_feature, inequal_symbol, split_value))
                node.child['%s %.1f' %
                           (inequal_symbol, split_value)] = child_node
        return
Beispiel #4
0
    def Find_Best_Feature(self, df):
        """
            This function retuns the feature that can split the data.
            in which, the impurity is the least.

            you should implement the part of calculating for
            impurity (entropy or gini_index) for given feature.변경하다


            이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를
            반환하는 함수입니다.

            여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를
            계산하는 것입니다.


            [Parameter]:
                df : [dataFrame (got by pandas) ][N x D] : Training_data

            [Variables] :
                header : [list of string ] [1 x D]   the set of attribute_name, last element is for output_feature.
                input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output)
                Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame,
                          you can write   ' data  = df.values '
                self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.)
                split_value : in numeric_data, You just divide the data two part.
                              one is the less than split_value, the other is no less than split_value.

            [Objects]: (the part of implement)
                impurity : (float) entropy or gini_index got by splitting data given attribute.
                impurity_list [list of float] [1 x D] : the list which store the all impurity in order.

            [return]:
                Best_feature : (string) feature_name
                feature_type : (string) the type of feature ('Category' or 'Numeric')
        """

        header = df.columns.values
        input_feature = header[:-1]
        output_feature = header[-1]
        Y_data = df[output_feature].values

        impurity_list = []

        # for all features in DataFrame,
        for idx, h in enumerate(input_feature):
            # ============       Edit here      ==================
            num = df[h].values.shape[0]
            impurity = 0
            if idx in self.Category_feature_idx:
                for key, cnt in zip(
                        *np.unique(df[h].values, return_counts=True)):
                    impurity += impurity_func(Y_data[df[h].values == key],
                                              self.criterion) * cnt / num
            else:
                split_value = Finding_split_point(df, h, self.criterion)
                split = df[h].values < split_value
                num_T, num_F = split.sum(), (~split).sum()
                impurity += impurity_func(Y_data[split],
                                          self.criterion) * num_T / num
                impurity += impurity_func(Y_data[~split],
                                          self.criterion) * num_F / num
        #=====================================================
            impurity_list.append(np.round(impurity, 6))

        idx = np.argmin(impurity_list)
        Best_feature = input_feature[idx]
        feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric'

        return Best_feature, feature_type