Exemple #1
0
    def __init__(self, dataFrame, criterion, depth, split_feature=''):
        self.df = dataFrame
        self.depth = depth
        self.split_feature = split_feature

        self.impurity = round(
            impurity_func(dataFrame.values[:, -1], criterion), 6)
        self.child = {}
        self.is_leaf = False
        self.label = None
Exemple #2
0
    def Find_Best_Feature(self, df):
        """
            This function retuns the feature that can split the data.
            in which, the impurity is the least.

            you should implement the part of calculating for
            impurity (entropy or gini_index) for given feature.변경하다


            이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를
            반환하는 함수입니다.

            여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를
            계산하는 것입니다.


            [Parameter]:
                df : [dataFrame (got by pandas) ][N x D] : Training_data

            [Variables] :
                header : [list of string ] [1 x D]   the set of attribute_name, last element is for output_feature.
                input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output)
                Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame,
                          you can write   ' data  = df.values '
                self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.)
                split_value : in numeric_data, You just divide the data two part.
                              one is the less than split_value, the other is no less than split_value.

            [Objects]: (the part of implement)
                impurity : (float) entropy or gini_index got by splitting data given attribute.
                impurity_list [list of float] [1 x D] : the list which store the all impurity in order.

            [return]:
                Best_feature : (string) feature_name
                feature_type : (string) the type of feature ('Category' or 'Numeric')
        """

        header = df.columns.values
        input_feature = header[:-1]
        output_feature = header[-1]
        Y_data = df[output_feature].values

        impurity_list = []
        # for all features in DataFrame,
        for idx, h in enumerate(input_feature):
            # ============       Edit here      ==================
            col_data = df[h]
            Y = df.values[:, -1]
            distinct_data = np.unique(col_data)
            # Category Feature Case
            if idx in self.Category_feature_idx:
                impurity = 0
                for attr in distinct_data:
                    attr_idx = (col_data == attr)
                    y0 = Y[attr_idx]
                    p0 = len(y0) / len(Y)
                    impurity += p0 * impurity_func(y0, self.criterion)

            # Numeric Feature Case
            else:
                split_value = Finding_split_point(df, h, self.criterion)
                for val in distinct_data:
                    less_idx = (col_data < split_value)

                    y0 = Y[less_idx]
                    y1 = Y[~less_idx]

                    p0 = len(y0) / len(Y)
                    p1 = len(y1) / len(Y)
                    impurity = np.sum([
                        p0 * impurity_func(y0, self.criterion),
                        p1 * impurity_func(y1, self.criterion)
                    ])

        #=====================================================
            impurity_list.append(np.round(impurity, 6))

        idx = np.argmin(impurity_list)
        Best_feature = input_feature[idx]
        feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric'

        return Best_feature, feature_type
Exemple #3
0
    def Find_Best_Feature(self, df):
        """
            This function retuns the feature that can split the data.
            in which, the impurity is the least.

            you should implement the part of calculating for
            impurity (entropy or gini_index) for given feature.변경하다


            이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를
            반환하는 함수입니다.

            여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를
            계산하는 것입니다.


            [Parameter]:
                df : [dataFrame (got by pandas) ][N x D] : Training_data

            [Variables] :
                header : [list of string ] [1 x D]   the set of attribute_name, last element is for output_feature.
                input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output)
                Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame,
                          you can write   ' data  = df.values '
                self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.)
                split_value : in numeric_data, You just divide the data two part.
                              one is the less than split_value, the other is no less than split_value.

            [Objects]: (the part of implement)
                impurity : (float) entropy or gini_index got by splitting data given attribute.
                impurity_list [list of float] [1 x D] : the list which store the all impurity in order.

            [return]:
                Best_feature : (string) feature_name
                feature_type : (string) the type of feature ('Category' or 'Numeric')
        """

        header = df.columns.values
        input_feature = header[:-1]
        output_feature = header[-1]
        Y_data = df[output_feature].values

        impurity_list = []

        # for all features in DataFrame,
        for idx, h in enumerate(input_feature):
            # ============       Edit here      ==================
            impurity = 0  # 이 위에 포문에서 피쳐(컬럼) 한줄 씩 가져옴. Humidty, temp, outlook 이런거야
            # Category Feature Case
            if idx in self.Category_feature_idx:  # 이건 카테고리 데이터일때, 예를 들어 humidity high랑 normal 뿐이니까
                values = []
                total_value_len = len(Y_data)  # 총 데이터의 개수
                for value in df[h].values:
                    if value not in values:
                        values.append(value)
                        # 위에 3줄은 이제 그 특정 피쳐에서 값을 뽑아내는건데 df[h]에 이제 high, high, normal, high 이런거 들어있고
                        # df[h]가 판다스라 어케 뽑아내는지 몰라서 저렇게함. values에 이제 high, normal 이렇게 담기게 되는 거
                for value in values:
                    data = Y_data[
                        df[h].values ==
                        value]  # 이제 values에서 하나 하나 뽑아서 Y_data 값을 가져오는거 예를들어, df[h] = (h, h, n ,h) Y_data = (yes, no, yes, yes) 이면 value가 h 일때 data 는 (yes, no, yes)가 되는거지
                    impurity += (len(data) / total_value_len) * impurity_func(
                        data, self.criterion
                    )  # impurity 구하는거는 이제 그 따로따로 임퓨리티 구해서 전체에서의 비율 곱하기
                    # 위 예제에서는 (3/4)*I(2, 1) + (1/4)*I(1,0) 이렇게 되는거

            # Numeric Feature Case
            else:  # 컨티뉴 일때는
                split_value = Finding_split_point(
                    df, h, self.criterion)  # 이건 조교가 짜준코드 가장 적절한 스플릿 포인트를 찾아

                less_idx = (df[h] < split_value)  # 스플릿 포인트보다 작은거의 인덱스

                y0 = Y_data[less_idx]  # 스플릿 포인트보다 작은 놈들의 Y_data
                y1 = Y_data[~less_idx]  # 이건 반대

                p0 = len(y0) / len(Y_data)
                p1 = len(y1) / len(Y_data)

                impurity = np.sum([
                    p0 * impurity_func(y0, self.criterion),
                    p1 * impurity_func(y1, self.criterion)
                ])  # 두개의 impurity합이 이제 impurity가 됨/
        #=====================================================
            impurity_list.append(np.round(impurity, 6))

        idx = np.argmin(impurity_list)
        Best_feature = input_feature[idx]
        feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric'

        return Best_feature, feature_type
Exemple #4
0
    def Find_Best_Feature(self, df):
        """
            This function retuns the feature that can split the data.
            in which, the impurity is the least.

            you should implement the part of calculating for
            impurity (entropy or gini_index) for given feature.변경하다


            이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를
            반환하는 함수입니다.

            여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를
            계산하는 것입니다.


            [Parameter]:
                df : [dataFrame (got by pandas) ][N x D] : Training_data

            [Variables] :
                header : [list of string ] [1 x D]   the set of attribute_name, last element is for output_feature.
                input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output)
                Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame,
                          you can write   ' data  = df.values '
                self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.)
                split_value : in numeric_data, You just divide the data two part.
                              one is the less than split_value, the other is no less than split_value.

            [Objects]: (the part of implement)
                impurity : (float) entropy or gini_index got by splitting data given attribute.
                impurity_list [list of float] [1 x D] : the list which store the all impurity in order.

            [return]:
                Best_feature : (string) feature_name
                feature_type : (string) the type of feature ('Category' or 'Numeric')
        """

        header = df.columns.values
        input_feature = header[:-1]
        output_feature = header[-1]
        Y_data = df[output_feature].values

        impurity_list = []

        # for all features in DataFrame,
        for idx, h in enumerate(input_feature):
            # ============       Edit here      ==================
            num = df[h].values.shape[0]
            impurity = 0
            if idx in self.Category_feature_idx:
                for key, cnt in zip(
                        *np.unique(df[h].values, return_counts=True)):
                    impurity += impurity_func(Y_data[df[h].values == key],
                                              self.criterion) * cnt / num
            else:
                split_value = Finding_split_point(df, h, self.criterion)
                split = df[h].values < split_value
                num_T, num_F = split.sum(), (~split).sum()
                impurity += impurity_func(Y_data[split],
                                          self.criterion) * num_T / num
                impurity += impurity_func(Y_data[~split],
                                          self.criterion) * num_F / num
        #=====================================================
            impurity_list.append(np.round(impurity, 6))

        idx = np.argmin(impurity_list)
        Best_feature = input_feature[idx]
        feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric'

        return Best_feature, feature_type