def Find_Best_Feature(self, df): """ This function retuns the feature that can split the data. in which, the impurity is the least. you should implement the part of calculating for impurity (entropy or gini_index) for given feature.변경하다 이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를 반환하는 함수입니다. 여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를 계산하는 것입니다. [Parameter]: df : [dataFrame (got by pandas) ][N x D] : Training_data [Variables] : header : [list of string ] [1 x D] the set of attribute_name, last element is for output_feature. input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output) Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame, you can write ' data = df.values ' self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.) split_value : in numeric_data, You just divide the data two part. one is the less than split_value, the other is no less than split_value. [Objects]: (the part of implement) impurity : (float) entropy or gini_index got by splitting data given attribute. impurity_list [list of float] [1 x D] : the list which store the all impurity in order. [return]: Best_feature : (string) feature_name feature_type : (string) the type of feature ('Category' or 'Numeric') """ header = df.columns.values input_feature = header[:-1] output_feature = header[-1] Y_data = df[output_feature].values impurity_list = [] # for all features in DataFrame, for idx, h in enumerate(input_feature): # ============ Edit here ================== col_data = df[h] Y = df.values[:, -1] distinct_data = np.unique(col_data) # Category Feature Case if idx in self.Category_feature_idx: impurity = 0 for attr in distinct_data: attr_idx = (col_data == attr) y0 = Y[attr_idx] p0 = len(y0) / len(Y) impurity += p0 * impurity_func(y0, self.criterion) # Numeric Feature Case else: split_value = Finding_split_point(df, h, self.criterion) for val in distinct_data: less_idx = (col_data < split_value) y0 = Y[less_idx] y1 = Y[~less_idx] p0 = len(y0) / len(Y) p1 = len(y1) / len(Y) impurity = np.sum([ p0 * impurity_func(y0, self.criterion), p1 * impurity_func(y1, self.criterion) ]) #===================================================== impurity_list.append(np.round(impurity, 6)) idx = np.argmin(impurity_list) Best_feature = input_feature[idx] feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric' return Best_feature, feature_type
def Find_Best_Feature(self, df): """ This function retuns the feature that can split the data. in which, the impurity is the least. you should implement the part of calculating for impurity (entropy or gini_index) for given feature.변경하다 이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를 반환하는 함수입니다. 여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를 계산하는 것입니다. [Parameter]: df : [dataFrame (got by pandas) ][N x D] : Training_data [Variables] : header : [list of string ] [1 x D] the set of attribute_name, last element is for output_feature. input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output) Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame, you can write ' data = df.values ' self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.) split_value : in numeric_data, You just divide the data two part. one is the less than split_value, the other is no less than split_value. [Objects]: (the part of implement) impurity : (float) entropy or gini_index got by splitting data given attribute. impurity_list [list of float] [1 x D] : the list which store the all impurity in order. [return]: Best_feature : (string) feature_name feature_type : (string) the type of feature ('Category' or 'Numeric') """ header = df.columns.values input_feature = header[:-1] output_feature = header[-1] Y_data = df[output_feature].values impurity_list = [] # for all features in DataFrame, for idx, h in enumerate(input_feature): # ============ Edit here ================== impurity = 0 # 이 위에 포문에서 피쳐(컬럼) 한줄 씩 가져옴. Humidty, temp, outlook 이런거야 # Category Feature Case if idx in self.Category_feature_idx: # 이건 카테고리 데이터일때, 예를 들어 humidity high랑 normal 뿐이니까 values = [] total_value_len = len(Y_data) # 총 데이터의 개수 for value in df[h].values: if value not in values: values.append(value) # 위에 3줄은 이제 그 특정 피쳐에서 값을 뽑아내는건데 df[h]에 이제 high, high, normal, high 이런거 들어있고 # df[h]가 판다스라 어케 뽑아내는지 몰라서 저렇게함. values에 이제 high, normal 이렇게 담기게 되는 거 for value in values: data = Y_data[ df[h].values == value] # 이제 values에서 하나 하나 뽑아서 Y_data 값을 가져오는거 예를들어, df[h] = (h, h, n ,h) Y_data = (yes, no, yes, yes) 이면 value가 h 일때 data 는 (yes, no, yes)가 되는거지 impurity += (len(data) / total_value_len) * impurity_func( data, self.criterion ) # impurity 구하는거는 이제 그 따로따로 임퓨리티 구해서 전체에서의 비율 곱하기 # 위 예제에서는 (3/4)*I(2, 1) + (1/4)*I(1,0) 이렇게 되는거 # Numeric Feature Case else: # 컨티뉴 일때는 split_value = Finding_split_point( df, h, self.criterion) # 이건 조교가 짜준코드 가장 적절한 스플릿 포인트를 찾아 less_idx = (df[h] < split_value) # 스플릿 포인트보다 작은거의 인덱스 y0 = Y_data[less_idx] # 스플릿 포인트보다 작은 놈들의 Y_data y1 = Y_data[~less_idx] # 이건 반대 p0 = len(y0) / len(Y_data) p1 = len(y1) / len(Y_data) impurity = np.sum([ p0 * impurity_func(y0, self.criterion), p1 * impurity_func(y1, self.criterion) ]) # 두개의 impurity합이 이제 impurity가 됨/ #===================================================== impurity_list.append(np.round(impurity, 6)) idx = np.argmin(impurity_list) Best_feature = input_feature[idx] feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric' return Best_feature, feature_type
def make_child(self, node, Best_feature, feature_type): ''' if attribute type is Category, For each attribute X, child_node is assigned to the data that has x-value. the child node is managed by dictionary. (dict key: 'feature_name = value')('outlook = sunny') attribute가 카테고리일 경우 각각의 attribute 값 x에 대해서 x를 value로 갖는 데이터가 자식노드에 할당됩니다. 자식노드는 dictionary에 의해 관리됩니다. (dict key: 'feature_name = value')('outlook = sunny') elif attribute type is Numeric Search for all value x which is the best split value. and the node has 2 child node. one is (data < x), the other is (data >= x). (dict key: 'feature_name < x' or 'feature_name >= x') ('Age' < 20) attribute가 수치값인 경우 모든 수치값 x에 대해서 data를 가장 잘 나누는 split value를 탐색 후 2개의 자식노드로 분할합니다. 하나는 (data < x) 다른 하나는 (data >= x). (dict key: 'feature_name < x' or 'feature_name >= x') ('Age' < 20) [Parameter] node : current Node (class object) Best_feature: 'the feature that can makes the impurity the least' (string) feature type: 'Category' or 'Numeric' (string) [return] nothing. ''' df = node.df col_data = df[Best_feature] node.split_feature = Best_feature print('\nBest_Feature: ', Best_feature) if feature_type == 'Category': distinct_data = np.unique(col_data) for i, d in enumerate(distinct_data): if type(d) == np.float64 or type(d) == np.float32: d = int(d) print('\t' * node.depth, 'parent: ', self.branch_name, '\tDepth:', node.depth, '\t', i, 'th branch: ', d) child_df = df[(col_data == d)] child_node = Node(child_df, self.criterion, node.depth + 1, '%s = %s' % (Best_feature, d)) node.child[d] = child_node elif feature_type == 'Numeric': split_value = Finding_split_point(df, Best_feature, self.criterion) less_idx = (col_data < split_value) idx_set = [less_idx, ~less_idx] print('\t' * node.depth, 'parent: ', self.branch_name, '\tDepth: ', node.depth, '\tsplit point: ', split_value) for i, idx in enumerate(idx_set): child_df = df[idx] inequal_symbol = (i == 0) and '<' or '>=' child_node = Node( child_df, self.criterion, node.depth + 1, '%s %s %s' % (Best_feature, inequal_symbol, split_value)) node.child['%s %.1f' % (inequal_symbol, split_value)] = child_node return
def Find_Best_Feature(self, df): """ This function retuns the feature that can split the data. in which, the impurity is the least. you should implement the part of calculating for impurity (entropy or gini_index) for given feature.변경하다 이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를 반환하는 함수입니다. 여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를 계산하는 것입니다. [Parameter]: df : [dataFrame (got by pandas) ][N x D] : Training_data [Variables] : header : [list of string ] [1 x D] the set of attribute_name, last element is for output_feature. input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output) Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame, you can write ' data = df.values ' self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.) split_value : in numeric_data, You just divide the data two part. one is the less than split_value, the other is no less than split_value. [Objects]: (the part of implement) impurity : (float) entropy or gini_index got by splitting data given attribute. impurity_list [list of float] [1 x D] : the list which store the all impurity in order. [return]: Best_feature : (string) feature_name feature_type : (string) the type of feature ('Category' or 'Numeric') """ header = df.columns.values input_feature = header[:-1] output_feature = header[-1] Y_data = df[output_feature].values impurity_list = [] # for all features in DataFrame, for idx, h in enumerate(input_feature): # ============ Edit here ================== num = df[h].values.shape[0] impurity = 0 if idx in self.Category_feature_idx: for key, cnt in zip( *np.unique(df[h].values, return_counts=True)): impurity += impurity_func(Y_data[df[h].values == key], self.criterion) * cnt / num else: split_value = Finding_split_point(df, h, self.criterion) split = df[h].values < split_value num_T, num_F = split.sum(), (~split).sum() impurity += impurity_func(Y_data[split], self.criterion) * num_T / num impurity += impurity_func(Y_data[~split], self.criterion) * num_F / num #===================================================== impurity_list.append(np.round(impurity, 6)) idx = np.argmin(impurity_list) Best_feature = input_feature[idx] feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric' return Best_feature, feature_type