def __init__(self, dataFrame, criterion, depth, split_feature=''): self.df = dataFrame self.depth = depth self.split_feature = split_feature self.impurity = round( impurity_func(dataFrame.values[:, -1], criterion), 6) self.child = {} self.is_leaf = False self.label = None
def Find_Best_Feature(self, df): """ This function retuns the feature that can split the data. in which, the impurity is the least. you should implement the part of calculating for impurity (entropy or gini_index) for given feature.변경하다 이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를 반환하는 함수입니다. 여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를 계산하는 것입니다. [Parameter]: df : [dataFrame (got by pandas) ][N x D] : Training_data [Variables] : header : [list of string ] [1 x D] the set of attribute_name, last element is for output_feature. input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output) Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame, you can write ' data = df.values ' self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.) split_value : in numeric_data, You just divide the data two part. one is the less than split_value, the other is no less than split_value. [Objects]: (the part of implement) impurity : (float) entropy or gini_index got by splitting data given attribute. impurity_list [list of float] [1 x D] : the list which store the all impurity in order. [return]: Best_feature : (string) feature_name feature_type : (string) the type of feature ('Category' or 'Numeric') """ header = df.columns.values input_feature = header[:-1] output_feature = header[-1] Y_data = df[output_feature].values impurity_list = [] # for all features in DataFrame, for idx, h in enumerate(input_feature): # ============ Edit here ================== col_data = df[h] Y = df.values[:, -1] distinct_data = np.unique(col_data) # Category Feature Case if idx in self.Category_feature_idx: impurity = 0 for attr in distinct_data: attr_idx = (col_data == attr) y0 = Y[attr_idx] p0 = len(y0) / len(Y) impurity += p0 * impurity_func(y0, self.criterion) # Numeric Feature Case else: split_value = Finding_split_point(df, h, self.criterion) for val in distinct_data: less_idx = (col_data < split_value) y0 = Y[less_idx] y1 = Y[~less_idx] p0 = len(y0) / len(Y) p1 = len(y1) / len(Y) impurity = np.sum([ p0 * impurity_func(y0, self.criterion), p1 * impurity_func(y1, self.criterion) ]) #===================================================== impurity_list.append(np.round(impurity, 6)) idx = np.argmin(impurity_list) Best_feature = input_feature[idx] feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric' return Best_feature, feature_type
def Find_Best_Feature(self, df): """ This function retuns the feature that can split the data. in which, the impurity is the least. you should implement the part of calculating for impurity (entropy or gini_index) for given feature.변경하다 이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를 반환하는 함수입니다. 여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를 계산하는 것입니다. [Parameter]: df : [dataFrame (got by pandas) ][N x D] : Training_data [Variables] : header : [list of string ] [1 x D] the set of attribute_name, last element is for output_feature. input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output) Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame, you can write ' data = df.values ' self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.) split_value : in numeric_data, You just divide the data two part. one is the less than split_value, the other is no less than split_value. [Objects]: (the part of implement) impurity : (float) entropy or gini_index got by splitting data given attribute. impurity_list [list of float] [1 x D] : the list which store the all impurity in order. [return]: Best_feature : (string) feature_name feature_type : (string) the type of feature ('Category' or 'Numeric') """ header = df.columns.values input_feature = header[:-1] output_feature = header[-1] Y_data = df[output_feature].values impurity_list = [] # for all features in DataFrame, for idx, h in enumerate(input_feature): # ============ Edit here ================== impurity = 0 # 이 위에 포문에서 피쳐(컬럼) 한줄 씩 가져옴. Humidty, temp, outlook 이런거야 # Category Feature Case if idx in self.Category_feature_idx: # 이건 카테고리 데이터일때, 예를 들어 humidity high랑 normal 뿐이니까 values = [] total_value_len = len(Y_data) # 총 데이터의 개수 for value in df[h].values: if value not in values: values.append(value) # 위에 3줄은 이제 그 특정 피쳐에서 값을 뽑아내는건데 df[h]에 이제 high, high, normal, high 이런거 들어있고 # df[h]가 판다스라 어케 뽑아내는지 몰라서 저렇게함. values에 이제 high, normal 이렇게 담기게 되는 거 for value in values: data = Y_data[ df[h].values == value] # 이제 values에서 하나 하나 뽑아서 Y_data 값을 가져오는거 예를들어, df[h] = (h, h, n ,h) Y_data = (yes, no, yes, yes) 이면 value가 h 일때 data 는 (yes, no, yes)가 되는거지 impurity += (len(data) / total_value_len) * impurity_func( data, self.criterion ) # impurity 구하는거는 이제 그 따로따로 임퓨리티 구해서 전체에서의 비율 곱하기 # 위 예제에서는 (3/4)*I(2, 1) + (1/4)*I(1,0) 이렇게 되는거 # Numeric Feature Case else: # 컨티뉴 일때는 split_value = Finding_split_point( df, h, self.criterion) # 이건 조교가 짜준코드 가장 적절한 스플릿 포인트를 찾아 less_idx = (df[h] < split_value) # 스플릿 포인트보다 작은거의 인덱스 y0 = Y_data[less_idx] # 스플릿 포인트보다 작은 놈들의 Y_data y1 = Y_data[~less_idx] # 이건 반대 p0 = len(y0) / len(Y_data) p1 = len(y1) / len(Y_data) impurity = np.sum([ p0 * impurity_func(y0, self.criterion), p1 * impurity_func(y1, self.criterion) ]) # 두개의 impurity합이 이제 impurity가 됨/ #===================================================== impurity_list.append(np.round(impurity, 6)) idx = np.argmin(impurity_list) Best_feature = input_feature[idx] feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric' return Best_feature, feature_type
def Find_Best_Feature(self, df): """ This function retuns the feature that can split the data. in which, the impurity is the least. you should implement the part of calculating for impurity (entropy or gini_index) for given feature.변경하다 이 함수는 불순도가 가장 작게 데이터를 분할할 수 있는 feature를 반환하는 함수입니다. 여러분이 구현할 내용은 주어진 feature의 impurity(entropy or gini_index)를 계산하는 것입니다. [Parameter]: df : [dataFrame (got by pandas) ][N x D] : Training_data [Variables] : header : [list of string ] [1 x D] the set of attribute_name, last element is for output_feature. input_feature : [list of string] [1 x (D-1)] the set of attribute name except for last feature(output) Y_data : [column vector of label] [N x 1] label_data, To get the (vector or matrix) not dataFrame, you can write ' data = df.values ' self.Category_feature_idx : [list of integers]: the set of idx only for Category feature.(referred top of the code.) split_value : in numeric_data, You just divide the data two part. one is the less than split_value, the other is no less than split_value. [Objects]: (the part of implement) impurity : (float) entropy or gini_index got by splitting data given attribute. impurity_list [list of float] [1 x D] : the list which store the all impurity in order. [return]: Best_feature : (string) feature_name feature_type : (string) the type of feature ('Category' or 'Numeric') """ header = df.columns.values input_feature = header[:-1] output_feature = header[-1] Y_data = df[output_feature].values impurity_list = [] # for all features in DataFrame, for idx, h in enumerate(input_feature): # ============ Edit here ================== num = df[h].values.shape[0] impurity = 0 if idx in self.Category_feature_idx: for key, cnt in zip( *np.unique(df[h].values, return_counts=True)): impurity += impurity_func(Y_data[df[h].values == key], self.criterion) * cnt / num else: split_value = Finding_split_point(df, h, self.criterion) split = df[h].values < split_value num_T, num_F = split.sum(), (~split).sum() impurity += impurity_func(Y_data[split], self.criterion) * num_T / num impurity += impurity_func(Y_data[~split], self.criterion) * num_F / num #===================================================== impurity_list.append(np.round(impurity, 6)) idx = np.argmin(impurity_list) Best_feature = input_feature[idx] feature_type = idx in self.Category_feature_idx and 'Category' or 'Numeric' return Best_feature, feature_type