def __fit_without_prune(self, data, features, target): ''' Built entire decision tree without pruning ''' continuous_features = list() discrete_features = list() for feature in features: if len(list(data[feature])) > 0: is_continue = self.is_attr_continue(list(data[feature])) if is_continue: continuous_features.append(feature) else: discrete_features.append(feature) if not continuous_features: return MyID3(self.gain_ratio).fit(data, features, target) # Continuous attribute # If only one value exist entropy_data_target = Calculate.entropy(data[target]) if entropy_data_target == 0: value_list = Calculate.get_unique_data(data, target) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) return Tree( Node( None, 0.0, # Entropy must be 0 since only one value exist value_dict, result=data[target][0], is_leaf=True)) if (len(features) == 0): value_list = Calculate.get_unique_data(data, target) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) return Tree( Node(None, entropy_data_target, value_dict, result=Calculate.most_label(data[target]), is_leaf=True)) # Find best attribute and build tree recursively best_attr = '' best_point = 0 is_discrete = False best_splitter = 0 chosen_edge = list(['', '']) for feature in continuous_features: best_treshold = self.find_threshold(data[[feature]], data[[target]]) if best_treshold[1] > best_point: best_attr = str(feature) chosen_edge[0] = best_attr + ' > ' + str(best_treshold[0]) chosen_edge[1] = best_attr + ' <= ' + str(best_treshold[0]) best_point = best_treshold[1] best_splitter = best_treshold[0] for feature in discrete_features: point = Calculate.info_gain(data[feature], data[target]) if point > best_point: best_point = point best_attr = str(feature) is_discrete = True value_list = Calculate.get_unique_data(data, target) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) dtree = Tree(Node(best_attr, best_point, value_dict)) # Scan all posible value to be generated subtree if is_discrete: list_attribute = Calculate.get_unique_data(data, best_attr) else: list_attribute = Calculate.split_by_threshold( data, best_attr, best_splitter) i = 0 for attribute in list_attribute: data = pd.DataFrame(data=list_attribute[attribute]).reset_index( drop=True) dtree.add_child(self.__fit_without_prune(data, features, target)) if is_discrete: dtree.children[i].value.edge = attribute else: dtree.children[i].value.edge = chosen_edge[i] i += 1 return dtree
def fit(self, data, attributes, target_name): ''' Built and return decision tree using ID3 algorithm ''' data_target = data[target_name] # Data target contains one label entropy_data_target = Calculate.entropy(data_target) if entropy_data_target == 0: value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) # Set current_node, info_gain, values tree = Tree( Node(None, entropy_data_target, value_dict, result=data_target[0], is_leaf=True)) return tree # Nothing attribute shall be chosen if len(attributes) == 0: # Set current_node, info_gain, values value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) tree = Tree( Node(None, entropy_data_target, value_dict, result=Calculate.most_label(data_target), is_leaf=True)) return tree else: # Find best attribute to be node using either info gain or gain ratio best_attr = '' best_point = 0 # Could be Info gain or Gain ratio for attr in attributes: if self.gain_ratio: point = Calculate.gain_ratio(data[attr], data_target) if point > best_point: best_point = point best_attr = attr else: point = Calculate.info_gain(data[attr], data_target) if point > best_point: best_point = point best_attr = attr value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) # Build decision tree recursively dtree = Tree(Node(best_attr, best_point, value_dict)) # Delete usage attribute in attributes attributes.remove(best_attr) # Scan all posible value to be generated subtree list_attribute = Calculate.get_unique_data(data, best_attr) i = 0 for attribute in list_attribute: data = pd.DataFrame( data=list_attribute[attribute]).reset_index(drop=True) data.drop(best_attr, axis=1, inplace=True) dtree.add_child(self.fit(data, attributes, target_name)) dtree.children[i].value.edge = attribute i += 1 return dtree