def id3(x, y, feature_list, impurity_measure="entropy", parent_node=None): """ Creates a decision tree, with the id3 alogoritm, based on learning data :param x: dataset of items as list of list :param y: target values for items as list :param feature_list: available features for each iteration as list :param impurity_measure: impurity measure entropy or gini as string :param parent_node: parent for this node iteration as Tree (node representation), default None :return: root node for tree as type Tree (node representation) """ x = np.array(x) uniques, counts = np.unique(y, return_counts=True)#Find unique target values if len(uniques) == 1: # Set is pure, only one target value left in set return Tree(uniques[0], label=uniques[0]) if len(feature_list) == 0:#No more attributes to split on if counts[0] > counts[1]: return Tree(uniques[0], label=uniques[0]) else: return Tree(uniques[1], label=uniques[1]) best_attribute = max_ig(x, y, feature_list, impurity_measure) vals, counts = np.unique(x[:, best_attribute], return_counts=True)#Finds unique attribute values this_node = Tree(best_attribute, parent_node) n_children = len(vals) feature_list.remove(best_attribute) children_labels = [] for child in range(n_children): #Creates each children, based on unique attribute values x_for_this_child = [] y_for_this_child = [] for item in range(len(x)): #Loops through all items, and passes on all items with correct attribute value if x[item][best_attribute] == vals[child]: # Items where the attribute value is equal to the decision branch x_for_this_child.append(x[item]) y_for_this_child.append(y[item]) # Continues to grow this child node new_child = id3(x_for_this_child, y_for_this_child, feature_list, impurity_measure, parent_node=this_node) new_child.set_was_split_on(vals[child]) #Saves what attribute value this child was split on this_node.add_child(new_child) children_labels.append(new_child.label) label_list, label_count = np.unique(children_labels, return_counts=True) #Lists of children labels label = label_list[np.argmax(label_count)] this_node.set_label(label)#This node get most popular label among children return this_node
def __fit_without_prune(self, data, features, target): ''' Built entire decision tree without pruning ''' continuous_features = list() discrete_features = list() for feature in features: if len(list(data[feature])) > 0: is_continue = self.is_attr_continue(list(data[feature])) if is_continue: continuous_features.append(feature) else: discrete_features.append(feature) if not continuous_features: return MyID3(self.gain_ratio).fit(data, features, target) # Continuous attribute # If only one value exist entropy_data_target = Calculate.entropy(data[target]) if entropy_data_target == 0: value_list = Calculate.get_unique_data(data, target) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) return Tree( Node( None, 0.0, # Entropy must be 0 since only one value exist value_dict, result=data[target][0], is_leaf=True)) if (len(features) == 0): value_list = Calculate.get_unique_data(data, target) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) return Tree( Node(None, entropy_data_target, value_dict, result=Calculate.most_label(data[target]), is_leaf=True)) # Find best attribute and build tree recursively best_attr = '' best_point = 0 is_discrete = False best_splitter = 0 chosen_edge = list(['', '']) for feature in continuous_features: best_treshold = self.find_threshold(data[[feature]], data[[target]]) if best_treshold[1] > best_point: best_attr = str(feature) chosen_edge[0] = best_attr + ' > ' + str(best_treshold[0]) chosen_edge[1] = best_attr + ' <= ' + str(best_treshold[0]) best_point = best_treshold[1] best_splitter = best_treshold[0] for feature in discrete_features: point = Calculate.info_gain(data[feature], data[target]) if point > best_point: best_point = point best_attr = str(feature) is_discrete = True value_list = Calculate.get_unique_data(data, target) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) dtree = Tree(Node(best_attr, best_point, value_dict)) # Scan all posible value to be generated subtree if is_discrete: list_attribute = Calculate.get_unique_data(data, best_attr) else: list_attribute = Calculate.split_by_threshold( data, best_attr, best_splitter) i = 0 for attribute in list_attribute: data = pd.DataFrame(data=list_attribute[attribute]).reset_index( drop=True) dtree.add_child(self.__fit_without_prune(data, features, target)) if is_discrete: dtree.children[i].value.edge = attribute else: dtree.children[i].value.edge = chosen_edge[i] i += 1 return dtree
def fit(self, data, attributes, target_name): ''' Built and return decision tree using ID3 algorithm ''' data_target = data[target_name] # Data target contains one label entropy_data_target = Calculate.entropy(data_target) if entropy_data_target == 0: value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) # Set current_node, info_gain, values tree = Tree( Node(None, entropy_data_target, value_dict, result=data_target[0], is_leaf=True)) return tree # Nothing attribute shall be chosen if len(attributes) == 0: # Set current_node, info_gain, values value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) tree = Tree( Node(None, entropy_data_target, value_dict, result=Calculate.most_label(data_target), is_leaf=True)) return tree else: # Find best attribute to be node using either info gain or gain ratio best_attr = '' best_point = 0 # Could be Info gain or Gain ratio for attr in attributes: if self.gain_ratio: point = Calculate.gain_ratio(data[attr], data_target) if point > best_point: best_point = point best_attr = attr else: point = Calculate.info_gain(data[attr], data_target) if point > best_point: best_point = point best_attr = attr value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) # Build decision tree recursively dtree = Tree(Node(best_attr, best_point, value_dict)) # Delete usage attribute in attributes attributes.remove(best_attr) # Scan all posible value to be generated subtree list_attribute = Calculate.get_unique_data(data, best_attr) i = 0 for attribute in list_attribute: data = pd.DataFrame( data=list_attribute[attribute]).reset_index(drop=True) data.drop(best_attr, axis=1, inplace=True) dtree.add_child(self.fit(data, attributes, target_name)) dtree.children[i].value.edge = attribute i += 1 return dtree