def prepare_data(filename, n_attr): data_set = DataSet("") # Load data set with open(filename) as f: data_set.rows = [line for line in csv.reader(f, delimiter=",")] data_set.attributes = data_set.rows.pop(0) # this is used to generalize the code for other datasets. # true indicates numeric data. false in nominal data # example: data_set.attribute_types = ['false', 'true', 'false', 'false', 'true', 'true', 'false'] data_set.attribute_types = ['false' for _ in range(n_attr)] data_set.classifier = data_set.attributes[-1] # find index of classifier data_set.class_col_index = data_set.attributes.index(data_set.classifier) # preprocessing the data_set data_set.preprocessing() return data_set
def compute_decision_tree(self, dataset, parent_node): node = DecisionTreeNode(parent_node) if parent_node is None: node.height = 0 else: node.height = node.parent.height + 1 # count_positives() will count the number of rows with classification "1" ones = dataset.count_positives() if len(dataset.rows) == ones: node.classification = 1 node.is_leaf_node = True return node elif ones == 0: node.is_leaf_node = True node.classification = 0 return node else: node.is_leaf_node = False # The index of the attribute we will split on splitting_attribute = None # The information gain given by the best attribute maximum_info_gain = 0 split_val = None minimum_info_gain = 0.01 entropy = dataset.calculate_entropy() # for each column of data for attr_index in range(len(dataset.rows[0])): if dataset.attributes[attr_index] != dataset.classifier: local_max_gain = 0 local_split_val = None # these are the values we can split on, now we must find the best one attr_value_list = [example[attr_index] for example in dataset.rows] # remove duplicates from list of all attribute values attr_value_list = list(set(attr_value_list)) if len(attr_value_list) > 100: attr_value_list = sorted(attr_value_list) total = len(attr_value_list) ten_percentile = int(total / 10) new_list = [] for x in range(1, 10): new_list.append(attr_value_list[x * ten_percentile]) attr_value_list = new_list for val in attr_value_list: # calculate the gain if we split on this value # if gain is greater than local_max_gain, save this gain and this value current_gain = dataset.calculate_information_gain(attr_index, val, entropy) if current_gain > local_max_gain: local_max_gain = current_gain local_split_val = val if local_max_gain > maximum_info_gain: maximum_info_gain = local_max_gain split_val = local_split_val splitting_attribute = attr_index if maximum_info_gain <= minimum_info_gain or node.height > 20: node.is_leaf_node = True node.classification = self.classify_leaf(dataset) return node node.attribute_split_index = splitting_attribute node.attribute_split = dataset.attributes[splitting_attribute] node.attribute_split_value = split_val left_dataset = DataSet(dataset.classifier) right_dataset = DataSet(dataset.classifier) left_dataset.attributes = dataset.attributes right_dataset.attributes = dataset.attributes left_dataset.attribute_types = dataset.attribute_types right_dataset.attribute_types = dataset.attribute_types for row in dataset.rows: if splitting_attribute is not None and row[splitting_attribute] >= split_val: left_dataset.rows.append(row) elif splitting_attribute is not None: right_dataset.rows.append(row) node.left_child = self.compute_decision_tree(left_dataset, node) node.right_child = self.compute_decision_tree(right_dataset, node) return node