Example #1
0
def prepare_data(filename, n_attr):
    data_set = DataSet("")

    # Load data set
    with open(filename) as f:
        data_set.rows = [line for line in csv.reader(f, delimiter=",")]

    data_set.attributes = data_set.rows.pop(0)

    # this is used to generalize the code for other datasets.
    # true indicates numeric data. false in nominal data
    # example: data_set.attribute_types = ['false', 'true', 'false', 'false', 'true', 'true', 'false']
    data_set.attribute_types = ['false' for _ in range(n_attr)]
    data_set.classifier = data_set.attributes[-1]

    # find index of classifier
    data_set.class_col_index = data_set.attributes.index(data_set.classifier)

    # preprocessing the data_set
    data_set.preprocessing()

    return data_set
    def compute_decision_tree(self, dataset, parent_node):
        node = DecisionTreeNode(parent_node)
        if parent_node is None:
            node.height = 0
        else:
            node.height = node.parent.height + 1

        # count_positives() will count the number of rows with classification "1"
        ones = dataset.count_positives()

        if len(dataset.rows) == ones:
            node.classification = 1
            node.is_leaf_node = True
            return node
        elif ones == 0:
            node.is_leaf_node = True
            node.classification = 0
            return node
        else:
            node.is_leaf_node = False

        # The index of the attribute we will split on
        splitting_attribute = None

        # The information gain given by the best attribute
        maximum_info_gain = 0

        split_val = None
        minimum_info_gain = 0.01

        entropy = dataset.calculate_entropy()

        # for each column of data
        for attr_index in range(len(dataset.rows[0])):
            if dataset.attributes[attr_index] != dataset.classifier:
                local_max_gain = 0
                local_split_val = None

                # these are the values we can split on, now we must find the best one
                attr_value_list = [example[attr_index] for example in dataset.rows]
                # remove duplicates from list of all attribute values
                attr_value_list = list(set(attr_value_list))

                if len(attr_value_list) > 100:
                    attr_value_list = sorted(attr_value_list)
                    total = len(attr_value_list)
                    ten_percentile = int(total / 10)
                    new_list = []
                    for x in range(1, 10):
                        new_list.append(attr_value_list[x * ten_percentile])
                    attr_value_list = new_list

                for val in attr_value_list:
                    # calculate the gain if we split on this value
                    # if gain is greater than local_max_gain, save this gain and this value
                    current_gain = dataset.calculate_information_gain(attr_index, val, entropy)

                    if current_gain > local_max_gain:
                        local_max_gain = current_gain
                        local_split_val = val

                if local_max_gain > maximum_info_gain:
                    maximum_info_gain = local_max_gain
                    split_val = local_split_val
                    splitting_attribute = attr_index

        if maximum_info_gain <= minimum_info_gain or node.height > 20:
            node.is_leaf_node = True
            node.classification = self.classify_leaf(dataset)
            return node

        node.attribute_split_index = splitting_attribute
        node.attribute_split = dataset.attributes[splitting_attribute]
        node.attribute_split_value = split_val

        left_dataset = DataSet(dataset.classifier)
        right_dataset = DataSet(dataset.classifier)

        left_dataset.attributes = dataset.attributes
        right_dataset.attributes = dataset.attributes

        left_dataset.attribute_types = dataset.attribute_types
        right_dataset.attribute_types = dataset.attribute_types

        for row in dataset.rows:
            if splitting_attribute is not None and row[splitting_attribute] >= split_val:
                left_dataset.rows.append(row)
            elif splitting_attribute is not None:
                right_dataset.rows.append(row)

        node.left_child = self.compute_decision_tree(left_dataset, node)
        node.right_child = self.compute_decision_tree(right_dataset, node)

        return node