def _create_tree(self,
                  instances,
                  candidate_attribute_indexes,
                  target_attribute_index=0,
                  default_class=None):
     class_labels_and_counts = Counter(
         [instance[target_attribute_index] for instance in instances])
     if not instances or not candidate_attribute_indexes:
         return default_class
     elif len(class_labels_and_counts) == 1:
         class_label = class_labels_and_counts.most_common(1)[0][0]
         return class_label
     else:
         default_class = simple_ml.majority_value(instances,
                                                  target_attribute_index)
         best_index = simple_ml.choose_best_attribute_index(
             instances, candidate_attribute_indexes, target_attribute_index)
         tree = {best_index: {}}
         partitions = simple_ml.split_instances(instances, best_index)
         remaining_candidate_attribute_indexes = [
             i for i in candidate_attribute_indexes if i != best_index
         ]
         for attribute_value in partitions:
             subtree = self._create_tree(
                 partitions[attribute_value],
                 remaining_candidate_attribute_indexes,
                 target_attribute_index, default_class)
             tree[best_index][attribute_value] = subtree
         return tree
Beispiel #2
0
    def _create(self,
                instances,
                candidate_attribute_indexes,
                target_attribute_index=0,
                default_class=None,
                trace=0,
                max_height=None,
                min_support=0,
                epsilon=1.0,
                parent_id=-1,
                parent_value=None,
                parent_node=None):
        '''
        Returns a new decision tree by recursively selecting and splitting instances based on 
        the highest information_gain of the candidate_attribute_indexes.
        The class label is found in target_attribute_index.
        The default class is the majority value for that branch of the tree.
        A positive trace value will generate trace information with increasing levels of indentation.

        max_height is the maximum levels the tree can have. Assume trace is non-zero.
        min_support is the minimum number of records needed to make a split. Otherwise the node becomes a leaf.
        epsilon budget
    
        Derived from the simplified ID3 algorithm presented in Building Decision Trees in Python by Christopher Roach,
        http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=3
        '''
        instances = instances[:]
        self._id_tracker += 1

        class_labels_and_counts = dict(
            Counter(
                [instance[target_attribute_index] for instance in instances]))
        #print( "class_labels_and_counts = "+str(class_labels_and_counts) )

        if epsilon is not None:
            class_labels_and_counts = simple_ml.add_laplace_noise(
                class_labels_and_counts, 1., epsilon)  # sensitivity = 1
        #print( "new class_labels_counts = "+str(class_labels_and_counts) )

        partitionSize = 0
        for k, v in class_labels_and_counts.items():
            partitionSize += v
        #print("noisy partitionSize = "+str(partitionSize))

        class_label = Counter(class_labels_and_counts).most_common(1)[0][0]

        # If the dataset is empty or the candidate attributes list is empty, return the default value.
        if partitionSize == 0:
            #if trace:
            #    print( '{}Using default class {}'.format('< ' * trace, default_class) )
            return node.Node(self._id_tracker,
                             trace,
                             None,
                             "_Leaf",
                             parent_id,
                             parent_value,
                             parent_node, {default_class: 1},
                             children=None)  #default_class
        # If the dataset is empty or the candidate attributes list is empty, return the default value.
        elif not candidate_attribute_indexes:
            #if trace:
            #    print( '{}Using default class {}'.format('< ' * trace, default_class) )
            return node.Node(self._id_tracker,
                             trace,
                             None,
                             "_Leaf",
                             parent_id,
                             parent_value,
                             parent_node,
                             class_labels_and_counts,
                             children=None)  #default_class

        # If all the records have the same class label, return that class label
        elif len(Counter(class_labels_and_counts)) == 1:
            #if trace:
            #    print( '{}All {} records have label {}'.format('< ' * trace, partitionSize, class_label) )
            return node.Node(self._id_tracker,
                             trace,
                             None,
                             "_Leaf",
                             parent_id,
                             parent_value,
                             parent_node,
                             class_labels_and_counts,
                             children=None)  #class_label

        # If there aren't enough records in the node to make another split, return the majority class label
        elif partitionSize < min_support:
            #if trace:
            #    print( '{} {} records is below the minimum support required for more splits. The majority label is {}'.format('< ' * trace, partitionSize, class_label) )
            return node.Node(self._id_tracker,
                             trace,
                             None,
                             "_Leaf",
                             parent_id,
                             parent_value,
                             parent_node,
                             class_labels_and_counts,
                             children=None)  #class_label

        # If the tree has reached the maximum number of levels (depth), return the majority class label. Assumes trace is non-zero.
        elif trace >= max_height:
            #if trace:
            #    print( '{}The maximum tree depth has been reached. The {} records in this leaf have majority label {}'.format('< ' * trace, partitionSize, class_label) )
            return node.Node(self._id_tracker,
                             trace,
                             None,
                             "_Leaf",
                             parent_id,
                             parent_value,
                             parent_node,
                             class_labels_and_counts,
                             children=None)  #class_label

        #  MAKE MORE SPLITS
        default_class = class_label  #simple_ml.majority_value(instances, target_attribute_index)

        # Choose the next best attribute index to best classify the records
        worst_case_sens = 1. - ((partitionSize / (partitionSize + 1))**2 +
                                (1 / (partitionSize + 1))**2)
        #print( "worst_case_sens="+str(worst_case_sens) )
        if trace == 1:  # if root node
            candi = [
                i for i in candidate_attribute_indexes
                if i not in self._previous_roots
            ]
            best_index = simple_ml.choose_best_attribute_index(
                instances,
                candi,
                target_attribute_index,
                epsilon=epsilon,
                sensitivity=worst_case_sens)
        else:
            best_index = simple_ml.choose_best_attribute_index(
                instances,
                candidate_attribute_indexes,
                target_attribute_index,
                epsilon=epsilon,
                sensitivity=worst_case_sens)
        #if trace:
        #    print( '{}Creating tree node for attribute index {}'.format('> ' * trace, best_index) )

        # Create a new decision tree node with the best attribute index and an empty dictionary object (for now)
        #tree = {best_index:{}}
        current_node = node.Node(self._id_tracker,
                                 trace,
                                 best_index,
                                 self._attribute_names[best_index],
                                 parent_id,
                                 parent_value,
                                 parent_node,
                                 class_labels_and_counts,
                                 children=[])

        # Create a new decision tree sub-node (branch) for each of the values in the best attribute field
        partitions = simple_ml.split_instances(instances, best_index)

        # Remove that attribute from the set of candidates for further splits
        remaining_candidate_attribute_indexes = [
            i for i in candidate_attribute_indexes if i != best_index
        ]
        ''' For every value in the chosen attribute, make a subtree '''
        tracecopy = trace + 1
        curr_id = self._id_tracker
        for attribute_value in partitions:
            #if trace:
            #    print( '{}Creating subtree for value {} ({}, {}, {}, {})'.format(
            #                        '> ' * trace,
            #                        attribute_value,
            #                        len(partitions[attribute_value]),
            #                        len(remaining_candidate_attribute_indexes),
            #                        target_attribute_index,
            #                        default_class)
            #                            )

            # Create a subtree for each value of the the best attribute
            subtree = self._create(partitions[attribute_value],
                                   remaining_candidate_attribute_indexes,
                                   target_attribute_index, default_class,
                                   tracecopy if trace else 0, max_height,
                                   min_support, epsilon, curr_id,
                                   attribute_value, current_node)

            # Add the new subtree to the empty dictionary object in the new tree/node we just created
            #tree[best_index][attribute_value] = subtree
            current_node.add_child(subtree)
            self._node_list.append(subtree)
            #print('.', end='')

        return current_node  #tree
    def _create_tree(self,
                     instances,
                     candidate_attribute_indexes,
                     target_attribute_index=0,
                     default_class=None,
                     trace=0):
        class_labels_and_counts = Counter(
            [instance[target_attribute_index] for instance in instances])
        # If the dataset is empty or the candidate attributes list is empty,
        # return the default class label
        if not instances or not candidate_attribute_indexes:
            if trace:
                print('{}Using default class {}'.format(
                    '< ' * trace, default_class))
            return default_class

        # If all the instances have the same class label, return that class label
        elif len(class_labels_and_counts) == 1:
            class_label = class_labels_and_counts.most_common(1)[0][0]
            if trace:
                print('{}All {} instances have label {}'.format(
                    '< ' * trace, len(instances), class_label))
            return class_label

# Otherwise, create a new subtree and add it to the tree
        else:
            default_class = majority_value(instances, target_attribute_index)

            # Choose the next best attribute index to best classify the instances
            best_index = choose_best_attribute_index(
                instances, candidate_attribute_indexes, target_attribute_index)
            if trace:
                print('{}Creating tree node for attribute index {}'.format(
                    '> ' * trace, best_index))

            # Create a new decision tree node with the best attribute index
            # and an empty dictionary object (for now)
            tree = {best_index: {}}

            # Create a new decision tree sub-node (branch)
            # for each of the values in the best attribute field
            partitions = split_instances(instances, best_index)

            # Remove that attribute from the set of candidates for further splits
            remaining_candidate_attribute_indexes = [
                i for i in candidate_attribute_indexes if i != best_index
            ]

            for attribute_value in partitions:
                if trace:
                    print('{}Creating subtree for value {} ({}, {}, {}, {})'.
                          format('> ' * trace, attribute_value,
                                 len(partitions[attribute_value]),
                                 len(remaining_candidate_attribute_indexes),
                                 target_attribute_index, default_class))

# Create a subtree for each value of the the best attribute
                subtree = self._create_tree(
                    partitions[attribute_value],
                    remaining_candidate_attribute_indexes,
                    target_attribute_index, default_class)

                # Add the new subtree to the empty dictionary object
                # in the new tree/node created above
                tree[best_index][attribute_value] = subtree

            return tree
    def _create(self,
                instances,
                candidate_attribute_indexes,
                target_attribute_index=0,
                default_class=None,
                trace=0):
        '''
        Returns a new decision tree by recursively selecting and splitting instances based on 
        the highest information_gain of the candidate_attribute_indexes.
        The class label is found in target_attribute_index.
        The default class is the majority value for that branch of the tree.
        A positive trace value will generate trace information with increasing levels of indentation.
    
        Derived from the simplified ID3 algorithm presented in Building Decision Trees in Python by Christopher Roach,
        http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=3
        '''
        instances = instances[:]
        class_labels_and_counts = Counter(
            [instance[target_attribute_index] for instance in instances])

        # If the dataset is empty or the candidate attributes list is empty, return the default value.
        if not instances or not candidate_attribute_indexes:
            if trace:
                print '{}Using default class {}'.format(
                    '< ' * trace, default_class)
            return default_class

        # If all the instances have the same class label, return that class label
        elif len(class_labels_and_counts) == 1:
            class_label = class_labels_and_counts.most_common(1)[0][0]
            if trace:
                print '{}All {} instances have label {}'.format(
                    '< ' * trace, len(instances), class_label)
            return class_label
        else:
            default_class = simple_ml.majority_value(instances,
                                                     target_attribute_index)

            # Choose the next best attribute index to best classify the instances
            best_index = simple_ml.choose_best_attribute_index(
                instances, candidate_attribute_indexes, target_attribute_index)
            if trace:
                print '{}Creating tree node for attribute index {}'.format(
                    '> ' * trace, best_index)

            # Create a new decision tree node with the best attribute index and an empty dictionary object (for now)
            tree = {best_index: {}}

            # Create a new decision tree sub-node (branch) for each of the values in the best attribute field
            partitions = simple_ml.split_instances(instances, best_index)

            # Remove that attribute from the set of candidates for further splits
            remaining_candidate_attribute_indexes = [
                i for i in candidate_attribute_indexes if i != best_index
            ]

            for attribute_value in partitions:
                if trace:
                    print '{}Creating subtree for value {} ({}, {}, {}, {})'.format(
                        '> ' * trace, attribute_value,
                        len(partitions[attribute_value]),
                        len(remaining_candidate_attribute_indexes),
                        target_attribute_index, default_class)

                # Create a subtree for each value of the the best attribute
                subtree = self._create(partitions[attribute_value],
                                       remaining_candidate_attribute_indexes,
                                       target_attribute_index, default_class,
                                       trace + 1 if trace else 0)

                # Add the new subtree to the empty dictionary object in the new tree/node we just created
                tree[best_index][attribute_value] = subtree

        return tree