def _create_tree(self,
                  instances,
                  candidate_attribute_indexes,
                  target_attribute_index=0,
                  default_class=None):
     class_labels_and_counts = Counter(
         [instance[target_attribute_index] for instance in instances])
     if not instances or not candidate_attribute_indexes:
         return default_class
     elif len(class_labels_and_counts) == 1:
         class_label = class_labels_and_counts.most_common(1)[0][0]
         return class_label
     else:
         default_class = simple_ml.majority_value(instances,
                                                  target_attribute_index)
         best_index = simple_ml.choose_best_attribute_index(
             instances, candidate_attribute_indexes, target_attribute_index)
         tree = {best_index: {}}
         partitions = simple_ml.split_instances(instances, best_index)
         remaining_candidate_attribute_indexes = [
             i for i in candidate_attribute_indexes if i != best_index
         ]
         for attribute_value in partitions:
             subtree = self._create_tree(
                 partitions[attribute_value],
                 remaining_candidate_attribute_indexes,
                 target_attribute_index, default_class)
             tree[best_index][attribute_value] = subtree
         return tree
    def _create_tree(self,
                     instances,
                     candidate_attribute_indexes,
                     target_attribute_index=0,
                     default_class=None,
                     trace=0):
        class_labels_and_counts = Counter(
            [instance[target_attribute_index] for instance in instances])
        # If the dataset is empty or the candidate attributes list is empty,
        # return the default class label
        if not instances or not candidate_attribute_indexes:
            if trace:
                print('{}Using default class {}'.format(
                    '< ' * trace, default_class))
            return default_class

        # If all the instances have the same class label, return that class label
        elif len(class_labels_and_counts) == 1:
            class_label = class_labels_and_counts.most_common(1)[0][0]
            if trace:
                print('{}All {} instances have label {}'.format(
                    '< ' * trace, len(instances), class_label))
            return class_label

# Otherwise, create a new subtree and add it to the tree
        else:
            default_class = majority_value(instances, target_attribute_index)

            # Choose the next best attribute index to best classify the instances
            best_index = choose_best_attribute_index(
                instances, candidate_attribute_indexes, target_attribute_index)
            if trace:
                print('{}Creating tree node for attribute index {}'.format(
                    '> ' * trace, best_index))

            # Create a new decision tree node with the best attribute index
            # and an empty dictionary object (for now)
            tree = {best_index: {}}

            # Create a new decision tree sub-node (branch)
            # for each of the values in the best attribute field
            partitions = split_instances(instances, best_index)

            # Remove that attribute from the set of candidates for further splits
            remaining_candidate_attribute_indexes = [
                i for i in candidate_attribute_indexes if i != best_index
            ]

            for attribute_value in partitions:
                if trace:
                    print('{}Creating subtree for value {} ({}, {}, {}, {})'.
                          format('> ' * trace, attribute_value,
                                 len(partitions[attribute_value]),
                                 len(remaining_candidate_attribute_indexes),
                                 target_attribute_index, default_class))

# Create a subtree for each value of the the best attribute
                subtree = self._create_tree(
                    partitions[attribute_value],
                    remaining_candidate_attribute_indexes,
                    target_attribute_index, default_class)

                # Add the new subtree to the empty dictionary object
                # in the new tree/node created above
                tree[best_index][attribute_value] = subtree

            return tree
    def _create_tree(self,
                     instances,
                     candidate_attribute_indexes,
                     target_attribute_index=0,
                     default_class=None,
                     trace=0):
        class_labels_and_counts = Counter([instance[target_attribute_index] 
                                           for instance in instances])
        # If the dataset is empty or the candidate attributes list is empty, 
        # return the default class label
        if not instances or not candidate_attribute_indexes:
            if trace:
                print('{}Using default class {}'.format('< ' * trace, default_class))
            return default_class
            
        # If all the instances have the same class label, return that class label
        elif len(class_labels_and_counts) == 1:
            class_label = class_labels_and_counts.most_common(1)[0][0]
            if trace:
                print('{}All {} instances have label {}'.format(
                    '< ' * trace, len(instances), class_label))
            return class_label

		# Otherwise, create a new subtree and add it to the tree
        else:
            default_class = majority_value(instances, target_attribute_index)

            # Choose the next best attribute index to best classify the instances
            best_index = choose_best_attribute_index(instances,
                                                     candidate_attribute_indexes, 
                                                     target_attribute_index)
            if trace:
                print('{}Creating tree node for attribute index {}'.format(
                	'> ' * trace, best_index))

            # Create a new decision tree node with the best attribute index 
            # and an empty dictionary object (for now)
            tree = {best_index:{}}

            # Create a new decision tree sub-node (branch) 
            # for each of the values in the best attribute field       
            partitions = split_instances(instances, best_index)
            
            # Remove that attribute from the set of candidates for further splits
            remaining_candidate_attribute_indexes = [i 
                                                     for i in candidate_attribute_indexes 
                                                     if i != best_index]
                                                     
            for attribute_value in partitions:
                if trace:
                    print('{}Creating subtree for value {} ({}, {}, {}, {})'.format(
                        '> ' * trace,
                        attribute_value, 
                        len(partitions[attribute_value]), 
                        len(remaining_candidate_attribute_indexes), 
                        target_attribute_index, 
                        default_class))

	            # Create a subtree for each value of the the best attribute
                subtree = self._create_tree(
                    partitions[attribute_value],
                    remaining_candidate_attribute_indexes,
                    target_attribute_index,
                    default_class)

                # Add the new subtree to the empty dictionary object 
                # in the new tree/node created above
                tree[best_index][attribute_value] = subtree

            return tree
Example #4
0
    def _create(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0):
        '''
        Returns a new decision tree by recursively selecting and splitting instances based on 
        the highest information_gain of the candidate_attribute_indexes.
        The class label is found in target_attribute_index.
        The default class is the majority value for that branch of the tree.
        A positive trace value will generate trace information with increasing levels of indentation.
    
        Derived from the simplified ID3 algorithm presented in Building Decision Trees in Python by Christopher Roach,
        http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=3
        '''
        instances = instances[:]
        class_labels_and_counts = Counter([instance[target_attribute_index] for instance in instances])

        # If the dataset is empty or the candidate attributes list is empty, return the default value. 
        if not instances or not candidate_attribute_indexes:
            if trace:
                print('{}Using default class {}'.format('< ' * trace, default_class))
            return default_class

        # If all the instances have the same class label, return that class label
        elif len(class_labels_and_counts) == 1:
            class_label = class_labels_and_counts.most_common(1)[0][0]
            if trace:
                print('{}All {} instances have label {}'.format('< ' * trace, len(instances), class_label))
            return class_label
        else:
            default_class = simple_ml.majority_value(instances, target_attribute_index)

            # Choose the next best attribute index to best classify the instances
            best_index = simple_ml.choose_best_attribute_index(instances, candidate_attribute_indexes, target_attribute_index)
            if trace:
                print('{}Creating tree node for attribute index {}'.format('> ' * trace, best_index))

            # Create a new decision tree node with the best attribute index and an empty dictionary object (for now)
            tree = {best_index:{}}

            # Create a new decision tree sub-node (branch) for each of the values in the best attribute field       
            partitions = simple_ml.split_instances(instances, best_index)

            # Remove that attribute from the set of candidates for further splits
            remaining_candidate_attribute_indexes = [i for i in candidate_attribute_indexes if i != best_index]

            for attribute_value in partitions:
                if trace:
                    print('{}Creating subtree for value {} ({}, {}, {}, {})'.format(
                        '> ' * trace,
                        attribute_value, 
                        len(partitions[attribute_value]), 
                        len(remaining_candidate_attribute_indexes), 
                        target_attribute_index, 
                        default_class))

                # Create a subtree for each value of the the best attribute
                subtree = self._create(
                    partitions[attribute_value],
                    remaining_candidate_attribute_indexes,
                    target_attribute_index,
                    default_class,
                    trace + 1 if trace else 0)

                # Add the new subtree to the empty dictionary object in the new tree/node we just created
                tree[best_index][attribute_value] = subtree

        return tree
    def _create(self,
                instances,
                candidate_attribute_indexes,
                target_attribute_index=0,
                default_class=None,
                trace=0):
        '''
        Returns a new decision tree by recursively selecting and splitting instances based on 
        the highest information_gain of the candidate_attribute_indexes.
        The class label is found in target_attribute_index.
        The default class is the majority value for that branch of the tree.
        A positive trace value will generate trace information with increasing levels of indentation.
    
        Derived from the simplified ID3 algorithm presented in Building Decision Trees in Python by Christopher Roach,
        http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=3
        '''
        instances = instances[:]
        class_labels_and_counts = Counter(
            [instance[target_attribute_index] for instance in instances])

        # If the dataset is empty or the candidate attributes list is empty, return the default value.
        if not instances or not candidate_attribute_indexes:
            if trace:
                print '{}Using default class {}'.format(
                    '< ' * trace, default_class)
            return default_class

        # If all the instances have the same class label, return that class label
        elif len(class_labels_and_counts) == 1:
            class_label = class_labels_and_counts.most_common(1)[0][0]
            if trace:
                print '{}All {} instances have label {}'.format(
                    '< ' * trace, len(instances), class_label)
            return class_label
        else:
            default_class = simple_ml.majority_value(instances,
                                                     target_attribute_index)

            # Choose the next best attribute index to best classify the instances
            best_index = simple_ml.choose_best_attribute_index(
                instances, candidate_attribute_indexes, target_attribute_index)
            if trace:
                print '{}Creating tree node for attribute index {}'.format(
                    '> ' * trace, best_index)

            # Create a new decision tree node with the best attribute index and an empty dictionary object (for now)
            tree = {best_index: {}}

            # Create a new decision tree sub-node (branch) for each of the values in the best attribute field
            partitions = simple_ml.split_instances(instances, best_index)

            # Remove that attribute from the set of candidates for further splits
            remaining_candidate_attribute_indexes = [
                i for i in candidate_attribute_indexes if i != best_index
            ]

            for attribute_value in partitions:
                if trace:
                    print '{}Creating subtree for value {} ({}, {}, {}, {})'.format(
                        '> ' * trace, attribute_value,
                        len(partitions[attribute_value]),
                        len(remaining_candidate_attribute_indexes),
                        target_attribute_index, default_class)

                # Create a subtree for each value of the the best attribute
                subtree = self._create(partitions[attribute_value],
                                       remaining_candidate_attribute_indexes,
                                       target_attribute_index, default_class,
                                       trace + 1 if trace else 0)

                # Add the new subtree to the empty dictionary object in the new tree/node we just created
                tree[best_index][attribute_value] = subtree

        return tree