Esempio n. 1
0
    def _rec_build_random_tree(training_data_cut, rec_count):
        # increase recursion count by 1
        rec_count += 1

        # find the feature to split the data that provides greatest information gain from a random sample
        # returns tuple ((feature_name, feature_index), (fc_has_vote, sc_has_vote), (fc_has_not_vote, sc_has_not_vote))
        feature_and_votes = _find_best_sampled_feature(training_data_cut)

        # if training data falls below a preset threshold or the vote is unanimous build a Leaf node;
        # otherwise split data on feature and build a Tree node; also enforce a recursion limit
        fc_has_vote = feature_and_votes[1][0]
        sc_has_vote = feature_and_votes[1][1]
        fc_has_not_vote = feature_and_votes[2][0]
        sc_has_not_vote = feature_and_votes[2][1]

        # length of training data cut
        cut_length = len(training_data_cut)

        # build left (has feature) branch
        if cut_length < _leaf_threshold or fc_has_vote == 0 or sc_has_vote == 0 or rec_count > _rec_limit:
            # build Leaf based on votes
            left_branch = DecisionTree.Leaf((fc_has_vote, sc_has_vote))
        else:
            # split out and build Tree
            has_feature_data = []
            for tree_row in training_data_cut:
                # add 2 to feature index to skip RECORD and CLASS columns
                feature_index = feature_and_votes[0][1] + 2
                if tree_row[feature_index]:
                    has_feature_data.append(tree_row)

            # recurse into the left branch building the tree of data that has feature
            left_branch = _rec_build_random_tree(has_feature_data, rec_count)

        # build right (has not feature) branch
        if cut_length < _leaf_threshold or fc_has_not_vote == 0 or sc_has_not_vote == 0 or rec_count > _rec_limit:
            # build Leaf based on votes
            right_branch = DecisionTree.Leaf((fc_has_not_vote, sc_has_not_vote))
        else:
            # split out and build Tree
            has_not_feature_data = []
            for tree_row in training_data_cut:
                # add 2 to feature index to skip RECORD and CLASS columns
                feature_index = feature_and_votes[0][1] + 2
                if not tree_row[feature_index]:
                    has_not_feature_data.append(tree_row)

            # recurse into the right branch building the tree of data without feature
            right_branch = _rec_build_random_tree(has_not_feature_data, rec_count)

        # build tree with splitting feature name and index, and the left and right branches
        feature_name_index = feature_and_votes[0]
        random_tree = DecisionTree.Tree(feature_name_index, left_branch, right_branch)

        return random_tree
Esempio n. 2
0
def dt_learn(dataset, attrs, parent_dist=None):
    if not dataset:
        return Dt.Leaf(parent_dist.get_most_common())
    dist = Distribution(dataset)
    if dist.is_leaf() or not attrs:
        return Dt.Leaf(dist.get_most_common())
    else:
        attr = max_gain(dataset, dist, attrs)
        tree = Dt.Node(attr)
        for v in attr.domain:
            dv = [d for d in dataset if d.x[attr.index] == v]
            child_attrs = [a for a in attrs if a != attr]
            subtree = dt_learn(dv, child_attrs, dist)
            tree.add_child(subtree, v)
        return tree
Esempio n. 3
0
 def decisionTreeLearning(examples, attributes, parents_examples=()):
     if len(examples) == 0:
         return pluralityValue(
             parents_examples
         )  #ritorna la piu frequente classificazione tra gli examples
     elif allSameClass(examples):
         return DecisionTree.Leaf(
             examples[0][dataset.target]
         )  #se tutti hanno la stessa classe ritorna la classe del primo esempio
     elif len(attributes) == 0:
         return pluralityValue(
             examples
         )  #ritorna la piu frequente classificazione tra gli esempi
     else:
         if ce == 0:
             mostImpAtt, threshold = chooseAttribute(attributes, examples)
         else:
             mostImpAtt, threshold = chooseAttribute2(attributes, examples)
         tree = DecisionTree.DecisionTree(mostImpAtt, threshold,
                                          dataset.attrnames[mostImpAtt])
         ExampleMinor, ExampleMajor = splittingOnThreshold(
             mostImpAtt, threshold,
             examples)  #separazione basata sulla soglia
         #fa la ricorsione ed aggiunge all albero
         branchesLeft = decisionTreeLearning(ExampleMinor,
                                             removeAttr(
                                                 mostImpAtt, attributes),
                                             examples)  #ricorsione
         branchesRight = decisionTreeLearning(ExampleMajor,
                                              removeAttr(
                                                  mostImpAtt, attributes),
                                              examples)  #ricorsione
         tree.addLeft(threshold, branchesLeft)
         tree.addRight(threshold, branchesRight)
         return tree
 def decisionTreeLearning(examples, attributes, parents_examples=()):
     if len(examples) == 0:
         return pluralityValue(
             parents_examples
         )  #returns the most frequent classification among the examples
     elif allSameClass(examples):
         return DecisionTree.Leaf(
             examples[0][dataset.target]
         )  #if they all have the same class, I return the class of the first example
     elif len(attributes) == 0:
         return pluralityValue(
             examples
         )  #returns the most frequent classification among the examples
     else:
         mostImpAtt, threshold = chooseAttribute(attributes, examples)
         tree = DecisionTree.DecisionTree(mostImpAtt, threshold,
                                          dataset.attrnames[mostImpAtt])
         ExampleMinor, ExampleMajor = splittingOnThreshold(
             mostImpAtt, threshold, examples)  #separate based on threshold
         #do recursion and add to the tree
         branchesLeft = decisionTreeLearning(ExampleMinor,
                                             removeAttr(
                                                 mostImpAtt, attributes),
                                             examples)  #recursion
         branchesRight = decisionTreeLearning(ExampleMajor,
                                              removeAttr(
                                                  mostImpAtt, attributes),
                                              examples)  #recursion
         tree.addLeft(threshold, branchesLeft)
         tree.addRight(threshold, branchesRight)
         return tree
Esempio n. 5
0
 def pluralityValue(examples):
     i = 0
     global popular
     for v in dataset.values:  #per ogni classificazione conta le occorrenze, poi scegli la piu popular
         count = counting(dataset.target, v, examples)
         if count > i:
             i = count
             popular = v
     return DecisionTree.Leaf(popular)
 def pluralityValue(examples):
     i = 0
     global popular
     for v in dataset.values:  #for each classification count the occurrences. Then choose the most popular
         count = counting(dataset.target, v, examples)
         if count > i:
             i = count
             popular = v
     return DecisionTree.Leaf(popular)