Esempio n. 1
0
    def learn_tree(self, input_features, data_subset):
        """returns a decision tree
        for input_features is a set of possible conditions
        data_subset is a subset of the data used to build this (sub)tree

        where a decision tree is a function that takes an example and
        makes a prediction on the target feature
        """
        if (input_features and len(data_subset) >= self.min_number_examples):
            first_target_val = self.target(data_subset[0])
            allagree = all(self.target(inst)==first_target_val for inst in data_subset)
            if not allagree:
                split, partn = self.select_split(input_features, data_subset)
                if split: # the split succeeded in splitting the data
                    false_examples, true_examples = partn
                    rem_features = [fe for fe in input_features if fe != split]
                    self.display(2,"Splitting on",split.__doc__,"with examples split",
                                   len(true_examples),":",len(false_examples))
                    true_tree = self.learn_tree(rem_features,true_examples)
                    false_tree =  self.learn_tree(rem_features,false_examples)
                    def fun(e):
                        if split(e):
                            return true_tree(e)
                        else:
                            return false_tree(e)
                    #fun = lambda e: true_tree(e) if split(e) else false_tree(e)
                    fun.__doc__ = ("if "+split.__doc__+" then ("+true_tree.__doc__+
                                   ") else ("+false_tree.__doc__+")")
                    return fun
        # don't expand the trees but return a point prediction
        return point_prediction(self.target, data_subset, selection=self.leaf_selection)
Esempio n. 2
0
def training_error(dataset, data_subset, to_optimize):
    """returns training error for dataset on to_optimize.
    This assumes that we choose the best value for the optimization
    criteria for dataset according to point_prediction
    """
    select_dict = {"sum-of-squares":"mean", "sum_absolute":"median",
                       "logloss":"Laplace"}  # arbitrary mapping. Perhaps wrong.
    selection = select_dict[to_optimize]
    predictor = point_prediction(dataset.target, data_subset, selection=selection)
    error = sum(error_example(predictor(example),
                               dataset.target(example),
                               to_optimize)
                 for example in data_subset)
    return error