Esempio n. 1
0
def find_best_partition(examples,
                        compare_fun,
                        max_splits):
    assert type(max_splits == IntType)
    best_relinfgain = -1
    best_absinfgain = -1
    best_part       = None    

    if len(examples) > 1:
        for i in range(0,examples.feature_no):        
            (relinfgain, absinfgain, part) = \
                         find_best_feature_partition(examples,
                                                     compare_fun,
                                                     i,
                                                     max_splits)
            if pylib_basics.verbose():
                print "# Evaluating feature %d: %1.6f, %1.6f "\
                      %(i,relinfgain, absinfgain),
            if part:
                if pylib_basics.verbose():
                    print part.abstracter
            else:
                if pylib_basics.verbose():
                    print "# No split possible, feature is homogenous:",
                    print examples.get_distinct_feature_values(i)
            if compare_fun((relinfgain, absinfgain),
                           (best_relinfgain,best_absinfgain)) > 0:
                best_relinfgain = relinfgain
                best_absinfgain = absinfgain
                best_part = part                
                
    return  (best_relinfgain, best_absinfgain, best_part)
Esempio n. 2
0
def find_best_partition(examples, compare_fun, max_splits):
    assert type(max_splits == IntType)
    best_relinfgain = -1
    best_absinfgain = -1
    best_part = None

    if len(examples) > 1:
        for i in range(0, examples.feature_no):
            (relinfgain, absinfgain, part) = \
                         find_best_feature_partition(examples,
                                                     compare_fun,
                                                     i,
                                                     max_splits)
            if pylib_basics.verbose():
                print "# Evaluating feature %d: %1.6f, %1.6f "\
                      %(i,relinfgain, absinfgain),
            if part:
                if pylib_basics.verbose():
                    print part.abstracter
            else:
                if pylib_basics.verbose():
                    print "# No split possible, feature is homogenous:",
                    print examples.get_distinct_feature_values(i)
            if compare_fun((relinfgain, absinfgain),
                           (best_relinfgain, best_absinfgain)) > 0:
                best_relinfgain = relinfgain
                best_absinfgain = absinfgain
                best_part = part

    return (best_relinfgain, best_absinfgain, best_part)
Esempio n. 3
0
def info_gain(a_priori_entropy, remainder_entropy):
    """
    Compute the expected information gain given a-priory entropy of a
    distribution and remainder entropy.
    """
    #assert a_priori_entropy>=remainder_entropy
    if a_priori_entropy < remainder_entropy:
        # Allow for rounding errors
        if pylib_basics.verbose():
            print "Warning: Remainder enropy > a-priori-entropy",
            print remainder_entropy, ">", a_priori_entropy
        remainder_entropy = a_priori_entropy

    return a_priori_entropy - remainder_entropy
Esempio n. 4
0
def info_gain(a_priori_entropy, remainder_entropy):
    """
    Compute the expected information gain given a-priory entropy of a
    distribution and remainder entropy.
    """
    # assert a_priori_entropy>=remainder_entropy
    if a_priori_entropy < remainder_entropy:
        # Allow for rounding errors
        if pylib_basics.verbose():
            print "Warning: Remainder enropy > a-priori-entropy",
            print remainder_entropy, ">", a_priori_entropy
        remainder_entropy = a_priori_entropy

    return a_priori_entropy - remainder_entropy
Esempio n. 5
0
    def refine(self,           
               entropy_compare_fun,
               tolerance,
               maxsplit):
        """
        Expand a tree node one level. This only affects
        self.subtrees. Return True if tree could be expanded with
               entropic win, False otherwise.
        """
        if self.closed:
            return False

        if not self.isleaf():
            res = False
            for i in self.subtrees.values():
                if i.refine(entropy_compare_fun, tolerance, maxsplit):
                    res = True
            return res

        old_rig = self.root.best_relinfogain
        
        if pylib_basics.verbose():
            print "# Searching test for node {%s} (%ld examples) (current RIG: %f)"\
                  %(self.node_name,self.sample_size, old_rig)
        (relinfgain, absinfgain, part) = \
                     pylib_ml_examples.find_best_partition(self.sample,
                                                           entropy_compare_fun,
                                                           maxsplit)
        if not part:
            if pylib_basics.verbose():
                print "# No possible split found"
            return False
        if pylib_basics.verbose():
            print "# Best partition: ", part.abstracter

        self.extend_leaf(part)
        entros  =  self.root.entropies()
        new_rig = entros[3]

        if pylib_basics.verbose():
            print "# New RIG: ", new_rig,
        if new_rig < old_rig*tolerance:
            if pylib_basics.verbose():
                print " Node closed"            
            self.subtrees = {}
            self.closed = True
            res = False
        else:
            if pylib_basics.verbose():
                print " Node expanded"            
            res = True
        if new_rig > self.root.best_relinfogain:
            self.root.best_relinfogain = new_rig
        return res
Esempio n. 6
0
    def global_refine_search(self, maxsplit):
        """ 
        For each open leaf node in the tree find the extension that
        leads to the best tree and store it and its evaluation in
        node.best_part, node.eval. Return True if an open node was
        found.
        """        
        res = False
        if self.closed:
            return res
        if not self.isleaf():
            for i in self.subtrees.values():
                if i.global_refine_search(maxsplit):
                    res =  True
            return res

        old_rig = self.root.best_relinfogain        
        if pylib_basics.verbose():
            print "# Searching test for node {%s} (%ld examples) (current RIG: %f)"\
                  %(self.node_name,self.sample_size, old_rig)

        pg = pylib_ml_examples.partition_generator(self.sample, maxsplit)
        self.best_refine = None
        self.best_rig    = 0
        res = True
        while 1:
            try:
                part = pg.next()
                self.extend_leaf(part)
                entros  =  self.root.entropies()
                new_rig = entros[3]
                if new_rig > self.best_rig:
                    self.best_rig = new_rig
                    self.best_refine = self.subtrees
                self.subtrees = {}
            except StopIteration:
                break
        return res
Esempio n. 7
0
    def global_refine_apply(self, rig_limit):
        """
        Apply all tree refinedments that are good enough. Return True
        if any have been found.
        """
        res = False
        if self.closed:
            return res

        if not self.isleaf():
            for i in self.subtrees.values():                
                if i.global_refine_apply(rig_limit):
                    res = True
            return res

        if self.best_rig >= rig_limit and self.best_refine:
            self.subtrees = self.best_refine
            res = True
            if pylib_basics.verbose():
                print "# Extending ", self," with ", self.best_refine
        else:
            self.best_refine = None
        return res
Esempio n. 8
0
if crossval:
    random.seed(seed)
    jobs = set.crossval_sets(crossval, stratified)
    tr_results = []
    te_results = []
    fold = 0
    for i in jobs:
        fold += 1
        tree = dectree_constructor(i[0],
                                   entropy_compare_fun,
                                   relgain_limit,
                                   max_split)
        (tsize, tdepth, tleaves)                     = tree.characteristics()
        (apriori, remainder, absinfgain, relinfgain) = tree.entropies()
        (succ, count) = tree.classify_set(i[0],pylib_basics.verbose())
        succ_percent = float(succ)/count*100
        print "Fold %-2d RIG: %5.3f (%2d,%4d,%4d) Train: %4d out of %4d, %7.3f%% " %\
              (fold, relinfgain,tdepth, tsize, tleaves, succ, count, succ_percent),
        tr_results.append((succ,count,succ_percent,relinfgain, tdepth,
                           tsize, tleaves))
        
        (succ, count) = tree.classify_set(i[1],pylib_basics.verbose())
        succ_percent = float(succ)/count*100
        print "Test: %4d out of %4d, %7.3f%%" % (succ, count, succ_percent)
        te_results.append((succ, count,succ_percent)) 

    tr_percent = (map(lambda x:x[2],tr_results))
    te_percent = (map(lambda x:x[2],te_results))
    rig        = (map(lambda x:x[3],tr_results))
    depths     = (map(lambda x:x[4],tr_results))
Esempio n. 9
0
if crossval:
    random.seed(seed)
    jobs = set.crossval_sets(crossval, stratified)
    tr_results = []
    te_results = []
    fold = 0
    for i in jobs:
        fold += 1
        tree = dectree_constructor(i[0],
                                   entropy_compare_fun,
                                   relgain_limit,
                                   max_split)
        (tsize, tdepth, tleaves)                     = tree.characteristics()
        (apriori, remainder, absinfgain, relinfgain) = tree.entropies()
        (succ, count) = tree.classify_set(i[0],pylib_basics.verbose())
        succ_percent = float(succ)/count*100
        print "Fold %-2d RIG: %5.3f (%2d,%4d,%4d) Train: %4d out of %4d, %7.3f%% " %\
              (fold, relinfgain,tdepth, tsize, tleaves, succ, count, succ_percent),
        tr_results.append((succ,count,succ_percent,relinfgain, tdepth,
                           tsize, tleaves))
        
        (succ, count) = tree.classify_set(i[1],pylib_basics.verbose())
        succ_percent = float(succ)/count*100
        print "Test: %4d out of %4d, %7.3f%%" % (succ, count, succ_percent)
        te_results.append((succ, count,succ_percent)) 

    tr_percent = (map(lambda x:x[2],tr_results))
    te_percent = (map(lambda x:x[2],te_results))
    rig        = (map(lambda x:x[3],tr_results))
    depths     = (map(lambda x:x[4],tr_results))