def find_best_partition(examples, compare_fun, max_splits): assert type(max_splits == IntType) best_relinfgain = -1 best_absinfgain = -1 best_part = None if len(examples) > 1: for i in range(0,examples.feature_no): (relinfgain, absinfgain, part) = \ find_best_feature_partition(examples, compare_fun, i, max_splits) if pylib_basics.verbose(): print "# Evaluating feature %d: %1.6f, %1.6f "\ %(i,relinfgain, absinfgain), if part: if pylib_basics.verbose(): print part.abstracter else: if pylib_basics.verbose(): print "# No split possible, feature is homogenous:", print examples.get_distinct_feature_values(i) if compare_fun((relinfgain, absinfgain), (best_relinfgain,best_absinfgain)) > 0: best_relinfgain = relinfgain best_absinfgain = absinfgain best_part = part return (best_relinfgain, best_absinfgain, best_part)
def find_best_partition(examples, compare_fun, max_splits): assert type(max_splits == IntType) best_relinfgain = -1 best_absinfgain = -1 best_part = None if len(examples) > 1: for i in range(0, examples.feature_no): (relinfgain, absinfgain, part) = \ find_best_feature_partition(examples, compare_fun, i, max_splits) if pylib_basics.verbose(): print "# Evaluating feature %d: %1.6f, %1.6f "\ %(i,relinfgain, absinfgain), if part: if pylib_basics.verbose(): print part.abstracter else: if pylib_basics.verbose(): print "# No split possible, feature is homogenous:", print examples.get_distinct_feature_values(i) if compare_fun((relinfgain, absinfgain), (best_relinfgain, best_absinfgain)) > 0: best_relinfgain = relinfgain best_absinfgain = absinfgain best_part = part return (best_relinfgain, best_absinfgain, best_part)
def info_gain(a_priori_entropy, remainder_entropy): """ Compute the expected information gain given a-priory entropy of a distribution and remainder entropy. """ #assert a_priori_entropy>=remainder_entropy if a_priori_entropy < remainder_entropy: # Allow for rounding errors if pylib_basics.verbose(): print "Warning: Remainder enropy > a-priori-entropy", print remainder_entropy, ">", a_priori_entropy remainder_entropy = a_priori_entropy return a_priori_entropy - remainder_entropy
def info_gain(a_priori_entropy, remainder_entropy): """ Compute the expected information gain given a-priory entropy of a distribution and remainder entropy. """ # assert a_priori_entropy>=remainder_entropy if a_priori_entropy < remainder_entropy: # Allow for rounding errors if pylib_basics.verbose(): print "Warning: Remainder enropy > a-priori-entropy", print remainder_entropy, ">", a_priori_entropy remainder_entropy = a_priori_entropy return a_priori_entropy - remainder_entropy
def refine(self, entropy_compare_fun, tolerance, maxsplit): """ Expand a tree node one level. This only affects self.subtrees. Return True if tree could be expanded with entropic win, False otherwise. """ if self.closed: return False if not self.isleaf(): res = False for i in self.subtrees.values(): if i.refine(entropy_compare_fun, tolerance, maxsplit): res = True return res old_rig = self.root.best_relinfogain if pylib_basics.verbose(): print "# Searching test for node {%s} (%ld examples) (current RIG: %f)"\ %(self.node_name,self.sample_size, old_rig) (relinfgain, absinfgain, part) = \ pylib_ml_examples.find_best_partition(self.sample, entropy_compare_fun, maxsplit) if not part: if pylib_basics.verbose(): print "# No possible split found" return False if pylib_basics.verbose(): print "# Best partition: ", part.abstracter self.extend_leaf(part) entros = self.root.entropies() new_rig = entros[3] if pylib_basics.verbose(): print "# New RIG: ", new_rig, if new_rig < old_rig*tolerance: if pylib_basics.verbose(): print " Node closed" self.subtrees = {} self.closed = True res = False else: if pylib_basics.verbose(): print " Node expanded" res = True if new_rig > self.root.best_relinfogain: self.root.best_relinfogain = new_rig return res
def global_refine_search(self, maxsplit): """ For each open leaf node in the tree find the extension that leads to the best tree and store it and its evaluation in node.best_part, node.eval. Return True if an open node was found. """ res = False if self.closed: return res if not self.isleaf(): for i in self.subtrees.values(): if i.global_refine_search(maxsplit): res = True return res old_rig = self.root.best_relinfogain if pylib_basics.verbose(): print "# Searching test for node {%s} (%ld examples) (current RIG: %f)"\ %(self.node_name,self.sample_size, old_rig) pg = pylib_ml_examples.partition_generator(self.sample, maxsplit) self.best_refine = None self.best_rig = 0 res = True while 1: try: part = pg.next() self.extend_leaf(part) entros = self.root.entropies() new_rig = entros[3] if new_rig > self.best_rig: self.best_rig = new_rig self.best_refine = self.subtrees self.subtrees = {} except StopIteration: break return res
def global_refine_apply(self, rig_limit): """ Apply all tree refinedments that are good enough. Return True if any have been found. """ res = False if self.closed: return res if not self.isleaf(): for i in self.subtrees.values(): if i.global_refine_apply(rig_limit): res = True return res if self.best_rig >= rig_limit and self.best_refine: self.subtrees = self.best_refine res = True if pylib_basics.verbose(): print "# Extending ", self," with ", self.best_refine else: self.best_refine = None return res
if crossval: random.seed(seed) jobs = set.crossval_sets(crossval, stratified) tr_results = [] te_results = [] fold = 0 for i in jobs: fold += 1 tree = dectree_constructor(i[0], entropy_compare_fun, relgain_limit, max_split) (tsize, tdepth, tleaves) = tree.characteristics() (apriori, remainder, absinfgain, relinfgain) = tree.entropies() (succ, count) = tree.classify_set(i[0],pylib_basics.verbose()) succ_percent = float(succ)/count*100 print "Fold %-2d RIG: %5.3f (%2d,%4d,%4d) Train: %4d out of %4d, %7.3f%% " %\ (fold, relinfgain,tdepth, tsize, tleaves, succ, count, succ_percent), tr_results.append((succ,count,succ_percent,relinfgain, tdepth, tsize, tleaves)) (succ, count) = tree.classify_set(i[1],pylib_basics.verbose()) succ_percent = float(succ)/count*100 print "Test: %4d out of %4d, %7.3f%%" % (succ, count, succ_percent) te_results.append((succ, count,succ_percent)) tr_percent = (map(lambda x:x[2],tr_results)) te_percent = (map(lambda x:x[2],te_results)) rig = (map(lambda x:x[3],tr_results)) depths = (map(lambda x:x[4],tr_results))