def __init__(self, int_dm, real_dm, cat, tree_count = 128, option_count = 3, minimum_size = 1, weight = None, index = None, callback = None, compress = False): """Constructs and trains the stochastic wood - basically all its doing is constructing lots of trees, each with a different bootstrap sample of the input and calculating the out-of-bound error estimates. The parameters are as follows: int_dm & real_dm - the data matrices, one for discrete attributes and one for continuous; you can set one to None if there are none of that kind. cat - The category vector, aligned with the data matrices, where each category is represented by an integer. tree_count - The number of decision trees to create. option_count - The number of attributes to consider at each level of the decision trees - maps to the rand parameter of the DecTree class. minimum_size - Nodes in the trees do not suffer further splits once they are this size or smaller. weight - Optionally allows you to weight the trainning examples, aligned with data matrices. index - Using this you can optionally tell it which examples to use from the other matrices/vectors, and/or duplicate examples. callback - An optional function of the form (steps done,steps overall) used to report progress during construction. compress - if True trees are stored pickled and compressed, in a bid to make them consume less memory - this will obviously destroy classification performance unless multi_classify is used with suitably large blocks. Allows the algorithm to be run with larger quantities of data, but only use as a last resort.""" # Generate weight/index vectors if not provided, and also put in a dummy callback if needed to avoid if statements... if weight==None: weight = numpy.ones(cat.shape[0], dtype=numpy.float32) if index==None: index = numpy.arange(cat.shape[0], dtype=numpy.int32) if callback==None: callback = lambda a, b: None # Create data structure to calculate the oob error rate... oob_success = numpy.zeros(cat.shape[0], dtype=numpy.float32) oob_total = numpy.zeros(cat.shape[0], dtype=numpy.int32) # Iterate and create all the trees... self.trees = [] for itr in xrange(tree_count): callback(itr, tree_count) # Select the bootstrap sample... b_ind = numpy.random.randint(index.shape[0], size=index.shape[0]) b_ind.sort() # Should improve cache coherance slightly. bootstrap = index[b_ind] # Train the classifier... dt = DecTree(int_dm, real_dm, cat, weight, bootstrap, option_count, minimum_size) if compress: self.trees.append(bz2.compress(pickle.dumps(dt))) else: self.trees.append(dt) # Get the indices of the oob set... oob_set = numpy.ones(index.shape[0], dtype=numpy.bool_) oob_set[b_ind] = False oob_set = index[oob_set] # Store the oob info... for ind in oob_set: dist = dt.classify(int_dm[ind,:], real_dm[ind,:]) if cat[ind] in dist: oob_success[ind] += float(dist[cat[ind]]) / float(sum(dist.itervalues())) oob_total[ind] += 1 # Combine the oob info to calculate the error rate, include being robust to a smaple never being a member of the oob set... oob_total[oob_total==0] = 1 self.success = (oob_success[index] / oob_total[index]).mean() del callback # Should not need this, but apparently I do.
# Train the model... dt = DecTree(None, dm, cat) print 'Generated a tree with %i nodes'%dt.size() # Test... politician_test = 256 politician_success = 0 politician_unsure = 0 for i in xrange(politician_test): t = make_politician() dist = dt.classify(None,t) if 0 in dist: politician_success += 1 if len(dist)>1: politician_unsure += 1 print 'Of %i politicians %i (%.1f%%) were correctly detected, with %i uncertain.'%(politician_test, politician_success, 100.0*politician_success/float(politician_test), politician_unsure) marketing_test = 256 marketing_success = 0 marketing_unsure = 0 for i in xrange(marketing_test): t = make_marketing() dist = dt.classify(None,t) if 1 in dist: marketing_success += 1 if len(dist)>1: marketing_unsure += 1 print 'Of %i marketers %i (%.1f%%) were correctly detected, with %i uncertain.'%(marketing_test, marketing_success, 100.0*marketing_success/float(marketing_test), marketing_unsure)
# Train the model... dt = DecTree(dm, None, cat) print 'Generated a tree with %i nodes'%dt.size() # Test... zombie_test = 256 zombie_success = 0 zombie_unsure = 0 for i in xrange(zombie_test): z = make_zombie() dist = dt.classify(z,None) if 0 in dist: zombie_success += 1 if len(dist)>1: zombie_unsure += 1 print 'Of %i zombies %i (%.1f%%) were correctly detected, with %i uncertain.'%(zombie_test, zombie_success, 100.0*zombie_success/float(zombie_test), zombie_unsure) human_test = 256 human_success = 0 human_unsure = 0 for i in xrange(human_test): h = make_human() dist = dt.classify(h,None) if 1 in dist: human_success += 1 if len(dist)>1: human_unsure += 1 print 'Of %i humans %i (%.1f%%) were correctly detected, with %i uncertain.'%(human_test, human_success, 100.0*human_success/float(human_test), human_unsure)
cat[i] = 1 else: dm[i, :] = make_tele_sales() cat[i] = 2 # Train the model... dt = DecTree(None, dm, cat) print 'Generated a tree with %i nodes' % dt.size() # Test... politician_test = 256 politician_success = 0 politician_unsure = 0 for i in xrange(politician_test): t = make_politician() dist = dt.classify(None, t) if 0 in dist: politician_success += 1 if len(dist) > 1: politician_unsure += 1 print 'Of %i politicians %i (%.1f%%) were correctly detected, with %i uncertain.' % ( politician_test, politician_success, 100.0 * politician_success / float(politician_test), politician_unsure) marketing_test = 256 marketing_success = 0 marketing_unsure = 0 for i in xrange(marketing_test): t = make_marketing() dist = dt.classify(None, t) if 1 in dist: marketing_success += 1 if len(dist) > 1: marketing_unsure += 1
cat[i] = 0 else: dm[i, :] = make_human() cat[i] = 1 # Train the model... dt = DecTree(dm, None, cat) print 'Generated a tree with %i nodes' % dt.size() # Test... zombie_test = 256 zombie_success = 0 zombie_unsure = 0 for i in xrange(zombie_test): z = make_zombie() dist = dt.classify(z, None) if 0 in dist: zombie_success += 1 if len(dist) > 1: zombie_unsure += 1 print 'Of %i zombies %i (%.1f%%) were correctly detected, with %i uncertain.' % ( zombie_test, zombie_success, 100.0 * zombie_success / float(zombie_test), zombie_unsure) human_test = 256 human_success = 0 human_unsure = 0 for i in xrange(human_test): h = make_human() dist = dt.classify(h, None) if 1 in dist: human_success += 1 if len(dist) > 1: human_unsure += 1