Ejemplo n.º 1
0
  def __init__(self, int_dm, real_dm, cat, tree_count = 128, option_count = 3, minimum_size = 1, weight = None, index = None, callback = None, compress = False):
    """Constructs and trains the stochastic wood - basically all its doing is constructing lots of trees, each with a different bootstrap sample of the input and calculating the out-of-bound error estimates. The parameters are as follows: int_dm & real_dm - the data matrices, one for discrete attributes and one for continuous; you can set one to None if there are none of that kind. cat - The category vector, aligned with the data matrices, where each category is represented by an integer. tree_count - The number of decision trees to create. option_count - The number of attributes to consider at each level of the decision trees - maps to the rand parameter of the DecTree class. minimum_size - Nodes in the trees do not suffer further splits once they are this size or smaller. weight - Optionally allows you to weight the trainning examples, aligned with data matrices. index - Using this you can optionally tell it which examples to use from the other matrices/vectors, and/or duplicate examples. callback - An optional function of the form (steps done,steps overall) used to report progress during construction. compress - if True trees are stored pickled and compressed, in a bid to make them consume less memory - this will obviously destroy classification performance unless multi_classify is used with suitably large blocks. Allows the algorithm to be run with larger quantities of data, but only use as a last resort."""
    
    # Generate weight/index vectors if not provided, and also put in a dummy callback if needed to avoid if statements...
    if weight==None: weight = numpy.ones(cat.shape[0], dtype=numpy.float32)
    if index==None: index = numpy.arange(cat.shape[0], dtype=numpy.int32)
    if callback==None: callback = lambda a, b: None

    # Create data structure to calculate the oob error rate...
    oob_success = numpy.zeros(cat.shape[0], dtype=numpy.float32)
    oob_total = numpy.zeros(cat.shape[0], dtype=numpy.int32)

    # Iterate and create all the trees...
    self.trees = []
    for itr in xrange(tree_count):
      callback(itr, tree_count)

      # Select the bootstrap sample...
      b_ind = numpy.random.randint(index.shape[0], size=index.shape[0])
      b_ind.sort() # Should improve cache coherance slightly.
      bootstrap = index[b_ind]

      # Train the classifier...
      dt = DecTree(int_dm, real_dm, cat, weight, bootstrap, option_count, minimum_size)
      if compress: self.trees.append(bz2.compress(pickle.dumps(dt)))
      else: self.trees.append(dt)

      # Get the indices of the oob set...
      oob_set = numpy.ones(index.shape[0], dtype=numpy.bool_)
      oob_set[b_ind] = False
      oob_set = index[oob_set]

      # Store the oob info...
      for ind in oob_set:
        dist = dt.classify(int_dm[ind,:], real_dm[ind,:])
        if cat[ind] in dist:
          oob_success[ind] += float(dist[cat[ind]]) / float(sum(dist.itervalues()))
        oob_total[ind] += 1

    # Combine the oob info to calculate the error rate, include being robust to a smaple never being a member of the oob set...
    oob_total[oob_total==0] = 1
    self.success = (oob_success[index] / oob_total[index]).mean()

    del callback # Should not need this, but apparently I do.
Ejemplo n.º 2
0


# Train the model...
dt = DecTree(None, dm, cat)
print 'Generated a tree with %i nodes'%dt.size()



# Test...
politician_test = 256
politician_success = 0
politician_unsure = 0
for i in xrange(politician_test):
  t = make_politician()
  dist = dt.classify(None,t)
  if 0 in dist: politician_success += 1
  if len(dist)>1: politician_unsure += 1

print 'Of %i politicians %i (%.1f%%) were correctly detected, with %i uncertain.'%(politician_test, politician_success, 100.0*politician_success/float(politician_test), politician_unsure)

marketing_test = 256
marketing_success = 0
marketing_unsure = 0
for i in xrange(marketing_test):
  t = make_marketing()
  dist = dt.classify(None,t)
  if 1 in dist: marketing_success += 1
  if len(dist)>1: marketing_unsure += 1

print 'Of %i marketers %i (%.1f%%) were correctly detected, with %i uncertain.'%(marketing_test, marketing_success, 100.0*marketing_success/float(marketing_test), marketing_unsure)
Ejemplo n.º 3
0


# Train the model...
dt = DecTree(dm, None, cat)
print 'Generated a tree with %i nodes'%dt.size()



# Test...
zombie_test = 256
zombie_success = 0
zombie_unsure = 0
for i in xrange(zombie_test):
  z = make_zombie()
  dist = dt.classify(z,None)
  if 0 in dist: zombie_success += 1
  if len(dist)>1: zombie_unsure += 1

print 'Of %i zombies %i (%.1f%%) were correctly detected, with %i uncertain.'%(zombie_test, zombie_success, 100.0*zombie_success/float(zombie_test), zombie_unsure)
  
human_test = 256
human_success = 0
human_unsure = 0
for i in xrange(human_test):
  h = make_human()
  dist = dt.classify(h,None)
  if 1 in dist: human_success += 1
  if len(dist)>1: human_unsure += 1

print 'Of %i humans %i (%.1f%%) were correctly detected, with %i uncertain.'%(human_test, human_success, 100.0*human_success/float(human_test), human_unsure)
Ejemplo n.º 4
0
        cat[i] = 1
    else:
        dm[i, :] = make_tele_sales()
        cat[i] = 2

# Train the model...
dt = DecTree(None, dm, cat)
print 'Generated a tree with %i nodes' % dt.size()

# Test...
politician_test = 256
politician_success = 0
politician_unsure = 0
for i in xrange(politician_test):
    t = make_politician()
    dist = dt.classify(None, t)
    if 0 in dist: politician_success += 1
    if len(dist) > 1: politician_unsure += 1

print 'Of %i politicians %i (%.1f%%) were correctly detected, with %i uncertain.' % (
    politician_test, politician_success,
    100.0 * politician_success / float(politician_test), politician_unsure)

marketing_test = 256
marketing_success = 0
marketing_unsure = 0
for i in xrange(marketing_test):
    t = make_marketing()
    dist = dt.classify(None, t)
    if 1 in dist: marketing_success += 1
    if len(dist) > 1: marketing_unsure += 1
Ejemplo n.º 5
0
        cat[i] = 0
    else:
        dm[i, :] = make_human()
        cat[i] = 1

# Train the model...
dt = DecTree(dm, None, cat)
print 'Generated a tree with %i nodes' % dt.size()

# Test...
zombie_test = 256
zombie_success = 0
zombie_unsure = 0
for i in xrange(zombie_test):
    z = make_zombie()
    dist = dt.classify(z, None)
    if 0 in dist: zombie_success += 1
    if len(dist) > 1: zombie_unsure += 1

print 'Of %i zombies %i (%.1f%%) were correctly detected, with %i uncertain.' % (
    zombie_test, zombie_success, 100.0 * zombie_success / float(zombie_test),
    zombie_unsure)

human_test = 256
human_success = 0
human_unsure = 0
for i in xrange(human_test):
    h = make_human()
    dist = dt.classify(h, None)
    if 1 in dist: human_success += 1
    if len(dist) > 1: human_unsure += 1