コード例 #1
0
  def __init__(self, int_dm, real_dm, cat, tree_count = 128, option_count = 3, minimum_size = 1, weight = None, index = None, callback = None, compress = False):
    """Constructs and trains the stochastic wood - basically all its doing is constructing lots of trees, each with a different bootstrap sample of the input and calculating the out-of-bound error estimates. The parameters are as follows: int_dm & real_dm - the data matrices, one for discrete attributes and one for continuous; you can set one to None if there are none of that kind. cat - The category vector, aligned with the data matrices, where each category is represented by an integer. tree_count - The number of decision trees to create. option_count - The number of attributes to consider at each level of the decision trees - maps to the rand parameter of the DecTree class. minimum_size - Nodes in the trees do not suffer further splits once they are this size or smaller. weight - Optionally allows you to weight the trainning examples, aligned with data matrices. index - Using this you can optionally tell it which examples to use from the other matrices/vectors, and/or duplicate examples. callback - An optional function of the form (steps done,steps overall) used to report progress during construction. compress - if True trees are stored pickled and compressed, in a bid to make them consume less memory - this will obviously destroy classification performance unless multi_classify is used with suitably large blocks. Allows the algorithm to be run with larger quantities of data, but only use as a last resort."""
    
    # Generate weight/index vectors if not provided, and also put in a dummy callback if needed to avoid if statements...
    if weight==None: weight = numpy.ones(cat.shape[0], dtype=numpy.float32)
    if index==None: index = numpy.arange(cat.shape[0], dtype=numpy.int32)
    if callback==None: callback = lambda a, b: None

    # Create data structure to calculate the oob error rate...
    oob_success = numpy.zeros(cat.shape[0], dtype=numpy.float32)
    oob_total = numpy.zeros(cat.shape[0], dtype=numpy.int32)

    # Iterate and create all the trees...
    self.trees = []
    for itr in xrange(tree_count):
      callback(itr, tree_count)

      # Select the bootstrap sample...
      b_ind = numpy.random.randint(index.shape[0], size=index.shape[0])
      b_ind.sort() # Should improve cache coherance slightly.
      bootstrap = index[b_ind]

      # Train the classifier...
      dt = DecTree(int_dm, real_dm, cat, weight, bootstrap, option_count, minimum_size)
      if compress: self.trees.append(bz2.compress(pickle.dumps(dt)))
      else: self.trees.append(dt)

      # Get the indices of the oob set...
      oob_set = numpy.ones(index.shape[0], dtype=numpy.bool_)
      oob_set[b_ind] = False
      oob_set = index[oob_set]

      # Store the oob info...
      for ind in oob_set:
        dist = dt.classify(int_dm[ind,:], real_dm[ind,:])
        if cat[ind] in dist:
          oob_success[ind] += float(dist[cat[ind]]) / float(sum(dist.itervalues()))
        oob_total[ind] += 1

    # Combine the oob info to calculate the error rate, include being robust to a smaple never being a member of the oob set...
    oob_total[oob_total==0] = 1
    self.success = (oob_success[index] / oob_total[index]).mean()

    del callback # Should not need this, but apparently I do.
コード例 #2
0
for i in xrange(total_count):
  if i<politician_count:
    dm[i,:] = make_politician()
    cat[i] = 0
  elif i<(politician_count+marketing_count):
    dm[i,:] = make_marketing()
    cat[i] = 1
  else:
    dm[i,:] = make_tele_sales()
    cat[i] = 2



# Train the model...
dt = DecTree(None, dm, cat)
print 'Generated a tree with %i nodes'%dt.size()



# Test...
politician_test = 256
politician_success = 0
politician_unsure = 0
for i in xrange(politician_test):
  t = make_politician()
  dist = dt.classify(None,t)
  if 0 in dist: politician_success += 1
  if len(dist)>1: politician_unsure += 1

print 'Of %i politicians %i (%.1f%%) were correctly detected, with %i uncertain.'%(politician_test, politician_success, 100.0*politician_success/float(politician_test), politician_unsure)
コード例 #3
0
dm = numpy.empty((total_count, len(attributes)), dtype=numpy.int32)
cat = numpy.empty(total_count, dtype=numpy.int32)

for i in xrange(total_count):
  if i<zombie_count:
    dm[i,:] = make_zombie()
    cat[i] = 0
  else:
    dm[i,:] = make_human()
    cat[i] = 1



# Train the model...
dt = DecTree(dm, None, cat)
print 'Generated a tree with %i nodes'%dt.size()



# Test...
zombie_test = 256
zombie_success = 0
zombie_unsure = 0
for i in xrange(zombie_test):
  z = make_zombie()
  dist = dt.classify(z,None)
  if 0 in dist: zombie_success += 1
  if len(dist)>1: zombie_unsure += 1

print 'Of %i zombies %i (%.1f%%) were correctly detected, with %i uncertain.'%(zombie_test, zombie_success, 100.0*zombie_success/float(zombie_test), zombie_unsure)
コード例 #4
0
ファイル: test_tree_continuous.py プロジェクト: zoginni/helit
dm = numpy.empty((total_count, feat_length), dtype=numpy.float32)
cat = numpy.empty(total_count, dtype=numpy.int32)

for i in xrange(total_count):
    if i < politician_count:
        dm[i, :] = make_politician()
        cat[i] = 0
    elif i < (politician_count + marketing_count):
        dm[i, :] = make_marketing()
        cat[i] = 1
    else:
        dm[i, :] = make_tele_sales()
        cat[i] = 2

# Train the model...
dt = DecTree(None, dm, cat)
print 'Generated a tree with %i nodes' % dt.size()

# Test...
politician_test = 256
politician_success = 0
politician_unsure = 0
for i in xrange(politician_test):
    t = make_politician()
    dist = dt.classify(None, t)
    if 0 in dist: politician_success += 1
    if len(dist) > 1: politician_unsure += 1

print 'Of %i politicians %i (%.1f%%) were correctly detected, with %i uncertain.' % (
    politician_test, politician_success,
    100.0 * politician_success / float(politician_test), politician_unsure)
コード例 #5
0
#! /usr/bin/env python

# Copyright 2011 Tom SF Haines

# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

#   http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

from dec_tree import DecTree
import test_model as mod

# Same as test_tree_model, but includes randomisation of attribute selection...

# Get trainning data...
int_dm, real_dm, cats, weights = mod.generate_train()

# Train...
dt = DecTree(int_dm, real_dm, cats, weights, rand=3)

# Test...
mod.test(dt.classify)
コード例 #6
0
ファイル: test_tree_discrete.py プロジェクト: zoginni/helit
human_count = 16
total_count = zombie_count + human_count

dm = numpy.empty((total_count, len(attributes)), dtype=numpy.int32)
cat = numpy.empty(total_count, dtype=numpy.int32)

for i in xrange(total_count):
    if i < zombie_count:
        dm[i, :] = make_zombie()
        cat[i] = 0
    else:
        dm[i, :] = make_human()
        cat[i] = 1

# Train the model...
dt = DecTree(dm, None, cat)
print 'Generated a tree with %i nodes' % dt.size()

# Test...
zombie_test = 256
zombie_success = 0
zombie_unsure = 0
for i in xrange(zombie_test):
    z = make_zombie()
    dist = dt.classify(z, None)
    if 0 in dist: zombie_success += 1
    if len(dist) > 1: zombie_unsure += 1

print 'Of %i zombies %i (%.1f%%) were correctly detected, with %i uncertain.' % (
    zombie_test, zombie_success, 100.0 * zombie_success / float(zombie_test),
    zombie_unsure)