def test_DecisionForest(): """Train a decision forest against an arbitrary formula, to see if it can approximate it to an arbitrary low error, given enough examples.""" def formula(x,y,z): return (x ** 2) + (x * y * z) + (10 * z) + (y / z) + 25 def random_input_output(): x = random.random() + 0.1 y = random.random() + 0.1 z = random.random() + 0.1 output = formula(x,y,z) return ({'x':x, 'y':y, 'z':z}, output) te = TrainingExamples() for i in xrange(1, 5000): (input, output) = random_input_output() te.add_example(input, output) if i % 500: continue print "Testing after", i, "training examples" forest = DecisionForest() forest.train(te, train_on_subset=True, num_trees=10, features_considered_per_node=3) # Measure the true out-of-sample error rate for the entire forest. predict_err = SummaryStats() for j in xrange(10000): (input, output) = random_input_output() predicted_output = forest.predict(input) predict_err.add((output - predicted_output) ** 2) print "avg squared error = ", predict_err.avg()
def test_DecisionTree_medium(): # come up with some input->output formula to see if we can learn it. def formula(x,y,z): return x + y + z # helper function to generate input->output examples def random_example(): x = random.random() + 0.1 y = random.random() + 0.1 z = random.random() + 0.1 output = formula(x,y,z) return (x,y,z,output) # generate a training set te = TrainingExamples() for i in xrange(20): (x,y,z,output) = random_example() te.add_example({'x':x, 'y':y, 'z':z}, output) # learn a decision tree based on training examples tree = DecisionTree() for example in te.examples: tree.add_example(example) tree.grow_tree(features_considered_per_node=1) # check that we learned the training set with enough accuracy tree.print_tree() assert tree.avg_squared_error(te.examples) < 0.001
def test_DecisionTree_simple(): te = TrainingExamples() te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 0}, 1) te.add_example({ 'fat': 0}, 0) te.add_example({ 'fat': 0}, 0) te.add_example({ 'fat': 0}, 0) tree = DecisionTree() for example in te.examples: tree.add_example(example) tree.grow_tree(features_considered_per_node=1) tree.print_tree() assert tree.predict({'fat': 1}) >= 1.0 assert tree.predict({'fat': 0}) <= 0.3
print "Prob output: mean=%1.3f std=%1.3f" % (prob_stats.avg(), prob_stats.std()) print "Histogram of prob output:", prob_hist return rank_accuracy import random class RandomBinaryClassifier(BinaryClassifier): def prob_output1(self, example): return random.random() if __name__ == "__main__": # data set where: x > 0 <==> output is 1 te = TrainingExamples() for x in xrange(1, 1000): te.add_example({"x": x}, 1) te.add_example({"x": -x}, 0) # A random classifier should have 50% area under the curve. random_binary_classifier = RandomBinaryClassifier() area_under_curve = random_binary_classifier.test(te) assert 0.45 <= area_under_curve <= 0.55 # A perfectly correct classifier should have 100% area under curve. class PerfectBinaryClassifier(BinaryClassifier): def prob_output1(self, example): if example["x"] > 0: return 1 else:
def test_NaiveBayesClassifier(): # Setup a training set where "fat" is the only input feature, and # the output value is whether they died of diabetes. # This is an easy training set where the input is almost always equal # to the output. te = TrainingExamples() te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 1}, 1) te.add_example({ 'fat': 0}, 1) # almost always te.add_example({ 'fat': 0}, 0) te.add_example({ 'fat': 0}, 0) te.add_example({ 'fat': 0}, 0) te.add_example({ 'fat': 0}, 0) te.add_example({ 'fat': 0}, 0) te.add_example({ 'fat': 0}, 0) te.add_example({ 'fat': 1}, 0) # almost always classifier = NaiveBayesClassifier() classifier.train(te) assert classifier.prob_output1({ 'fat': 1}) > classifier.prob_output1({ 'fat': 0}) # let's see the underlying weights print classifier.prob_output1({ 'fat': 1}, explain=True) print classifier.prob_output1({ 'fat': 0}, explain=True)