def test_DecisionForest():
    """Train a decision forest against an arbitrary formula, to see if it can
    approximate it to an arbitrary low error, given enough examples."""
    def formula(x,y,z):
        return (x ** 2) + (x * y * z) + (10 * z) + (y / z) + 25

    def random_input_output():
        x = random.random() + 0.1
        y = random.random() + 0.1
        z = random.random() + 0.1
        output = formula(x,y,z)
        return ({'x':x, 'y':y, 'z':z}, output)


    te = TrainingExamples()
    for i in xrange(1, 5000):
        (input, output) = random_input_output()
        te.add_example(input, output)

        if i % 500: continue

        print "Testing after", i, "training examples"
        forest = DecisionForest()
        forest.train(te, train_on_subset=True, num_trees=10, features_considered_per_node=3)

        # Measure the true out-of-sample error rate for the entire forest.
        predict_err = SummaryStats()
        for j in xrange(10000):
            (input, output) = random_input_output()
            predicted_output = forest.predict(input)
            predict_err.add((output - predicted_output) ** 2)
        print "avg squared error = ", predict_err.avg()
def test_DecisionTree_medium():
    # come up with some input->output formula to see if we can learn it.
    def formula(x,y,z):
        return x + y + z

    # helper function to generate input->output examples
    def random_example():
        x = random.random() + 0.1
        y = random.random() + 0.1
        z = random.random() + 0.1
        output = formula(x,y,z)
        return (x,y,z,output)

    # generate a training set
    te = TrainingExamples()
    for i in xrange(20):
        (x,y,z,output) = random_example()
        te.add_example({'x':x, 'y':y, 'z':z}, output)

    # learn a decision tree based on training examples
    tree = DecisionTree()
    for example in te.examples:
        tree.add_example(example)
    tree.grow_tree(features_considered_per_node=1)

    # check that we learned the training set with enough accuracy
    tree.print_tree()
    assert tree.avg_squared_error(te.examples) < 0.001
def test_DecisionTree_simple():
    te = TrainingExamples()
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 0}, 1)
    te.add_example({ 'fat': 0}, 0)
    te.add_example({ 'fat': 0}, 0)
    te.add_example({ 'fat': 0}, 0)

    tree = DecisionTree()
    for example in te.examples:
        tree.add_example(example)
    tree.grow_tree(features_considered_per_node=1)
    tree.print_tree()

    assert tree.predict({'fat': 1}) >= 1.0
    assert tree.predict({'fat': 0}) <= 0.3
Exemple #4
0
            print "Prob output: mean=%1.3f std=%1.3f" % (prob_stats.avg(), prob_stats.std())
            print "Histogram of prob output:", prob_hist
        return rank_accuracy


import random


class RandomBinaryClassifier(BinaryClassifier):
    def prob_output1(self, example):
        return random.random()


if __name__ == "__main__":
    # data set where:  x > 0 <==> output is 1
    te = TrainingExamples()
    for x in xrange(1, 1000):
        te.add_example({"x": x}, 1)
        te.add_example({"x": -x}, 0)

    # A random classifier should have 50% area under the curve.
    random_binary_classifier = RandomBinaryClassifier()
    area_under_curve = random_binary_classifier.test(te)
    assert 0.45 <= area_under_curve <= 0.55

    # A perfectly correct classifier should have 100% area under curve.
    class PerfectBinaryClassifier(BinaryClassifier):
        def prob_output1(self, example):
            if example["x"] > 0:
                return 1
            else:
Exemple #5
0
def test_NaiveBayesClassifier():
    # Setup a training set where "fat" is the only input feature, and
    # the output value is whether they died of diabetes.
    # This is an easy training set where the input is almost always equal
    # to the output.
    te = TrainingExamples()
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 1}, 1)
    te.add_example({ 'fat': 0}, 1)  # almost always

    te.add_example({ 'fat': 0}, 0)
    te.add_example({ 'fat': 0}, 0)
    te.add_example({ 'fat': 0}, 0)
    te.add_example({ 'fat': 0}, 0)
    te.add_example({ 'fat': 0}, 0)
    te.add_example({ 'fat': 0}, 0)
    te.add_example({ 'fat': 1}, 0)  # almost always

    classifier = NaiveBayesClassifier()
    classifier.train(te)

    assert classifier.prob_output1({ 'fat': 1}) > classifier.prob_output1({ 'fat': 0})

    # let's see the underlying weights
    print classifier.prob_output1({ 'fat': 1}, explain=True)
    print classifier.prob_output1({ 'fat': 0}, explain=True)