Example #1
0
def test_DecisionForest():
    """Train a decision forest against an arbitrary formula, to see if it can
    approximate it to an arbitrary low error, given enough examples."""
    def formula(x,y,z):
        return (x ** 2) + (x * y * z) + (10 * z) + (y / z) + 25

    def random_input_output():
        x = random.random() + 0.1
        y = random.random() + 0.1
        z = random.random() + 0.1
        output = formula(x,y,z)
        return ({'x':x, 'y':y, 'z':z}, output)


    te = TrainingExamples()
    for i in xrange(1, 5000):
        (input, output) = random_input_output()
        te.add_example(input, output)

        if i % 500: continue

        print "Testing after", i, "training examples"
        forest = DecisionForest()
        forest.train(te, train_on_subset=True, num_trees=10, features_considered_per_node=3)

        # Measure the true out-of-sample error rate for the entire forest.
        predict_err = SummaryStats()
        for j in xrange(10000):
            (input, output) = random_input_output()
            predicted_output = forest.predict(input)
            predict_err.add((output - predicted_output) ** 2)
        print "avg squared error = ", predict_err.avg()
Example #2
0
    def train(self, training_examples, train_on_subset=True, num_trees=100, features_considered_per_node=2, **kwds):
        print "Training a decision forest of %d trees, using %d examples, and %d features considered per node." % (
                num_trees, len(training_examples), features_considered_per_node)
        self.trees = []
        total_test_output_stats = SummaryStats()

        binary_classification = all(example["_OUTPUT"] in [0,1] for example in training_examples)
        #binary_classification = True
        #for example in training_examples:
        #    output = example["_OUTPUT"]
        #    if output not in [0,1]:
        #        binary_classification = False
        #        break

        for tree_i in xrange(1, num_trees+1):
            tree = DecisionTree()
            self.trees.append(tree)

            test_set_ids = set(xrange(len(training_examples)))
            for i in xrange(len(training_examples)):
                if train_on_subset:  # N samples with replacement ("bootstrap")
                    index = random.randint(0, len(training_examples)-1)
                else:
                    index = i

                tree.add_example(training_examples[index])
                test_set_ids.discard(index)

            print "Growing tree %d/%d ..." % (tree_i, num_trees),
            tree.grow_tree(features_considered_per_node=features_considered_per_node)

            # Report the in-sample training error
            if binary_classification:
                print "area-under-curve for %d training examples is %2.2f" % (
                        len(tree.examples), tree.test(tree.examples, print_level=0))
            else:
                print "%2.2f avg err^2 on %d training examples" % (
                        tree.avg_squared_error(), len(tree.examples)),


            # Report the out-of-sample testing error, if we have any out-of-sample
            # examples to test on.
            if train_on_subset:
                print "; ",
                test_set = [training_examples[i] for i in test_set_ids]

                if binary_classification:
                    # Do a true out-of-sample test just on this one tree
                    # Temporarily make this a forest-of-one-tree...
                    save_trees = self.trees
                    self.trees = [tree]
                    self.test(test_set)
                    self.trees = save_trees
                else:
                    avg_squared_error = tree.avg_squared_error(test_set)
                    total_test_output_stats.add(avg_squared_error)

                    print "out-of-sample avg err^2 on %d test cases: %.2f [%.2f avg. for all %d trees so far]" % (len(test_set), avg_squared_error, total_test_output_stats.avg(), tree_i),

            print
Example #3
0
 def avg_squared_error(self, examples=None):
     "returns avg squared error of output"
     examples = examples or self.examples  # by default, use training examples
     output_stats = SummaryStats()
     for example in examples:
         prediction = self.predict(example)
         output_stats.add((prediction - float(example["_OUTPUT"])) ** 2)
     return output_stats.avg()
Example #4
0
 def avg_squared_error(self, examples):
     """Returns average squared error of the predicted output.
     This is useful to determine if the DecisionForest was able to learn the
     training set completely.
     If the training_error is very high, then the forest was not powerful enough
     to learn the training set (and perhaps features_considered_per_node should be
     increased.)
     In theory, a DecisionForest should never overfit the training data, because
     each tree was trained on a random bootstrap sample of the training examples.
     """
     errors = SummaryStats()
     for example in examples:
         prediction = self.predict(example)
         squared_error = (prediction - float(example["_OUTPUT"])) ** 2
         errors.add(squared_error)
     return errors.avg()
Example #5
0
    def get(self):

        user = users.get_current_user()

        # We use app.yaml to configure overall authentication
        if not validate_user(user.email()):
            self.redirect(users.create_login_url(self.request.uri))

        start = time.time()

        key = self.request.get("key")

        gc_data = gc_datastore.get_data(key)
        #need to sort results

        # We regenerate summary stats with each invocation
        summary_stats = SummaryStats(gc_data).stats

        q = db.GqlQuery("SELECT * FROM GraphModel " +
                "WHERE ANCESTOR IS :1 ", key)

        results = q.fetch(6)

        for entry in results:
            if entry.graph_type == graph.RAW_CSV_DATA:
                results_csv_key = str(entry.blob_key.key())
            elif entry.graph_type == graph.YG_GC_MEMORY:
                yg_memory_key = str(entry.blob_key.key())
            elif entry.graph_type == graph.GC_DURATION:
                gc_duration_key = str(entry.blob_key.key())
            elif entry.graph_type == graph.MEMORY_RECLAIMED:
                gc_reclaimed_key = str(entry.blob_key.key())
            elif entry.graph_type == graph.FULL_GC_MEMORY:
                full_memory_key = str(entry.blob_key.key())
            elif entry.graph_type == graph.MEMORY_UTIL_POST:
                memory_util_post_key = str(entry.blob_key.key())

        duration = time.time() - start

        # Pass the key to our results, as the data will be obtained via a 
        template_values = {
            'user': user,
            'logout': users.create_logout_url("/"),
            'name': '/uploads',
            'duration': duration,
            'results_key': results_csv_key,
            'summary_stats': summary_stats,
            'gc_results': gc_data,
            'yg_memory_key': yg_memory_key,
            'full_memory_key': full_memory_key,
            'gc_duration_key': gc_duration_key,
            'gc_reclaimed_key': gc_reclaimed_key,
            'memory_util_post_key': memory_util_post_key
        }

        template = jinja_environment.get_template(
            'static/templates/results.html')
        self.response.out.write(template.render(template_values))
Example #6
0
    def __init__(self):
        self.examples = []
        self.example_output_stats = SummaryStats()
        # For each feature, store a list of examples sorted by that feature's value
        self.examples_sorted_by_feature = collections.defaultdict(list)

        self.decision_feature = None
        self.decision_threshold = None

        # the two subtrees induced by the above decision function
        self.subtrees = [None, None]
Example #7
0
    def test(self, examples, print_level=1):
        """Computes the "area under the ROC curve". This is a way to measure the
        precision/recall WITHOUT choosing a cutoff-threshold.  It is mathematically
        equivalent to:
           "the probability that a random positive example has a higher
            prob_output1 than a random negative case"
        (This equivalence is non-obvious).

        The algorithm below computes this average probability by effectively trying
        all combinations of positive-vs-negative examples, but does this in O(NlgN)
        instead of O(N^2)"""
        if type(examples) is TrainingExamples:
            examples = examples.examples

        prob_stats = SummaryStats()
        prob_hist = Histogram()
        output1_scores = list()
        output0_scores = list()
        for example in examples:
            assert example["_OUTPUT"] in [0, 1]
            prob = self.prob_output1(example)
            prob_stats.add(prob)
            prob_key = "%1.1f-%1.1f" % (int(prob * 10) / 10.0, (int(prob * 10) + 1) / 10.0)
            if prob == 1:
                prob_key = "0.9-1.0"  # don't create a 1.0-1.1 bucket
            prob_hist.add(prob_key)
            real_output = example["_OUTPUT"] == 1
            if real_output:
                output1_scores.append(prob)
            else:
                output0_scores.append(prob)

        output1_scores.sort()
        output0_scores.sort()

        if print_level >= 2:
            print "%d output1 scores:" % len(output1_scores),
            print ["%2.2f" % i for i in output1_scores[0:5]],
            print " ... ",
Example #8
0
    def test_generate_stats(self):
        """Test the generation of stats result set"""

        yg_gc1 = parsegc.generate_yg_gc_entry("50.0", "50.0", "ParNew", "2",
                                              "1", "4", "0.12345", "2048",
                                              "1024", "4096", "2.12345", "1.0",
                                              "1.50", "2.1")

        yg_gc2 = parsegc.generate_yg_gc_entry("100.25", "100.25", "ParNew",
                                              "3", "2", "8", "0.12345", "8192",
                                              "5120", "16384", "3.12345",
                                              "1.5", "2.0", "3.1")

        full_gc = parsegc.generate_full_gc_entry("200.5", "200.5", "Tenured",
                                                 "20", "10", "40", "0.23456",
                                                 "8192", "5120", "16384",
                                                 "200", "100", "400", "3.1234",
                                                 "1.9", "0.05", "3.11")

        system_gc = parsegc.generate_full_gc_entry("250.75", "250.75",
                                                   "Tenured", "30", "20", "80",
                                                   "0.23456", "8192", "4096",
                                                   "8192", "300", "200", "800",
                                                   "4.0912", "1.98", "2.1",
                                                   "4.09", "System")

        gc_data = [yg_gc1, yg_gc2, full_gc, system_gc]
        results = SummaryStats(gc_data).stats

        expected = {}
        expected['Total Events'] = len(gc_data)
        expected['Elapsed Time'] = '200.750  secs'
        expected['Time spent in Full GC'] = '7.215  secs'
        expected['Time spent in YG GC'] = '5.247  secs'
        expected['Heap Start / End (Peak)'] = '4 MB / 8 MB (16 MB)'
        expected['YG Start / End (Peak)'] = '4 KB / 8 KB (8 KB)'
        expected['Tenured Start / End (Peak)'] = '40 KB / 80 KB (80 KB)'
        expected['Perm Start / End (Peak)'] = '400 KB / 800 KB (800 KB)'
        expected['Heap Growth'] = '4 MB'
        expected['YG Growth'] = '4 KB'
        expected['Tenured Growth'] = '40 KB'
        expected['Perm Growth'] = '400 KB'
        expected['Avg YG Reclaimed'] = '1 KB'
        expected['Avg Tenured Reclaimed'] = '10 KB'

        self.assertEqual(results, expected)
Example #9
0
    def post(self):
        user = users.get_current_user()

        # We use app.yaml to configure overall authentication
        if not validate_user(user.email()):
            self.redirect(users.create_login_url(self.request.uri))

        start = time.time()

        #file = self.request.body_file
        file = self.request.params["gclog"].file

        log_key = LogData(
            filename=self.request.POST["gclog"].filename,
            notes=self.request.get("notes")).put()

        parser = ParseGCLog()
        gc_results = parser.parse_data(file)

        if len(gc_results) > 0:

            # Generate summary stats for results page
            summary_stats = SummaryStats(gc_results).stats

            # persist gc data - too slow at present with large datasets
            gc_datastore.store_data(log_key, gc_results)

            # persist all CSV data we generate to the store so we 
            # won't have to regenerate later
            blob_writer = BlobResultWriter()

            results_csv_key = graph.generate_cached_graph(log_key, 
                graph.RAW_CSV_DATA,
                gc_results, 
                blob_writer)

            yg_memory_blob_key = graph.generate_cached_graph(log_key, 
                graph.YG_GC_MEMORY, 
                gc_results, 
                blob_writer)

            full_memory_blob_key = graph.generate_cached_graph(log_key, 
                graph.FULL_GC_MEMORY, 
                gc_results, 
                blob_writer)

            gc_duration_blob_key = graph.generate_cached_graph(log_key, 
                graph.GC_DURATION, 
                gc_results, 
                blob_writer)
            
            gc_reclaimed_blob_key = graph.generate_cached_graph(log_key, 
                graph.MEMORY_RECLAIMED, 
                gc_results, 
                blob_writer)

            memory_util_post_blob_key = graph.generate_cached_graph(log_key, 
                graph.MEMORY_UTIL_POST, 
                gc_results, 
                blob_writer)
            
            duration = time.time() - start

            # Pass the key to our results, as the data will be obtained via a 
            template_values = {
                'user': user,
                'logout': users.create_logout_url("/"),
                'duration': duration,
                'name': '/uploads',
                'results_key': str(results_csv_key),
                'summary_stats': summary_stats,
                'gc_results': gc_results,
                'yg_memory_key': str(yg_memory_blob_key),
                'full_memory_key': str(full_memory_blob_key),
                'gc_duration_key': str(gc_duration_blob_key),
                'gc_reclaimed_key': str(gc_reclaimed_blob_key),
                'memory_util_post_key': str(memory_util_post_blob_key)
            }

            template = jinja_environment.get_template(
                'static/templates/results.html')

        else:
            template_values = {
                'user': user,
                'logout': users.create_logout_url("/")
            }

            template = jinja_environment.get_template(
                'static/templates/error.html')

        self.response.out.write(template.render(template_values))
Example #10
0
 def predict(self, example):
     """Returns the average predicted output over all our trees."""
     output_stats = SummaryStats()
     for tree in self.trees:
         output_stats.add(tree.predict(example))
     return output_stats.avg()
Example #11
0
    def _find_best_split(self, feature):
        """Find the best threshold value to split this node on, using @feature.
        Returns (less_than_threshold, split_err).
        The @less_than_threshold is what you use to "decide", i.e.:
            if example[feature] < less_than_threshold:
                decide_left ...
            else:
                decide_right ...
        Note that this method doesn't actually split anything: it just figures out
        which threshold value would be best to split at.
        """
        self._sort_by_features(feature)
        left_output_stats = SummaryStats()
        right_output_stats = SummaryStats()
        assert len(self.examples) == len(self.examples_sorted_by_feature[feature])

        # To begin, let's assume we push all examples into the right child node.
        for example in self.examples:
            right_output_stats.add(float(example["_OUTPUT"]))

        # Now, move the examples one by one to the left child node.
        # (Note the examples sorted by value -- it's as if we're adjusting the
        # less_than_threshold.)
        # After each example, calculate the goodness-of-split, and track the best.
        best_threshold = None
        best_err = None
        last_feature_value = None
        for example in self.examples_sorted_by_feature[feature]:
            feature_value = example[feature]
            output_value = float(example["_OUTPUT"])

            # Speed optimization: skip over examples with same feature value.
            if feature_value == last_feature_value:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue

            last_feature_value = feature_value  # remember for next iteration

            left_count = left_output_stats.count()
            right_count = right_output_stats.count()

            # Edge-case: left or right child is empty
            if left_count == 0 or right_count == 0:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue  # not a true split

            # Compute goodness-of-split: weighted average of the 2 output variances.
            if left_count <= 1: left_err = 0
            else: left_err = (left_count - 1) * left_output_stats.var()

            if right_count <= 1: right_err = 0
            else: right_err = (right_count - 1) * right_output_stats.var()

            err = left_err + right_err
            if best_err is None or err < best_err:
                best_threshold = feature_value
                best_err = err

            left_output_stats.add(output_value)
            right_output_stats.remove(output_value)

        # to save memory, delete this sorted array (we'll never use it again anyway)
        del self.examples_sorted_by_feature[feature]
        return (best_threshold, best_err)
Example #12
0
class DecisionTree:
    """http://www-users.cs.umn.edu/~kumar/dmbook/ch4.pdf
    A real-output-valued decision tree, whose nodes split on real-valued
    inputs. You can still use this for binary classification by having 0/1 outputs.
    """
    def __init__(self):
        self.examples = []
        self.example_output_stats = SummaryStats()
        # For each feature, store a list of examples sorted by that feature's value
        self.examples_sorted_by_feature = collections.defaultdict(list)

        self.decision_feature = None
        self.decision_threshold = None

        # the two subtrees induced by the above decision function
        self.subtrees = [None, None]

    def add_example(self, example):
        self.examples.append(example)
        self.example_output_stats.add(float(example["_OUTPUT"]))

    def _examples_features(self):
        """Return the set of useable features from the examples.
        (Internally assumes examples[0] has all features.)"""
        return set(f for f in self.examples[0].keys() if not f.startswith("_"))

    def _sort_by_features(self, feature=None):
        """Called after final example is added. Only needs to be called once,
        for the root node, since sorting is preserved when splitting."""
        if feature is None:
            features = self._examples_features()
        else:
            assert type(feature) is str
            features = [feature]

        for feature in features:
            if self.examples_sorted_by_feature[feature]: continue  #already done
            self.examples_sorted_by_feature[feature] = list(self.examples)
            self.examples_sorted_by_feature[feature].sort(key=lambda e: e[feature])

    def avg_squared_error(self, examples=None):
        "returns avg squared error of output"
        examples = examples or self.examples  # by default, use training examples
        output_stats = SummaryStats()
        for example in examples:
            prediction = self.predict(example)
            output_stats.add((prediction - float(example["_OUTPUT"])) ** 2)
        return output_stats.avg()

    def predict(self, example):
        "recursively goes down decision tree, and returns avg output value at leaf."
        subtree = self._decision_subtree(example)
        if subtree is None:
            return self.example_output_stats.avg()
        else:
            return subtree.predict(example)

    def decision_str(self):
        return "%s < %s" % (self.decision_feature, self.decision_threshold)

    def print_tree(self, prefix=""):
        if self.decision_feature is None:
            print "Leaf(", len(self.examples), "examples,",
            print "avg_output=", self.example_output_stats.avg(),
            print "var_output=", self.example_output_stats.var(),
            print ")"
        else:
            print "Node(", len(self.examples), "examples,",
            print "decision = [", self.decision_str(),
            print "] )"

            prefix += "    "
            print prefix, "false =>",
            self.subtrees[0].print_tree(prefix)

            print prefix, "true  =>",
            self.subtrees[1].print_tree(prefix)

    def _find_best_split(self, feature):
        """Find the best threshold value to split this node on, using @feature.
        Returns (less_than_threshold, split_err).
        The @less_than_threshold is what you use to "decide", i.e.:
            if example[feature] < less_than_threshold:
                decide_left ...
            else:
                decide_right ...
        Note that this method doesn't actually split anything: it just figures out
        which threshold value would be best to split at.
        """
        self._sort_by_features(feature)
        left_output_stats = SummaryStats()
        right_output_stats = SummaryStats()
        assert len(self.examples) == len(self.examples_sorted_by_feature[feature])

        # To begin, let's assume we push all examples into the right child node.
        for example in self.examples:
            right_output_stats.add(float(example["_OUTPUT"]))

        # Now, move the examples one by one to the left child node.
        # (Note the examples sorted by value -- it's as if we're adjusting the
        # less_than_threshold.)
        # After each example, calculate the goodness-of-split, and track the best.
        best_threshold = None
        best_err = None
        last_feature_value = None
        for example in self.examples_sorted_by_feature[feature]:
            feature_value = example[feature]
            output_value = float(example["_OUTPUT"])

            # Speed optimization: skip over examples with same feature value.
            if feature_value == last_feature_value:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue

            last_feature_value = feature_value  # remember for next iteration

            left_count = left_output_stats.count()
            right_count = right_output_stats.count()

            # Edge-case: left or right child is empty
            if left_count == 0 or right_count == 0:
                left_output_stats.add(output_value)
                right_output_stats.remove(output_value)
                continue  # not a true split

            # Compute goodness-of-split: weighted average of the 2 output variances.
            if left_count <= 1: left_err = 0
            else: left_err = (left_count - 1) * left_output_stats.var()

            if right_count <= 1: right_err = 0
            else: right_err = (right_count - 1) * right_output_stats.var()

            err = left_err + right_err
            if best_err is None or err < best_err:
                best_threshold = feature_value
                best_err = err

            left_output_stats.add(output_value)
            right_output_stats.remove(output_value)

        # to save memory, delete this sorted array (we'll never use it again anyway)
        del self.examples_sorted_by_feature[feature]
        return (best_threshold, best_err)

    def _split_subtrees(self, feature, threshold):
        """Reset the two subtrees based on the current decision function."""
        assert feature is not None
        assert threshold is not None
        assert len(self.examples) >= 2

        self.decision_feature = feature
        self.decision_threshold = threshold

        self.subtrees = [DecisionTree(), DecisionTree()]
        for example in self.examples:
            decision = int(example[self.decision_feature] < self.decision_threshold)
            if decision == 0:
                self.subtrees[0].add_example(example)
            else:
                self.subtrees[1].add_example(example)

    def _decision_subtree(self, example):
        """returns one of the two child subtrees, or None if we are a leaf."""
        if self.decision_feature is None: return None
        decision = example[self.decision_feature] < self.decision_threshold
        return self.subtrees[int(decision)]

    def grow_tree(self, features_considered_per_node=2):
        # Stop growing based on termination criteria:
        if len(self.examples) <= 1: return
        if self.example_output_stats.var() < 0.001: return

        # assume all examples have all features
        feature_names = self._examples_features()
        feature_subset = random.sample(feature_names, features_considered_per_node)

        best_feature = None
        best_threshold = None
        best_avg_error = None
        for feature in feature_subset:
            (threshold, avg_error) = self._find_best_split(feature)
            if avg_error is None: continue  # no split possible (all same value?)

            if best_avg_error is None or avg_error < best_avg_error:
                best_feature = feature
                best_threshold = threshold
                best_avg_error = avg_error

        # Did we fail to find a good decision?
        if best_feature is None: return

        # Accept the best decision
        self._split_subtrees(best_feature, best_threshold)

        # Grow recursively at each branch.
        self.subtrees[0].grow_tree(features_considered_per_node)
        self.subtrees[1].grow_tree(features_considered_per_node)