def test_DecisionForest(): """Train a decision forest against an arbitrary formula, to see if it can approximate it to an arbitrary low error, given enough examples.""" def formula(x,y,z): return (x ** 2) + (x * y * z) + (10 * z) + (y / z) + 25 def random_input_output(): x = random.random() + 0.1 y = random.random() + 0.1 z = random.random() + 0.1 output = formula(x,y,z) return ({'x':x, 'y':y, 'z':z}, output) te = TrainingExamples() for i in xrange(1, 5000): (input, output) = random_input_output() te.add_example(input, output) if i % 500: continue print "Testing after", i, "training examples" forest = DecisionForest() forest.train(te, train_on_subset=True, num_trees=10, features_considered_per_node=3) # Measure the true out-of-sample error rate for the entire forest. predict_err = SummaryStats() for j in xrange(10000): (input, output) = random_input_output() predicted_output = forest.predict(input) predict_err.add((output - predicted_output) ** 2) print "avg squared error = ", predict_err.avg()
def train(self, training_examples, train_on_subset=True, num_trees=100, features_considered_per_node=2, **kwds): print "Training a decision forest of %d trees, using %d examples, and %d features considered per node." % ( num_trees, len(training_examples), features_considered_per_node) self.trees = [] total_test_output_stats = SummaryStats() binary_classification = all(example["_OUTPUT"] in [0,1] for example in training_examples) #binary_classification = True #for example in training_examples: # output = example["_OUTPUT"] # if output not in [0,1]: # binary_classification = False # break for tree_i in xrange(1, num_trees+1): tree = DecisionTree() self.trees.append(tree) test_set_ids = set(xrange(len(training_examples))) for i in xrange(len(training_examples)): if train_on_subset: # N samples with replacement ("bootstrap") index = random.randint(0, len(training_examples)-1) else: index = i tree.add_example(training_examples[index]) test_set_ids.discard(index) print "Growing tree %d/%d ..." % (tree_i, num_trees), tree.grow_tree(features_considered_per_node=features_considered_per_node) # Report the in-sample training error if binary_classification: print "area-under-curve for %d training examples is %2.2f" % ( len(tree.examples), tree.test(tree.examples, print_level=0)) else: print "%2.2f avg err^2 on %d training examples" % ( tree.avg_squared_error(), len(tree.examples)), # Report the out-of-sample testing error, if we have any out-of-sample # examples to test on. if train_on_subset: print "; ", test_set = [training_examples[i] for i in test_set_ids] if binary_classification: # Do a true out-of-sample test just on this one tree # Temporarily make this a forest-of-one-tree... save_trees = self.trees self.trees = [tree] self.test(test_set) self.trees = save_trees else: avg_squared_error = tree.avg_squared_error(test_set) total_test_output_stats.add(avg_squared_error) print "out-of-sample avg err^2 on %d test cases: %.2f [%.2f avg. for all %d trees so far]" % (len(test_set), avg_squared_error, total_test_output_stats.avg(), tree_i), print
def avg_squared_error(self, examples=None): "returns avg squared error of output" examples = examples or self.examples # by default, use training examples output_stats = SummaryStats() for example in examples: prediction = self.predict(example) output_stats.add((prediction - float(example["_OUTPUT"])) ** 2) return output_stats.avg()
def avg_squared_error(self, examples): """Returns average squared error of the predicted output. This is useful to determine if the DecisionForest was able to learn the training set completely. If the training_error is very high, then the forest was not powerful enough to learn the training set (and perhaps features_considered_per_node should be increased.) In theory, a DecisionForest should never overfit the training data, because each tree was trained on a random bootstrap sample of the training examples. """ errors = SummaryStats() for example in examples: prediction = self.predict(example) squared_error = (prediction - float(example["_OUTPUT"])) ** 2 errors.add(squared_error) return errors.avg()
def get(self): user = users.get_current_user() # We use app.yaml to configure overall authentication if not validate_user(user.email()): self.redirect(users.create_login_url(self.request.uri)) start = time.time() key = self.request.get("key") gc_data = gc_datastore.get_data(key) #need to sort results # We regenerate summary stats with each invocation summary_stats = SummaryStats(gc_data).stats q = db.GqlQuery("SELECT * FROM GraphModel " + "WHERE ANCESTOR IS :1 ", key) results = q.fetch(6) for entry in results: if entry.graph_type == graph.RAW_CSV_DATA: results_csv_key = str(entry.blob_key.key()) elif entry.graph_type == graph.YG_GC_MEMORY: yg_memory_key = str(entry.blob_key.key()) elif entry.graph_type == graph.GC_DURATION: gc_duration_key = str(entry.blob_key.key()) elif entry.graph_type == graph.MEMORY_RECLAIMED: gc_reclaimed_key = str(entry.blob_key.key()) elif entry.graph_type == graph.FULL_GC_MEMORY: full_memory_key = str(entry.blob_key.key()) elif entry.graph_type == graph.MEMORY_UTIL_POST: memory_util_post_key = str(entry.blob_key.key()) duration = time.time() - start # Pass the key to our results, as the data will be obtained via a template_values = { 'user': user, 'logout': users.create_logout_url("/"), 'name': '/uploads', 'duration': duration, 'results_key': results_csv_key, 'summary_stats': summary_stats, 'gc_results': gc_data, 'yg_memory_key': yg_memory_key, 'full_memory_key': full_memory_key, 'gc_duration_key': gc_duration_key, 'gc_reclaimed_key': gc_reclaimed_key, 'memory_util_post_key': memory_util_post_key } template = jinja_environment.get_template( 'static/templates/results.html') self.response.out.write(template.render(template_values))
def __init__(self): self.examples = [] self.example_output_stats = SummaryStats() # For each feature, store a list of examples sorted by that feature's value self.examples_sorted_by_feature = collections.defaultdict(list) self.decision_feature = None self.decision_threshold = None # the two subtrees induced by the above decision function self.subtrees = [None, None]
def test(self, examples, print_level=1): """Computes the "area under the ROC curve". This is a way to measure the precision/recall WITHOUT choosing a cutoff-threshold. It is mathematically equivalent to: "the probability that a random positive example has a higher prob_output1 than a random negative case" (This equivalence is non-obvious). The algorithm below computes this average probability by effectively trying all combinations of positive-vs-negative examples, but does this in O(NlgN) instead of O(N^2)""" if type(examples) is TrainingExamples: examples = examples.examples prob_stats = SummaryStats() prob_hist = Histogram() output1_scores = list() output0_scores = list() for example in examples: assert example["_OUTPUT"] in [0, 1] prob = self.prob_output1(example) prob_stats.add(prob) prob_key = "%1.1f-%1.1f" % (int(prob * 10) / 10.0, (int(prob * 10) + 1) / 10.0) if prob == 1: prob_key = "0.9-1.0" # don't create a 1.0-1.1 bucket prob_hist.add(prob_key) real_output = example["_OUTPUT"] == 1 if real_output: output1_scores.append(prob) else: output0_scores.append(prob) output1_scores.sort() output0_scores.sort() if print_level >= 2: print "%d output1 scores:" % len(output1_scores), print ["%2.2f" % i for i in output1_scores[0:5]], print " ... ",
def test_generate_stats(self): """Test the generation of stats result set""" yg_gc1 = parsegc.generate_yg_gc_entry("50.0", "50.0", "ParNew", "2", "1", "4", "0.12345", "2048", "1024", "4096", "2.12345", "1.0", "1.50", "2.1") yg_gc2 = parsegc.generate_yg_gc_entry("100.25", "100.25", "ParNew", "3", "2", "8", "0.12345", "8192", "5120", "16384", "3.12345", "1.5", "2.0", "3.1") full_gc = parsegc.generate_full_gc_entry("200.5", "200.5", "Tenured", "20", "10", "40", "0.23456", "8192", "5120", "16384", "200", "100", "400", "3.1234", "1.9", "0.05", "3.11") system_gc = parsegc.generate_full_gc_entry("250.75", "250.75", "Tenured", "30", "20", "80", "0.23456", "8192", "4096", "8192", "300", "200", "800", "4.0912", "1.98", "2.1", "4.09", "System") gc_data = [yg_gc1, yg_gc2, full_gc, system_gc] results = SummaryStats(gc_data).stats expected = {} expected['Total Events'] = len(gc_data) expected['Elapsed Time'] = '200.750 secs' expected['Time spent in Full GC'] = '7.215 secs' expected['Time spent in YG GC'] = '5.247 secs' expected['Heap Start / End (Peak)'] = '4 MB / 8 MB (16 MB)' expected['YG Start / End (Peak)'] = '4 KB / 8 KB (8 KB)' expected['Tenured Start / End (Peak)'] = '40 KB / 80 KB (80 KB)' expected['Perm Start / End (Peak)'] = '400 KB / 800 KB (800 KB)' expected['Heap Growth'] = '4 MB' expected['YG Growth'] = '4 KB' expected['Tenured Growth'] = '40 KB' expected['Perm Growth'] = '400 KB' expected['Avg YG Reclaimed'] = '1 KB' expected['Avg Tenured Reclaimed'] = '10 KB' self.assertEqual(results, expected)
def post(self): user = users.get_current_user() # We use app.yaml to configure overall authentication if not validate_user(user.email()): self.redirect(users.create_login_url(self.request.uri)) start = time.time() #file = self.request.body_file file = self.request.params["gclog"].file log_key = LogData( filename=self.request.POST["gclog"].filename, notes=self.request.get("notes")).put() parser = ParseGCLog() gc_results = parser.parse_data(file) if len(gc_results) > 0: # Generate summary stats for results page summary_stats = SummaryStats(gc_results).stats # persist gc data - too slow at present with large datasets gc_datastore.store_data(log_key, gc_results) # persist all CSV data we generate to the store so we # won't have to regenerate later blob_writer = BlobResultWriter() results_csv_key = graph.generate_cached_graph(log_key, graph.RAW_CSV_DATA, gc_results, blob_writer) yg_memory_blob_key = graph.generate_cached_graph(log_key, graph.YG_GC_MEMORY, gc_results, blob_writer) full_memory_blob_key = graph.generate_cached_graph(log_key, graph.FULL_GC_MEMORY, gc_results, blob_writer) gc_duration_blob_key = graph.generate_cached_graph(log_key, graph.GC_DURATION, gc_results, blob_writer) gc_reclaimed_blob_key = graph.generate_cached_graph(log_key, graph.MEMORY_RECLAIMED, gc_results, blob_writer) memory_util_post_blob_key = graph.generate_cached_graph(log_key, graph.MEMORY_UTIL_POST, gc_results, blob_writer) duration = time.time() - start # Pass the key to our results, as the data will be obtained via a template_values = { 'user': user, 'logout': users.create_logout_url("/"), 'duration': duration, 'name': '/uploads', 'results_key': str(results_csv_key), 'summary_stats': summary_stats, 'gc_results': gc_results, 'yg_memory_key': str(yg_memory_blob_key), 'full_memory_key': str(full_memory_blob_key), 'gc_duration_key': str(gc_duration_blob_key), 'gc_reclaimed_key': str(gc_reclaimed_blob_key), 'memory_util_post_key': str(memory_util_post_blob_key) } template = jinja_environment.get_template( 'static/templates/results.html') else: template_values = { 'user': user, 'logout': users.create_logout_url("/") } template = jinja_environment.get_template( 'static/templates/error.html') self.response.out.write(template.render(template_values))
def predict(self, example): """Returns the average predicted output over all our trees.""" output_stats = SummaryStats() for tree in self.trees: output_stats.add(tree.predict(example)) return output_stats.avg()
def _find_best_split(self, feature): """Find the best threshold value to split this node on, using @feature. Returns (less_than_threshold, split_err). The @less_than_threshold is what you use to "decide", i.e.: if example[feature] < less_than_threshold: decide_left ... else: decide_right ... Note that this method doesn't actually split anything: it just figures out which threshold value would be best to split at. """ self._sort_by_features(feature) left_output_stats = SummaryStats() right_output_stats = SummaryStats() assert len(self.examples) == len(self.examples_sorted_by_feature[feature]) # To begin, let's assume we push all examples into the right child node. for example in self.examples: right_output_stats.add(float(example["_OUTPUT"])) # Now, move the examples one by one to the left child node. # (Note the examples sorted by value -- it's as if we're adjusting the # less_than_threshold.) # After each example, calculate the goodness-of-split, and track the best. best_threshold = None best_err = None last_feature_value = None for example in self.examples_sorted_by_feature[feature]: feature_value = example[feature] output_value = float(example["_OUTPUT"]) # Speed optimization: skip over examples with same feature value. if feature_value == last_feature_value: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue last_feature_value = feature_value # remember for next iteration left_count = left_output_stats.count() right_count = right_output_stats.count() # Edge-case: left or right child is empty if left_count == 0 or right_count == 0: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue # not a true split # Compute goodness-of-split: weighted average of the 2 output variances. if left_count <= 1: left_err = 0 else: left_err = (left_count - 1) * left_output_stats.var() if right_count <= 1: right_err = 0 else: right_err = (right_count - 1) * right_output_stats.var() err = left_err + right_err if best_err is None or err < best_err: best_threshold = feature_value best_err = err left_output_stats.add(output_value) right_output_stats.remove(output_value) # to save memory, delete this sorted array (we'll never use it again anyway) del self.examples_sorted_by_feature[feature] return (best_threshold, best_err)
class DecisionTree: """http://www-users.cs.umn.edu/~kumar/dmbook/ch4.pdf A real-output-valued decision tree, whose nodes split on real-valued inputs. You can still use this for binary classification by having 0/1 outputs. """ def __init__(self): self.examples = [] self.example_output_stats = SummaryStats() # For each feature, store a list of examples sorted by that feature's value self.examples_sorted_by_feature = collections.defaultdict(list) self.decision_feature = None self.decision_threshold = None # the two subtrees induced by the above decision function self.subtrees = [None, None] def add_example(self, example): self.examples.append(example) self.example_output_stats.add(float(example["_OUTPUT"])) def _examples_features(self): """Return the set of useable features from the examples. (Internally assumes examples[0] has all features.)""" return set(f for f in self.examples[0].keys() if not f.startswith("_")) def _sort_by_features(self, feature=None): """Called after final example is added. Only needs to be called once, for the root node, since sorting is preserved when splitting.""" if feature is None: features = self._examples_features() else: assert type(feature) is str features = [feature] for feature in features: if self.examples_sorted_by_feature[feature]: continue #already done self.examples_sorted_by_feature[feature] = list(self.examples) self.examples_sorted_by_feature[feature].sort(key=lambda e: e[feature]) def avg_squared_error(self, examples=None): "returns avg squared error of output" examples = examples or self.examples # by default, use training examples output_stats = SummaryStats() for example in examples: prediction = self.predict(example) output_stats.add((prediction - float(example["_OUTPUT"])) ** 2) return output_stats.avg() def predict(self, example): "recursively goes down decision tree, and returns avg output value at leaf." subtree = self._decision_subtree(example) if subtree is None: return self.example_output_stats.avg() else: return subtree.predict(example) def decision_str(self): return "%s < %s" % (self.decision_feature, self.decision_threshold) def print_tree(self, prefix=""): if self.decision_feature is None: print "Leaf(", len(self.examples), "examples,", print "avg_output=", self.example_output_stats.avg(), print "var_output=", self.example_output_stats.var(), print ")" else: print "Node(", len(self.examples), "examples,", print "decision = [", self.decision_str(), print "] )" prefix += " " print prefix, "false =>", self.subtrees[0].print_tree(prefix) print prefix, "true =>", self.subtrees[1].print_tree(prefix) def _find_best_split(self, feature): """Find the best threshold value to split this node on, using @feature. Returns (less_than_threshold, split_err). The @less_than_threshold is what you use to "decide", i.e.: if example[feature] < less_than_threshold: decide_left ... else: decide_right ... Note that this method doesn't actually split anything: it just figures out which threshold value would be best to split at. """ self._sort_by_features(feature) left_output_stats = SummaryStats() right_output_stats = SummaryStats() assert len(self.examples) == len(self.examples_sorted_by_feature[feature]) # To begin, let's assume we push all examples into the right child node. for example in self.examples: right_output_stats.add(float(example["_OUTPUT"])) # Now, move the examples one by one to the left child node. # (Note the examples sorted by value -- it's as if we're adjusting the # less_than_threshold.) # After each example, calculate the goodness-of-split, and track the best. best_threshold = None best_err = None last_feature_value = None for example in self.examples_sorted_by_feature[feature]: feature_value = example[feature] output_value = float(example["_OUTPUT"]) # Speed optimization: skip over examples with same feature value. if feature_value == last_feature_value: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue last_feature_value = feature_value # remember for next iteration left_count = left_output_stats.count() right_count = right_output_stats.count() # Edge-case: left or right child is empty if left_count == 0 or right_count == 0: left_output_stats.add(output_value) right_output_stats.remove(output_value) continue # not a true split # Compute goodness-of-split: weighted average of the 2 output variances. if left_count <= 1: left_err = 0 else: left_err = (left_count - 1) * left_output_stats.var() if right_count <= 1: right_err = 0 else: right_err = (right_count - 1) * right_output_stats.var() err = left_err + right_err if best_err is None or err < best_err: best_threshold = feature_value best_err = err left_output_stats.add(output_value) right_output_stats.remove(output_value) # to save memory, delete this sorted array (we'll never use it again anyway) del self.examples_sorted_by_feature[feature] return (best_threshold, best_err) def _split_subtrees(self, feature, threshold): """Reset the two subtrees based on the current decision function.""" assert feature is not None assert threshold is not None assert len(self.examples) >= 2 self.decision_feature = feature self.decision_threshold = threshold self.subtrees = [DecisionTree(), DecisionTree()] for example in self.examples: decision = int(example[self.decision_feature] < self.decision_threshold) if decision == 0: self.subtrees[0].add_example(example) else: self.subtrees[1].add_example(example) def _decision_subtree(self, example): """returns one of the two child subtrees, or None if we are a leaf.""" if self.decision_feature is None: return None decision = example[self.decision_feature] < self.decision_threshold return self.subtrees[int(decision)] def grow_tree(self, features_considered_per_node=2): # Stop growing based on termination criteria: if len(self.examples) <= 1: return if self.example_output_stats.var() < 0.001: return # assume all examples have all features feature_names = self._examples_features() feature_subset = random.sample(feature_names, features_considered_per_node) best_feature = None best_threshold = None best_avg_error = None for feature in feature_subset: (threshold, avg_error) = self._find_best_split(feature) if avg_error is None: continue # no split possible (all same value?) if best_avg_error is None or avg_error < best_avg_error: best_feature = feature best_threshold = threshold best_avg_error = avg_error # Did we fail to find a good decision? if best_feature is None: return # Accept the best decision self._split_subtrees(best_feature, best_threshold) # Grow recursively at each branch. self.subtrees[0].grow_tree(features_considered_per_node) self.subtrees[1].grow_tree(features_considered_per_node)