def find_regressions(kernels, test_runs, metric): # A test is regressed on some platform if its latest results are # definitely lower than on the reference kernel. # Runs for the latest kernel may be underway and incomplete. # In that case, selectively use next-latest kernel. # TODO: the next-latest method hurts if latest run is not sorted last, # or if there are several dev threads ref = kernels[0] latest = kernels[-1] prev = kernels[-2:][0] scores = {} # kernel --> (platform --> list of perf scores) for k in [ref, prev, latest]: if k in test_runs: scores[k] = collect_raw_scores(test_runs[k], metric) regressed_platforms = [] for platform in scores[ref]: if latest in scores and platform in scores[latest]: k = latest elif prev in scores and platform in scores[prev]: k = prev else: # perhaps due to decay of test machines k = ref # no regression info avail ref_avg, ref_std = plotgraph.avg_dev(scores[ref][platform]) avg, std = plotgraph.avg_dev(scores[ k ][platform]) if avg+std < ref_avg-ref_std: regressed_platforms.append(platform) return sorted(regressed_platforms)
def find_regressions(kernels, test_runs, metric): # A test is regressed on some platform if its latest results are # definitely lower than on the reference kernel. # Runs for the latest kernel may be underway and incomplete. # In that case, selectively use next-latest kernel. # TODO: the next-latest method hurts if latest run is not sorted last, # or if there are several dev threads ref = kernels[0] latest = kernels[-1] prev = kernels[-2:][0] scores = {} # kernel --> (platform --> list of perf scores) for k in [ref, prev, latest]: if k in test_runs: scores[k] = collect_raw_scores(test_runs[k], metric) regressed_platforms = [] for platform in scores[ref]: if latest in scores and platform in scores[latest]: k = latest elif prev in scores and platform in scores[prev]: k = prev else: # perhaps due to decay of test machines k = ref # no regression info avail ref_avg, ref_std = plotgraph.avg_dev(scores[ref][platform]) avg, std = plotgraph.avg_dev(scores[k][platform]) if avg + std < ref_avg - ref_std: regressed_platforms.append(platform) return sorted(regressed_platforms)
def find_regressions(kernels, test, metric): # A test is regressed on some platform if its latest results are # definitely lower than on the reference kernel. # Runs for the latest kernel may be underway and incomplete. # In that case, selectively use next-latest kernel. if not regress: return None ref = kernels[0] latest = kernels[-1] prev = kernels[-2:][0] scores = {} # kernel --> (platform --> list of perf scores) for k in [ref, prev, latest]: runs = collect_testruns(job_table[k], None, test) scores[k] = collect_raw_scores(runs, metric) regressed_platforms = [] for platform in scores[ref]: k = latest if platform not in scores[k]: k = prev if platform not in scores[k]: continue # perhaps due to decay of test machines ref_avg, ref_std = plotgraph.avg_dev(scores[ref][platform]) avg, std = plotgraph.avg_dev(scores[ k ][platform]) if avg+std < ref_avg-ref_std: regressed_platforms.append(platform) return sorted(regressed_platforms)
def analyze_variants_all_tests_1_platform(self, platform, vary): # generate one graph image for results of all benchmarks # on one platform and one kernel, comparing effects of # two or more combos of kernel options (test run attributes) # (numa_fake,stale_page,kswapd_merge,sched_idle, etc) kernel = self.cgiform.getvalue('kernel', 'some_kernel') self.passthru.append('kernel=%s' % kernel) # two or more vary_groups, one for each plotted line, # each group begins with vary= and ends with next & # each group has comma-separated list of test attribute key=val pairs # eg vary=keyval1,keyval2&vary=keyval3,keyval4 vary_groups = [ dict(pair.split('=', 1) for pair in vary_group.split(',')) for vary_group in vary ] test = self.benchmarks[0] # pick any test in all jobs kernels, test_tag = self.jobs_selector(test, self.job_table, self.kernel_dates) linekeys = {} plot_data = {} baselines = {} for i, vary_group in enumerate(vary_groups): group_attributes = self.test_attributes.copy() group_attributes.update(vary_group) linekey = ','.join('%s=%s' % (attr, vary_group[attr]) for attr in vary_group) linekeys[i] = linekey data = {} for benchmark in self.benchmarks: metric = perf.benchmark_main_metric(benchmark) runs = collect_testruns(self.job_table[kernel], benchmark + test_tag, group_attributes, self.platforms_filter, 'by_hosts' in self.toggles, self.no_antag) vals = [] for testrunx in runs[platform]: vals += perf.get_metric_at_point([testrunx], metric) if vals: if benchmark not in baselines: baselines[benchmark], stddev = plotgraph.avg_dev(vals) vals = [val / baselines[benchmark] for val in vals] data[benchmark] = vals plot_data[i] = data title = "%s on %s" % (kernel, platform) for attr in self.test_attributes: title += ', %s=%s' % (attr, self.test_attributes[attr]) if 'table' in self.cgiform: self.table_for_variants_all_tests(title, plot_data, linekeys, range(len(linekeys)), filtered_passthru=self.passthru, test_tag=test_tag) else: graph_variants_all_tests(title, plot_data, linekeys, self.size, 'dark' in self.toggles)
def table_variants_all_tests(plot_data, columns, colkeys, benchmarks, myurl, filtered_passthru): # generate html table of graph's numbers # for primary metric over all benchmarks (rows), # on one platform and one kernel, # over various combos of test run attribute constraints (cols). ref_thresholds = {} print "<table border=1 cellpadding=3 cellspacing=0>" print "<tr> <td><b> Benchmark </b></td>", for col in columns: print "<td><b>", colkeys[col].replace(',', ',<br>'), "</b></td>" print "</tr>" for benchmark in benchmarks: print "<tr> <td><b>", benchmark, "</b></td>" for col in columns: print "<td>", vals = plot_data[col].get(benchmark, []) if not vals: print "?", else: (avg, std_dev) = plotgraph.avg_dev(vals) args = filtered_passthru[:] perf.append_cgi_args(args, {'test': benchmark}) for keyval in colkeys[col].split(','): key, val = keyval.split('=', 1) perf.append_cgi_args(args, {key: val}) print "<a href='%s?%s&runs&attrs'>" % (myurl, '&'.join(args)) print "<b>%.4g</b>" % avg, "</a><br>", print " <small> %dr </small>" % len(vals), print " <small> %.3g </small>" % std_dev, print "</td>" print "</tr>\n" print "</table>" print "<p> <b>Bold value:</b> Average of this metric, then <br>" print "number of good test runs, then standard deviation of those runs"
def analyze_variants_all_tests_1_platform(self, platform, vary): # generate one graph image for results of all benchmarks # on one platform and one kernel, comparing effects of # two or more combos of kernel options (test run attributes) # (numa_fake,stale_page,kswapd_merge,sched_idle, etc) kernel = self.cgiform.getvalue('kernel', 'some_kernel') self.passthru.append('kernel=%s' % kernel) # two or more vary_groups, one for each plotted line, # each group begins with vary= and ends with next & # each group has comma-separated list of test attribute key=val pairs # eg vary=keyval1,keyval2&vary=keyval3,keyval4 vary_groups = [dict(pair.split('=',1) for pair in vary_group.split(',')) for vary_group in vary] test = self.benchmarks[0] # pick any test in all jobs kernels, test_tag = self.jobs_selector(test, self.job_table, self.kernel_dates) linekeys = {} plot_data = {} baselines = {} for i, vary_group in enumerate(vary_groups): group_attributes = self.test_attributes.copy() group_attributes.update(vary_group) linekey = ','.join('%s=%s' % (attr, vary_group[attr]) for attr in vary_group) linekeys[i] = linekey data = {} for benchmark in self.benchmarks: metric = perf.benchmark_main_metric(benchmark) runs = collect_testruns(self.job_table[kernel], benchmark+test_tag, group_attributes, self.platforms_filter, 'by_hosts' in self.toggles, self.no_antag) vals = [] for testrunx in runs[platform]: vals += perf.get_metric_at_point([testrunx], metric) if vals: if benchmark not in baselines: baselines[benchmark], stddev = plotgraph.avg_dev(vals) vals = [val/baselines[benchmark] for val in vals] data[benchmark] = vals plot_data[i] = data title = "%s on %s" % (kernel, platform) for attr in self.test_attributes: title += ', %s=%s' % (attr, self.test_attributes[attr]) if 'table' in self.cgiform: self.table_for_variants_all_tests(title, plot_data, linekeys, range(len(linekeys)), filtered_passthru=self.passthru, test_tag=test_tag) else: graph_variants_all_tests(title, plot_data, linekeys, self.size, 'dark' in self.toggles)
def table_for_1_test(test, metric): # generate detailed html page with graph plus numeric table for 1 benchmark print "Content-Type: text/html\n\n<html><body>" heading = "%s %s:  %s%s" % (test_group, kernel_legend, test.capitalize(), suite_notes) if regress: heading += ", Regressions Only" print "<h2> %s </h2>" % heading print "<img src='%s?%s'>" % (myself, '&'.join(passthru)) heading = "%s %s metric" % (test.capitalize(), metric) if relative: heading += ", relative" print "<p><p> <h3> %s: </h3>" % heading ref_thresholds = {} print "<table border=1, cellpadding=3>" print "<tr> <td><b> Kernel </b></td>", for platform in platforms: p = platform.replace("_", "_<br>").replace(".", "<br>") print "<td><b>", p, "</b></td>" print "</tr>" for kernel in kernels: print "<tr> <td><b>", kernel, "</b><br><small>", print kernel_dates[kernel], "</small></td>" for platform in platforms: print "<td", vals = plot_data[platform].get(kernel, []) if vals: (avg, std_dev) = plotgraph.avg_dev(vals) if platform not in ref_thresholds: ref_thresholds[platform] = avg - std_dev if avg+std_dev < ref_thresholds[platform]: print "bgcolor=pink", print ( "> <a href='%s?test=%s&metric=%s" "&platforms=%s&runs&kernel=%s'>" % (myself, test, metric, platform, kernel) ) print "<b>%.4g</b>" % avg, "</a><br>", print " <small> %dr </small>" % len(vals), print " <small> %.3g </small>" % std_dev, else: print "> ?", print "</td>" print "</tr>\n" print "</table>" print "<p> <b>Bold value:</b> Average of this metric, then <br>" print "number of good test runs, then standard deviation of those runs" print "<br> Pink if regressed from reference kernel" print "</body></html>"
def collect_scaled_scores(metric): # get scores of test runs for 1 test on some kernels and platforms # optionally make relative to first kernel on that platform # arrange by plotline (ie platform) for gnuplot plot_data = {} # platform --> (kernel --> list of perf scores) baseline = {} for kernel in sorted(test_runs.keys()): for platform in test_runs[kernel]: vals = perf.get_metric_at_point(test_runs[kernel][platform], metric) if vals: if relative: if platform not in baseline: baseline[platform], std = plotgraph.avg_dev(vals) vals = [v/baseline[platform] for v in vals] pdp = plot_data.setdefault(platform, {}) pdp.setdefault(kernel, []).extend(vals) return plot_data
def table_1_metric_all_kernels(plot_data, columns, column_argname, kernels, kernel_dates, myurl, filtered_passthru): # generate html table of graph's numbers # for 1 benchmark metric over all kernels (rows), # over various platforms or various antagonists etc (cols). ref_thresholds = {} print "<table border=1 cellpadding=3 cellspacing=0>" print "<tr> <td><b> Kernel </b></td>", for label in columns: if not label and column_argname == 'antag': label = 'no antag' print "<td><b>", label.replace('_', '<br>_'), "</b></td>" print "</tr>" for kernel in kernels: print "<tr> <td><b>", kernel, "</b>", if kernel in kernel_dates: print "<br><small>", kernel_dates[kernel], "</small>" print "</td>" for col in columns: print "<td", vals = plot_data[col].get(kernel, []) if not vals: print "> ?", else: (avg, std_dev) = plotgraph.avg_dev(vals) if col not in ref_thresholds: ref_thresholds[col] = avg - std_dev if avg+std_dev < ref_thresholds[col]: print "bgcolor=pink", print "> ", args = filtered_passthru[:] perf.append_cgi_args(args, {column_argname:col, 'kernel':kernel}) print "<a href='%s?%s&runs&attrs'>" % (myurl, '&'.join(args)) print "<b>%.4g</b>" % avg, "</a><br>", print " <small> %dr </small>" % len(vals), print " <small> %.3g </small>" % std_dev, print "</td>" print "</tr>\n" print "</table>" print "<p> <b>Bold value:</b> Average of this metric, then <br>" print "number of good test runs, then standard deviation of those runs" print "<br> Pink if regressed from reference kernel"
def table_1_metric_all_kernels(plot_data, columns, column_argname, kernels, kernel_dates, myurl, filtered_passthru): # generate html table of graph's numbers # for 1 benchmark metric over all kernels (rows), # over various platforms or various antagonists etc (cols). ref_thresholds = {} print "<table border=1 cellpadding=3 cellspacing=0>" print "<tr> <td><b> Kernel </b></td>", for label in columns: if not label and column_argname == 'antag': label = 'no antag' print "<td><b>", label.replace('_', '<br>_'), "</b></td>" print "</tr>" for kernel in kernels: print "<tr> <td><b>", kernel, "</b>", if kernel in kernel_dates: print "<br><small>", kernel_dates[kernel], "</small>" print "</td>" for col in columns: print "<td", vals = plot_data[col].get(kernel, []) if not vals: print "> ?", else: (avg, std_dev) = plotgraph.avg_dev(vals) if col not in ref_thresholds: ref_thresholds[col] = avg - std_dev if avg + std_dev < ref_thresholds[col]: print "bgcolor=pink", print "> ", args = filtered_passthru[:] perf.append_cgi_args(args, { column_argname: col, 'kernel': kernel }) print "<a href='%s?%s&runs&attrs'>" % (myurl, '&'.join(args)) print "<b>%.4g</b>" % avg, "</a><br>", print " <small> %dr </small>" % len(vals), print " <small> %.3g </small>" % std_dev, print "</td>" print "</tr>\n" print "</table>" print "<p> <b>Bold value:</b> Average of this metric, then <br>" print "number of good test runs, then standard deviation of those runs" print "<br> Pink if regressed from reference kernel"
def table_all_metrics_1_platform(test_runs, platform, relative): # TODO: show std dev in cells # can't mark regressions, since some metrics improve downwards kernels = perf.sort_kernels(test_runs.keys()) scores = {} attrs = set() for kernel in kernels: testruns = test_runs[kernel].get(platform, []) if testruns: d = perf.collect_all_metrics_scores(testruns) scores[kernel] = d attrs.update(set(d.keys())) else: print "No runs completed on", kernel, "<br>" attrs = sorted(list(attrs))[:100] print "<table border=1 cellpadding=4 cellspacing=0>" print "<tr><td> Metric </td>" for kernel in kernels: kernel = kernel.replace("_", "_<br>") print "<td>", kernel, "</td>" print "</tr>" for attr in attrs: print "<tr>" print "<td>", attr, "</td>" baseline = None for kernel in kernels: print "<td>", if kernel in scores and attr in scores[kernel]: (avg, dev) = plotgraph.avg_dev(scores[kernel][attr]) if baseline and relative: percent = (avg/baseline - 1)*100 print "%+.1f%%" % percent, else: baseline = avg print "%.4g" % avg, else: print "?" print "</td>" print "</tr>" print "</table>"
def table_all_metrics_1_platform(test_runs, platform, relative): # TODO: show std dev in cells # can't mark regressions, since some metrics improve downwards kernels = perf.sort_kernels(test_runs.keys()) scores = {} attrs = set() for kernel in kernels: testruns = test_runs[kernel].get(platform, []) if testruns: d = perf.collect_all_metrics_scores(testruns) scores[kernel] = d attrs.update(set(d.keys())) else: print "No runs completed on", kernel, "<br>" attrs = sorted(list(attrs))[:100] print "<table border=1 cellpadding=4 cellspacing=0>" print "<tr><td> Metric </td>" for kernel in kernels: kernel = kernel.replace("_", "_<br>") print "<td>", kernel, "</td>" print "</tr>" for attr in attrs: print "<tr>" print "<td>", attr, "</td>" baseline = None for kernel in kernels: print "<td>", if kernel in scores and attr in scores[kernel]: (avg, dev) = plotgraph.avg_dev(scores[kernel][attr]) if baseline and relative: percent = (avg / baseline - 1) * 100 print "%+.1f%%" % percent, else: baseline = avg print "%.4g" % avg, else: print "?" print "</td>" print "</tr>" print "</table>"
def collect_scaled_scores(metric, test_runs, regressed_platforms, relative): # get scores of test runs for 1 test on some kernels and platforms # optionally make relative to oldest (?) kernel on that platform # arrange by plotline (ie platform) for gnuplot plot_data = {} # platform --> (kernel --> list of perf scores) baseline = {} for kernel in sorted(test_runs.keys()): for platform in test_runs[kernel]: if not (regressed_platforms is None or platform in regressed_platforms): continue # delete results for uninteresting platforms vals = perf.get_metric_at_point(test_runs[kernel][platform], metric) if vals: if relative: if platform not in baseline: baseline[platform], std = plotgraph.avg_dev(vals) vals = [v / baseline[platform] for v in vals] pdp = plot_data.setdefault(platform, {}) pdp.setdefault(kernel, []).extend(vals) return plot_data
def table_variants_all_tests(plot_data, columns, colkeys, benchmarks, myurl, filtered_passthru): # generate html table of graph's numbers # for primary metric over all benchmarks (rows), # on one platform and one kernel, # over various combos of test run attribute constraints (cols). ref_thresholds = {} print "<table border=1 cellpadding=3 cellspacing=0>" print "<tr> <td><b> Benchmark </b></td>", for col in columns: print "<td><b>", colkeys[col].replace(',', ',<br>'), "</b></td>" print "</tr>" for benchmark in benchmarks: print "<tr> <td><b>", benchmark, "</b></td>" for col in columns: print "<td>", vals = plot_data[col].get(benchmark, []) if not vals: print "?", else: (avg, std_dev) = plotgraph.avg_dev(vals) args = filtered_passthru[:] perf.append_cgi_args(args, {'test':benchmark}) for keyval in colkeys[col].split(','): key, val = keyval.split('=', 1) perf.append_cgi_args(args, {key:val}) print "<a href='%s?%s&runs&attrs'>" % (myurl, '&'.join(args)) print "<b>%.4g</b>" % avg, "</a><br>", print " <small> %dr </small>" % len(vals), print " <small> %.3g </small>" % std_dev, print "</td>" print "</tr>\n" print "</table>" print "<p> <b>Bold value:</b> Average of this metric, then <br>" print "number of good test runs, then standard deviation of those runs"