def run_perf_tests(self):
        self.return_code |= run_performance_tests.main(self.args)
        self.interpret_run_benchmark_results(False)

        if len(self.result_recorder[False].failed_stories) > 0:
            # For failed stories we run_tests again to make sure it's not a false
            # positive.
            print('============ Re_run the failed tests ============')
            all_failed_stories = '(' + '|'.join(
                self.result_recorder[False].failed_stories) + ')'
            # TODO(crbug.com/1055893): Remove the extra chrome categories after
            # investigation of flakes in representative perf tests.
            self.re_run_args.extend([
                '--story-filter', all_failed_stories, '--pageset-repeat=3',
                '--extra-chrome-categories=blink,blink_gc,gpu,v8,viz'
            ])
            self.return_code |= run_performance_tests.main(self.re_run_args)
            self.interpret_run_benchmark_results(True)

            for story_name in self.result_recorder[False].failed_stories.copy(
            ):
                if story_name not in self.result_recorder[True].failed_stories:
                    self.result_recorder[False].remove_failure(
                        story_name, self.benchmark,
                        self.is_control_story(story_name))

        if self.result_recorder[False].is_control_stories_noisy:
            # In this case all failures are reported as expected, and the number of
            # Failed stories in output.json will be zero.
            self.result_recorder[False].invalidate_failures(self.benchmark)

        (finalOut, self.return_code) = self.result_recorder[False].get_output(
            self.return_code)

        with open(self.output_path[False], 'r+') as resultsFile:
            json.dump(finalOut, resultsFile, indent=4)
        with open(self.options.isolated_script_test_output, 'w') as outputFile:
            json.dump(finalOut, outputFile, indent=4)

        if self.result_recorder[False].is_control_stories_noisy:
            assert self.return_code == 0
            print('Control story has high noise. These runs are not reliable!')

        return self.return_code
def main():
    overall_return_code = 0

    # Linux does not have it's own specific representatives
    # and uses the representatives chosen for windows.
    if sys.platform == 'win32' or sys.platform.startswith('linux'):
        platform = 'win'
        story_tag = 'representative_win_desktop'
    elif sys.platform == 'darwin':
        platform = 'mac'
        story_tag = 'representative_mac_desktop'
    else:
        return 1

    options = parse_arguments()
    args = sys.argv
    re_run_args = sys.argv
    args.extend(['--story-tag-filter', story_tag])

    overall_return_code = run_performance_tests.main(args)

    # The values used as the upper limit are the 99th percentile of the
    # avg and ci_095 frame_times recorded by dashboard in the past 200 revisions.
    # If the value measured here would be higher than this value at least by
    # 2ms [AVG_ERROR_MARGIN], that would be considered a failure.
    # crbug.com/953895
    with open(
            os.path.join(
                os.path.dirname(__file__), 'representative_perf_test_data',
                'representatives_frame_times_upper_limit.json')) as bound_data:
        upper_limit_data = json.load(bound_data)

    out_dir_path = os.path.dirname(options.isolated_script_test_output)
    output_path = os.path.join(out_dir_path, BENCHMARK, 'test_results.json')
    result_recorder = interpret_run_benchmark_results(
        upper_limit_data[platform], options.isolated_script_test_output)

    with open(output_path, 'r+') as resultsFile:
        if len(result_recorder.failed_stories) > 0:
            # For failed stories we run_tests again to make sure it's not a false
            # positive.
            print('============ Re_run the failed tests ============')
            all_failed_stories = '(' + '|'.join(
                result_recorder.failed_stories) + ')'
            re_run_args.extend(
                ['--story-filter', all_failed_stories, '--pageset-repeat=3'])

            re_run_isolated_script_test_dir = os.path.join(
                out_dir_path, 're_run_failures')
            re_run_isolated_script_test_output = os.path.join(
                re_run_isolated_script_test_dir,
                os.path.basename(options.isolated_script_test_output))
            re_run_isolated_script_test_perf_output = os.path.join(
                re_run_isolated_script_test_dir,
                os.path.basename(options.isolated_script_test_perf_output))

            re_run_args = replace_arg_values(
                re_run_args, [('--isolated-script-test-output',
                               re_run_isolated_script_test_output),
                              ('--isolated-script-test-perf-output',
                               re_run_isolated_script_test_perf_output)])

            overall_return_code |= run_performance_tests.main(re_run_args)
            re_run_result_recorder = interpret_run_benchmark_results(
                upper_limit_data[platform], re_run_isolated_script_test_output)

            for story_name in result_recorder.failed_stories.copy():
                if story_name not in re_run_result_recorder.failed_stories:
                    result_recorder.remove_failure(story_name)

        (finalOut,
         overall_return_code) = result_recorder.get_output(overall_return_code)

        json.dump(finalOut, resultsFile, indent=4)

        with open(options.isolated_script_test_output, 'w') as outputFile:
            json.dump(finalOut, outputFile, indent=4)

    return overall_return_code
Beispiel #3
0
def main():
    overall_return_code = 0

    # Linux does not have it's own specific representatives
    # and uses the representatives chosen for winodws.
    if sys.platform == 'win32':
        platform = 'win'
        story_tag = 'representative_win_desktop'
    elif sys.platform == 'darwin':
        platform = 'mac'
        story_tag = 'representative_mac_desktop'
    else:
        return 1

    options = parse_arguments()
    args = sys.argv
    args.extend(['--story-tag-filter', story_tag])

    overall_return_code = run_performance_tests.main(args)
    result_recorder = ResultRecorder()

    # The values used as the upper limit are the 99th percentile of the
    # avg and ci_095 frame_times recorded by dashboard in the past 200 revisions.
    # If the value measured here would be higher than this value at least by
    # 2ms [AVG_ERROR_MARGIN], that would be considered a failure.
    # crbug.com/953895
    with open(
            os.path.join(
                os.path.dirname(__file__), 'representative_perf_test_data',
                'representatives_frame_times_upper_limit.json')) as bound_data:
        upper_limit_data = json.load(bound_data)

    out_dir_path = os.path.dirname(options.isolated_script_test_output)
    test_count = len(upper_limit_data[platform])

    output_path = os.path.join(out_dir_path, BENCHMARK, 'test_results.json')

    with open(output_path, 'r+') as resultsFile:
        initialOut = json.load(resultsFile)
        result_recorder.setTests(initialOut, test_count)

        results_path = os.path.join(out_dir_path, BENCHMARK,
                                    'perf_results.csv')
        marked_stories = set()
        with open(results_path) as csv_file:
            reader = csv.DictReader(csv_file)
            for row in reader:
                # For now only frame_times is used for testing representatives'
                # performance.
                if row['name'] != 'frame_times':
                    continue
                story_name = row['stories']
                if (story_name in marked_stories
                        or story_name not in upper_limit_data[platform]):
                    continue
                marked_stories.add(story_name)

                if row['avg'] == '' or row['count'] == 0:
                    print "No values for " + story_name
                    result_recorder.addFailure(story_name)
                elif (float(row['ci_095']) >
                      upper_limit_data[platform][story_name]['ci_095'] *
                      CI_ERROR_MARGIN):
                    print "Noisy data on frame_times for " + story_name + ".\n"
                    result_recorder.addFailure(story_name)
                elif (float(row['avg']) >
                      upper_limit_data[platform][story_name]['avg'] +
                      AVG_ERROR_MARGIN):
                    print(story_name +
                          ": average frame_times is higher than 99th " +
                          "percentile of the past 200 recorded frame_times(" +
                          row['avg'] + ")" + ".\n")
                    result_recorder.addFailure(story_name)

        (finalOut,
         overall_return_code) = result_recorder.getOutput(overall_return_code)

        # Clearing the result of run_benchmark and write the gated perf results
        resultsFile.seek(0)
        resultsFile.truncate(0)
        json.dump(finalOut, resultsFile, indent=4)

        with open(options.isolated_script_test_output, 'w') as outputFile:
            json.dump(finalOut, outputFile, indent=4)

    return overall_return_code
def main():
    overall_return_code = 0
    options = parse_arguments()

    print(options)

    if options.benchmarks == 'rendering.desktop':
        # Linux does not have it's own specific representatives
        # and uses the representatives chosen for windows.
        if sys.platform == 'win32' or sys.platform.startswith('linux'):
            platform = 'win'
            story_tag = 'representative_win_desktop'
        elif sys.platform == 'darwin':
            platform = 'mac'
            story_tag = 'representative_mac_desktop'
        else:
            return 1
    elif options.benchmarks == 'rendering.mobile':
        platform = 'android'
        story_tag = 'representative_mobile'
    else:
        return 1

    benchmark = options.benchmarks
    args = sys.argv
    re_run_args = sys.argv
    args.extend(['--story-tag-filter', story_tag])

    overall_return_code = run_performance_tests.main(args)

    # The values used as the upper limit are the 99th percentile of the
    # avg and ci_095 frame_times recorded by dashboard in the past 200 revisions.
    # If the value measured here would be higher than this value at least by
    # 10 [AVG_ERROR_MARGIN] percent of upper limit, that would be considered a
    # failure. crbug.com/953895
    with open(
            os.path.join(
                os.path.dirname(__file__), 'representative_perf_test_data',
                'representatives_frame_times_upper_limit.json')) as bound_data:
        upper_limit_data = json.load(bound_data)

    out_dir_path = os.path.dirname(options.isolated_script_test_output)
    output_path = os.path.join(out_dir_path, benchmark, 'test_results.json')
    result_recorder = interpret_run_benchmark_results(
        upper_limit_data[platform], options.isolated_script_test_output,
        benchmark)

    with open(output_path, 'r+') as resultsFile:
        if len(result_recorder.failed_stories) > 0:
            # For failed stories we run_tests again to make sure it's not a false
            # positive.
            print('============ Re_run the failed tests ============')
            all_failed_stories = '(' + '|'.join(
                result_recorder.failed_stories) + ')'
            re_run_args.extend(
                ['--story-filter', all_failed_stories, '--pageset-repeat=3'])

            re_run_isolated_script_test_dir = os.path.join(
                out_dir_path, 're_run_failures')
            re_run_isolated_script_test_output = os.path.join(
                re_run_isolated_script_test_dir,
                os.path.basename(options.isolated_script_test_output))
            re_run_isolated_script_test_perf_output = os.path.join(
                re_run_isolated_script_test_dir,
                os.path.basename(options.isolated_script_test_perf_output))

            re_run_args = replace_arg_values(
                re_run_args, [('--isolated-script-test-output',
                               re_run_isolated_script_test_output),
                              ('--isolated-script-test-perf-output',
                               re_run_isolated_script_test_perf_output)])

            overall_return_code |= run_performance_tests.main(re_run_args)
            re_run_result_recorder = interpret_run_benchmark_results(
                upper_limit_data[platform], re_run_isolated_script_test_output,
                benchmark)

            for story_name in result_recorder.failed_stories.copy():
                if story_name not in re_run_result_recorder.failed_stories:
                    result_recorder.remove_failure(
                        story_name, benchmark,
                        is_control_story(
                            upper_limit_data[platform][story_name]))

        if result_recorder.is_control_stories_noisy:
            # In this case all failures are reported as expected, and the number of
            # Failed stories in output.json will be zero.
            result_recorder.invalidate_failures(benchmark)

        (finalOut,
         overall_return_code) = result_recorder.get_output(overall_return_code)

        json.dump(finalOut, resultsFile, indent=4)

        with open(options.isolated_script_test_output, 'w') as outputFile:
            json.dump(finalOut, outputFile, indent=4)

        if result_recorder.is_control_stories_noisy:
            assert overall_return_code == 0
            print('Control story has high noise. These runs are not reliable!')

    return overall_return_code