if not jobs: print('No jobs to run.') for image in docker_images.itervalues(): dockerjob.remove_image(image, skip_nonexistent=True) sys.exit(1) num_failures, resultset = jobset.run(jobs, newline_on_success=True, maxjobs=args.jobs) if num_failures: jobset.message('FAILED', 'Some tests failed', do_newline=True) else: jobset.message('SUCCESS', 'All tests passed', do_newline=True) report_utils.render_junit_xml_report(resultset, 'report.xml') for name, job in resultset.items(): if "http2" in name: job[0].http2results = aggregate_http2_results(job[0].message) report_utils.render_interop_html_report( set([str(l) for l in languages]), servers, _TEST_CASES, _AUTH_TEST_CASES, _HTTP2_TEST_CASES, resultset, num_failures, args.cloud_to_prod_auth or args.cloud_to_prod, args.prod_servers, args.http2_interop) finally: # Check if servers are still running. for server, job in server_jobs.items(): if not job.is_running():
six.iteritems(resultset))) finally: # Consider qps workers that need to be killed as failures qps_workers_killed += finish_qps_workers(scenario.workers) if perf_cmd and scenario_failures == 0 and not args.skip_generate_flamegraphs: workers_and_base_names = {} for worker in scenario.workers: if not worker.perf_file_base_name: raise Exception('using perf buf perf report filename is unspecified') workers_and_base_names[worker.host_and_port] = worker.perf_file_base_name perf_report_failures += run_collect_perf_profile_jobs(workers_and_base_names, scenario.name) # Still write the index.html even if some scenarios failed. # 'profile_output_files' will only have names for scenarios that passed if perf_cmd and not args.skip_generate_flamegraphs: # write the index fil to the output dir, with all profiles from all scenarios/workers report_utils.render_perf_profiling_results('%s/index.html' % args.flame_graph_reports, profile_output_files) report_utils.render_junit_xml_report(merged_resultset, args.xml_report, suite_name='benchmarks') if total_scenario_failures > 0 or qps_workers_killed > 0: print('%s scenarios failed and %s qps worker jobs killed' % (total_scenario_failures, qps_workers_killed)) sys.exit(1) if perf_report_failures > 0: print('%s perf profile collection jobs failed' % perf_report_failures) sys.exit(1)
def main(): argp = argparse.ArgumentParser(description='Run performance tests.') argp.add_argument('-l', '--language', choices=['all'] + sorted(scenario_config.LANGUAGES.keys()), nargs='+', required=True, help='Languages to benchmark.') argp.add_argument('--remote_driver_host', default=None, help='Run QPS driver on given host. By default, QPS driver is run locally.') argp.add_argument('--remote_worker_host', nargs='+', default=[], help='Worker hosts where to start QPS workers.') argp.add_argument('--dry_run', default=False, action='store_const', const=True, help='Just list scenarios to be run, but don\'t run them.') argp.add_argument('-r', '--regex', default='.*', type=str, help='Regex to select scenarios to run.') argp.add_argument('--bq_result_table', default=None, type=str, help='Bigquery "dataset.table" to upload results to.') argp.add_argument('--category', choices=['smoketest','all','scalable','sweep'], default='all', help='Select a category of tests to run.') argp.add_argument('--netperf', default=False, action='store_const', const=True, help='Run netperf benchmark as one of the scenarios.') argp.add_argument('--server_cpu_load', default=0, type=int, help='Select a targeted server cpu load to run. 0 means ignore this flag') argp.add_argument('-x', '--xml_report', default='report.xml', type=str, help='Name of XML report file to generate.') argp.add_argument('--perf_args', help=('Example usage: "--perf_args=record -F 99 -g". ' 'Wrap QPS workers in a perf command ' 'with the arguments to perf specified here. ' '".svg" flame graph profiles will be ' 'created for each Qps Worker on each scenario. ' 'Files will output to "<repo_root>/<args.flame_graph_reports>" ' 'directory. Output files from running the worker ' 'under perf are saved in the repo root where its ran. ' 'Note that the perf "-g" flag is necessary for ' 'flame graphs generation to work (assuming the binary ' 'being profiled uses frame pointers, check out ' '"--call-graph dwarf" option using libunwind otherwise.) ' 'Also note that the entire "--perf_args=<arg(s)>" must ' 'be wrapped in quotes as in the example usage. ' 'If the "--perg_args" is unspecified, "perf" will ' 'not be used at all. ' 'See http://www.brendangregg.com/perf.html ' 'for more general perf examples.')) argp.add_argument('--skip_generate_flamegraphs', default=False, action='store_const', const=True, help=('Turn flame graph generation off. ' 'May be useful if "perf_args" arguments do not make sense for ' 'generating flamegraphs (e.g., "--perf_args=stat ...")')) argp.add_argument('-f', '--flame_graph_reports', default='perf_reports', type=str, help='Name of directory to output flame graph profiles to, if any are created.') args = argp.parse_args() languages = set(scenario_config.LANGUAGES[l] for l in itertools.chain.from_iterable( six.iterkeys(scenario_config.LANGUAGES) if x == 'all' else [x] for x in args.language)) # Put together set of remote hosts where to run and build remote_hosts = set() if args.remote_worker_host: for host in args.remote_worker_host: remote_hosts.add(host) if args.remote_driver_host: remote_hosts.add(args.remote_driver_host) if not args.dry_run: if remote_hosts: archive_repo(languages=[str(l) for l in languages]) prepare_remote_hosts(remote_hosts, prepare_local=True) else: prepare_remote_hosts([], prepare_local=True) build_local = False if not args.remote_driver_host: build_local = True if not args.dry_run: build_on_remote_hosts(remote_hosts, languages=[str(l) for l in languages], build_local=build_local) perf_cmd = None if args.perf_args: print('Running workers under perf profiler') # Expect /usr/bin/perf to be installed here, as is usual perf_cmd = ['/usr/bin/perf'] perf_cmd.extend(re.split('\s+', args.perf_args)) qpsworker_jobs = create_qpsworkers(languages, args.remote_worker_host, perf_cmd=perf_cmd) # get list of worker addresses for each language. workers_by_lang = dict([(str(language), []) for language in languages]) for job in qpsworker_jobs: workers_by_lang[str(job.language)].append(job) scenarios = create_scenarios(languages, workers_by_lang=workers_by_lang, remote_host=args.remote_driver_host, regex=args.regex, category=args.category, bq_result_table=args.bq_result_table, netperf=args.netperf, netperf_hosts=args.remote_worker_host, server_cpu_load=args.server_cpu_load) if not scenarios: raise Exception('No scenarios to run') total_scenario_failures = 0 qps_workers_killed = 0 merged_resultset = {} perf_report_failures = 0 for scenario in scenarios: if args.dry_run: print(scenario.name) else: scenario_failures = 0 try: for worker in scenario.workers: worker.start() jobs = [scenario.jobspec] if scenario.workers: jobs.append(create_quit_jobspec(scenario.workers, remote_host=args.remote_driver_host)) scenario_failures, resultset = jobset.run(jobs, newline_on_success=True, maxjobs=1) total_scenario_failures += scenario_failures merged_resultset = dict(itertools.chain(six.iteritems(merged_resultset), six.iteritems(resultset))) finally: # Consider qps workers that need to be killed as failures qps_workers_killed += finish_qps_workers(scenario.workers, qpsworker_jobs) if perf_cmd and scenario_failures == 0 and not args.skip_generate_flamegraphs: workers_and_base_names = {} for worker in scenario.workers: if not worker.perf_file_base_name: raise Exception('using perf buf perf report filename is unspecified') workers_and_base_names[worker.host_and_port] = worker.perf_file_base_name perf_report_failures += run_collect_perf_profile_jobs(workers_and_base_names, scenario.name, args.flame_graph_reports) # Still write the index.html even if some scenarios failed. # 'profile_output_files' will only have names for scenarios that passed if perf_cmd and not args.skip_generate_flamegraphs: # write the index fil to the output dir, with all profiles from all scenarios/workers report_utils.render_perf_profiling_results('%s/index.html' % args.flame_graph_reports, profile_output_files) report_utils.render_junit_xml_report(merged_resultset, args.xml_report, suite_name='benchmarks') if total_scenario_failures > 0 or qps_workers_killed > 0: print('%s scenarios failed and %s qps worker jobs killed' % (total_scenario_failures, qps_workers_killed)) sys.exit(1) if perf_report_failures > 0: print('%s perf profile collection jobs failed' % perf_report_failures) sys.exit(1)
num_failures, resultset = jobset.run(jobs, newline_on_success=True, maxjobs=args.jobs, skip_jobs=args.manual_run) if num_failures: jobset.message('FAILED', 'Some tests failed', do_newline=True) else: jobset.message('SUCCESS', 'All tests passed', do_newline=True) write_cmdlog_maybe(server_manual_cmd_log, 'interop_server_cmds.sh') write_cmdlog_maybe(client_manual_cmd_log, 'interop_client_cmds.sh') xml_report_name = _XML_REPORT if args.internal_ci: xml_report_name = _INTERNAL_CL_XML_REPORT report_utils.render_junit_xml_report(resultset, xml_report_name) for name, job in resultset.items(): if "http2" in name: job[0].http2results = aggregate_http2_results(job[0].message) http2_server_test_cases = ( _HTTP2_SERVER_TEST_CASES if args.http2_server_interop else []) report_utils.render_interop_html_report( set([str(l) for l in languages]), servers, _TEST_CASES, _AUTH_TEST_CASES, _HTTP2_TEST_CASES, http2_server_test_cases, resultset, num_failures, args.cloud_to_prod_auth or args.cloud_to_prod, args.prod_servers, args.http2_interop) if num_failures:
if perf_cmd and scenario_failures == 0 and not args.skip_generate_flamegraphs: workers_and_base_names = {} for worker in scenario.workers: if not worker.perf_file_base_name: raise Exception( 'using perf buf perf report filename is unspecified') workers_and_base_names[ worker.host_and_port] = worker.perf_file_base_name perf_report_failures += run_collect_perf_profile_jobs( workers_and_base_names, scenario.name) # Still write the index.html even if some scenarios failed. # 'profile_output_files' will only have names for scenarios that passed if perf_cmd and not args.skip_generate_flamegraphs: # write the index fil to the output dir, with all profiles from all scenarios/workers report_utils.render_perf_profiling_results( '%s/index.html' % args.flame_graph_reports, profile_output_files) report_utils.render_junit_xml_report(merged_resultset, args.xml_report, suite_name='benchmarks') if total_scenario_failures > 0 or qps_workers_killed > 0: print('%s scenarios failed and %s qps worker jobs killed' % (total_scenario_failures, qps_workers_killed)) sys.exit(1) if perf_report_failures > 0: print('%s perf profile collection jobs failed' % perf_report_failures) sys.exit(1)
else: print(' %s' % job.shortname) print if args.dry_run: print('--dry_run was used, exiting') sys.exit(1) jobset.message('START', 'Running test matrix.', do_newline=True) num_failures, resultset = jobset.run(jobs, newline_on_success=True, travis=True, maxjobs=args.jobs) # Merge skipped tests into results to show skipped tests on report.xml if skipped_jobs: skipped_results = jobset.run(skipped_jobs, skip_jobs=True) resultset.update(skipped_results) report_utils.render_junit_xml_report(resultset, 'report.xml', suite_name='aggregate_tests') if num_failures == 0: jobset.message('SUCCESS', 'All run_tests.py instance finished successfully.', do_newline=True) else: jobset.message('FAILED', 'Some run_tests.py instance have failed.', do_newline=True) sys.exit(1)
def run_one_scenario(scenario_config): jobset.message('START', 'Run scenario: %s' % scenario_config['name']) server_jobs = {} server_addresses = {} suppress_server_logs = True try: backend_addrs = [] fallback_ips = [] grpclb_ips = [] shortname_prefix = scenario_config['name'] # Start backends for i in xrange(len(scenario_config['backend_configs'])): backend_config = scenario_config['backend_configs'][i] backend_shortname = shortname(shortname_prefix, 'backend_server', i) backend_spec = backend_server_jobspec( backend_config['transport_sec'], backend_shortname) backend_job = dockerjob.DockerJob(backend_spec) server_jobs[backend_shortname] = backend_job backend_addrs.append( '%s:%d' % (backend_job.ip_address(), _BACKEND_SERVER_PORT)) # Start fallbacks for i in xrange(len(scenario_config['fallback_configs'])): fallback_config = scenario_config['fallback_configs'][i] fallback_shortname = shortname(shortname_prefix, 'fallback_server', i) fallback_spec = fallback_server_jobspec( fallback_config['transport_sec'], fallback_shortname) fallback_job = dockerjob.DockerJob(fallback_spec) server_jobs[fallback_shortname] = fallback_job fallback_ips.append(fallback_job.ip_address()) # Start balancers for i in xrange(len(scenario_config['balancer_configs'])): balancer_config = scenario_config['balancer_configs'][i] grpclb_shortname = shortname(shortname_prefix, 'grpclb_server', i) grpclb_spec = grpclb_jobspec(balancer_config['transport_sec'], balancer_config['short_stream'], backend_addrs, grpclb_shortname) grpclb_job = dockerjob.DockerJob(grpclb_spec) server_jobs[grpclb_shortname] = grpclb_job grpclb_ips.append(grpclb_job.ip_address()) # Start DNS server dns_server_shortname = shortname(shortname_prefix, 'dns_server', 0) dns_server_spec = dns_server_in_docker_jobspec( grpclb_ips, fallback_ips, dns_server_shortname, scenario_config['cause_no_error_no_data_for_balancer_a_record']) dns_server_job = dockerjob.DockerJob(dns_server_spec) server_jobs[dns_server_shortname] = dns_server_job # Get the IP address of the docker container running the DNS server. # The DNS server is running on port 53 of that IP address. Note we will # point the DNS resolvers of grpc clients under test to our controlled # DNS server by effectively modifying the /etc/resolve.conf "nameserver" # lists of their docker containers. dns_server_ip = dns_server_job.ip_address() wait_until_dns_server_is_up(dns_server_ip) # Run clients jobs = [] for lang_name in languages: # Skip languages that are known to not currently # work for this test. if not args.no_skips and lang_name in scenario_config.get( 'skip_langs', []): jobset.message( 'IDLE', 'Skipping scenario: %s for language: %s\n' % (scenario_config['name'], lang_name)) continue lang = _LANGUAGES[lang_name] test_job = lb_client_interop_jobspec( lang, dns_server_ip, docker_image=docker_images.get(lang.safename), transport_security=scenario_config['transport_sec']) jobs.append(test_job) jobset.message( 'IDLE', 'Jobs to run: \n%s\n' % '\n'.join(str(job) for job in jobs)) num_failures, resultset = jobset.run(jobs, newline_on_success=True, maxjobs=args.jobs) report_utils.render_junit_xml_report(resultset, 'sponge_log.xml') if num_failures: suppress_server_logs = False jobset.message('FAILED', 'Scenario: %s. Some tests failed' % scenario_config['name'], do_newline=True) else: jobset.message('SUCCESS', 'Scenario: %s. All tests passed' % scenario_config['name'], do_newline=True) return num_failures finally: # Check if servers are still running. for server, job in server_jobs.items(): if not job.is_running(): print('Server "%s" has exited prematurely.' % server) suppress_failure = suppress_server_logs and not args.verbose dockerjob.finish_jobs([j for j in six.itervalues(server_jobs)], suppress_failure=suppress_failure)
print('') if args.dry_run: print('--dry_run was used, exiting') sys.exit(1) jobset.message('START', 'Running test matrix.', do_newline=True) num_failures, resultset = jobset.run(jobs, newline_on_success=True, travis=True, maxjobs=args.jobs) # Merge skipped tests into results to show skipped tests on report.xml if skipped_jobs: ignored_num_skipped_failures, skipped_results = jobset.run( skipped_jobs, skip_jobs=True) resultset.update(skipped_results) report_utils.render_junit_xml_report(resultset, _report_filename(_MATRIX_REPORT_NAME), suite_name=_MATRIX_REPORT_NAME, multi_target=True) if num_failures == 0: jobset.message('SUCCESS', 'All run_tests.py instances finished successfully.', do_newline=True) else: jobset.message('FAILED', 'Some run_tests.py instances have failed.', do_newline=True) sys.exit(1)
if args.verbose: print('Jobs to run: \n%s\n' % '\n'.join(str(job) for job in jobs)) num_failures, resultset = jobset.run(jobs, newline_on_success=True, maxjobs=args.jobs, skip_jobs=args.manual_run) if num_failures: jobset.message('FAILED', 'Some tests failed', do_newline=True) else: jobset.message('SUCCESS', 'All tests passed', do_newline=True) write_cmdlog_maybe(server_manual_cmd_log, 'interop_server_cmds.sh') write_cmdlog_maybe(client_manual_cmd_log, 'interop_client_cmds.sh') report_utils.render_junit_xml_report(resultset, 'report.xml') for name, job in resultset.items(): if "http2" in name: job[0].http2results = aggregate_http2_results(job[0].message) http2_server_test_cases = ( _HTTP2_SERVER_TEST_CASES if args.http2_server_interop else []) report_utils.render_interop_html_report( set([str(l) for l in languages]), servers, _TEST_CASES, _AUTH_TEST_CASES, _HTTP2_TEST_CASES, http2_server_test_cases, resultset, num_failures, args.cloud_to_prod_auth or args.cloud_to_prod, args.prod_servers, args.http2_interop) except Exception as e: print('exception occurred:')
prebuild_jobs += target.pre_build_jobspecs() if prebuild_jobs: num_failures, _ = jobset.run(prebuild_jobs, newline_on_success=True, maxjobs=args.jobs) if num_failures != 0: jobset.message('FAILED', 'Pre-build phase failed.', do_newline=True) sys.exit(1) build_jobs = [] for target in targets: build_jobs.append(target.build_jobspec()) if not build_jobs: print('Nothing to build.') sys.exit(1) jobset.message('START', 'Building targets.', do_newline=True) num_failures, resultset = jobset.run(build_jobs, newline_on_success=True, maxjobs=args.jobs) report_utils.render_junit_xml_report(resultset, 'report_taskrunner_sponge_log.xml', suite_name='tasks') if num_failures == 0: jobset.message('SUCCESS', 'All targets built successfully.', do_newline=True) else: jobset.message('FAILED', 'Failed to build targets.', do_newline=True) sys.exit(1)
if args.dry_run: print(' %s: "%s"' % (job.shortname, ' '.join(job.cmdline))) else: print(' %s' % job.shortname) print if args.dry_run: print('--dry_run was used, exiting') sys.exit(1) jobset.message('START', 'Running test matrix.', do_newline=True) num_failures, resultset = jobset.run(jobs, newline_on_success=True, travis=True, maxjobs=args.jobs) # Merge skipped tests into results to show skipped tests on report.xml if skipped_jobs: ignored_num_skipped_failures, skipped_results = jobset.run( skipped_jobs, skip_jobs=True) resultset.update(skipped_results) report_utils.render_junit_xml_report(resultset, _report_filename('aggregate_tests'), suite_name='aggregate_tests') if num_failures == 0: jobset.message('SUCCESS', 'All run_tests.py instance finished successfully.', do_newline=True) else: jobset.message('FAILED', 'Some run_tests.py instance have failed.', do_newline=True) sys.exit(1)
elif test_case == 'secondary_locality_gets_requests_on_primary_failure': test_secondary_locality_gets_requests_on_primary_failure( gcp, backend_service, instance_group, secondary_zone_instance_group) else: logger.error('Unknown test case: %s', test_case) sys.exit(1) result.state = 'PASSED' result.returncode = 0 except Exception as e: result.state = 'FAILED' result.message = str(e) finally: if client_process: client_process.terminate() # Workaround for Python 3, as report_utils will invoke decode() on # result.message, which has a default value of ''. result.message = result.message.encode('UTF-8') test_results[test_case] = [result] if not os.path.exists(_TEST_LOG_BASE_DIR): os.makedirs(_TEST_LOG_BASE_DIR) report_utils.render_junit_xml_report(test_results, os.path.join(_TEST_LOG_BASE_DIR, _SPONGE_XML_NAME), suite_name='xds_tests', multi_target=True) finally: if not args.keep_gcp_resources: logger.info('Cleaning up GCP resources. This may take some time.') clean_up(gcp)
if args.dry_run: print(' %s: "%s"' % (job.shortname, ' '.join(job.cmdline))) else: print(' %s' % job.shortname) print if args.dry_run: print('--dry_run was used, exiting') sys.exit(1) jobset.message('START', 'Running test matrix.', do_newline=True) num_failures, resultset = jobset.run(jobs, newline_on_success=True, travis=True, maxjobs=args.jobs) # Merge skipped tests into results to show skipped tests on report.xml if skipped_jobs: skipped_results = jobset.run(skipped_jobs, skip_jobs=True) resultset.update(skipped_results) report_utils.render_junit_xml_report(resultset, 'report.xml', suite_name='aggregate_tests') if num_failures == 0: jobset.message('SUCCESS', 'All run_tests.py instance finished successfully.', do_newline=True) else: jobset.message('FAILED', 'Some run_tests.py instance have failed.', do_newline=True) sys.exit(1)
prebuild_jobs += target.pre_build_jobspecs() if prebuild_jobs: num_failures, _ = jobset.run(prebuild_jobs, newline_on_success=True, maxjobs=args.jobs) if num_failures != 0: jobset.message('FAILED', 'Pre-build phase failed.', do_newline=True) sys.exit(1) build_jobs = [] for target in targets: build_jobs.append(target.build_jobspec()) if not build_jobs: print('Nothing to build.') sys.exit(1) jobset.message('START', 'Building targets.', do_newline=True) num_failures, resultset = jobset.run(build_jobs, newline_on_success=True, maxjobs=args.jobs) report_utils.render_junit_xml_report(resultset, args.xml_report, suite_name='tasks') if num_failures == 0: jobset.message('SUCCESS', 'All targets built successfully.', do_newline=True) else: jobset.message('FAILED', 'Failed to build targets.', do_newline=True) sys.exit(1)
maxjobs=args.jobs, skip_jobs=args.manual_run) if args.bq_result_table and resultset: upload_interop_results_to_bq(resultset, args.bq_result_table, args) if num_failures: jobset.message('FAILED', 'Some tests failed', do_newline=True) else: jobset.message('SUCCESS', 'All tests passed', do_newline=True) write_cmdlog_maybe(server_manual_cmd_log, 'interop_server_cmds.sh') write_cmdlog_maybe(client_manual_cmd_log, 'interop_client_cmds.sh') xml_report_name = _XML_REPORT if args.internal_ci: xml_report_name = _INTERNAL_CL_XML_REPORT report_utils.render_junit_xml_report(resultset, xml_report_name) for name, job in resultset.items(): if "http2" in name: job[0].http2results = aggregate_http2_results(job[0].message) http2_server_test_cases = (_HTTP2_SERVER_TEST_CASES if args.http2_server_interop else []) report_utils.render_interop_html_report( set([str(l) for l in languages]), servers, _TEST_CASES, _AUTH_TEST_CASES, _HTTP2_TEST_CASES, http2_server_test_cases, resultset, num_failures, args.cloud_to_prod_auth or args.cloud_to_prod, args.prod_servers, args.http2_interop) if num_failures:
# Execute pre-build phase prebuild_jobs = [] for target in targets: prebuild_jobs += target.pre_build_jobspecs() if prebuild_jobs: num_failures, _ = jobset.run( prebuild_jobs, newline_on_success=True, maxjobs=args.jobs) if num_failures != 0: jobset.message('FAILED', 'Pre-build phase failed.', do_newline=True) sys.exit(1) build_jobs = [] for target in targets: build_jobs.append(target.build_jobspec()) if not build_jobs: print('Nothing to build.') sys.exit(1) jobset.message('START', 'Building targets.', do_newline=True) num_failures, resultset = jobset.run( build_jobs, newline_on_success=True, maxjobs=args.jobs) report_utils.render_junit_xml_report(resultset, 'report_taskrunner_sponge_log.xml', suite_name='tasks') if num_failures == 0: jobset.message('SUCCESS', 'All targets built successfully.', do_newline=True) else: jobset.message('FAILED', 'Failed to build targets.', do_newline=True) sys.exit(1)
def main(): argp = argparse.ArgumentParser(description='Run performance tests.') argp.add_argument( '-l', '--language', choices=['all'] + sorted(scenario_config.LANGUAGES.keys()), nargs='+', required=True, help='Languages to benchmark.') argp.add_argument( '--remote_driver_host', default=None, help= 'Run QPS driver on given host. By default, QPS driver is run locally.') argp.add_argument( '--remote_worker_host', nargs='+', default=[], help='Worker hosts where to start QPS workers.') argp.add_argument( '--dry_run', default=False, action='store_const', const=True, help='Just list scenarios to be run, but don\'t run them.') argp.add_argument( '-r', '--regex', default='.*', type=str, help='Regex to select scenarios to run.') argp.add_argument( '--bq_result_table', default=None, type=str, help='Bigquery "dataset.table" to upload results to.') argp.add_argument( '--category', choices=['smoketest', 'all', 'scalable', 'sweep'], default='all', help='Select a category of tests to run.') argp.add_argument( '--netperf', default=False, action='store_const', const=True, help='Run netperf benchmark as one of the scenarios.') argp.add_argument( '--server_cpu_load', default=0, type=int, help='Select a targeted server cpu load to run. 0 means ignore this flag' ) argp.add_argument( '-x', '--xml_report', default='report.xml', type=str, help='Name of XML report file to generate.') argp.add_argument( '--perf_args', help=('Example usage: "--perf_args=record -F 99 -g". ' 'Wrap QPS workers in a perf command ' 'with the arguments to perf specified here. ' '".svg" flame graph profiles will be ' 'created for each Qps Worker on each scenario. ' 'Files will output to "<repo_root>/<args.flame_graph_reports>" ' 'directory. Output files from running the worker ' 'under perf are saved in the repo root where its ran. ' 'Note that the perf "-g" flag is necessary for ' 'flame graphs generation to work (assuming the binary ' 'being profiled uses frame pointers, check out ' '"--call-graph dwarf" option using libunwind otherwise.) ' 'Also note that the entire "--perf_args=<arg(s)>" must ' 'be wrapped in quotes as in the example usage. ' 'If the "--perg_args" is unspecified, "perf" will ' 'not be used at all. ' 'See http://www.brendangregg.com/perf.html ' 'for more general perf examples.')) argp.add_argument( '--skip_generate_flamegraphs', default=False, action='store_const', const=True, help=('Turn flame graph generation off. ' 'May be useful if "perf_args" arguments do not make sense for ' 'generating flamegraphs (e.g., "--perf_args=stat ...")')) argp.add_argument( '-f', '--flame_graph_reports', default='perf_reports', type=str, help= 'Name of directory to output flame graph profiles to, if any are created.' ) argp.add_argument( '-u', '--remote_host_username', default='', type=str, help='Use a username that isn\'t "Jenkins" to SSH into remote workers.') args = argp.parse_args() global _REMOTE_HOST_USERNAME if args.remote_host_username: _REMOTE_HOST_USERNAME = args.remote_host_username languages = set( scenario_config.LANGUAGES[l] for l in itertools.chain.from_iterable( six.iterkeys(scenario_config.LANGUAGES) if x == 'all' else [x] for x in args.language)) # Put together set of remote hosts where to run and build remote_hosts = set() if args.remote_worker_host: for host in args.remote_worker_host: remote_hosts.add(host) if args.remote_driver_host: remote_hosts.add(args.remote_driver_host) if not args.dry_run: if remote_hosts: archive_repo(languages=[str(l) for l in languages]) prepare_remote_hosts(remote_hosts, prepare_local=True) else: prepare_remote_hosts([], prepare_local=True) build_local = False if not args.remote_driver_host: build_local = True if not args.dry_run: build_on_remote_hosts( remote_hosts, languages=[str(l) for l in languages], build_local=build_local) perf_cmd = None if args.perf_args: print('Running workers under perf profiler') # Expect /usr/bin/perf to be installed here, as is usual perf_cmd = ['/usr/bin/perf'] perf_cmd.extend(re.split('\s+', args.perf_args)) qpsworker_jobs = create_qpsworkers( languages, args.remote_worker_host, perf_cmd=perf_cmd) # get list of worker addresses for each language. workers_by_lang = dict([(str(language), []) for language in languages]) for job in qpsworker_jobs: workers_by_lang[str(job.language)].append(job) scenarios = create_scenarios( languages, workers_by_lang=workers_by_lang, remote_host=args.remote_driver_host, regex=args.regex, category=args.category, bq_result_table=args.bq_result_table, netperf=args.netperf, netperf_hosts=args.remote_worker_host, server_cpu_load=args.server_cpu_load) if not scenarios: raise Exception('No scenarios to run') total_scenario_failures = 0 qps_workers_killed = 0 merged_resultset = {} perf_report_failures = 0 for scenario in scenarios: if args.dry_run: print(scenario.name) else: scenario_failures = 0 try: for worker in scenario.workers: worker.start() jobs = [scenario.jobspec] if scenario.workers: # TODO(jtattermusch): ideally the "quit" job won't show up # in the report jobs.append( create_quit_jobspec( scenario.workers, remote_host=args.remote_driver_host)) scenario_failures, resultset = jobset.run( jobs, newline_on_success=True, maxjobs=1) total_scenario_failures += scenario_failures merged_resultset = dict( itertools.chain( six.iteritems(merged_resultset), six.iteritems(resultset))) finally: # Consider qps workers that need to be killed as failures qps_workers_killed += finish_qps_workers( scenario.workers, qpsworker_jobs) if perf_cmd and scenario_failures == 0 and not args.skip_generate_flamegraphs: workers_and_base_names = {} for worker in scenario.workers: if not worker.perf_file_base_name: raise Exception( 'using perf buf perf report filename is unspecified' ) workers_and_base_names[ worker.host_and_port] = worker.perf_file_base_name perf_report_failures += run_collect_perf_profile_jobs( workers_and_base_names, scenario.name, args.flame_graph_reports) # Still write the index.html even if some scenarios failed. # 'profile_output_files' will only have names for scenarios that passed if perf_cmd and not args.skip_generate_flamegraphs: # write the index fil to the output dir, with all profiles from all scenarios/workers report_utils.render_perf_profiling_results( '%s/index.html' % args.flame_graph_reports, profile_output_files) report_utils.render_junit_xml_report( merged_resultset, args.xml_report, suite_name='benchmarks', multi_target=True) if total_scenario_failures > 0 or qps_workers_killed > 0: print('%s scenarios failed and %s qps worker jobs killed' % (total_scenario_failures, qps_workers_killed)) sys.exit(1) if perf_report_failures > 0: print('%s perf profile collection jobs failed' % perf_report_failures) sys.exit(1)
print(' %s' % job.shortname) print if args.dry_run: print('--dry_run was used, exiting') sys.exit(1) jobset.message('START', 'Running test matrix.', do_newline=True) num_failures, resultset = jobset.run(jobs, newline_on_success=True, travis=True, maxjobs=args.jobs) # Merge skipped tests into results to show skipped tests on report.xml if skipped_jobs: ignored_num_skipped_failures, skipped_results = jobset.run( skipped_jobs, skip_jobs=True) resultset.update(skipped_results) report_utils.render_junit_xml_report(resultset, _report_filename('aggregate_tests'), suite_name='aggregate_tests') if num_failures == 0: jobset.message('SUCCESS', 'All run_tests.py instance finished successfully.', do_newline=True) else: jobset.message('FAILED', 'Some run_tests.py instance have failed.', do_newline=True) sys.exit(1)
if args.dry_run: print(' %s: "%s"' % (job.shortname, ' '.join(job.cmdline))) else: print(' %s' % job.shortname) print if args.dry_run: print('--dry_run was used, exiting') sys.exit(1) jobset.message('START', 'Running test matrix.', do_newline=True) num_failures, resultset = jobset.run(jobs, newline_on_success=True, travis=True, maxjobs=args.jobs) # Merge skipped tests into results to show skipped tests on report.xml if skipped_jobs: ignored_num_skipped_failures, skipped_results = jobset.run( skipped_jobs, skip_jobs=True) resultset.update(skipped_results) report_utils.render_junit_xml_report(resultset, 'report_%s' % _REPORT_SUFFIX, suite_name='aggregate_tests') if num_failures == 0: jobset.message('SUCCESS', 'All run_tests.py instance finished successfully.', do_newline=True) else: jobset.message('FAILED', 'Some run_tests.py instance have failed.', do_newline=True) sys.exit(1)
print(' %s' % job.shortname) print if args.dry_run: print('--dry_run was used, exiting') sys.exit(1) jobset.message('START', 'Running test matrix.', do_newline=True) num_failures, resultset = jobset.run(jobs, newline_on_success=True, travis=True, maxjobs=args.jobs) # Merge skipped tests into results to show skipped tests on report.xml if skipped_jobs: ignored_num_skipped_failures, skipped_results = jobset.run( skipped_jobs, skip_jobs=True) resultset.update(skipped_results) report_utils.render_junit_xml_report(resultset, 'report_%s' % _REPORT_SUFFIX, suite_name='aggregate_tests') if num_failures == 0: jobset.message('SUCCESS', 'All run_tests.py instance finished successfully.', do_newline=True) else: jobset.message('FAILED', 'Some run_tests.py instance have failed.', do_newline=True) sys.exit(1)
def run_one_scenario(scenario_config): jobset.message('START', 'Run scenario: %s' % scenario_config['name']) server_jobs = {} server_addresses = {} suppress_server_logs = True try: backend_addrs = [] fallback_ips = [] grpclb_ips = [] shortname_prefix = scenario_config['name'] # Start backends for i in xrange(len(scenario_config['backend_configs'])): backend_config = scenario_config['backend_configs'][i] backend_shortname = shortname(shortname_prefix, 'backend_server', i) backend_spec = backend_server_jobspec( backend_config['transport_sec'], backend_shortname) backend_job = dockerjob.DockerJob(backend_spec) server_jobs[backend_shortname] = backend_job backend_addrs.append('%s:%d' % (backend_job.ip_address(), _BACKEND_SERVER_PORT)) # Start fallbacks for i in xrange(len(scenario_config['fallback_configs'])): fallback_config = scenario_config['fallback_configs'][i] fallback_shortname = shortname(shortname_prefix, 'fallback_server', i) fallback_spec = fallback_server_jobspec( fallback_config['transport_sec'], fallback_shortname) fallback_job = dockerjob.DockerJob(fallback_spec) server_jobs[fallback_shortname] = fallback_job fallback_ips.append(fallback_job.ip_address()) # Start balancers for i in xrange(len(scenario_config['balancer_configs'])): balancer_config = scenario_config['balancer_configs'][i] grpclb_shortname = shortname(shortname_prefix, 'grpclb_server', i) grpclb_spec = grpclb_jobspec(balancer_config['transport_sec'], balancer_config['short_stream'], backend_addrs, grpclb_shortname) grpclb_job = dockerjob.DockerJob(grpclb_spec) server_jobs[grpclb_shortname] = grpclb_job grpclb_ips.append(grpclb_job.ip_address()) # Start DNS server dns_server_shortname = shortname(shortname_prefix, 'dns_server', 0) dns_server_spec = dns_server_in_docker_jobspec( grpclb_ips, fallback_ips, dns_server_shortname, scenario_config['cause_no_error_no_data_for_balancer_a_record']) dns_server_job = dockerjob.DockerJob(dns_server_spec) server_jobs[dns_server_shortname] = dns_server_job # Get the IP address of the docker container running the DNS server. # The DNS server is running on port 53 of that IP address. Note we will # point the DNS resolvers of grpc clients under test to our controlled # DNS server by effectively modifying the /etc/resolve.conf "nameserver" # lists of their docker containers. dns_server_ip = dns_server_job.ip_address() wait_until_dns_server_is_up(dns_server_ip) # Run clients jobs = [] for lang_name in languages: # Skip languages that are known to not currently # work for this test. if not args.no_skips and lang_name in scenario_config.get( 'skip_langs', []): jobset.message('IDLE', 'Skipping scenario: %s for language: %s\n' % (scenario_config['name'], lang_name)) continue lang = _LANGUAGES[lang_name] test_job = lb_client_interop_jobspec( lang, dns_server_ip, docker_image=docker_images.get(lang.safename), transport_security=scenario_config['transport_sec']) jobs.append(test_job) jobset.message('IDLE', 'Jobs to run: \n%s\n' % '\n'.join( str(job) for job in jobs)) num_failures, resultset = jobset.run( jobs, newline_on_success=True, maxjobs=args.jobs) report_utils.render_junit_xml_report(resultset, 'sponge_log.xml') if num_failures: suppress_server_logs = False jobset.message( 'FAILED', 'Scenario: %s. Some tests failed' % scenario_config['name'], do_newline=True) else: jobset.message( 'SUCCESS', 'Scenario: %s. All tests passed' % scenario_config['name'], do_newline=True) return num_failures finally: # Check if servers are still running. for server, job in server_jobs.items(): if not job.is_running(): print('Server "%s" has exited prematurely.' % server) suppress_failure = suppress_server_logs and not args.verbose dockerjob.finish_jobs( [j for j in six.itervalues(server_jobs)], suppress_failure=suppress_failure)