def run_tests(config): """ The main function that launches the stress tests """ # Build docker images and push to GKE registry if config.global_settings.build_docker_images: for name, docker_image in config.docker_images_dict.iteritems(): if not (docker_image.build_image() and docker_image.push_to_gke_registry()): return False # Create a unique id for this run (Note: Using timestamp instead of UUID to # make it easier to deduce the date/time of the run just by looking at the run # run id. This is useful in debugging when looking at records in Biq query) run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') dataset_id = '%s_%s' % (config.global_settings.dataset_id_prefix, run_id) bq_helper = BigQueryHelper(run_id, '', '', config.global_settings.gcp_project_id, dataset_id, config.global_settings.summary_table_id, config.global_settings.qps_table_id) bq_helper.initialize() gke = Gke(config.global_settings.gcp_project_id, run_id, dataset_id, config.global_settings.summary_table_id, config.global_settings.qps_table_id, config.global_settings.kubernetes_proxy_port) is_success = True try: print 'Launching servers..' for name, server_pod_spec in config.server_pod_specs_dict.iteritems(): if not gke.launch_servers(server_pod_spec): is_success = False # is_success is checked in the 'finally' block return False print('Launched servers. Waiting for %d seconds for the server pods to be ' 'fully online') % config.global_settings.pod_warmup_secs time.sleep(config.global_settings.pod_warmup_secs) for name, client_pod_spec in config.client_pod_specs_dict.iteritems(): if not gke.launch_clients(client_pod_spec): is_success = False # is_success is checked in the 'finally' block return False print('Launched all clients. Waiting for %d seconds for the client pods to ' 'be fully online') % config.global_settings.pod_warmup_secs time.sleep(config.global_settings.pod_warmup_secs) start_time = datetime.datetime.now() end_time = start_time + datetime.timedelta( seconds=config.global_settings.test_duration_secs) print 'Running the test until %s' % end_time.isoformat() while True: if datetime.datetime.now() > end_time: print 'Test was run for %d seconds' % config.global_settings.test_duration_secs break # Check if either stress server or clients have failed (btw, the bq_helper # monitors all the rows in the summary table and checks if any of them # have a failure status) if bq_helper.check_if_any_tests_failed(): is_success = False print 'Some tests failed.' break # Don't 'return' here. We still want to call bq_helper to print qps/summary tables # Tests running fine. Wait until next poll time to check the status print 'Sleeping for %d seconds..' % config.global_settings.test_poll_interval_secs time.sleep(config.global_settings.test_poll_interval_secs) # Print BiqQuery tables bq_helper.print_qps_records() bq_helper.print_summary_records() finally: # If there was a test failure, we should not delete the pods since they # would contain useful debug information (logs, core dumps etc) if is_success: for name, server_pod_spec in config.server_pod_specs_dict.iteritems(): gke.delete_servers(server_pod_spec) for name, client_pod_spec in config.client_pod_specs_dict.iteritems(): gke.delete_clients(client_pod_spec) return is_success
def run_client(): """This is a wrapper around the stress test client and performs the following: 1) Create the following two tables in Big Query: (i) Summary table: To record events like the test started, completed successfully or failed (ii) Qps table: To periodically record the QPS sent by this client 2) Start the stress test client and add a row in the Big Query summary table 3) Once every few seconds (as specificed by the poll_interval_secs) poll the status of the stress test client process and perform the following: 3.1) If the process is still running, get the current qps by invoking the metrics client program and add a row in the Big Query Qps table. Sleep for a duration specified by poll_interval_secs 3.2) If the process exited successfully, add a row in the Big Query Summary table and exit 3.3) If the process failed, add a row in Big Query summary table and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the stress test client fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures """ # Set the 'core file' size to 'unlimited' so that 'core' files are generated # if the client crashes (Note: This is not relevant for Java and Go clients) resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) env = dict(os.environ) image_type = env['STRESS_TEST_IMAGE_TYPE'] stress_client_cmd = env['STRESS_TEST_CMD'].split() args_str = env['STRESS_TEST_ARGS_STR'] metrics_client_cmd = env['METRICS_CLIENT_CMD'].split() metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR'] run_id = env['RUN_ID'] pod_name = env['POD_NAME'] logfile_name = env.get('LOGFILE_NAME') poll_interval_secs = float(env['POLL_INTERVAL_SECS']) project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] # The following parameter is to inform us whether the stress client runs # forever until forcefully stopped or will it naturally stop after sometime. # This way, we know that the stress client process should not terminate (even # if it does with a success exit code) and flag the termination as a failure will_run_forever = env.get('WILL_RUN_FOREVER', '1') bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening logfile: %s ...' % logfile_name details = 'Logfile: %s' % logfile_name logfile = open(logfile_name, 'w') metrics_cmd = metrics_client_cmd + [x for x in metrics_client_args_str.split()] stress_cmd = stress_client_cmd + [x for x in args_str.split()] details = '%s, Metrics command: %s, Stress client command: %s' % ( details, str(metrics_cmd), str(stress_cmd)) # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) qps_history = [1, 1, 1] # Maintain the last 3 qps readings qps_history_idx = 0 # Index into the qps_history list is_running_status_written = False is_error = False while True: # Check if stress_client is still running. If so, collect metrics and upload # to BigQuery status table # If stress_p.poll() is not None, it means that the stress client terminated if stress_p.poll() is not None: end_time = datetime.datetime.now().isoformat() event_type = EventType.SUCCESS details = 'End time: %s' % end_time if will_run_forever == '1' or stress_p.returncode != 0: event_type = EventType.FAILURE details = 'Return code = %d. End time: %s' % (stress_p.returncode, end_time) is_error = True bq_helper.insert_summary_row(event_type, details) print details break if not is_running_status_written: bq_helper.insert_summary_row(EventType.RUNNING, '') is_running_status_written = True # Stress client still running. Get metrics qps = _get_qps(metrics_cmd) qps_recorded_at = datetime.datetime.now().isoformat() print 'qps: %d at %s' % (qps, qps_recorded_at) # If QPS has been zero for the last 3 iterations, flag it as error and exit qps_history[qps_history_idx] = qps qps_history_idx = (qps_history_idx + 1) % len(qps_history) if sum(qps_history) == 0: details = 'QPS has been zero for the last %d seconds - as of : %s' % ( poll_interval_secs * 3, qps_recorded_at) is_error = True bq_helper.insert_summary_row(EventType.FAILURE, details) print details break # Upload qps metrics to BiqQuery bq_helper.insert_qps_row(qps, qps_recorded_at) time.sleep(poll_interval_secs) if is_error: print 'Waiting indefinitely..' select.select([], [], []) print 'Completed' return
import argparse import os import sys stress_test_utils_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), '../../gcp/stress_test')) sys.path.append(stress_test_utils_dir) from stress_test_utils import BigQueryHelper argp = argparse.ArgumentParser( description='Print summary tables', formatter_class=argparse.ArgumentDefaultsHelpFormatter) argp.add_argument('--gcp_project_id', required=True, help='The Google Cloud Platform Project Id') argp.add_argument('--dataset_id', type=str, required=True) argp.add_argument('--run_id', type=str, required=True) argp.add_argument('--summary_table_id', type=str, default='summary') argp.add_argument('--qps_table_id', type=str, default='qps') argp.add_argument('--summary_only', action='store_true', default=True) if __name__ == '__main__': args = argp.parse_args() bq_helper = BigQueryHelper(args.run_id, '', '', args.gcp_project_id, args.dataset_id, args.summary_table_id, args.qps_table_id) bq_helper.initialize() if not args.summary_only: bq_helper.print_qps_records() bq_helper.print_summary_records()
def run_tests(config): """ The main function that launches the stress tests """ # Build docker images and push to GKE registry if config.global_settings.build_docker_images: for name, docker_image in config.docker_images_dict.iteritems(): if not (docker_image.build_image() and docker_image.push_to_gke_registry()): return False # Create a unique id for this run (Note: Using timestamp instead of UUID to # make it easier to deduce the date/time of the run just by looking at the run # run id. This is useful in debugging when looking at records in Biq query) run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') dataset_id = '%s_%s' % (config.global_settings.dataset_id_prefix, run_id) print 'Run id:', run_id print 'Dataset id:', dataset_id bq_helper = BigQueryHelper(run_id, '', '', config.global_settings.gcp_project_id, dataset_id, config.global_settings.summary_table_id, config.global_settings.qps_table_id) bq_helper.initialize() gke = Gke(config.global_settings.gcp_project_id, run_id, dataset_id, config.global_settings.summary_table_id, config.global_settings.qps_table_id, config.global_settings.kubernetes_proxy_port) is_success = True try: print 'Launching servers..' for name, server_pod_spec in config.server_pod_specs_dict.iteritems(): if not gke.launch_servers(server_pod_spec): is_success = False # is_success is checked in the 'finally' block return False print( 'Launched servers. Waiting for %d seconds for the server pods to be ' 'fully online') % config.global_settings.pod_warmup_secs time.sleep(config.global_settings.pod_warmup_secs) for name, client_pod_spec in config.client_pod_specs_dict.iteritems(): if not gke.launch_clients(client_pod_spec): is_success = False # is_success is checked in the 'finally' block return False print( 'Launched all clients. Waiting for %d seconds for the client pods to ' 'be fully online') % config.global_settings.pod_warmup_secs time.sleep(config.global_settings.pod_warmup_secs) start_time = datetime.datetime.now() end_time = start_time + datetime.timedelta( seconds=config.global_settings.test_duration_secs) print 'Running the test until %s' % end_time.isoformat() while True: if datetime.datetime.now() > end_time: print 'Test was run for %d seconds' % config.global_settings.test_duration_secs break # Check if either stress server or clients have failed (btw, the bq_helper # monitors all the rows in the summary table and checks if any of them # have a failure status) if bq_helper.check_if_any_tests_failed(): is_success = False print 'Some tests failed.' break # Don't 'return' here. We still want to call bq_helper to print qps/summary tables # Tests running fine. Wait until next poll time to check the status print 'Sleeping for %d seconds..' % config.global_settings.test_poll_interval_secs time.sleep(config.global_settings.test_poll_interval_secs) # Print BiqQuery tables bq_helper.print_qps_records() bq_helper.print_summary_records() finally: # If there was a test failure, we should not delete the pods since they # would contain useful debug information (logs, core dumps etc) if is_success: for name, server_pod_spec in config.server_pod_specs_dict.iteritems( ): gke.delete_servers(server_pod_spec) for name, client_pod_spec in config.client_pod_specs_dict.iteritems( ): gke.delete_clients(client_pod_spec) return is_success
def run_test_main(test_settings, gke_settings, stress_server_settings, stress_client_clients): is_success = True if test_settings.build_docker_image: is_success = _build_docker_image(gke_settings.docker_image_name, gke_settings.tag_name) if not is_success: return False is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name) if not is_success: return False # Create a unique id for this run (Note: Using timestamp instead of UUID to # make it easier to deduce the date/time of the run just by looking at the run # run id. This is useful in debugging when looking at records in Biq query) run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id) # Big Query settings (common for both Stress Server and Client) bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID, _QPS_TABLE_ID) bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id, _SUMMARY_TABLE_ID, _QPS_TABLE_ID) bq_helper.initialize() try: is_success = _launch_server_and_client(gke_settings, stress_server_settings, stress_client_settings, bq_settings, test_settings.kubernetes_proxy_port) if not is_success: return False start_time = datetime.datetime.now() end_time = start_time + datetime.timedelta( seconds=test_settings.test_duration_secs) print 'Running the test until %s' % end_time.isoformat() while True: if datetime.datetime.now() > end_time: print 'Test was run for %d seconds' % test_settings.test_duration_secs break # Check if either stress server or clients have failed if bq_helper.check_if_any_tests_failed(): is_success = False print 'Some tests failed.' break # Things seem to be running fine. Wait until next poll time to check the # status print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs time.sleep(test_settings.test_poll_interval_secs) # Print BiqQuery tables bq_helper.print_summary_records() bq_helper.print_qps_records() finally: # If is_success is False at this point, it means that the stress tests were # started successfully but failed while running the tests. In this case we # do should not delete the pods (since they contain all the failure # information) if is_success: _delete_server_and_client(stress_server_settings, stress_client_settings, test_settings.kubernetes_proxy_port) return is_success
import argparse import os import sys stress_test_utils_dir = os.path.abspath(os.path.join( os.path.dirname(__file__), '../../gcp/stress_test')) sys.path.append(stress_test_utils_dir) from stress_test_utils import BigQueryHelper argp = argparse.ArgumentParser( description='Print summary tables', formatter_class=argparse.ArgumentDefaultsHelpFormatter) argp.add_argument('--gcp_project_id', required=True, help='The Google Cloud Platform Project Id') argp.add_argument('--dataset_id', type=str, required=True) argp.add_argument('--run_id', type=str, required=True) argp.add_argument('--summary_table_id', type=str, default='summary') argp.add_argument('--qps_table_id', type=str, default='qps') argp.add_argument('--summary_only', action='store_true', default=True) if __name__ == '__main__': args = argp.parse_args() bq_helper = BigQueryHelper(args.run_id, '', '', args.gcp_project_id, args.dataset_id, args.summary_table_id, args.qps_table_id) bq_helper.initialize() if not args.summary_only: bq_helper.print_qps_records() bq_helper.print_summary_records()
def run_server(): """This is a wrapper around the interop server and performs the following: 1) Create a 'Summary table' in Big Query to record events like the server started, completed successfully or failed. NOTE: This also creates another table called the QPS table which is currently NOT needed on the server (it is needed on the stress test clients) 2) Start the server process and add a row in Big Query summary table 3) Wait for the server process to terminate. The server process does not terminate unless there is an error. If the server process terminated with a failure, add a row in Big Query and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the server process fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures. """ # Set the 'core file' size to 'unlimited' so that 'core' files are generated # if the server crashes (Note: This is not relevant for Java and Go servers) resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) # Read the parameters from environment variables env = dict(os.environ) run_id = env['RUN_ID'] # The unique run id for this test image_type = env['STRESS_TEST_IMAGE_TYPE'] stress_server_cmd = env['STRESS_TEST_CMD'].split() args_str = env['STRESS_TEST_ARGS_STR'] pod_name = env['POD_NAME'] project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] # The following parameter is to inform us whether the server runs forever # until forcefully stopped or will it naturally stop after sometime. # This way, we know that the process should not terminate (even if it does # with a success exit code) and flag any termination as a failure. will_run_forever = env.get('WILL_RUN_FOREVER', '1') logfile_name = env.get('LOGFILE_NAME') print( 'pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, ' 'summary_table_id: %s, qps_table_id: %s') % ( pod_name, project_id, run_id, dataset_id, summary_table_id, qps_table_id) bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening log file: ', logfile_name logfile = open(logfile_name, 'w') details = 'Logfile: %s' % logfile_name # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) stress_cmd = stress_server_cmd + [x for x in args_str.split()] print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) returncode = stress_p.wait() if will_run_forever == '1' or returncode != 0: end_time = datetime.datetime.now().isoformat() event_type = EventType.FAILURE details = 'Returncode: %d; End time: %s' % (returncode, end_time) bq_helper.insert_summary_row(event_type, details) print 'Waiting indefinitely..' select.select([], [], []) return returncode
def run_client(): """This is a wrapper around the stress test client and performs the following: 1) Create the following two tables in Big Query: (i) Summary table: To record events like the test started, completed successfully or failed (ii) Qps table: To periodically record the QPS sent by this client 2) Start the stress test client and add a row in the Big Query summary table 3) Once every few seconds (as specificed by the poll_interval_secs) poll the status of the stress test client process and perform the following: 3.1) If the process is still running, get the current qps by invoking the metrics client program and add a row in the Big Query Qps table. Sleep for a duration specified by poll_interval_secs 3.2) If the process exited successfully, add a row in the Big Query Summary table and exit 3.3) If the process failed, add a row in Big Query summary table and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the stress test client fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures """ env = dict(os.environ) image_type = env['STRESS_TEST_IMAGE_TYPE'] image_name = env['STRESS_TEST_IMAGE'] args_str = env['STRESS_TEST_ARGS_STR'] metrics_client_image = env['METRICS_CLIENT_IMAGE'] metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR'] run_id = env['RUN_ID'] pod_name = env['POD_NAME'] logfile_name = env.get('LOGFILE_NAME') poll_interval_secs = float(env['POLL_INTERVAL_SECS']) project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening logfile: %s ...' % logfile_name details = 'Logfile: %s' % logfile_name logfile = open(logfile_name, 'w') # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) metrics_cmd = [metrics_client_image ] + [x for x in metrics_client_args_str.split()] stress_cmd = [image_name] + [x for x in args_str.split()] print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) qps_history = [1, 1, 1] # Maintain the last 3 qps readings qps_history_idx = 0 # Index into the qps_history list is_error = False while True: # Check if stress_client is still running. If so, collect metrics and upload # to BigQuery status table if stress_p.poll() is not None: end_time = datetime.datetime.now().isoformat() event_type = EventType.SUCCESS details = 'End time: %s' % end_time if stress_p.returncode != 0: event_type = EventType.FAILURE details = 'Return code = %d. End time: %s' % (stress_p.returncode, end_time) is_error = True bq_helper.insert_summary_row(event_type, details) print details break # Stress client still running. Get metrics qps = _get_qps(metrics_cmd) qps_recorded_at = datetime.datetime.now().isoformat() print 'qps: %d at %s' % (qps, qps_recorded_at) # If QPS has been zero for the last 3 iterations, flag it as error and exit qps_history[qps_history_idx] = qps qps_history_idx = (qps_history_idx + 1) % len(qps_history) if sum(qps_history) == 0: details = 'QPS has been zero for the last %d seconds - as of : %s' % ( poll_interval_secs * 3, qps_recorded_at) is_error = True bq_helper.insert_summary_row(EventType.FAILURE, details) print details break # Upload qps metrics to BiqQuery bq_helper.insert_qps_row(qps, qps_recorded_at) time.sleep(poll_interval_secs) if is_error: print 'Waiting indefinitely..' select.select([], [], []) print 'Completed' return
def run_server(): """This is a wrapper around the interop server and performs the following: 1) Create a 'Summary table' in Big Query to record events like the server started, completed successfully or failed. NOTE: This also creates another table called the QPS table which is currently NOT needed on the server (it is needed on the stress test clients) 2) Start the server process and add a row in Big Query summary table 3) Wait for the server process to terminate. The server process does not terminate unless there is an error. If the server process terminated with a failure, add a row in Big Query and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the server process fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures. """ # Read the parameters from environment variables env = dict(os.environ) run_id = env['RUN_ID'] # The unique run id for this test image_type = env['STRESS_TEST_IMAGE_TYPE'] stress_server_cmd = env['STRESS_TEST_CMD'].split() args_str = env['STRESS_TEST_ARGS_STR'] pod_name = env['POD_NAME'] project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] logfile_name = env.get('LOGFILE_NAME') print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, ' 'summary_table_id: %s, qps_table_id: %s') % ( pod_name, project_id, run_id, dataset_id, summary_table_id, qps_table_id) bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening log file: ', logfile_name logfile = open(logfile_name, 'w') details = 'Logfile: %s' % logfile_name # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) stress_cmd = stress_server_cmd + [x for x in args_str.split()] print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) returncode = stress_p.wait() if returncode != 0: end_time = datetime.datetime.now().isoformat() event_type = EventType.FAILURE details = 'Returncode: %d; End time: %s' % (returncode, end_time) bq_helper.insert_summary_row(event_type, details) print 'Waiting indefinitely..' select.select([], [], []) return returncode
def run_server(): """This is a wrapper around the interop server and performs the following: 1) Create a 'Summary table' in Big Query to record events like the server started, completed successfully or failed. NOTE: This also creates another table called the QPS table which is currently NOT needed on the server (it is needed on the stress test clients) 2) Start the server process and add a row in Big Query summary table 3) Wait for the server process to terminate. The server process does not terminate unless there is an error. If the server process terminated with a failure, add a row in Big Query and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the server process fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures. """ # Set the 'core file' size to 'unlimited' so that 'core' files are generated # if the server crashes (Note: This is not relevant for Java and Go servers) resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) # Read the parameters from environment variables env = dict(os.environ) run_id = env['RUN_ID'] # The unique run id for this test image_type = env['STRESS_TEST_IMAGE_TYPE'] stress_server_cmd = env['STRESS_TEST_CMD'].split() args_str = env['STRESS_TEST_ARGS_STR'] pod_name = env['POD_NAME'] project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] # The following parameter is to inform us whether the server runs forever # until forcefully stopped or will it naturally stop after sometime. # This way, we know that the process should not terminate (even if it does # with a success exit code) and flag any termination as a failure. will_run_forever = env.get('WILL_RUN_FOREVER', '1') logfile_name = env.get('LOGFILE_NAME') print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, ' 'summary_table_id: %s, qps_table_id: %s') % (pod_name, project_id, run_id, dataset_id, summary_table_id, qps_table_id) bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening log file: ', logfile_name logfile = open(logfile_name, 'w') details = 'Logfile: %s' % logfile_name stress_cmd = stress_server_cmd + [x for x in args_str.split()] details = '%s, Stress server command: %s' % (details, str(stress_cmd)) # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) # Update the status to running if subprocess.Popen launched the server if stress_p.poll() is None: bq_helper.insert_summary_row(EventType.RUNNING, '') # Wait for the server process to terminate returncode = stress_p.wait() if will_run_forever == '1' or returncode != 0: end_time = datetime.datetime.now().isoformat() event_type = EventType.FAILURE details = 'Returncode: %d; End time: %s' % (returncode, end_time) bq_helper.insert_summary_row(event_type, details) print 'Waiting indefinitely..' select.select([], [], []) return returncode
def run_test_main(test_settings, gke_settings, stress_server_settings, stress_client_clients): is_success = True if test_settings.build_docker_image: is_success = _build_docker_image(gke_settings.docker_image_name, gke_settings.tag_name) if not is_success: return False is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name) if not is_success: return False # Create a unique id for this run (Note: Using timestamp instead of UUID to # make it easier to deduce the date/time of the run just by looking at the run # run id. This is useful in debugging when looking at records in Biq query) run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id) # Big Query settings (common for both Stress Server and Client) bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID, _QPS_TABLE_ID) bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id, _SUMMARY_TABLE_ID, _QPS_TABLE_ID) bq_helper.initialize() try: is_success = _launch_server_and_client( gke_settings, stress_server_settings, stress_client_settings, bq_settings, test_settings.kubernetes_proxy_port) if not is_success: return False start_time = datetime.datetime.now() end_time = start_time + datetime.timedelta( seconds=test_settings.test_duration_secs) print 'Running the test until %s' % end_time.isoformat() while True: if datetime.datetime.now() > end_time: print 'Test was run for %d seconds' % test_settings.test_duration_secs break # Check if either stress server or clients have failed if bq_helper.check_if_any_tests_failed(): is_success = False print 'Some tests failed.' break # Things seem to be running fine. Wait until next poll time to check the # status print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs time.sleep(test_settings.test_poll_interval_secs) # Print BiqQuery tables bq_helper.print_summary_records() bq_helper.print_qps_records() finally: # If is_success is False at this point, it means that the stress tests were # started successfully but failed while running the tests. In this case we # do should not delete the pods (since they contain all the failure # information) if is_success: _delete_server_and_client(stress_server_settings, stress_client_settings, test_settings.kubernetes_proxy_port) return is_success