def run_client(): """This is a wrapper around the stress test client and performs the following: 1) Create the following two tables in Big Query: (i) Summary table: To record events like the test started, completed successfully or failed (ii) Qps table: To periodically record the QPS sent by this client 2) Start the stress test client and add a row in the Big Query summary table 3) Once every few seconds (as specificed by the poll_interval_secs) poll the status of the stress test client process and perform the following: 3.1) If the process is still running, get the current qps by invoking the metrics client program and add a row in the Big Query Qps table. Sleep for a duration specified by poll_interval_secs 3.2) If the process exited successfully, add a row in the Big Query Summary table and exit 3.3) If the process failed, add a row in Big Query summary table and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the stress test client fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures """ # Set the 'core file' size to 'unlimited' so that 'core' files are generated # if the client crashes (Note: This is not relevant for Java and Go clients) resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) env = dict(os.environ) image_type = env['STRESS_TEST_IMAGE_TYPE'] stress_client_cmd = env['STRESS_TEST_CMD'].split() args_str = env['STRESS_TEST_ARGS_STR'] metrics_client_cmd = env['METRICS_CLIENT_CMD'].split() metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR'] run_id = env['RUN_ID'] pod_name = env['POD_NAME'] logfile_name = env.get('LOGFILE_NAME') poll_interval_secs = float(env['POLL_INTERVAL_SECS']) project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] # The following parameter is to inform us whether the stress client runs # forever until forcefully stopped or will it naturally stop after sometime. # This way, we know that the stress client process should not terminate (even # if it does with a success exit code) and flag the termination as a failure will_run_forever = env.get('WILL_RUN_FOREVER', '1') bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening logfile: %s ...' % logfile_name details = 'Logfile: %s' % logfile_name logfile = open(logfile_name, 'w') metrics_cmd = metrics_client_cmd + [x for x in metrics_client_args_str.split()] stress_cmd = stress_client_cmd + [x for x in args_str.split()] details = '%s, Metrics command: %s, Stress client command: %s' % ( details, str(metrics_cmd), str(stress_cmd)) # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) qps_history = [1, 1, 1] # Maintain the last 3 qps readings qps_history_idx = 0 # Index into the qps_history list is_running_status_written = False is_error = False while True: # Check if stress_client is still running. If so, collect metrics and upload # to BigQuery status table # If stress_p.poll() is not None, it means that the stress client terminated if stress_p.poll() is not None: end_time = datetime.datetime.now().isoformat() event_type = EventType.SUCCESS details = 'End time: %s' % end_time if will_run_forever == '1' or stress_p.returncode != 0: event_type = EventType.FAILURE details = 'Return code = %d. End time: %s' % (stress_p.returncode, end_time) is_error = True bq_helper.insert_summary_row(event_type, details) print details break if not is_running_status_written: bq_helper.insert_summary_row(EventType.RUNNING, '') is_running_status_written = True # Stress client still running. Get metrics qps = _get_qps(metrics_cmd) qps_recorded_at = datetime.datetime.now().isoformat() print 'qps: %d at %s' % (qps, qps_recorded_at) # If QPS has been zero for the last 3 iterations, flag it as error and exit qps_history[qps_history_idx] = qps qps_history_idx = (qps_history_idx + 1) % len(qps_history) if sum(qps_history) == 0: details = 'QPS has been zero for the last %d seconds - as of : %s' % ( poll_interval_secs * 3, qps_recorded_at) is_error = True bq_helper.insert_summary_row(EventType.FAILURE, details) print details break # Upload qps metrics to BiqQuery bq_helper.insert_qps_row(qps, qps_recorded_at) time.sleep(poll_interval_secs) if is_error: print 'Waiting indefinitely..' select.select([], [], []) print 'Completed' return
def run_server(): """This is a wrapper around the interop server and performs the following: 1) Create a 'Summary table' in Big Query to record events like the server started, completed successfully or failed. NOTE: This also creates another table called the QPS table which is currently NOT needed on the server (it is needed on the stress test clients) 2) Start the server process and add a row in Big Query summary table 3) Wait for the server process to terminate. The server process does not terminate unless there is an error. If the server process terminated with a failure, add a row in Big Query and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the server process fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures. """ # Set the 'core file' size to 'unlimited' so that 'core' files are generated # if the server crashes (Note: This is not relevant for Java and Go servers) resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) # Read the parameters from environment variables env = dict(os.environ) run_id = env['RUN_ID'] # The unique run id for this test image_type = env['STRESS_TEST_IMAGE_TYPE'] stress_server_cmd = env['STRESS_TEST_CMD'].split() args_str = env['STRESS_TEST_ARGS_STR'] pod_name = env['POD_NAME'] project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] # The following parameter is to inform us whether the server runs forever # until forcefully stopped or will it naturally stop after sometime. # This way, we know that the process should not terminate (even if it does # with a success exit code) and flag any termination as a failure. will_run_forever = env.get('WILL_RUN_FOREVER', '1') logfile_name = env.get('LOGFILE_NAME') print( 'pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, ' 'summary_table_id: %s, qps_table_id: %s') % ( pod_name, project_id, run_id, dataset_id, summary_table_id, qps_table_id) bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening log file: ', logfile_name logfile = open(logfile_name, 'w') details = 'Logfile: %s' % logfile_name # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) stress_cmd = stress_server_cmd + [x for x in args_str.split()] print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) returncode = stress_p.wait() if will_run_forever == '1' or returncode != 0: end_time = datetime.datetime.now().isoformat() event_type = EventType.FAILURE details = 'Returncode: %d; End time: %s' % (returncode, end_time) bq_helper.insert_summary_row(event_type, details) print 'Waiting indefinitely..' select.select([], [], []) return returncode
def run_server(): """This is a wrapper around the interop server and performs the following: 1) Create a 'Summary table' in Big Query to record events like the server started, completed successfully or failed. NOTE: This also creates another table called the QPS table which is currently NOT needed on the server (it is needed on the stress test clients) 2) Start the server process and add a row in Big Query summary table 3) Wait for the server process to terminate. The server process does not terminate unless there is an error. If the server process terminated with a failure, add a row in Big Query and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the server process fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures. """ # Read the parameters from environment variables env = dict(os.environ) run_id = env['RUN_ID'] # The unique run id for this test image_type = env['STRESS_TEST_IMAGE_TYPE'] stress_server_cmd = env['STRESS_TEST_CMD'].split() args_str = env['STRESS_TEST_ARGS_STR'] pod_name = env['POD_NAME'] project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] logfile_name = env.get('LOGFILE_NAME') print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, ' 'summary_table_id: %s, qps_table_id: %s') % ( pod_name, project_id, run_id, dataset_id, summary_table_id, qps_table_id) bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening log file: ', logfile_name logfile = open(logfile_name, 'w') details = 'Logfile: %s' % logfile_name # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) stress_cmd = stress_server_cmd + [x for x in args_str.split()] print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) returncode = stress_p.wait() if returncode != 0: end_time = datetime.datetime.now().isoformat() event_type = EventType.FAILURE details = 'Returncode: %d; End time: %s' % (returncode, end_time) bq_helper.insert_summary_row(event_type, details) print 'Waiting indefinitely..' select.select([], [], []) return returncode
def run_client(): """This is a wrapper around the stress test client and performs the following: 1) Create the following two tables in Big Query: (i) Summary table: To record events like the test started, completed successfully or failed (ii) Qps table: To periodically record the QPS sent by this client 2) Start the stress test client and add a row in the Big Query summary table 3) Once every few seconds (as specificed by the poll_interval_secs) poll the status of the stress test client process and perform the following: 3.1) If the process is still running, get the current qps by invoking the metrics client program and add a row in the Big Query Qps table. Sleep for a duration specified by poll_interval_secs 3.2) If the process exited successfully, add a row in the Big Query Summary table and exit 3.3) If the process failed, add a row in Big Query summary table and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the stress test client fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures """ env = dict(os.environ) image_type = env['STRESS_TEST_IMAGE_TYPE'] image_name = env['STRESS_TEST_IMAGE'] args_str = env['STRESS_TEST_ARGS_STR'] metrics_client_image = env['METRICS_CLIENT_IMAGE'] metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR'] run_id = env['RUN_ID'] pod_name = env['POD_NAME'] logfile_name = env.get('LOGFILE_NAME') poll_interval_secs = float(env['POLL_INTERVAL_SECS']) project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening logfile: %s ...' % logfile_name details = 'Logfile: %s' % logfile_name logfile = open(logfile_name, 'w') # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) metrics_cmd = [metrics_client_image ] + [x for x in metrics_client_args_str.split()] stress_cmd = [image_name] + [x for x in args_str.split()] print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) qps_history = [1, 1, 1] # Maintain the last 3 qps readings qps_history_idx = 0 # Index into the qps_history list is_error = False while True: # Check if stress_client is still running. If so, collect metrics and upload # to BigQuery status table if stress_p.poll() is not None: end_time = datetime.datetime.now().isoformat() event_type = EventType.SUCCESS details = 'End time: %s' % end_time if stress_p.returncode != 0: event_type = EventType.FAILURE details = 'Return code = %d. End time: %s' % (stress_p.returncode, end_time) is_error = True bq_helper.insert_summary_row(event_type, details) print details break # Stress client still running. Get metrics qps = _get_qps(metrics_cmd) qps_recorded_at = datetime.datetime.now().isoformat() print 'qps: %d at %s' % (qps, qps_recorded_at) # If QPS has been zero for the last 3 iterations, flag it as error and exit qps_history[qps_history_idx] = qps qps_history_idx = (qps_history_idx + 1) % len(qps_history) if sum(qps_history) == 0: details = 'QPS has been zero for the last %d seconds - as of : %s' % ( poll_interval_secs * 3, qps_recorded_at) is_error = True bq_helper.insert_summary_row(EventType.FAILURE, details) print details break # Upload qps metrics to BiqQuery bq_helper.insert_qps_row(qps, qps_recorded_at) time.sleep(poll_interval_secs) if is_error: print 'Waiting indefinitely..' select.select([], [], []) print 'Completed' return
def run_server(): """This is a wrapper around the interop server and performs the following: 1) Create a 'Summary table' in Big Query to record events like the server started, completed successfully or failed. NOTE: This also creates another table called the QPS table which is currently NOT needed on the server (it is needed on the stress test clients) 2) Start the server process and add a row in Big Query summary table 3) Wait for the server process to terminate. The server process does not terminate unless there is an error. If the server process terminated with a failure, add a row in Big Query and wait forever. NOTE: This script typically runs inside a GKE pod which means that the pod gets destroyed when the script exits. However, in case the server process fails, we would not want the pod to be destroyed (since we might want to connect to the pod for examining logs). This is the reason why the script waits forever in case of failures. """ # Set the 'core file' size to 'unlimited' so that 'core' files are generated # if the server crashes (Note: This is not relevant for Java and Go servers) resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) # Read the parameters from environment variables env = dict(os.environ) run_id = env['RUN_ID'] # The unique run id for this test image_type = env['STRESS_TEST_IMAGE_TYPE'] stress_server_cmd = env['STRESS_TEST_CMD'].split() args_str = env['STRESS_TEST_ARGS_STR'] pod_name = env['POD_NAME'] project_id = env['GCP_PROJECT_ID'] dataset_id = env['DATASET_ID'] summary_table_id = env['SUMMARY_TABLE_ID'] qps_table_id = env['QPS_TABLE_ID'] # The following parameter is to inform us whether the server runs forever # until forcefully stopped or will it naturally stop after sometime. # This way, we know that the process should not terminate (even if it does # with a success exit code) and flag any termination as a failure. will_run_forever = env.get('WILL_RUN_FOREVER', '1') logfile_name = env.get('LOGFILE_NAME') print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, ' 'summary_table_id: %s, qps_table_id: %s') % (pod_name, project_id, run_id, dataset_id, summary_table_id, qps_table_id) bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() # Create BigQuery Dataset and Tables: Summary Table and Metrics Table if not bq_helper.setup_tables(): print 'Error in creating BigQuery tables' return start_time = datetime.datetime.now() logfile = None details = 'Logging to stdout' if logfile_name is not None: print 'Opening log file: ', logfile_name logfile = open(logfile_name, 'w') details = 'Logfile: %s' % logfile_name stress_cmd = stress_server_cmd + [x for x in args_str.split()] details = '%s, Stress server command: %s' % (details, str(stress_cmd)) # Update status that the test is starting (in the status table) bq_helper.insert_summary_row(EventType.STARTING, details) print 'Launching process %s ...' % stress_cmd stress_p = subprocess.Popen(args=stress_cmd, stdout=logfile, stderr=subprocess.STDOUT) # Update the status to running if subprocess.Popen launched the server if stress_p.poll() is None: bq_helper.insert_summary_row(EventType.RUNNING, '') # Wait for the server process to terminate returncode = stress_p.wait() if will_run_forever == '1' or returncode != 0: end_time = datetime.datetime.now().isoformat() event_type = EventType.FAILURE details = 'Returncode: %d; End time: %s' % (returncode, end_time) bq_helper.insert_summary_row(event_type, details) print 'Waiting indefinitely..' select.select([], [], []) return returncode