def calculate_time_stats(grouped): """ Add statistics to the nested dictionary. Each query name is supplemented with the average, standard deviation, number of clients, iterations, and a sorted list of the time taken to complete each run. """ def remove_first_run(result_list): """We want to remove the first result because the performance is much worse on the first run. """ if len(result_list) > 1: # We want to remove the first result only if there is more that one result result_list.remove(min(result_list, key=lambda result: result['start_time'])) for workload_scale, workload in grouped.items(): for file_format, queries in workload.items(): for query_name, results in queries.items(): result_list = results[RESULT_LIST] remove_first_run(result_list) avg = calculate_avg( [query_results[TIME_TAKEN] for query_results in result_list]) dev = calculate_stddev( [query_results[TIME_TAKEN] for query_results in result_list]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = int((len(result_list) + 1) / num_clients) results[AVG] = avg results[STDDEV] = dev results[NUM_CLIENTS] = num_clients results[ITERATIONS] = iterations results[SORTED] = [query_results[TIME_TAKEN] for query_results in result_list] results[SORTED].sort()
def calculate_time_stats(grouped): """Adds statistics to the nested dictionary. We are calculating the average runtime and Standard Deviation for each query type. """ def remove_first_run(result_list): """We want to remove the first result because the performance is much worse on the first run. """ if len(result_list) > 1: # We want to remove the first result only if there is more that one result result_list.remove(min(result_list, key=lambda result: result['start_time'])) for workload_scale, workload in grouped.items(): for file_format, queries in workload.items(): for query_name, results in queries.items(): result_list = results[RESULT_LIST] remove_first_run(result_list) avg = calculate_avg( [query_results[TIME_TAKEN] for query_results in result_list]) dev = calculate_stddev( [query_results[TIME_TAKEN] for query_results in result_list]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = int((len(result_list) + 1) / num_clients) results[AVG] = avg results[STDDEV] = dev results[NUM_CLIENTS] = num_clients results[ITERATIONS] = iterations
def create_exec_result(execution_times, iterations, result_data): exec_result = QueryExecResult() exec_result.success = False if result_data: # Just print the first result returned. There may be additional results if # there were multiple iterations executed. LOG.debug('Data:\n%s\n' % result_data[0]) exec_result.data = result_data[0].split('\n') if len(execution_times) == iterations: exec_result.avg_time = calculate_avg(execution_times) if iterations > 1: exec_result.std_dev = calculate_stddev(execution_times) exec_result.success = True return exec_result
def calculate_time_stats(grouped): """Adds statistics to the nested dictionary. We are calculating the average runtime and Standard Deviation for each query type. """ for workload_scale, workload in grouped.items(): for file_format, queries in workload.items(): for query_name, results in queries.items(): result_list = results[RESULT_LIST] avg = calculate_avg( [query_results[TIME_TAKEN] for query_results in result_list]) dev = calculate_stddev( [query_results[TIME_TAKEN] for query_results in result_list]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = len(result_list) results[AVG] = avg results[STDDEV] = dev results[NUM_CLIENTS] = num_clients results[ITERATIONS] = iterations
def calculate_time_stats(grouped): """Adds statistics to the nested dictionary. We are calculating the average runtime and Standard Deviation for each query type. """ for workload_scale in grouped: for query_name in grouped[workload_scale]: for file_format in grouped[workload_scale][query_name]: result_list = grouped[workload_scale][query_name][file_format][RESULT_LIST] avg = calculate_avg( [query_results[TIME_TAKEN] for query_results in result_list]) dev = calculate_stddev( [query_results[TIME_TAKEN] for query_results in result_list]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = len(result_list) grouped[workload_scale][query_name][file_format][AVG] = avg grouped[workload_scale][query_name][file_format][STDDEV] = dev grouped[workload_scale][query_name][file_format][NUM_CLIENTS] = num_clients grouped[workload_scale][query_name][file_format][ITERATIONS] = iterations
def __build_rows(self, exec_summaries): first_exec_summary = exec_summaries[0] for row_num, row in enumerate(first_exec_summary): combined_row = {} # Copy fixed values from the first exec summary for key in [PREFIX, OPERATOR, NUM_HOSTS, NUM_ROWS, EST_NUM_ROWS, DETAIL]: combined_row[key] = row[key] avg_times = [exec_summary[row_num][AVG_TIME] for exec_summary in exec_summaries] max_times = [exec_summary[row_num][MAX_TIME] for exec_summary in exec_summaries] peak_mems = [exec_summary[row_num][PEAK_MEM] for exec_summary in exec_summaries] est_peak_mems = [exec_summary[row_num][EST_PEAK_MEM] for exec_summary in exec_summaries] # Set the calculated values combined_row[AVG_TIME] = calculate_avg(avg_times) combined_row[STDDEV_TIME] = calculate_stddev(avg_times) combined_row[MAX_TIME] = max(max_times) combined_row[PEAK_MEM] = max(peak_mems) combined_row[EST_PEAK_MEM] = max(est_peak_mems) self.rows.append(combined_row)
def calculate_time_stats(grouped): """Adds statistics to the nested dictionary. We are calculating the average runtime and Standard Deviation for each query type. """ for workload_scale, workload in grouped.items(): for file_format, queries in workload.items(): for query_name, results in queries.items(): result_list = results[RESULT_LIST] avg = calculate_avg([ query_results[TIME_TAKEN] for query_results in result_list ]) dev = calculate_stddev([ query_results[TIME_TAKEN] for query_results in result_list ]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = len(result_list) results[AVG] = avg results[STDDEV] = dev results[NUM_CLIENTS] = num_clients results[ITERATIONS] = iterations
def construct_exec_result(iterations, query, results): """ Calculate average running time and standard deviation. The summary of the first result is used as the summary for the entire execution. """ # Use the output from the first result. exec_result = QueryExecResult() exec_result.query = query exec_result.data = results[0].data exec_result.beeswax_result = results[0] exec_result.set_result_note(results[0].summary) exec_result.runtime_profile = results[0].runtime_profile # If running more than 2 iterations, throw the first result out. Don't throw away # the first result if iterations = 2 to preserve the stddev calculation. if iterations > 2: results = results[1:] runtimes = [r.time_taken for r in results] exec_result.success = True exec_result.avg_time = calculate_avg(runtimes) if iterations > 1: exec_result.std_dev = calculate_stddev(runtimes) return exec_result