Beispiel #1
0
    def __init__(self, workload_scale, file_format, queries, ref_queries):

      time_list = []
      ref_time_list = []
      for query_name, results in queries.items():
        if query_name in ref_queries:
          # We want to calculate the average and geomean of the query only if it is both
          # results and reference results
          for query_results in results[RESULT_LIST]:
            time_list.append(query_results[TIME_TAKEN])
          ref_results = ref_queries[query_name]
          for ref_query_results in ref_results[RESULT_LIST]:
            ref_time_list.append(ref_query_results[TIME_TAKEN])


      self.workload_name = '{0}({1})'.format(
          workload_scale[0][1].upper(), workload_scale[1][1])

      self.file_format = '{0} / {1} / {2}'.format(
          file_format[0][1], file_format[1][1], file_format[2][1])

      self.avg = calculate_avg(time_list)
      ref_avg = calculate_avg(ref_time_list)

      self.delta_avg = calculate_change(self.avg, ref_avg)

      self.geomean = calculate_geomean(time_list)
      ref_geomean = calculate_geomean(ref_time_list)

      self.delta_geomean = calculate_change(self.geomean, ref_geomean)
Beispiel #2
0
def calculate_time_stats(grouped):
  """Adds statistics to the nested dictionary. We are calculating the average runtime
     and Standard Deviation for each query type.
  """

  def remove_first_run(result_list):
    """We want to remove the first result because the performance is much worse on the
    first run.
    """
    if len(result_list) > 1:
      # We want to remove the first result only if there is more that one result
      result_list.remove(min(result_list, key=lambda result: result['start_time']))

  for workload_scale, workload in grouped.items():
    for file_format, queries in workload.items():
      for query_name, results in queries.items():
        result_list = results[RESULT_LIST]
        remove_first_run(result_list)
        avg = calculate_avg(
            [query_results[TIME_TAKEN] for query_results in result_list])
        dev = calculate_stddev(
            [query_results[TIME_TAKEN] for query_results in result_list])
        num_clients = max(
            int(query_results[CLIENT_NAME]) for query_results in result_list)

        iterations = int((len(result_list) + 1) / num_clients)
        results[AVG] = avg
        results[STDDEV] = dev
        results[NUM_CLIENTS] = num_clients
        results[ITERATIONS] = iterations
def calculate_workload_file_format_runtimes(grouped):
  """Calculate average time for each workload and scale factor, for each file format and
  compression.

  This returns a new dictionary with avarage times.

  Here's an example of how this dictionary is structured:
  dictionary->
  (('workload', 'tpch'), ('scale', '300gb'))->
  (('file_format','parquet'), ('compression_codec','zip'), ('compression_type','block'))->
  'avg'

  We also have access to the list of QueryResult associated with each file_format

  The difference between this dictionary and grouped_queries is that query name is missing
  after workload.
  """
  new_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

  # First populate the dictionary with query results
  for workload_scale, workload in grouped.items():
    for query_name, file_formats in workload.items():
      for file_format, results in file_formats.items():
        new_dict[workload_scale][file_format][RESULT_LIST].extend(results[RESULT_LIST])

  # Do the average calculation. Standard deviation could also be calculated here
  for workload_scale in new_dict:
    for file_format in new_dict[workload_scale]:
      avg = calculate_avg([query_results[TIME_TAKEN]
        for query_results in new_dict[workload_scale][file_format][RESULT_LIST]])
      new_dict[workload_scale][file_format][AVG] = avg
  return new_dict
Beispiel #4
0
def calculate_time_stats(grouped):
  """
  Add statistics to the nested dictionary.

  Each query name is supplemented with the average, standard deviation, number of clients,
  iterations, and a sorted list of the time taken to complete each run.
  """

  def remove_first_run(result_list):
    """We want to remove the first result because the performance is much worse on the
    first run.
    """
    if len(result_list) > 1:
      # We want to remove the first result only if there is more that one result
      result_list.remove(min(result_list, key=lambda result: result['start_time']))

  for workload_scale, workload in grouped.items():
    for file_format, queries in workload.items():
      for query_name, results in queries.items():
        result_list = results[RESULT_LIST]
        remove_first_run(result_list)
        avg = calculate_avg(
            [query_results[TIME_TAKEN] for query_results in result_list])
        dev = calculate_stddev(
            [query_results[TIME_TAKEN] for query_results in result_list])
        num_clients = max(
            int(query_results[CLIENT_NAME]) for query_results in result_list)

        iterations = int((len(result_list) + 1) / num_clients)
        results[AVG] = avg
        results[STDDEV] = dev
        results[NUM_CLIENTS] = num_clients
        results[ITERATIONS] = iterations
        results[SORTED] = [query_results[TIME_TAKEN] for query_results in result_list]
        results[SORTED].sort()
Beispiel #5
0
def create_exec_result(execution_times, iterations, result_data):
  exec_result = QueryExecResult()
  exec_result.success = False

  if result_data:
    # Just print the first result returned. There may be additional results if
    # there were multiple iterations executed.
    LOG.debug('Data:\n%s\n' % result_data[0])
    exec_result.data = result_data[0].split('\n')

  if len(execution_times) == iterations:
    exec_result.avg_time = calculate_avg(execution_times)
    if iterations > 1:
      exec_result.std_dev = calculate_stddev(execution_times)
    exec_result.success = True
  return exec_result
def get_summary_str(grouped):
  summary_str = str()

  for workload_scale, workload in grouped.items():
    summary_str += "{0} / {1} \n".format(workload_scale[0][1], workload_scale[1][1])
    table = prettytable.PrettyTable(["File Format", "Compression", "Avg (s)"])
    table.align = 'l'
    table.float_format = '.2'
    for file_format, queries in workload.items():
      # Calculate The average time for each file format and compression
      ff = file_format[0][1]
      compression = file_format[1][1] + " / " + file_format[2][1]
      avg = calculate_avg([query_results[TIME_TAKEN] for results in queries.values() for
        query_results in results[RESULT_LIST]])
      table.add_row([ff, compression, avg])
    summary_str += str(table) + '\n'
  return summary_str
def calculate_time_stats(grouped):
  """Adds statistics to the nested dictionary. We are calculating the average runtime
     and Standard Deviation for each query type.
  """

  for workload_scale, workload in grouped.items():
    for file_format, queries in workload.items():
      for query_name, results in queries.items():
        result_list = results[RESULT_LIST]
        avg = calculate_avg(
            [query_results[TIME_TAKEN] for query_results in result_list])
        dev = calculate_stddev(
            [query_results[TIME_TAKEN] for query_results in result_list])
        num_clients = max(
            int(query_results[CLIENT_NAME]) for query_results in result_list)
        iterations = len(result_list)
        results[AVG] = avg
        results[STDDEV] = dev
        results[NUM_CLIENTS] = num_clients
        results[ITERATIONS] = iterations
Beispiel #8
0
def get_summary_str(grouped):
    summary_str = str()

    for workload_scale, workload in grouped.items():
        summary_str += "{0} / {1} \n".format(workload_scale[0][1],
                                             workload_scale[1][1])
        table = prettytable.PrettyTable(
            ["File Format", "Compression", "Avg (s)"])
        table.align = 'l'
        table.float_format = '.2'
        for file_format, queries in workload.items():
            # Calculate The average time for each file format and compression
            ff = file_format[0][1]
            compression = file_format[1][1] + " / " + file_format[2][1]
            avg = calculate_avg([
                query_results[TIME_TAKEN] for results in queries.values()
                for query_results in results[RESULT_LIST]
            ])
            table.add_row([ff, compression, avg])
        summary_str += str(table) + '\n'
    return summary_str
def calculate_time_stats(grouped):
  """Adds statistics to the nested dictionary. We are calculating the average runtime
     and Standard Deviation for each query type.
  """

  for workload_scale in grouped:
    for query_name in grouped[workload_scale]:
      for file_format in grouped[workload_scale][query_name]:
        result_list = grouped[workload_scale][query_name][file_format][RESULT_LIST]
        avg = calculate_avg(
            [query_results[TIME_TAKEN] for query_results in result_list])
        dev = calculate_stddev(
            [query_results[TIME_TAKEN] for query_results in result_list])
        num_clients = max(
            int(query_results[CLIENT_NAME]) for query_results in result_list)
        iterations = len(result_list)

        grouped[workload_scale][query_name][file_format][AVG] = avg
        grouped[workload_scale][query_name][file_format][STDDEV] = dev
        grouped[workload_scale][query_name][file_format][NUM_CLIENTS] = num_clients
        grouped[workload_scale][query_name][file_format][ITERATIONS] = iterations
Beispiel #10
0
  def __build_rows(self, exec_summaries):

    first_exec_summary = exec_summaries[0]

    for row_num, row in enumerate(first_exec_summary):
      combined_row = {}
      # Copy fixed values from the first exec summary
      for key in [PREFIX, OPERATOR, NUM_HOSTS, NUM_ROWS, EST_NUM_ROWS, DETAIL]:
        combined_row[key] = row[key]

      avg_times = [exec_summary[row_num][AVG_TIME] for exec_summary in exec_summaries]
      max_times = [exec_summary[row_num][MAX_TIME] for exec_summary in exec_summaries]
      peak_mems = [exec_summary[row_num][PEAK_MEM] for exec_summary in exec_summaries]
      est_peak_mems = [exec_summary[row_num][EST_PEAK_MEM]
          for exec_summary in exec_summaries]

      # Set the calculated values
      combined_row[AVG_TIME] = calculate_avg(avg_times)
      combined_row[STDDEV_TIME] = calculate_stddev(avg_times)
      combined_row[MAX_TIME] = max(max_times)
      combined_row[PEAK_MEM] = max(peak_mems)
      combined_row[EST_PEAK_MEM] = max(est_peak_mems)
      self.rows.append(combined_row)
Beispiel #11
0
def calculate_time_stats(grouped):
    """Adds statistics to the nested dictionary. We are calculating the average runtime
     and Standard Deviation for each query type.
  """

    for workload_scale, workload in grouped.items():
        for file_format, queries in workload.items():
            for query_name, results in queries.items():
                result_list = results[RESULT_LIST]
                avg = calculate_avg([
                    query_results[TIME_TAKEN] for query_results in result_list
                ])
                dev = calculate_stddev([
                    query_results[TIME_TAKEN] for query_results in result_list
                ])
                num_clients = max(
                    int(query_results[CLIENT_NAME])
                    for query_results in result_list)
                iterations = len(result_list)
                results[AVG] = avg
                results[STDDEV] = dev
                results[NUM_CLIENTS] = num_clients
                results[ITERATIONS] = iterations
Beispiel #12
0
def construct_exec_result(iterations, query, results):
  """
  Calculate average running time and standard deviation.

  The summary of the first result is used as the summary for the entire execution.
  """
  # Use the output from the first result.
  exec_result = QueryExecResult()
  exec_result.query = query
  exec_result.data = results[0].data
  exec_result.beeswax_result = results[0]
  exec_result.set_result_note(results[0].summary)
  exec_result.runtime_profile = results[0].runtime_profile
  # If running more than 2 iterations, throw the first result out. Don't throw away
  # the first result if iterations = 2 to preserve the stddev calculation.
  if iterations > 2:
    results = results[1:]

  runtimes = [r.time_taken for r in results]
  exec_result.success = True
  exec_result.avg_time = calculate_avg(runtimes)
  if iterations > 1:
    exec_result.std_dev = calculate_stddev(runtimes)
  return exec_result