Beispiel #1
0
        '%(known_error_count)s queries were excluded from the mismatch count because '
        'they are known errors.\n'
        '%(query_timeout_count)s queries timed out and were excluded from all counts.') \
            % summary_params


if __name__ == '__main__':
  import sys
  from optparse import OptionParser

  import tests.comparison.cli_options as cli_options
  from tests.comparison.query_profile import PROFILES

  parser = OptionParser()
  cli_options.add_logging_options(parser)
  cli_options.add_db_name_option(parser)
  cli_options.add_connection_option_groups(parser)

  parser.add_option('--test-db-type', default=IMPALA,
      choices=(IMPALA, MYSQL, ORACLE, POSTGRESQL),
      help='The type of the test database to use. Ex: IMPALA.')
  parser.add_option('--ref-db-type', default=POSTGRESQL,
      choices=(MYSQL, ORACLE, POSTGRESQL),
      help='The type of the ref database to use. Ex: POSTGRESQL.')
  parser.add_option('--stop-on-mismatch', default=False, action='store_true',
      help='Exit immediately upon find a discrepancy in a query result.')
  parser.add_option('--stop-on-crash', default=False, action='store_true',
      help='Exit immediately if Impala crashes.')
  parser.add_option('--query-count', default=1000000, type=int,
      help='Exit after running the given number of queries.')
  parser.add_option('--exclude-types', default='',
Beispiel #2
0
    from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser

    from tests.comparison import cli_options

    parser = ArgumentParser(
        usage='usage: \n'
        '  %(prog)s [options] [populate]\n\n'
        '     Create and populate database(s). The Impala database will always be \n'
        '     included. Postgres is optional. The other databases are not supported.\n\n'
        '  %(prog)s [options] migrate\n\n'
        '     Migrate an Impala database to another database type. The destination \n'
        '     database will be dropped and recreated.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    cli_options.add_logging_options(parser)
    cli_options.add_cluster_options(parser)
    cli_options.add_db_name_option(parser)
    cli_options.add_connection_option_groups(parser)

    group = parser.add_argument_group('Database Population Options')
    group.add_argument(
        '--randomization-seed',
        default=1,
        type=int,
        help=
        'The randomization will be initialized with this seed. Using the same seed '
        'will produce the same results across runs.')
    cli_options.add_storage_format_options(group)
    group.add_argument(
        '--create-data-files',
        default=False,
        action='store_true',
def main():
  from optparse import OptionParser
  import tests.comparison.cli_options as cli_options

  parser = OptionParser(epilog=dedent(
      """Before running this script a CM cluster must be setup and any needed data
         such as TPC-H/DS must be loaded. The first time this script is run it will
         find memory limits and runtimes for each query and save the data to disk (since
         collecting the data is slow) at --runtime-info-path then run the stress test.
         Later runs will reuse the saved memory limits and timings. If the cluster changes
         significantly the memory limits should be re-measured (deleting the file at
         --runtime-info-path will cause re-measuring to happen)."""))
  cli_options.add_logging_options(parser)
  cli_options.add_cm_options(parser)
  cli_options.add_db_name_option(parser)
  parser.add_option("--runtime-info-path",
      default=os.path.join(gettempdir(), "{cm_host}_query_runtime_info.json"),
      help="The path to store query runtime info at. '{cm_host}' will be replaced with"
      " the actual host name from --cm-host.")
  parser.add_option("--no-status", action="store_true",
      help="Do not print the status table.")
  parser.add_option("--cancel-current-queries", action="store_true",
      help="Cancel any queries running on the cluster before beginning.")
  parser.add_option("--filter-query-mem-ratio", type=float, default=0.333,
      help="Queries that require this ratio of total available memory will be filtered.")
  parser.add_option("--mem-limit-padding-pct", type=int, default=25,
      help="Pad query mem limits found by solo execution with this percentage when"
      " running concurrently. After padding queries will not be expected to fail"
      " due to mem limit exceeded.")
  parser.add_option("--timeout-multiplier", type=float, default=1.0,
      help="Query timeouts will be multiplied by this value.")
  parser.add_option("--max-queries", type=int, default=100)
  parser.add_option("--tpcds-db-name")
  parser.add_option("--tpch-db-name")
  parser.add_option("--mem-overcommit-pct", type=float, default=0)
  parser.add_option("--mem-spill-probability", type=float, default=0.33,
      dest="spill_probability",
      help="The probability that a mem limit will be set low enough to induce spilling.")
  parser.add_option("--cancel-probability", type=float, default=0.1,
      help="The probability a query will be cancelled.")
  cli_options.add_default_values_to_help(parser)
  opts, args = parser.parse_args()

  if not opts.tpcds_db_name and not opts.tpch_db_name:
    raise Exception("At least one of --tpcds-db-name --tpch-db-name is required")

  cli_options.configure_logging(opts.log_level, debug_log_file=opts.debug_log_file,
      log_thread_id=True, log_process_id=True)
  LOG.debug("CLI opts: %s" % (opts, ))
  LOG.debug("CLI args: %s" % (args, ))

  impala = find_impala_in_cm(
      opts.cm_host, opts.cm_user, opts.cm_password, opts.cm_cluster_name)
  if opts.cancel_current_queries:
    impala.cancel_queries()
  if impala.queries_are_running():
    raise Exception("Queries are currently running on the cluster")

  runtime_info_path = opts.runtime_info_path
  if "{cm_host}" in runtime_info_path:
    runtime_info_path = runtime_info_path.format(cm_host=opts.cm_host)
  queries_with_runtime_info_by_db_and_sql = load_runtime_info(runtime_info_path, impala)
  queries = list()
  if opts.tpcds_db_name:
    tpcds_queries = load_tpc_queries("tpcds")
    for query in tpcds_queries:
      query.db_name = opts.tpcds_db_name
    queries.extend(tpcds_queries)
  if opts.tpch_db_name:
    tpch_queries = load_tpc_queries("tpch")
    for query in tpch_queries:
      query.db_name = opts.tpch_db_name
    queries.extend(tpch_queries)
  for idx in xrange(len(queries) - 1, -1, -1):
    query = queries[idx]
    if query.sql in queries_with_runtime_info_by_db_and_sql[query.db_name]:
      query = queries_with_runtime_info_by_db_and_sql[query.db_name][query.sql]
      LOG.debug("Reusing previous runtime data for query: " + query.sql)
      queries[idx] = query
    else:
      populate_runtime_info(query, impala)
      save_runtime_info(runtime_info_path, query, impala)
    if query.required_mem_mb_with_spilling:
      query.required_mem_mb_with_spilling += int(query.required_mem_mb_with_spilling
          * opts.mem_limit_padding_pct / 100.0)
    if query.required_mem_mb_without_spilling:
      query.required_mem_mb_without_spilling += int(query.required_mem_mb_without_spilling
          * opts.mem_limit_padding_pct / 100.0)
    if query.solo_runtime_secs_with_spilling:
      query.solo_runtime_secs_with_spilling *= opts.timeout_multiplier
    if query.solo_runtime_secs_without_spilling:
      query.solo_runtime_secs_without_spilling *= opts.timeout_multiplier

    # Remove any queries that would use "too many" resources. This way a larger number
    # of queries will run concurrently.
    if query.required_mem_mb_with_spilling is None \
        or query.required_mem_mb_with_spilling / impala.min_impalad_mem_mb \
            > opts.filter_query_mem_ratio:
      LOG.debug("Filtered query due to mem ratio option: " + query.sql)
      del queries[idx]
  if len(queries) == 0:
    raise Exception("All queries were filtered")

  stress_runner = StressRunner()
  stress_runner.cancel_probability = opts.cancel_probability
  stress_runner.spill_probability = opts.spill_probability
  stress_runner.run_queries(queries, impala, opts.max_queries, opts.mem_overcommit_pct,
      not opts.no_status)