'%(test_crash_count)s crashes occurred.\n' '%(known_error_count)s queries were excluded from the mismatch count because ' 'they are known errors.\n' '%(query_timeout_count)s queries timed out and were excluded from all counts.') \ % summary_params if __name__ == '__main__': import sys from optparse import OptionParser import tests.comparison.cli_options as cli_options from tests.comparison.query_profile import PROFILES parser = OptionParser() cli_options.add_logging_options(parser) cli_options.add_db_name_option(parser) cli_options.add_connection_option_groups(parser) parser.add_option('--test-db-type', default=IMPALA, choices=(IMPALA, MYSQL, ORACLE, POSTGRESQL), help='The type of the test database to use. Ex: IMPALA.') parser.add_option('--ref-db-type', default=POSTGRESQL, choices=(MYSQL, ORACLE, POSTGRESQL), help='The type of the ref database to use. Ex: POSTGRESQL.') parser.add_option('--stop-on-mismatch', default=False, action='store_true', help='Exit immediately upon find a discrepancy in a query result.') parser.add_option('--stop-on-crash', default=False, action='store_true', help='Exit immediately if Impala crashes.') parser.add_option('--query-count', default=1000000, type=int, help='Exit after running the given number of queries.')
if __name__ == '__main__': from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from tests.comparison import cli_options parser = ArgumentParser( usage='usage: \n' ' %(prog)s [options] [populate]\n\n' ' Create and populate database(s). The Impala database will always be \n' ' included. Postgres is optional. The other databases are not supported.\n\n' ' %(prog)s [options] migrate\n\n' ' Migrate an Impala database to another database type. The destination \n' ' database will be dropped and recreated.', formatter_class=ArgumentDefaultsHelpFormatter) cli_options.add_logging_options(parser) cli_options.add_cluster_options(parser) cli_options.add_db_name_option(parser) cli_options.add_connection_option_groups(parser) group = parser.add_argument_group('Database Population Options') group.add_argument( '--randomization-seed', default=1, type=int, help= 'The randomization will be initialized with this seed. Using the same seed ' 'will produce the same results across runs.') cli_options.add_storage_format_options(group) group.add_argument( '--create-data-files',
def main(): from optparse import OptionParser import tests.comparison.cli_options as cli_options parser = OptionParser(epilog=dedent( """Before running this script a CM cluster must be setup and any needed data such as TPC-H/DS must be loaded. The first time this script is run it will find memory limits and runtimes for each query and save the data to disk (since collecting the data is slow) at --runtime-info-path then run the stress test. Later runs will reuse the saved memory limits and timings. If the cluster changes significantly the memory limits should be re-measured (deleting the file at --runtime-info-path will cause re-measuring to happen).""")) cli_options.add_logging_options(parser) cli_options.add_cm_options(parser) cli_options.add_db_name_option(parser) parser.add_option("--runtime-info-path", default=os.path.join(gettempdir(), "{cm_host}_query_runtime_info.json"), help="The path to store query runtime info at. '{cm_host}' will be replaced with" " the actual host name from --cm-host.") parser.add_option("--no-status", action="store_true", help="Do not print the status table.") parser.add_option("--cancel-current-queries", action="store_true", help="Cancel any queries running on the cluster before beginning.") parser.add_option("--filter-query-mem-ratio", type=float, default=0.333, help="Queries that require this ratio of total available memory will be filtered.") parser.add_option("--mem-limit-padding-pct", type=int, default=25, help="Pad query mem limits found by solo execution with this percentage when" " running concurrently. After padding queries will not be expected to fail" " due to mem limit exceeded.") parser.add_option("--timeout-multiplier", type=float, default=1.0, help="Query timeouts will be multiplied by this value.") parser.add_option("--max-queries", type=int, default=100) parser.add_option("--tpcds-db-name") parser.add_option("--tpch-db-name") parser.add_option("--mem-overcommit-pct", type=float, default=0) parser.add_option("--mem-spill-probability", type=float, default=0.33, dest="spill_probability", help="The probability that a mem limit will be set low enough to induce spilling.") parser.add_option("--cancel-probability", type=float, default=0.1, help="The probability a query will be cancelled.") cli_options.add_default_values_to_help(parser) opts, args = parser.parse_args() if not opts.tpcds_db_name and not opts.tpch_db_name: raise Exception("At least one of --tpcds-db-name --tpch-db-name is required") cli_options.configure_logging(opts.log_level, debug_log_file=opts.debug_log_file, log_thread_id=True, log_process_id=True) LOG.debug("CLI opts: %s" % (opts, )) LOG.debug("CLI args: %s" % (args, )) impala = find_impala_in_cm( opts.cm_host, opts.cm_user, opts.cm_password, opts.cm_cluster_name) if opts.cancel_current_queries: impala.cancel_queries() if impala.queries_are_running(): raise Exception("Queries are currently running on the cluster") runtime_info_path = opts.runtime_info_path if "{cm_host}" in runtime_info_path: runtime_info_path = runtime_info_path.format(cm_host=opts.cm_host) queries_with_runtime_info_by_db_and_sql = load_runtime_info(runtime_info_path, impala) queries = list() if opts.tpcds_db_name: tpcds_queries = load_tpc_queries("tpcds") for query in tpcds_queries: query.db_name = opts.tpcds_db_name queries.extend(tpcds_queries) if opts.tpch_db_name: tpch_queries = load_tpc_queries("tpch") for query in tpch_queries: query.db_name = opts.tpch_db_name queries.extend(tpch_queries) for idx in xrange(len(queries) - 1, -1, -1): query = queries[idx] if query.sql in queries_with_runtime_info_by_db_and_sql[query.db_name]: query = queries_with_runtime_info_by_db_and_sql[query.db_name][query.sql] LOG.debug("Reusing previous runtime data for query: " + query.sql) queries[idx] = query else: populate_runtime_info(query, impala) save_runtime_info(runtime_info_path, query, impala) if query.required_mem_mb_with_spilling: query.required_mem_mb_with_spilling += int(query.required_mem_mb_with_spilling * opts.mem_limit_padding_pct / 100.0) if query.required_mem_mb_without_spilling: query.required_mem_mb_without_spilling += int(query.required_mem_mb_without_spilling * opts.mem_limit_padding_pct / 100.0) if query.solo_runtime_secs_with_spilling: query.solo_runtime_secs_with_spilling *= opts.timeout_multiplier if query.solo_runtime_secs_without_spilling: query.solo_runtime_secs_without_spilling *= opts.timeout_multiplier # Remove any queries that would use "too many" resources. This way a larger number # of queries will run concurrently. if query.required_mem_mb_with_spilling is None \ or query.required_mem_mb_with_spilling / impala.min_impalad_mem_mb \ > opts.filter_query_mem_ratio: LOG.debug("Filtered query due to mem ratio option: " + query.sql) del queries[idx] if len(queries) == 0: raise Exception("All queries were filtered") stress_runner = StressRunner() stress_runner.cancel_probability = opts.cancel_probability stress_runner.spill_probability = opts.spill_probability stress_runner.run_queries(queries, impala, opts.max_queries, opts.mem_overcommit_pct, not opts.no_status)