def populate_db_with_random_data(self, db_name, db_connectors, min_number_of_tables, max_number_of_tables, min_number_of_cols, max_number_of_cols, min_number_of_rows, max_number_of_rows, allowed_storage_formats, create_files): '''Create tables with a random number of cols. The given db_name must have already been created. ''' connections = list() hive_connection = None for connector in db_connectors: connection = connector.create_connection(db_name=db_name) connections.append(connection) if connector.db_type == IMPALA: # The Impala table creator needs help from Hive for some storage formats. # Eventually Impala should be able to write in all formats and this can be # removed. hive_connection = DbConnector(HIVE).create_connection(db_name=db_name) connection.hive_connection = hive_connection for table_idx in xrange(randint(min_number_of_tables, max_number_of_tables)): table = self.create_random_table( 'table_%s' % (table_idx + 1), min_number_of_cols, max_number_of_cols, allowed_storage_formats) for connection in connections: connection.bulk_load_data_file = open( "/tmp/%s_%s.data" % (table.name, connection.db_type.lower()), "w") connection.begin_bulk_load_table(table) row_count = randint(min_number_of_rows, max_number_of_rows) LOG.info('Inserting %s rows into %s', row_count, table.name) while row_count: batch_size = min(1000, row_count) rows = self.generate_table_data(table, number_of_rows=batch_size) row_count -= batch_size for connection in connections: connection.handle_bulk_load_table_data(rows) for connection in connections: connection.end_bulk_load_table() self.index_tables_in_database(connections) for connection in connections: connection.close() if hive_connection: hive_connection.close()
def populate_db_with_random_data(self, db_name, db_connectors, min_number_of_tables, max_number_of_tables, min_number_of_cols, max_number_of_cols, min_number_of_rows, max_number_of_rows, allowed_storage_formats, create_files): '''Create tables with a random number of cols. The given db_name must have already been created. ''' connections = list() hive_connection = None for connector in db_connectors: connection = connector.create_connection(db_name=db_name) connections.append(connection) if connector.db_type == IMPALA: # The Impala table creator needs help from Hive for some storage formats. # Eventually Impala should be able to write in all formats and this can be # removed. hive_connection = DbConnector(HIVE).create_connection( db_name=db_name) connection.hive_connection = hive_connection for table_idx in xrange( randint(min_number_of_tables, max_number_of_tables)): table = self.create_random_table('table_%s' % (table_idx + 1), min_number_of_cols, max_number_of_cols, allowed_storage_formats) for connection in connections: connection.bulk_load_data_file = open( "/tmp/%s_%s.data" % (table.name, connection.db_type.lower()), "w") connection.begin_bulk_load_table(table) row_count = randint(min_number_of_rows, max_number_of_rows) LOG.info('Inserting %s rows into %s', row_count, table.name) while row_count: batch_size = min(1000, row_count) rows = self.generate_table_data(table, number_of_rows=batch_size) row_count -= batch_size for connection in connections: connection.handle_bulk_load_table_data(rows) for connection in connections: connection.end_bulk_load_table() self.index_tables_in_database(connections) for connection in connections: connection.close() if hive_connection: hive_connection.close()
def start_impala(self): '''Starts impala and creates a connection to it. ''' self.impala_env.start_impala() self.test_connection = DbConnector( IMPALA, user_name=None, password=None, host_name=self.impala_env.host, port=self.impala_env.impala_port).create_connection(DATABASE_NAME) self.test_connection.reconnect() self.query_result_comparator = QueryResultComparator( self.ref_connection, self.test_connection) LOG.info('Created query result comparator') LOG.info(str(self.query_result_comparator.__dict__))
def populate_db_with_random_data(self, db_name, db_connectors, min_number_of_tables, max_number_of_tables, min_number_of_cols, max_number_of_cols, min_number_of_rows, max_number_of_rows, allowed_storage_formats, create_files): '''Create tables with a random number of cols. The given db_name must have already been created. ''' connections = list() hive_connection = None for connector in db_connectors: connection = connector.create_connection(db_name=db_name) connections.append(connection) if connector.db_type == IMPALA: # The Impala table creator needs help from Hive for some storage formats. # Eventually Impala should be able to write in all formats and this can be # removed. hive_connection = DbConnector(HIVE).create_connection(db_name=db_name) connection.hive_connection = hive_connection tables = list() for table_idx in xrange(randint(min_number_of_tables, max_number_of_tables)): table = self.create_random_table( 'table_%s' % (table_idx + 1), min_number_of_cols, max_number_of_cols, allowed_storage_formats) tables.append(table) self.populate_tables_with_random_data( tables, connections, min_number_of_rows, max_number_of_rows, create_tables=True) for connection in connections: connection.close() if hive_connection: hive_connection.close()
def populate_db_with_random_data(self, db_name, db_connectors, min_number_of_tables, max_number_of_tables, min_number_of_cols, max_number_of_cols, min_number_of_rows, max_number_of_rows, allowed_storage_formats, create_files): '''Create tables with a random number of cols. The given db_name must have already been created. ''' connections = list() hive_connection = None for connector in db_connectors: connection = connector.create_connection(db_name=db_name) connections.append(connection) if connector.db_type == IMPALA: # The Impala table creator needs help from Hive for some storage formats. # Eventually Impala should be able to write in all formats and this can be # removed. hive_connection = DbConnector(HIVE).create_connection( db_name=db_name) connection.hive_connection = hive_connection tables = list() for table_idx in xrange( randint(min_number_of_tables, max_number_of_tables)): table = self.create_random_table('table_%s' % (table_idx + 1), min_number_of_cols, max_number_of_cols, allowed_storage_formats) tables.append(table) self.populate_tables_with_random_data(tables, connections, min_number_of_rows, max_number_of_rows, create_tables=True) for connection in connections: connection.close() if hive_connection: hive_connection.close()
def start_impala(self): '''Starts impala and creates a connection to it. ''' self.impala_env.start_impala() self.test_connection = DbConnector(IMPALA, user_name=None, password=None, host_name=self.impala_env.host, port=self.impala_env.impala_port).create_connection(DATABASE_NAME) self.test_connection.reconnect() self.query_result_comparator = QueryResultComparator( self.ref_connection, self.test_connection) LOG.info('Created query result comparator') LOG.info(str(self.query_result_comparator.__dict__))
def prepare(self): '''Prepares the environment and connects to Postgres and Impala running inside the Docker container. ''' LOG.info('Starting Job Preparation') self.impala_env.prepare() LOG.info('Job Preparation Complete') self.ref_connection = DbConnector( POSTGRESQL, user_name=POSTGRES_USER_NAME, password=None, host_name=self.impala_env.host, port=self.impala_env.postgres_port).create_connection( DATABASE_NAME) LOG.info('Create Ref Connection') self.start_impala() self.git_hash = self.impala_env.get_git_hash()
) group.add_option( '--postgresql-password', help='The password to use when connecting to the Postgresql database.') parser.add_option_group(group) for group in parser.option_groups + [parser]: for option in group.option_list: if option.default != NO_DEFAULT: option.help += " [default: %default]" options, args = parser.parse_args() basicConfig(level=options.log_level) impala_connection = DbConnector(IMPALA).create_connection(options.db_name) db_connector_param_key = options.reference_db_type.lower() reference_connection = DbConnector(options.reference_db_type, user_name=getattr(options, db_connector_param_key + '_user'), password=getattr(options, db_connector_param_key + '_password'), host_name=getattr(options, db_connector_param_key + '_host'), port=getattr(options, db_connector_param_key + '_port')) \ .create_connection(options.db_name) if options.exclude_types: exclude_types = set(type_name.lower() for type_name in options.exclude_types.split(',')) filter_col_types = [ type_ for type_ in TYPES if type_.__name__.lower() in exclude_types ] else: filter_col_types = []
def connect(self): self.impalad_conn = DbConnector( IMPALA, host_name=self.impalad.host_name, port=self.impalad.port ).create_connection()
class QueryRunner(object): """Encapsulates functionality to run a query and provide a runtime report.""" SPILLED_PATTERN = re.compile("ExecOption:.*Spilled") BATCH_SIZE = 1024 def __init__(self): self.impalad = None self.impalad_conn = None def connect(self): self.impalad_conn = DbConnector( IMPALA, host_name=self.impalad.host_name, port=self.impalad.port ).create_connection() def disconnect(self): if self.impalad_conn: self.impalad_conn.close() self.impalad_conn = None def run_query(self, query, timeout_secs, mem_limit_mb): """Run a query and return an execution report.""" if not self.impalad_conn: raise Exception("connect() must first be called") timeout_unix_time = time() + timeout_secs report = QueryReport() try: with self.impalad_conn.open_cursor() as cursor: start_time = time() LOG.debug("Setting mem limit to %s MB", mem_limit_mb) cursor.execute("SET MEM_LIMIT=%sM" % mem_limit_mb) LOG.debug("Using %s database", query.db_name) cursor.execute("USE %s" % query.db_name) LOG.debug("Running query with %s MB mem limit at %s with timeout secs %s:\n%s", mem_limit_mb, self.impalad.host_name, timeout_secs, query.sql) error = None try: cursor.execute_async("/* Mem: %s MB. Coordinator: %s. */\n" % (mem_limit_mb, self.impalad.host_name) + query.sql) LOG.debug("Query id is %s", cursor._last_operation_handle) while cursor.is_executing(): if time() > timeout_unix_time: self._cancel(cursor, report) return report sleep(0.1) try: report.result_hash = self._hash_result(cursor, timeout_unix_time) except QueryTimeout: self._cancel(cursor, report) return report except Exception as error: LOG.debug("Error running query with id %s: %s", cursor._last_operation_handle, error) self._check_for_mem_limit_exceeded(report, cursor, error) if report.non_mem_limit_error or report.mem_limit_exceeded: return report report.runtime_secs = time() - start_time report.profile = cursor.get_profile() report.mem_was_spilled = \ QueryRunner.SPILLED_PATTERN.search(report.profile) is not None except Exception as error: # A mem limit error would have been caught above, no need to check for that here. report.non_mem_limit_error = error return report def _cancel(self, cursor, report): report.timed_out = True if cursor._last_operation_handle: LOG.debug("Attempting cancellation of query with id %s", cursor._last_operation_handle) cursor.cancel_operation() def _check_for_mem_limit_exceeded(self, report, cursor, caught_exception): """To be called after a query failure to check for signs of failed due to a mem limit. The report will be updated accordingly. """ if cursor._last_operation_handle: try: report.profile = cursor.get_profile() except Exception as e: LOG.debug("Error getting profile for query with id %s: %s", cursor._last_operation_handle, e) if "memory limit exceeded" in str(caught_exception).lower(): report.mem_limit_exceeded = True return LOG.error("Non-mem limit error for query with id %s: %s", cursor._last_operation_handle, caught_exception, exc_info=True) report.non_mem_limit_error = caught_exception def _hash_result(self, cursor, timeout_unix_time): """Returns a hash that is independent of row order.""" # A value of 1 indicates that the hash thread should continue to work. should_continue = Value("i", 1) def hash_result_impl(): try: current_thread().result = 1 while should_continue.value: LOG.debug("Fetching result for query with id %s" % cursor._last_operation_handle) rows = cursor.fetchmany(self.BATCH_SIZE) if not rows: return for row in rows: for idx, val in enumerate(row): # Floats returned by Impala may not be deterministic, the ending # insignificant digits may differ. Only the first 6 digits will be used # after rounding. if isinstance(val, float): sval = "%f" % val dot_idx = sval.find(".") val = round(val, 6 - dot_idx) current_thread().result += (idx + 1) * hash(val) # Modulo the result to Keep it "small" otherwise the math ops can be slow # since python does infinite precision math. current_thread().result %= maxint except Exception as e: current_thread().error = e hash_thread = create_and_start_daemon_thread(hash_result_impl) hash_thread.join(max(timeout_unix_time - time(), 0)) if hash_thread.is_alive(): should_continue.value = 0 raise QueryTimeout() if hash_thread.error: raise hash_thread.error return hash_thread.result
command = args[0] if args else 'populate' if len(args) > 1 or command not in ['populate', 'migrate']: raise Exception( 'Command must either be "populate" or "migrate" but was "%s"' % ' '.join(args)) if command == 'migrate' and not any( (options.use_mysql, options.use_postgresql)): raise Exception( 'At least one destination database must be chosen with ' '--use-<database type>') basicConfig(level=options.log_level) seed(options.randomization_seed) impala_connector = DbConnector(IMPALA) db_connectors = [] if options.use_postgresql: db_connectors.append( DbConnector(POSTGRESQL, user_name=options.postgresql_user, password=options.postgresql_password, host_name=options.postgresql_host, port=options.postgresql_port)) if options.use_mysql: db_connectors.append( DbConnector(MYSQL, user_name=options.mysql_user, password=options.mysql_password, host_name=options.mysql_host, port=options.mysql_port))
class Job(object): '''Represents a Query Generator Job. One ImpalaDockerEnv is associated with it. Able to execute queries by either generaing them based on a provided query profile or by extracting queries from an existing report. A report is generated when it finishes running. ''' def __init__(self, query_profile, job_id, run_name = 'default', time_limit_sec = 24 * 3600, git_command = None, parent_job = None): self.git_hash = '' self.impala_env = ImpalaDockerEnv(git_command) self.job_id = job_id self.job_name = run_name self.parent_job = parent_job self.query_profile = query_profile self.ref_connection = None self.result_list = [] self.start_time = time() self.stop_time = None self.target_stop_time = time() + time_limit_sec self.test_connection = None self.num_queries_executed = 0 def __getstate__(self): '''For pickling''' result = {} result['job_id'] = self.job_id result['job_name'] = self.job_name result['parent_job'] = self.parent_job result['result_list'] = self.result_list result['git_hash'] = self.git_hash result['start_time'] = self.start_time result['stop_time'] = self.stop_time result['num_queries_executed'] = self.num_queries_executed return result def prepare(self): '''Prepares the environment and connects to Postgres and Impala running inside the Docker container. ''' LOG.info('Starting Job Preparation') self.impala_env.prepare() LOG.info('Job Preparation Complete') self.ref_connection = DbConnector(POSTGRESQL, user_name=POSTGRES_USER_NAME, password=None, host_name=self.impala_env.host, port=self.impala_env.postgres_port).create_connection(DATABASE_NAME) LOG.info('Create Ref Connection') self.start_impala() self.git_hash = self.impala_env.get_git_hash() def get_stack(self): stack_trace = self.impala_env.get_stack() LOG.info('Stack Trace: {0}'.format(stack_trace)) return stack_trace def start_impala(self): '''Starts impala and creates a connection to it. ''' self.impala_env.start_impala() self.test_connection = DbConnector(IMPALA, user_name=None, password=None, host_name=self.impala_env.host, port=self.impala_env.impala_port).create_connection(DATABASE_NAME) self.test_connection.reconnect() self.query_result_comparator = QueryResultComparator( self.ref_connection, self.test_connection) LOG.info('Created query result comparator') LOG.info(str(self.query_result_comparator.__dict__)) def is_impala_running(self): return self.impala_env.is_impala_running() def save_pickle(self): '''Saves self as pickle. This is normally done when the job finishes running. ''' with open(join_path(PATH_TO_FINISHED_JOBS, self.job_id), 'w') as f: pickle.dump(self, f) LOG.info('Saved Completed Job Pickle') def queries_to_be_executed(self): '''Generator that outputs query models. They are either generated based on the query profile, or they are extracted from an existing report. ''' if self.parent_job: # If parent job is specified, get the queries from the parent job report with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f: parent_report = pickle.load(f) #for error_type in ['stack', 'row_counts', 'mismatch']: for error_type in ['stack']: for query in parent_report.grouped_results[error_type]: yield query['model'] else: # If parent job is not specified, generate queries with QueryGenerator num_unexpected_errors = 0 while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD: query = None try: query = self.query_generator.create_query(self.common_tables) except IndexError as e: # This is a query generator bug that happens extremely rarely LOG.info('Query Generator Choice Problem, {0}'.format(e)) continue except Exception as e: LOG.info('Unexpected error in queries_to_be_executed, {0}'.format(e)) self.query_generator = QueryGenerator(self.query_profile) num_unexpected_errors += 1 if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD: LOG.error('Num Unexpected Errors above threshold') raise else: continue query.execution = 'RAW' yield query def generate_report(self): '''Generate report and save it into the reports directory''' from report import Report rep = Report(self.job_id) rep.save_pickle() def start(self): try: self.prepare() self.query_generator = QueryGenerator(self.query_profile) self.common_tables = DbConnection.describe_common_tables( [self.ref_connection, self.test_connection]) for query_model in self.queries_to_be_executed(): LOG.info('About to execute query.') result_dict = self.run_query(query_model) LOG.info('Query Executed successfully.') if result_dict: self.num_queries_executed += 1 self.result_list.append(result_dict) LOG.info('Time Left: {0}'.format(self.target_stop_time - time())) if time() > self.target_stop_time: break self.stop_time = time() self.save_pickle() self.generate_report() LOG.info('Generated Report') except: LOG.exception('Unexpected Exception in start') raise finally: self.impala_env.stop_docker() LOG.info('Docker Stopped') try: os.remove(join_path(PATH_TO_SCHEDULE, self.job_id)) LOG.info('Schedule file removed') except OSError: LOG.info('Unable to remove schedule file.') def reproduce_crash(self, query_model): '''Check if the given query_model causes a crash. Returns the number of times the query had to be run to cause a crash. ''' NUM_TRIES = 5 self.start_impala() for try_num in range(1, NUM_TRIES + 1): self.query_result_comparator.compare_query_results(query_model) if not self.is_impala_running(): return try_num def run_query(self, query_model): '''Runs a single query.''' if not self.is_impala_running(): LOG.info('Impala is not running, starting Impala.') self.start_impala() def run_query_internal(): self.comparison_result = self.query_result_comparator.compare_query_results( query_model) # 10 minute time out to avoid cursor close problem? self.comparison_result = None internal_thread = Thread( target=run_query_internal, name='run_query_internal_{0}'.format(self.job_id)) internal_thread.daemon = True internal_thread.start() internal_thread.join(timeout=600) if internal_thread.is_alive(): LOG.info('run_query_internal is alive, restarting Impala Environment') self.impala_env.stop_docker() self.prepare() return None else: LOG.info('run_query_internal is dead as expected') comparison_result = self.comparison_result if comparison_result.query_timed_out: LOG.info('Query Timeout Exception') restart_impala = True else: restart_impala = False result_dict = {} if self.is_impala_running(): if comparison_result.error: result_dict = self.comparison_result_analysis(comparison_result) result_dict['model'] = query_model else: LOG.info('CRASH OCCURED') result_dict = self.comparison_result_analysis(comparison_result) result_dict['model'] = query_model result_dict['stack'] = self.get_stack() result_dict['num_tries_to_reproduce'] = self.reproduce_crash(query_model) if restart_impala: self.start_impala() return result_dict def comparison_result_analysis(self, comparison_result): '''Get useful information from the comparison_result.''' result_dict = {} result_dict['error'] = comparison_result.error result_dict['mismatch_col'] = comparison_result.mismatch_at_col_number result_dict['mismatch_ref_row'] = comparison_result.ref_row result_dict['mismatch_test_row'] = comparison_result.test_row result_dict['ref_row_count'] = comparison_result.ref_row_count result_dict['ref_sql'] = comparison_result.ref_sql result_dict['test_row_count'] = comparison_result.test_row_count result_dict['test_sql'] = comparison_result.test_sql return result_dict
'--profile', default='default', choices=(sorted(profiles.keys())), help= 'Determines the mix of SQL features to use during query generation.') # TODO: Seed the random query generator for repeatable queries? cli_options.add_default_values_to_help(parser) options, args = parser.parse_args() cli_options.configure_logging(options.log_level) db_connector_param_key = options.ref_db_type.lower() ref_connection = DbConnector(options.ref_db_type, user_name=getattr(options, db_connector_param_key + '_user'), password=getattr(options, db_connector_param_key + '_password'), host_name=getattr(options, db_connector_param_key + '_host'), port=getattr(options, db_connector_param_key + '_port')) \ .create_connection(options.db_name) db_connector_param_key = options.test_db_type.lower() test_connection = DbConnector(options.test_db_type, user_name=getattr(options, db_connector_param_key + '_user', None), password=getattr(options, db_connector_param_key + '_password', None), host_name=getattr(options, db_connector_param_key + '_host', None), port=getattr(options, db_connector_param_key + '_port', None)) \ .create_connection(options.db_name) # Create an instance of profile class (e.g. DefaultProfile) query_profile = profiles[options.profile]() diff_searcher = QueryResultDiffSearcher(query_profile, ref_connection, test_connection) query_timeout_seconds = options.timeout search_results = diff_searcher.search(options.query_count,
class Job(object): '''Represents a Query Generator Job. One ImpalaDockerEnv is associated with it. Able to execute queries by either generaing them based on a provided query profile or by extracting queries from an existing report. A report is generated when it finishes running. ''' def __init__(self, query_profile, job_id, run_name='default', time_limit_sec=24 * 3600, git_command=None, parent_job=None): self.git_hash = '' self.impala_env = ImpalaDockerEnv(git_command) self.job_id = job_id self.job_name = run_name self.parent_job = parent_job self.query_profile = query_profile self.ref_connection = None self.result_list = [] self.start_time = time() self.stop_time = None self.target_stop_time = time() + time_limit_sec self.test_connection = None self.num_queries_executed = 0 def __getstate__(self): '''For pickling''' result = {} result['job_id'] = self.job_id result['job_name'] = self.job_name result['parent_job'] = self.parent_job result['result_list'] = self.result_list result['git_hash'] = self.git_hash result['start_time'] = self.start_time result['stop_time'] = self.stop_time result['num_queries_executed'] = self.num_queries_executed return result def prepare(self): '''Prepares the environment and connects to Postgres and Impala running inside the Docker container. ''' LOG.info('Starting Job Preparation') self.impala_env.prepare() LOG.info('Job Preparation Complete') self.ref_connection = DbConnector( POSTGRESQL, user_name=POSTGRES_USER_NAME, password=None, host_name=self.impala_env.host, port=self.impala_env.postgres_port).create_connection( DATABASE_NAME) LOG.info('Create Ref Connection') self.start_impala() self.git_hash = self.impala_env.get_git_hash() def get_stack(self): stack_trace = self.impala_env.get_stack() LOG.info('Stack Trace: {0}'.format(stack_trace)) return stack_trace def start_impala(self): '''Starts impala and creates a connection to it. ''' self.impala_env.start_impala() self.test_connection = DbConnector( IMPALA, user_name=None, password=None, host_name=self.impala_env.host, port=self.impala_env.impala_port).create_connection(DATABASE_NAME) self.test_connection.reconnect() self.query_result_comparator = QueryResultComparator( self.ref_connection, self.test_connection) LOG.info('Created query result comparator') LOG.info(str(self.query_result_comparator.__dict__)) def is_impala_running(self): return self.impala_env.is_impala_running() def save_pickle(self): '''Saves self as pickle. This is normally done when the job finishes running. ''' with open(join_path(PATH_TO_FINISHED_JOBS, self.job_id), 'w') as f: pickle.dump(self, f) LOG.info('Saved Completed Job Pickle') def queries_to_be_executed(self): '''Generator that outputs query models. They are either generated based on the query profile, or they are extracted from an existing report. ''' if self.parent_job: # If parent job is specified, get the queries from the parent job report with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f: parent_report = pickle.load(f) #for error_type in ['stack', 'row_counts', 'mismatch']: for error_type in ['stack']: for query in parent_report.grouped_results[error_type]: yield query['model'] else: # If parent job is not specified, generate queries with QueryGenerator num_unexpected_errors = 0 while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD: query = None try: query = self.query_generator.create_query( self.common_tables) except IndexError as e: # This is a query generator bug that happens extremely rarely LOG.info('Query Generator Choice Problem, {0}'.format(e)) continue except Exception as e: LOG.info('Unexpected error in queries_to_be_executed, {0}'. format(e)) self.query_generator = QueryGenerator(self.query_profile) num_unexpected_errors += 1 if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD: LOG.error('Num Unexpected Errors above threshold') raise else: continue query.execution = 'RAW' yield query def generate_report(self): '''Generate report and save it into the reports directory''' from report import Report rep = Report(self.job_id) rep.save_pickle() def start(self): try: self.prepare() self.query_generator = QueryGenerator(self.query_profile) self.common_tables = DbConnection.describe_common_tables( [self.ref_connection, self.test_connection]) for query_model in self.queries_to_be_executed(): LOG.info('About to execute query.') result_dict = self.run_query(query_model) LOG.info('Query Executed successfully.') if result_dict: self.num_queries_executed += 1 self.result_list.append(result_dict) LOG.info('Time Left: {0}'.format(self.target_stop_time - time())) if time() > self.target_stop_time: break self.stop_time = time() self.save_pickle() self.generate_report() LOG.info('Generated Report') except: LOG.exception('Unexpected Exception in start') raise finally: self.impala_env.stop_docker() LOG.info('Docker Stopped') try: os.remove(join_path(PATH_TO_SCHEDULE, self.job_id)) LOG.info('Schedule file removed') except OSError: LOG.info('Unable to remove schedule file.') def reproduce_crash(self, query_model): '''Check if the given query_model causes a crash. Returns the number of times the query had to be run to cause a crash. ''' NUM_TRIES = 5 self.start_impala() for try_num in range(1, NUM_TRIES + 1): self.query_result_comparator.compare_query_results(query_model) if not self.is_impala_running(): return try_num def run_query(self, query_model): '''Runs a single query.''' if not self.is_impala_running(): LOG.info('Impala is not running, starting Impala.') self.start_impala() def run_query_internal(): self.comparison_result = self.query_result_comparator.compare_query_results( query_model) # 10 minute time out to avoid cursor close problem? self.comparison_result = None internal_thread = Thread(target=run_query_internal, name='run_query_internal_{0}'.format( self.job_id)) internal_thread.daemon = True internal_thread.start() internal_thread.join(timeout=600) if internal_thread.is_alive(): LOG.info( 'run_query_internal is alive, restarting Impala Environment') self.impala_env.stop_docker() self.prepare() return None else: LOG.info('run_query_internal is dead as expected') comparison_result = self.comparison_result if comparison_result.query_timed_out: LOG.info('Query Timeout Exception') restart_impala = True else: restart_impala = False result_dict = {} if self.is_impala_running(): if comparison_result.error: result_dict = self.comparison_result_analysis( comparison_result) result_dict['model'] = query_model else: LOG.info('CRASH OCCURED') result_dict = self.comparison_result_analysis(comparison_result) result_dict['model'] = query_model result_dict['stack'] = self.get_stack() result_dict['num_tries_to_reproduce'] = self.reproduce_crash( query_model) if restart_impala: self.start_impala() return result_dict def comparison_result_analysis(self, comparison_result): '''Get useful information from the comparison_result.''' result_dict = {} result_dict['error'] = comparison_result.error result_dict['mismatch_col'] = comparison_result.mismatch_at_col_number result_dict['mismatch_ref_row'] = comparison_result.ref_row result_dict['mismatch_test_row'] = comparison_result.test_row result_dict['ref_row_count'] = comparison_result.ref_row_count result_dict['ref_sql'] = comparison_result.ref_sql result_dict['test_row_count'] = comparison_result.test_row_count result_dict['test_sql'] = comparison_result.test_sql return result_dict
'populate', 'migrate', 'populate_existing' ]: raise Exception( 'Command must either be "populate", "populate_existing" or "migrate" but was "%s"' % ' '.join(args)) if command == 'migrate' and \ not any((options.use_mysql, options.use_postgresql, options.use_oracle)): raise Exception( 'At least one destination database must be chosen with ' '--use-<database type>') basicConfig(level=getattr(logging, options.log_level)) seed(options.randomization_seed) impala_connector = DbConnector(IMPALA) db_connectors = [] if options.use_postgresql: db_connectors.append( DbConnector(POSTGRESQL, user_name=options.postgresql_user, password=options.postgresql_password, host_name=options.postgresql_host, port=options.postgresql_port)) if options.use_oracle: db_connectors.append( DbConnector(ORACLE, user_name=options.oracle_user, password=options.oracle_password, host_name=options.oracle_host, port=options.oracle_port))