def __init__(self, impala_connection, reference_connection): self.reference_db_type = reference_connection.db_type self.impala_cursor = impala_connection.create_cursor() self.reference_cursor = reference_connection.create_cursor() self.impala_sql_writer = SqlWriter.create(dialect=impala_connection.db_type) self.reference_sql_writer = SqlWriter.create(dialect=reference_connection.db_type) # At this time the connection will be killed and ther comparison result will be # timeout. self.query_timeout_seconds = 3 * 60
def __init__(self, query_profile, ref_conn, test_conn): '''query_profile should be an instance of one of the profiles in query_profile.py''' self.query_profile = query_profile self.ref_conn = ref_conn self.test_conn = test_conn self.ref_sql_writer = SqlWriter.create(dialect=ref_conn.db_type) self.test_sql_writer = SqlWriter.create(dialect=test_conn.db_type) with ref_conn.cursor() as ref_cursor: with test_conn.cursor() as test_cursor: self.common_tables = DbCursor.describe_common_tables([ref_cursor, test_cursor]) if not self.common_tables: raise Exception("Unable to find a common set of tables in both databases")
def __init__(self, ref_connection, test_connection): '''test/ref_connection arguments should be an instance of DbConnection''' ref_cursor = ref_connection.create_cursor() test_cursor = test_connection.create_cursor() self.ref_connection = ref_connection self.ref_sql_writer = SqlWriter.create(dialect=ref_connection.db_type) self.test_connection = test_connection self.test_sql_writer = SqlWriter.create(dialect=test_connection.db_type) self.query_executor = QueryExecutor( [ref_cursor, test_cursor], [self.ref_sql_writer, self.test_sql_writer], query_timeout_seconds=(3 * 60))
def __init__(self, impala_connection, reference_connection): self.reference_db_type = reference_connection.db_type self.impala_cursor = impala_connection.create_cursor() self.reference_cursor = reference_connection.create_cursor() self.impala_sql_writer = SqlWriter.create( dialect=impala_connection.db_type) self.reference_sql_writer = SqlWriter.create( dialect=reference_connection.db_type) # At this time the connection will be killed and ther comparison result will be # timeout. self.query_timeout_seconds = 3 * 60
def __init__(self, ref_connection, test_connection): '''test/ref_connection arguments should be an instance of DbConnection''' ref_cursor = ref_connection.create_cursor() test_cursor = test_connection.create_cursor() self.ref_connection = ref_connection self.ref_sql_writer = SqlWriter.create(dialect=ref_connection.db_type) self.test_connection = test_connection self.test_sql_writer = SqlWriter.create( dialect=test_connection.db_type) self.query_executor = QueryExecutor( [ref_cursor, test_cursor], [self.ref_sql_writer, self.test_sql_writer], query_timeout_seconds=(3 * 60))
def __init__(self, query_profile, ref_conn, test_conn, query_timeout_seconds, flatten_dialect=None): '''test/ref_conn arguments should be an instance of DbConnection''' ref_cursor = ref_conn.cursor() test_cursor = test_conn.cursor() self.ref_conn = ref_conn self.ref_sql_writer = SqlWriter.create( dialect=ref_conn.db_type, nulls_order_asc=query_profile.nulls_order_asc()) self.test_conn = test_conn self.test_sql_writer = SqlWriter.create(dialect=test_conn.db_type) self.query_executor = QueryExecutor( [ref_cursor, test_cursor], [self.ref_sql_writer, self.test_sql_writer], query_timeout_seconds=query_timeout_seconds, flatten_dialect=flatten_dialect)
def generate_random_queries(impala, random_db): """Generator function to produce random queries. 'impala' is the Impala service object. random_db is the name of the database that queries should be generated for.""" with impala.cursor(db_name=random_db) as cursor: tables = [cursor.describe_table(t) for t in cursor.list_table_names()] query_generator = QueryGenerator(DefaultProfile()) model_translator = SqlWriter.create() while True: query_model = query_generator.generate_statement(tables) sql = model_translator.write_query(query_model) query = Query() query.sql = sql query.db_name = random_db yield query
def search(self, number_of_test_queries, stop_on_result_mismatch, stop_on_crash): if exists(self.query_shelve_path): # Ensure a clean shelve will be created remove(self.query_shelve_path) start_time = time() impala_sql_writer = SqlWriter.create(dialect=IMPALA) reference_sql_writer = SqlWriter.create( dialect=self.reference_connection.db_type) query_result_comparator = QueryResultComparator( self.impala_connection, self.reference_connection) query_generator = QueryGenerator() query_count = 0 queries_resulted_in_data_count = 0 mismatch_count = 0 query_timeout_count = 0 known_error_count = 0 impala_crash_count = 0 last_error = None repeat_error_count = 0 with open(self.query_log_path, 'w') as impala_query_log: impala_query_log.write('--\n' '-- Stating new run\n' '--\n') while number_of_test_queries > query_count: query = query_generator.create_query(self.common_tables) impala_sql = impala_sql_writer.write_query(query) if 'FULL OUTER JOIN' in impala_sql and self.reference_connection.db_type == MYSQL: # Not supported by MySQL continue query_count += 1 LOG.info('Running query #%s', query_count) impala_query_log.write(impala_sql + ';\n') result = query_result_comparator.compare_query_results(query) if result.query_resulted_in_data: queries_resulted_in_data_count += 1 if result.error: # TODO: These first two come from psycopg2, the postgres driver. Maybe we should # try a different driver? Or maybe the usage of the driver isn't correct. # Anyhow ignore these failures. if 'division by zero' in result.error \ or 'out of range' in result.error \ or 'Too much data' in result.error: LOG.debug('Ignoring error: %s', result.error) query_count -= 1 continue if result.is_known_error: known_error_count += 1 elif result.query_timed_out: query_timeout_count += 1 else: mismatch_count += 1 with closing(open_shelve( self.query_shelve_path)) as query_shelve: query_shelve[str(query_count)] = query print('---Impala Query---\n') print( impala_sql_writer.write_query(query, pretty=True) + '\n') print('---Reference Query---\n') print( reference_sql_writer.write_query(query, pretty=True) + '\n') print('---Error---\n') print(result.error + '\n') print('------\n') if 'Could not connect' in result.error \ or "Couldn't open transport for" in result.error: # if stop_on_crash: # break # Assume Impala crashed and try restarting impala_crash_count += 1 LOG.info('Restarting Impala') call([ join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/") ]) self.impala_connection.reconnect() query_result_comparator.impala_cursor = self.impala_connection.create_cursor( ) result = query_result_comparator.compare_query_results( query) if result.error: LOG.info('Restarting Impala') call([ join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/") ]) self.impala_connection.reconnect() query_result_comparator.impala_cursor = self.impala_connection.create_cursor( ) else: break if stop_on_result_mismatch and \ not (result.is_known_error or result.query_timed_out): break if last_error == result.error \ and not (result.is_known_error or result.query_timed_out): repeat_error_count += 1 if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT: break else: last_error = result.error repeat_error_count = 0 else: if result.query_resulted_in_data: LOG.info('Results matched (%s rows)', result.impala_row_count) else: LOG.info('Query did not produce meaningful data') last_error = None repeat_error_count = 0 return SearchResults(query_count, queries_resulted_in_data_count, mismatch_count, query_timeout_count, known_error_count, impala_crash_count, time() - start_time)
def search(self, number_of_test_queries, stop_on_result_mismatch, stop_on_crash): if exists(self.query_shelve_path): # Ensure a clean shelve will be created remove(self.query_shelve_path) start_time = time() impala_sql_writer = SqlWriter.create(dialect=IMPALA) reference_sql_writer = SqlWriter.create( dialect=self.reference_connection.db_type) query_result_comparator = QueryResultComparator( self.impala_connection, self.reference_connection) query_generator = QueryGenerator() query_count = 0 queries_resulted_in_data_count = 0 mismatch_count = 0 query_timeout_count = 0 known_error_count = 0 impala_crash_count = 0 last_error = None repeat_error_count = 0 with open(self.query_log_path, 'w') as impala_query_log: impala_query_log.write( '--\n' '-- Stating new run\n' '--\n') while number_of_test_queries > query_count: query = query_generator.create_query(self.common_tables) impala_sql = impala_sql_writer.write_query(query) if 'FULL OUTER JOIN' in impala_sql and self.reference_connection.db_type == MYSQL: # Not supported by MySQL continue query_count += 1 LOG.info('Running query #%s', query_count) impala_query_log.write(impala_sql + ';\n') result = query_result_comparator.compare_query_results(query) if result.query_resulted_in_data: queries_resulted_in_data_count += 1 if result.error: # TODO: These first two come from psycopg2, the postgres driver. Maybe we should # try a different driver? Or maybe the usage of the driver isn't correct. # Anyhow ignore these failures. if 'division by zero' in result.error \ or 'out of range' in result.error \ or 'Too much data' in result.error: LOG.debug('Ignoring error: %s', result.error) query_count -= 1 continue if result.is_known_error: known_error_count += 1 elif result.query_timed_out: query_timeout_count += 1 else: mismatch_count += 1 with closing(open_shelve(self.query_shelve_path)) as query_shelve: query_shelve[str(query_count)] = query print('---Impala Query---\n') print(impala_sql_writer.write_query(query, pretty=True) + '\n') print('---Reference Query---\n') print(reference_sql_writer.write_query(query, pretty=True) + '\n') print('---Error---\n') print(result.error + '\n') print('------\n') if 'Could not connect' in result.error \ or "Couldn't open transport for" in result.error: # if stop_on_crash: # break # Assume Impala crashed and try restarting impala_crash_count += 1 LOG.info('Restarting Impala') call([join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")]) self.impala_connection.reconnect() query_result_comparator.impala_cursor = self.impala_connection.create_cursor() result = query_result_comparator.compare_query_results(query) if result.error: LOG.info('Restarting Impala') call([join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")]) self.impala_connection.reconnect() query_result_comparator.impala_cursor = self.impala_connection.create_cursor() else: break if stop_on_result_mismatch and \ not (result.is_known_error or result.query_timed_out): break if last_error == result.error \ and not (result.is_known_error or result.query_timed_out): repeat_error_count += 1 if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT: break else: last_error = result.error repeat_error_count = 0 else: if result.query_resulted_in_data: LOG.info('Results matched (%s rows)', result.impala_row_count) else: LOG.info('Query did not produce meaningful data') last_error = None repeat_error_count = 0 return SearchResults( query_count, queries_resulted_in_data_count, mismatch_count, query_timeout_count, known_error_count, impala_crash_count, time() - start_time)
def sql_writer(request): """ Return a SqlWriter object that is torn down at the end of each test. """ yield SqlWriter.create(dialect=request.param)
def sql_writer(): """ Return a SqlWriter object that is torn down at the end of each test. """ # TODO: Later, we can parametrize on dialect, but for now, this is just PoC. yield SqlWriter.create(dialect='IMPALA')