def queries_to_be_executed(self): '''Generator that outputs query models. They are either generated based on the query profile, or they are extracted from an existing report. ''' if self.parent_job: # If parent job is specified, get the queries from the parent job report with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f: parent_report = pickle.load(f) for error_type in ['stack', 'row_counts', 'mismatch']: for query in parent_report.grouped_results[error_type]: yield query['model'] else: # If parent job is not specified, generate queries with QueryGenerator num_unexpected_errors = 0 while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD: query = None try: self.query_generator = QueryGenerator(self.query_profile) query = self.query_generator.create_query( self.common_tables) except IndexError as e: # This is a query generator bug that happens extremely rarely LOG.info('Query Generator Choice Problem, {0}'.format(e)) continue except Exception as e: LOG.info('Unexpected error in queries_to_be_executed, {0}'. format(e)) num_unexpected_errors += 1 if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD: LOG.error('Num Unexpected Errors above threshold') raise else: continue yield query
def test_hive_analytics_cannot_contain_aggs(analytic_func): """ Tests that the HiveProfile does not allow nested aggs inside specific analytic functions. The list of analytic functions that cannot contain aggs is defined by QueryProfile.test_hive_analytics_cannot_contain_aggs() """ class FakeDefaultQueryProfile(DefaultProfile): """ A DefaultProfile that forces only nested expression in any expression trees """ def __init__(self): super(FakeDefaultQueryProfile, self).__init__() self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)}) class FakeHiveQueryProfile(HiveProfile): """ A HiveProfile that forces only nested expression in any expression trees """ def __init__(self): super(FakeHiveQueryProfile, self).__init__() self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)}) # Aggregate functions can only return specific types, such as Int, Number, or Float; # while certain AnalyticFuncs can return non-numeric types such as FirstValue or # LastValue. So for simplicity, these tests are only run against the Int return_type. if Int in [signature.return_type for signature in analytic_func.signatures()]: # Generate an agg + analytic func tree using the FakeDefaultQueryProfile and ensure # the root func is a analytic_func and that the tree contains an aggregate func. An # empty list of funcs is passed into the _create_agg_or_analytic_tree so that no # basic funcs are created. We can be sure that the root_func is an analytic_func # because agg_funcs cannot contain analytic_funcs. qgen = QueryGenerator(FakeDefaultQueryProfile()) func_tree = qgen._create_agg_or_analytic_tree(Int, agg_funcs=AGG_FUNCS, analytic_funcs=[analytic_func], basic_funcs=[]) assert isinstance(func_tree, analytic_func) assert func_tree.contains_agg # Do the same for the FakeHiveQueryProfile, but now check that the func_tree has no # aggregates. qgen = QueryGenerator(FakeHiveQueryProfile()) func_tree = qgen._create_agg_or_analytic_tree(Int, agg_funcs=AGG_FUNCS, analytic_funcs=[analytic_func], basic_funcs=[]) assert isinstance(func_tree, analytic_func) assert not func_tree.contains_agg
def generate_random_queries(impala, random_db): """Generator function to produce random queries. 'impala' is the Impala service object. random_db is the name of the database that queries should be generated for.""" with impala.cursor(db_name=random_db) as cursor: tables = [cursor.describe_table(t) for t in cursor.list_table_names()] query_generator = QueryGenerator(DefaultProfile()) model_translator = SqlWriter.create() while True: query_model = query_generator.generate_statement(tables) sql = model_translator.write_query(query_model) query = Query() query.sql = sql query.db_name = random_db yield query
def queries_to_be_executed(self): '''Generator that outputs query models. They are either generated based on the query profile, or they are extracted from an existing report. ''' if self.parent_job: # If parent job is specified, get the queries from the parent job report with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f: parent_report = pickle.load(f) for error_type in ['stack', 'row_counts', 'mismatch']: for query in parent_report.grouped_results[error_type]: yield query['model'] else: # If parent job is not specified, generate queries with QueryGenerator num_unexpected_errors = 0 while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD: query = None try: self.query_generator = QueryGenerator(self.query_profile) query = self.query_generator.create_query(self.common_tables) except IndexError as e: # This is a query generator bug that happens extremely rarely LOG.info('Query Generator Choice Problem, {0}'.format(e)) continue except Exception as e: LOG.info('Unexpected error in queries_to_be_executed, {0}'.format(e)) num_unexpected_errors += 1 if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD: LOG.error('Num Unexpected Errors above threshold') raise else: continue yield query
def start(self): try: self.prepare() self.query_generator = QueryGenerator(self.query_profile) self.common_tables = DbConnection.describe_common_tables( [self.ref_connection, self.test_connection]) for query_model in self.queries_to_be_executed(): LOG.info('About to execute query.') result_dict = self.run_query(query_model) LOG.info('Query Executed successfully.') if result_dict: self.num_queries_executed += 1 self.result_list.append(result_dict) LOG.info('Time Left: {0}'.format(self.target_stop_time - time())) if time() > self.target_stop_time: break self.stop_time = time() self.save_pickle() self.generate_report() LOG.info('Generated Report') except: LOG.exception('Unexpected Exception in start') raise finally: self.impala_env.stop_docker() LOG.info('Docker Stopped') try: os.remove(join_path(PATH_TO_SCHEDULE, self.job_id)) LOG.info('Schedule file removed') except OSError: LOG.info('Unable to remove schedule file.')
def test_hive_create_equality_only_joins(): """ Tests that QueryGenerator produces a join condition with only equality functions if the HiveProfile is used. """ class FakeHiveQueryProfile(HiveProfile): """ A fake QueryProfile that extends the HiveProfile, various weights are modified in order to ensure that this test is deterministic. """ def choose_join_condition_count(self): """ There should be only one operator in the JOIN condition """ return 1 def choose_conjunct_disjunct_fill_ratio(self): """ There should be no AND or OR operators """ return 0 def choose_relational_func_fill_ratio(self): """ Force all operators to be relational """ return 1 query_generator = QueryGenerator(FakeHiveQueryProfile()) # Create two tables that have one joinable Column right_table_expr_list = TableExprList() right_table = Table("right_table") right_table.add_col(Column("right_table", "right_col", Int)) right_table_expr_list.append(right_table) left_table_expr_list = TableExprList() left_table = Table("left_table") left_table.add_col(Column("left_table", "left_col", Int)) left_table_expr_list.append(left_table) # Validate the root predicate is an Equals funcs assert isinstance(query_generator._create_relational_join_condition( right_table_expr_list, left_table_expr_list), Equals)
def test_use_nested_width_subquery(): """ Tests that setting DefaultProfile.use_nested_with to False works properly. Setting this method to return False should prevent a WITH clause from being used inside a sub-query. """ class MockQueryProfile(DefaultProfile): """ A mock QueryProfile that sets use_nested_with to False and forces the QueryGenerator to created nested queries. """ def __init__(self): super(MockQueryProfile, self).__init__() # Force the QueryGenerator to create nested queries self._bounds['MAX_NESTED_QUERY_COUNT'] = (4, 4) # Force the QueryGenerator to use WITH clauses whenever possible self._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = 1 # Force the QueryGenerator to create inline views whenever possible self._probabilities['MISC']['INLINE_VIEW'] = 1 def use_nested_with(self): return False mock_query_gen = QueryGenerator(MockQueryProfile()) # Create two tables table_expr_list = TableExprList() right_table = Table("right_table") right_table.add_col(Column("right_table", "right_col", Int)) table_expr_list.append(right_table) left_table = Table("left_table") left_table.add_col(Column("left_table", "left_col", Int)) table_expr_list.append(left_table) # Check that each nested_query doesn't have a with clause for nested_query in mock_query_gen.generate_statement(table_expr_list).nested_queries: assert nested_query.with_clause is None
def test_use_nested_width_subquery(): """ Tests that setting DefaultProfile.use_nested_with to False works properly. Setting this method to return False should prevent a WITH clause from being used inside a sub-query. """ class MockQueryProfile(DefaultProfile): """ A mock QueryProfile that sets use_nested_with to False and forces the QueryGenerator to created nested queries. """ def __init__(self): super(MockQueryProfile, self).__init__() # Force the QueryGenerator to create nested queries self._bounds['MAX_NESTED_QUERY_COUNT'] = (4, 4) # Force the QueryGenerator to use WITH clauses whenever possible self._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = 1 # Force the QueryGenerator to create inline views whenever possible self._probabilities['MISC']['INLINE_VIEW'] = 1 def use_nested_with(self): return False mock_query_gen = QueryGenerator(MockQueryProfile()) # Create two tables table_expr_list = TableExprList() right_table = Table("right_table") right_table.add_col(Column("right_table", "right_col", Int)) table_expr_list.append(right_table) left_table = Table("left_table") left_table.add_col(Column("left_table", "left_col", Int)) table_expr_list.append(left_table) # Check that each nested_query doesn't have a with clause for nested_query in mock_query_gen.create_query(table_expr_list).nested_queries: assert nested_query.with_clause is None
def test_hive_analytics_cannot_contain_aggs(analytic_func): """ Tests that the HiveProfile does not allow nested aggs inside specific analytic functions. The list of analytic functions that cannot contain aggs is defined by QueryProfile.test_hive_analytics_cannot_contain_aggs() """ class FakeDefaultQueryProfile(DefaultProfile): """ A DefaultProfile that forces only nested expression in any expression trees """ def __init__(self): super(FakeDefaultQueryProfile, self).__init__() self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)}) class FakeHiveQueryProfile(HiveProfile): """ A HiveProfile that forces only nested expression in any expression trees """ def __init__(self): super(FakeHiveQueryProfile, self).__init__() self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)}) # Aggregate functions can only return specific types, such as Int, Number, or Float; # while certain AnalyticFuncs can return non-numeric types such as FirstValue or # LastValue. So for simplicity, these tests are only run against the Int return_type. if Int in [ signature.return_type for signature in analytic_func.signatures() ]: # Generate an agg + analytic func tree using the FakeDefaultQueryProfile and ensure # the root func is a analytic_func and that the tree contains an aggregate func. An # empty list of funcs is passed into the _create_agg_or_analytic_tree so that no # basic funcs are created. We can be sure that the root_func is an analytic_func # because agg_funcs cannot contain analytic_funcs. qgen = QueryGenerator(FakeDefaultQueryProfile()) func_tree = qgen._create_agg_or_analytic_tree( Int, agg_funcs=AGG_FUNCS, analytic_funcs=[analytic_func], basic_funcs=[]) assert isinstance(func_tree, analytic_func) assert func_tree.contains_agg # Do the same for the FakeHiveQueryProfile, but now check that the func_tree has no # aggregates. qgen = QueryGenerator(FakeHiveQueryProfile()) func_tree = qgen._create_agg_or_analytic_tree( Int, agg_funcs=AGG_FUNCS, analytic_funcs=[analytic_func], basic_funcs=[]) assert isinstance(func_tree, analytic_func) assert not func_tree.contains_agg
def test_func_tree_contains_funcs(): """ Tests the QueryGenerator.func_tree_contains_funcs() method """ qgen = QueryGenerator(DefaultProfile()) # Create a simple func_tree with only one function and_func = And.create_from_args(Boolean(True), Boolean(True)) and_func.parent = None assert qgen._func_tree_contains_funcs(and_func, [And]) assert not qgen._func_tree_contains_funcs(and_func, [Or]) # Create a func_tree that contains one parent, and two children equals_func = Equals.create_from_args(Boolean(True), Boolean(True)) and_func = And.create_from_args(equals_func, equals_func) equals_func.parent = and_func and_func.parent = None assert qgen._func_tree_contains_funcs(equals_func, [And]) assert qgen._func_tree_contains_funcs(equals_func, [Equals]) assert not qgen._func_tree_contains_funcs(equals_func, [Or])
def start(self): try: self.prepare() self.query_generator = QueryGenerator(self.query_profile) if NESTED_TYPES_MODE: self.common_tables = DbCursor.describe_common_tables( [self.test_connection.cursor()]) else: self.common_tables = DbCursor.describe_common_tables( [self.test_connection.cursor(), self.ref_connection.cursor()]) for query_model in self.queries_to_be_executed(): LOG.info('About to execute query.') result_dict = self.run_query(query_model) LOG.info('Query Executed successfully.') self.num_queries_executed += 1 if result_dict: self.result_list.append(result_dict) LOG.info('Time Left: {0}'.format(self.target_stop_time - time())) if time() > self.target_stop_time: break self.stop_time = time() self.save_pickle() self.generate_report() LOG.info('Generated Report') except: LOG.exception('Unexpected Exception in start') raise finally: self.impala_env.stop_docker() LOG.info('Docker Stopped') try: os.remove(join_path(PATH_TO_SCHEDULE, self.job_id)) LOG.info('Schedule file removed') except OSError: LOG.info('Unable to remove schedule file.')
class Job(object): '''Represents a Query Generator Job. One ImpalaDockerEnv is associated with it. Able to execute queries by either generaing them based on a provided query profile or by extracting queries from an existing report. A report is generated when it finishes running. ''' def __init__(self, query_profile, job_id, run_name = 'default', time_limit_sec = 24 * 3600, git_command = None, parent_job = None): self.git_hash = '' self.job_id = job_id self.job_name = run_name self.parent_job = parent_job self.query_profile = query_profile or ( ImpalaNestedTypesProfile() if NESTED_TYPES_MODE else DefaultProfile()) self.ref_connection = None self.result_list = [] self.start_time = time() self.stop_time = None self.target_stop_time = time() + time_limit_sec self.test_connection = None self.num_queries_executed = 0 self.num_queries_returned_correct_data = 0 self.flatten_dialect = 'POSTGRESQL' if NESTED_TYPES_MODE else None self.impala_env = ImpalaDockerEnv(git_command) def __getstate__(self): '''For pickling''' result = {} result['job_id'] = self.job_id result['job_name'] = self.job_name result['parent_job'] = self.parent_job result['result_list'] = self.result_list result['git_hash'] = self.git_hash result['start_time'] = self.start_time result['stop_time'] = self.stop_time result['num_queries_executed'] = self.num_queries_executed result['num_queries_returned_correct_data'] = self.num_queries_returned_correct_data return result def prepare(self): '''Prepares the environment and connects to Postgres and Impala running inside the Docker container. ''' LOG.info('Starting Job Preparation') self.impala_env.prepare() LOG.info('Job Preparation Complete') self.ref_connection = PostgresqlConnection( user_name=POSTGRES_USER_NAME, password=None, host_name=self.impala_env.host, port=self.impala_env.postgres_port, db_name=POSTGRES_DATABASE_NAME) LOG.info('Created ref_connection') self.start_impala() self.git_hash = self.impala_env.get_git_hash() def get_stack(self): stack_trace = self.impala_env.get_stack() LOG.info('Stack Trace: {0}'.format(stack_trace)) return stack_trace def start_impala(self): '''Starts impala and creates a connection to it. ''' self.impala_env.start_impala() self.test_connection = ImpalaConnection( host_name=self.impala_env.host, port=self.impala_env.impala_port, user_name=None, db_name=DATABASE_NAME) self.test_connection.reconnect() self.query_result_comparator = QueryResultComparator( self.query_profile, self.ref_connection, self.test_connection, query_timeout_seconds=4*60, flatten_dialect='POSTGRESQL') LOG.info('Created query result comparator') LOG.info(str(self.query_result_comparator.__dict__)) def is_impala_running(self): return self.impala_env.is_impala_running() def save_pickle(self): '''Saves self as pickle. This is normally done when the job finishes running. ''' with open(join_path(PATH_TO_FINISHED_JOBS, self.job_id), 'w') as f: pickle.dump(self, f) LOG.info('Saved Completed Job Pickle') def queries_to_be_executed(self): '''Generator that outputs query models. They are either generated based on the query profile, or they are extracted from an existing report. ''' if self.parent_job: # If parent job is specified, get the queries from the parent job report with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f: parent_report = pickle.load(f) for error_type in ['stack', 'row_counts', 'mismatch']: for query in parent_report.grouped_results[error_type]: yield query['model'] else: # If parent job is not specified, generate queries with QueryGenerator num_unexpected_errors = 0 while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD: query = None try: self.query_generator = QueryGenerator(self.query_profile) query = self.query_generator.create_query(self.common_tables) except IndexError as e: # This is a query generator bug that happens extremely rarely LOG.info('Query Generator Choice Problem, {0}'.format(e)) continue except Exception as e: LOG.info('Unexpected error in queries_to_be_executed, {0}'.format(e)) num_unexpected_errors += 1 if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD: LOG.error('Num Unexpected Errors above threshold') raise else: continue yield query def generate_report(self): '''Generate report and save it into the reports directory. ''' from report import Report rep = Report(self.job_id) rep.save_pickle() def start(self): try: self.prepare() self.query_generator = QueryGenerator(self.query_profile) if NESTED_TYPES_MODE: self.common_tables = DbCursor.describe_common_tables( [self.test_connection.cursor()]) else: self.common_tables = DbCursor.describe_common_tables( [self.test_connection.cursor(), self.ref_connection.cursor()]) for query_model in self.queries_to_be_executed(): LOG.info('About to execute query.') result_dict = self.run_query(query_model) LOG.info('Query Executed successfully.') self.num_queries_executed += 1 if result_dict: self.result_list.append(result_dict) LOG.info('Time Left: {0}'.format(self.target_stop_time - time())) if time() > self.target_stop_time: break self.stop_time = time() self.save_pickle() self.generate_report() LOG.info('Generated Report') except: LOG.exception('Unexpected Exception in start') raise finally: self.impala_env.stop_docker() LOG.info('Docker Stopped') try: os.remove(join_path(PATH_TO_SCHEDULE, self.job_id)) LOG.info('Schedule file removed') except OSError: LOG.info('Unable to remove schedule file.') def reproduce_crash(self, query_model): '''Check if the given query_model causes a crash. Returns the number of times the query had to be run to cause a crash. ''' NUM_TRIES = 5 self.start_impala() for try_num in range(1, NUM_TRIES + 1): self.query_result_comparator.compare_query_results(query_model) if not self.is_impala_running(): return try_num def run_query(self, query_model): '''Runs a single query. ''' if not self.is_impala_running(): LOG.info('Impala is not running, starting Impala.') self.start_impala() def run_query_internal(): self.comparison_result = self.query_result_comparator.compare_query_results( query_model) self.comparison_result = None internal_thread = Thread( target=run_query_internal, name='run_query_internal_{0}'.format(self.job_id)) internal_thread.daemon = True internal_thread.start() internal_thread.join(timeout=600) if internal_thread.is_alive(): LOG.info('run_query_internal is alive, restarting Impala Environment') self.impala_env.stop_docker() self.prepare() return None else: LOG.info('run_query_internal is dead as expected') comparison_result = self.comparison_result if comparison_result.query_timed_out: LOG.info('Query Timeout Exception') restart_impala = True else: restart_impala = False result_dict = {} if self.is_impala_running(): if comparison_result.error: result_dict = self.comparison_result_analysis(comparison_result) result_dict['model'] = query_model elif comparison_result.query_resulted_in_data: self.num_queries_returned_correct_data += 1 else: LOG.info('CRASH OCCURED') result_dict = self.comparison_result_analysis(comparison_result) result_dict['model'] = query_model result_dict['stack'] = self.get_stack() result_dict['num_tries_to_reproduce'] = self.reproduce_crash(query_model) if restart_impala: self.start_impala() return result_dict def comparison_result_analysis(self, comparison_result): '''Get useful information from the comparison_result. ''' result_dict = {} result_dict['error'] = comparison_result.error result_dict['mismatch_col'] = comparison_result.mismatch_at_col_number result_dict['mismatch_ref_row'] = comparison_result.ref_row result_dict['mismatch_test_row'] = comparison_result.test_row result_dict['ref_row_count'] = comparison_result.ref_row_count result_dict['ref_sql'] = comparison_result.ref_sql result_dict['test_row_count'] = comparison_result.test_row_count result_dict['test_sql'] = comparison_result.test_sql return result_dict
def generate_statement(self, tables, dml_table): """ Return a randomly generated INSERT or UPSERT statement. Note that UPSERTs are very similar to INSERTs, which is why this generator handles both. tables should be a list of Table objects. A typical source of such a list comes from db_connection.DbCursor.describe_common_tables(). This list describes the possible "sources" of the INSERT/UPSERT's WITH and FROM/WHERE clauses. dml_table is a required Table object. The INSERT/UPSERT will be into this table. """ if not (isinstance(tables, list) and len(tables) > 0 and all((isinstance(t, Table) for t in tables))): raise Exception('tables must be a not-empty list of Table objects') if not isinstance(dml_table, Table): raise Exception('dml_table must be a Table') self.select_stmt_generator = QueryGenerator(self.profile) insert_statement = InsertStatement(execution=StatementExecutionMode.DML_TEST) # Choose whether this is a # INSERT/UPSERT INTO table SELECT/VALUES # or # INSERT/UPSERT INTO table (col1, col2, ...) SELECT/VALUES # If the method returns None, it's the former. insert_column_list = self.profile.choose_insert_column_list(dml_table) if dml_table.primary_keys: # Having primary keys implies the table is a Kudu table, which makes it subject to # both INSERTs (with automatic ignoring of primary key duplicates) and UPSERTs. conflict_action = self.profile.choose_insert_vs_upsert() else: conflict_action = InsertClause.CONFLICT_ACTION_DEFAULT insert_statement.insert_clause = InsertClause( dml_table, column_list=insert_column_list, conflict_action=conflict_action) # We still need to internally track the columns we're inserting. Keep in mind None # means "all" without an explicit column list. Since we've already created the # InsertClause object though, we can fill this in for ourselves. if insert_column_list is None: insert_column_list = dml_table.cols insert_item_data_types = [col.type for col in insert_column_list] # Decide whether this is INSERT/UPSERT VALUES or INSERT/UPSERT SELECT insert_source_clause = self.profile.choose_insert_source_clause() if issubclass(insert_source_clause, Query): # Use QueryGenerator()'s public interface to generate the SELECT. select_query = self.select_stmt_generator.generate_statement( tables, select_item_data_types=insert_item_data_types) # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The # generator's type system is not near sophisticated enough to know how random # expressions will be implicitly casted in the databases. This requires less work # to implement. IMPALA-4693 considers alternative approaches. self._cast_select_items(select_query, insert_column_list) insert_statement.with_clause = deepcopy(select_query.with_clause) select_query.with_clause = None insert_statement.select_query = select_query elif issubclass(insert_source_clause, ValuesClause): insert_statement.values_clause = self._generate_values_clause(insert_column_list) else: raise Exception('unsupported INSERT/UPSERT source clause: {0}'.format( insert_source_clause)) return insert_statement
class InsertStatementGenerator(object): def __init__(self, profile): # QueryProfile-like object self.profile = profile # used to generate SELECT queries for INSERT/UPSERT ... SELECT statements; # to ensure state is completely reset, this is created anew with each call to # generate_statement() self.select_stmt_generator = None def generate_statement(self, tables, dml_table): """ Return a randomly generated INSERT or UPSERT statement. Note that UPSERTs are very similar to INSERTs, which is why this generator handles both. tables should be a list of Table objects. A typical source of such a list comes from db_connection.DbCursor.describe_common_tables(). This list describes the possible "sources" of the INSERT/UPSERT's WITH and FROM/WHERE clauses. dml_table is a required Table object. The INSERT/UPSERT will be into this table. """ if not (isinstance(tables, list) and len(tables) > 0 and all((isinstance(t, Table) for t in tables))): raise Exception('tables must be a not-empty list of Table objects') if not isinstance(dml_table, Table): raise Exception('dml_table must be a Table') self.select_stmt_generator = QueryGenerator(self.profile) insert_statement = InsertStatement(execution=StatementExecutionMode.DML_TEST) # Choose whether this is a # INSERT/UPSERT INTO table SELECT/VALUES # or # INSERT/UPSERT INTO table (col1, col2, ...) SELECT/VALUES # If the method returns None, it's the former. insert_column_list = self.profile.choose_insert_column_list(dml_table) if dml_table.primary_keys: # Having primary keys implies the table is a Kudu table, which makes it subject to # both INSERTs (with automatic ignoring of primary key duplicates) and UPSERTs. conflict_action = self.profile.choose_insert_vs_upsert() else: conflict_action = InsertClause.CONFLICT_ACTION_DEFAULT insert_statement.insert_clause = InsertClause( dml_table, column_list=insert_column_list, conflict_action=conflict_action) # We still need to internally track the columns we're inserting. Keep in mind None # means "all" without an explicit column list. Since we've already created the # InsertClause object though, we can fill this in for ourselves. if insert_column_list is None: insert_column_list = dml_table.cols insert_item_data_types = [col.type for col in insert_column_list] # Decide whether this is INSERT/UPSERT VALUES or INSERT/UPSERT SELECT insert_source_clause = self.profile.choose_insert_source_clause() if issubclass(insert_source_clause, Query): # Use QueryGenerator()'s public interface to generate the SELECT. select_query = self.select_stmt_generator.generate_statement( tables, select_item_data_types=insert_item_data_types) # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The # generator's type system is not near sophisticated enough to know how random # expressions will be implicitly casted in the databases. This requires less work # to implement. IMPALA-4693 considers alternative approaches. self._cast_select_items(select_query, insert_column_list) insert_statement.with_clause = deepcopy(select_query.with_clause) select_query.with_clause = None insert_statement.select_query = select_query elif issubclass(insert_source_clause, ValuesClause): insert_statement.values_clause = self._generate_values_clause(insert_column_list) else: raise Exception('unsupported INSERT/UPSERT source clause: {0}'.format( insert_source_clause)) return insert_statement def _generate_values_clause(self, columns): """ Return a VALUES clause containing a variable number of rows. The values corresponding to primary keys will be non-null constants. Any other columns could be null, constants, or function trees that may or may not evaluate to null. """ values_rows = [] for _ in xrange(self.profile.choose_insert_values_row_count()): values_row = [] for col in columns: if col.is_primary_key: val = self.profile.choose_constant(return_type=col.exact_type, allow_null=False) elif 'constant' == self.profile.choose_values_item_expr(): val = self.profile.choose_constant(return_type=col.exact_type, allow_null=True) else: func_tree = self.select_stmt_generator.create_func_tree( col.type, allow_subquery=False) val = self.select_stmt_generator.populate_func_with_vals(func_tree) # Only the generic type, not the exact type, of the value will be known. To # avoid a lot of failed queries due to precision errors, we cast the val to # the exact type of the column. This will still not prevent "out of range" # conditions, as we don't try to evaluate the random expressions. val = CastFunc(val, col.exact_type) values_row.append(val) values_rows.append(ValuesRow(values_row)) return ValuesClause(values_rows) def _cast_select_items(self, select_query, column_list): """ For a given Query select_query and a column_list (list of Columns), cast each select item in select_query to the exact type of the column. A Query may have a UNION, recursively do this down the line. """ for col_idx, select_item in enumerate(select_query.select_clause.items): cast_val_expr = CastFunc(select_item.val_expr, column_list[col_idx].exact_type) select_item.val_expr = cast_val_expr if select_query.union_clause: self._cast_select_items(select_query.union_clause.query, column_list)
def search(self, number_of_test_queries, stop_on_result_mismatch, stop_on_crash): if exists(self.query_shelve_path): # Ensure a clean shelve will be created remove(self.query_shelve_path) start_time = time() impala_sql_writer = SqlWriter.create(dialect=IMPALA) reference_sql_writer = SqlWriter.create( dialect=self.reference_connection.db_type) query_result_comparator = QueryResultComparator( self.impala_connection, self.reference_connection) query_generator = QueryGenerator() query_count = 0 queries_resulted_in_data_count = 0 mismatch_count = 0 query_timeout_count = 0 known_error_count = 0 impala_crash_count = 0 last_error = None repeat_error_count = 0 with open(self.query_log_path, 'w') as impala_query_log: impala_query_log.write('--\n' '-- Stating new run\n' '--\n') while number_of_test_queries > query_count: query = query_generator.create_query(self.common_tables) impala_sql = impala_sql_writer.write_query(query) if 'FULL OUTER JOIN' in impala_sql and self.reference_connection.db_type == MYSQL: # Not supported by MySQL continue query_count += 1 LOG.info('Running query #%s', query_count) impala_query_log.write(impala_sql + ';\n') result = query_result_comparator.compare_query_results(query) if result.query_resulted_in_data: queries_resulted_in_data_count += 1 if result.error: # TODO: These first two come from psycopg2, the postgres driver. Maybe we should # try a different driver? Or maybe the usage of the driver isn't correct. # Anyhow ignore these failures. if 'division by zero' in result.error \ or 'out of range' in result.error \ or 'Too much data' in result.error: LOG.debug('Ignoring error: %s', result.error) query_count -= 1 continue if result.is_known_error: known_error_count += 1 elif result.query_timed_out: query_timeout_count += 1 else: mismatch_count += 1 with closing(open_shelve( self.query_shelve_path)) as query_shelve: query_shelve[str(query_count)] = query print('---Impala Query---\n') print( impala_sql_writer.write_query(query, pretty=True) + '\n') print('---Reference Query---\n') print( reference_sql_writer.write_query(query, pretty=True) + '\n') print('---Error---\n') print(result.error + '\n') print('------\n') if 'Could not connect' in result.error \ or "Couldn't open transport for" in result.error: # if stop_on_crash: # break # Assume Impala crashed and try restarting impala_crash_count += 1 LOG.info('Restarting Impala') call([ join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/") ]) self.impala_connection.reconnect() query_result_comparator.impala_cursor = self.impala_connection.create_cursor( ) result = query_result_comparator.compare_query_results( query) if result.error: LOG.info('Restarting Impala') call([ join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/") ]) self.impala_connection.reconnect() query_result_comparator.impala_cursor = self.impala_connection.create_cursor( ) else: break if stop_on_result_mismatch and \ not (result.is_known_error or result.query_timed_out): break if last_error == result.error \ and not (result.is_known_error or result.query_timed_out): repeat_error_count += 1 if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT: break else: last_error = result.error repeat_error_count = 0 else: if result.query_resulted_in_data: LOG.info('Results matched (%s rows)', result.impala_row_count) else: LOG.info('Query did not produce meaningful data') last_error = None repeat_error_count = 0 return SearchResults(query_count, queries_resulted_in_data_count, mismatch_count, query_timeout_count, known_error_count, impala_crash_count, time() - start_time)
def search(self, number_of_test_queries, stop_on_result_mismatch, stop_on_crash): '''Returns an instance of SearchResults, which is a summary report. This method oversees the generation, execution, and comparison of queries. number_of_test_queries should an integer indicating the maximum number of queries to generate and execute. ''' start_time = time() query_result_comparator = QueryResultComparator( self.ref_connection, self.test_connection) query_generator = QueryGenerator(self.query_profile) query_count = 0 queries_resulted_in_data_count = 0 mismatch_count = 0 query_timeout_count = 0 known_error_count = 0 test_crash_count = 0 last_error = None repeat_error_count = 0 while number_of_test_queries > query_count: query = query_generator.create_query(self.common_tables) query.execution = self.query_profile.get_query_execution() query_count += 1 LOG.info('Running query #%s', query_count) result = query_result_comparator.compare_query_results(query) if result.query_resulted_in_data: queries_resulted_in_data_count += 1 if isinstance(result.exception, DataLimitExceeded) \ or isinstance(result.exception, TypeOverflow): continue if result.error: # TODO: These first two come from psycopg2, the postgres driver. Maybe we should # try a different driver? Or maybe the usage of the driver isn't correct. # Anyhow ignore these failures. if 'division by zero' in result.error \ or 'out of range' in result.error: LOG.debug('Ignoring error: %s', result.error) query_count -= 1 continue if result.is_known_error: known_error_count += 1 elif result.query_timed_out: query_timeout_count += 1 else: mismatch_count += 1 print('---Test Query---\n') print(result.test_sql + '\n') print('---Reference Query---\n') print(result.ref_sql + '\n') print('---Error---\n') print(result.error + '\n') print('------\n') if 'Could not connect' in result.error \ or "Couldn't open transport for" in result.error: if stop_on_crash: break # Assume Impala crashed and try restarting test_crash_count += 1 LOG.info('Restarting Impala') call([join_path(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")]) self.test_connection.reconnect() query_result_comparator.test_cursor = self.test_connection.create_cursor() result = query_result_comparator.compare_query_results(query) if result.error: LOG.info('Restarting Impala') call([join_path(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")]) self.test_connection.reconnect() query_result_comparator.test_cursor = self.test_connection.create_cursor() else: break if stop_on_result_mismatch and \ not (result.is_known_error or result.query_timed_out): break if last_error == result.error \ and not (result.is_known_error or result.query_timed_out): repeat_error_count += 1 if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT: break else: last_error = result.error repeat_error_count = 0 else: if result.query_resulted_in_data: LOG.info('Results matched (%s rows)', result.test_row_count) else: LOG.info('Query did not produce meaningful data') last_error = None repeat_error_count = 0 return SearchResults( query_count, queries_resulted_in_data_count, mismatch_count, query_timeout_count, known_error_count, test_crash_count, time() - start_time)
class Job(object): '''Represents a Query Generator Job. One ImpalaDockerEnv is associated with it. Able to execute queries by either generaing them based on a provided query profile or by extracting queries from an existing report. A report is generated when it finishes running. ''' def __init__(self, query_profile, job_id, run_name='default', time_limit_sec=24 * 3600, git_command=None, parent_job=None): self.git_hash = '' self.job_id = job_id self.job_name = run_name self.parent_job = parent_job self.query_profile = query_profile or (ImpalaNestedTypesProfile() if NESTED_TYPES_MODE else DefaultProfile()) self.ref_connection = None self.result_list = [] self.start_time = time() self.stop_time = None self.target_stop_time = time() + time_limit_sec self.test_connection = None self.num_queries_executed = 0 self.num_queries_returned_correct_data = 0 self.flatten_dialect = 'POSTGRESQL' if NESTED_TYPES_MODE else None self.impala_env = ImpalaDockerEnv(git_command) def __getstate__(self): '''For pickling''' result = {} result['job_id'] = self.job_id result['job_name'] = self.job_name result['parent_job'] = self.parent_job result['result_list'] = self.result_list result['git_hash'] = self.git_hash result['start_time'] = self.start_time result['stop_time'] = self.stop_time result['num_queries_executed'] = self.num_queries_executed result[ 'num_queries_returned_correct_data'] = self.num_queries_returned_correct_data return result def prepare(self): '''Prepares the environment and connects to Postgres and Impala running inside the Docker container. ''' LOG.info('Starting Job Preparation') self.impala_env.prepare() LOG.info('Job Preparation Complete') self.ref_connection = PostgresqlConnection( user_name=POSTGRES_USER_NAME, password=None, host_name=self.impala_env.host, port=self.impala_env.postgres_port, db_name=POSTGRES_DATABASE_NAME) LOG.info('Created ref_connection') self.start_impala() self.git_hash = self.impala_env.get_git_hash() def get_stack(self): stack_trace = self.impala_env.get_stack() LOG.info('Stack Trace: {0}'.format(stack_trace)) return stack_trace def start_impala(self): '''Starts impala and creates a connection to it. ''' self.impala_env.start_impala() self.test_connection = ImpalaConnection( host_name=self.impala_env.host, port=self.impala_env.impala_port, user_name=None, db_name=DATABASE_NAME) self.test_connection.reconnect() self.query_result_comparator = QueryResultComparator( self.query_profile, self.ref_connection, self.test_connection, query_timeout_seconds=4 * 60, flatten_dialect='POSTGRESQL') LOG.info('Created query result comparator') LOG.info(str(self.query_result_comparator.__dict__)) def is_impala_running(self): return self.impala_env.is_impala_running() def save_pickle(self): '''Saves self as pickle. This is normally done when the job finishes running. ''' with open(join_path(PATH_TO_FINISHED_JOBS, self.job_id), 'w') as f: pickle.dump(self, f) LOG.info('Saved Completed Job Pickle') def queries_to_be_executed(self): '''Generator that outputs query models. They are either generated based on the query profile, or they are extracted from an existing report. ''' if self.parent_job: # If parent job is specified, get the queries from the parent job report with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f: parent_report = pickle.load(f) for error_type in ['stack', 'row_counts', 'mismatch']: for query in parent_report.grouped_results[error_type]: yield query['model'] else: # If parent job is not specified, generate queries with QueryGenerator num_unexpected_errors = 0 while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD: query = None try: self.query_generator = QueryGenerator(self.query_profile) query = self.query_generator.create_query( self.common_tables) except IndexError as e: # This is a query generator bug that happens extremely rarely LOG.info('Query Generator Choice Problem, {0}'.format(e)) continue except Exception as e: LOG.info('Unexpected error in queries_to_be_executed, {0}'. format(e)) num_unexpected_errors += 1 if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD: LOG.error('Num Unexpected Errors above threshold') raise else: continue yield query def generate_report(self): '''Generate report and save it into the reports directory. ''' from report import Report rep = Report(self.job_id) rep.save_pickle() def start(self): try: self.prepare() self.query_generator = QueryGenerator(self.query_profile) if NESTED_TYPES_MODE: self.common_tables = DbCursor.describe_common_tables( [self.test_connection.cursor()]) else: self.common_tables = DbCursor.describe_common_tables([ self.test_connection.cursor(), self.ref_connection.cursor() ]) for query_model in self.queries_to_be_executed(): LOG.info('About to execute query.') result_dict = self.run_query(query_model) LOG.info('Query Executed successfully.') self.num_queries_executed += 1 if result_dict: self.result_list.append(result_dict) LOG.info('Time Left: {0}'.format(self.target_stop_time - time())) if time() > self.target_stop_time: break self.stop_time = time() self.save_pickle() self.generate_report() LOG.info('Generated Report') except: LOG.exception('Unexpected Exception in start') raise finally: self.impala_env.stop_docker() LOG.info('Docker Stopped') try: os.remove(join_path(PATH_TO_SCHEDULE, self.job_id)) LOG.info('Schedule file removed') except OSError: LOG.info('Unable to remove schedule file.') def reproduce_crash(self, query_model): '''Check if the given query_model causes a crash. Returns the number of times the query had to be run to cause a crash. ''' NUM_TRIES = 5 self.start_impala() for try_num in range(1, NUM_TRIES + 1): self.query_result_comparator.compare_query_results(query_model) if not self.is_impala_running(): return try_num def run_query(self, query_model): '''Runs a single query. ''' if not self.is_impala_running(): LOG.info('Impala is not running, starting Impala.') self.start_impala() def run_query_internal(): self.comparison_result = self.query_result_comparator.compare_query_results( query_model) self.comparison_result = None internal_thread = Thread(target=run_query_internal, name='run_query_internal_{0}'.format( self.job_id)) internal_thread.daemon = True internal_thread.start() internal_thread.join(timeout=600) if internal_thread.is_alive(): LOG.info( 'run_query_internal is alive, restarting Impala Environment') self.impala_env.stop_docker() self.prepare() return None else: LOG.info('run_query_internal is dead as expected') comparison_result = self.comparison_result if comparison_result.query_timed_out: LOG.info('Query Timeout Exception') restart_impala = True else: restart_impala = False result_dict = {} if self.is_impala_running(): if comparison_result.error: result_dict = self.comparison_result_analysis( comparison_result) result_dict['model'] = query_model elif comparison_result.query_resulted_in_data: self.num_queries_returned_correct_data += 1 else: LOG.info('CRASH OCCURED') result_dict = self.comparison_result_analysis(comparison_result) result_dict['model'] = query_model result_dict['stack'] = self.get_stack() result_dict['num_tries_to_reproduce'] = self.reproduce_crash( query_model) if restart_impala: self.start_impala() return result_dict def comparison_result_analysis(self, comparison_result): '''Get useful information from the comparison_result. ''' result_dict = {} result_dict['error'] = comparison_result.error result_dict['mismatch_col'] = comparison_result.mismatch_at_col_number result_dict['mismatch_ref_row'] = comparison_result.ref_row result_dict['mismatch_test_row'] = comparison_result.test_row result_dict['ref_row_count'] = comparison_result.ref_row_count result_dict['ref_sql'] = comparison_result.ref_sql result_dict['test_row_count'] = comparison_result.test_row_count result_dict['test_sql'] = comparison_result.test_sql return result_dict
def search(self, number_of_test_queries, stop_on_result_mismatch, stop_on_crash, query_timeout_seconds): '''Returns an instance of SearchResults, which is a summary report. This method oversees the generation, execution, and comparison of queries. number_of_test_queries should an integer indicating the maximum number of queries to generate and execute. ''' start_time = time() query_result_comparator = QueryResultComparator( self.query_profile, self.ref_connection, self.test_connection, query_timeout_seconds) query_generator = QueryGenerator(self.query_profile) query_count = 0 queries_resulted_in_data_count = 0 mismatch_count = 0 query_timeout_count = 0 known_error_count = 0 test_crash_count = 0 last_error = None repeat_error_count = 0 while number_of_test_queries > query_count: query = query_generator.create_query(self.common_tables) query.execution = self.query_profile.get_query_execution() query_count += 1 LOG.info('Running query #%s', query_count) result = query_result_comparator.compare_query_results(query) if result.query_resulted_in_data: queries_resulted_in_data_count += 1 if isinstance(result.exception, DataLimitExceeded) \ or isinstance(result.exception, TypeOverflow): continue if result.error: # TODO: These first two come from psycopg2, the postgres driver. Maybe we should # try a different driver? Or maybe the usage of the driver isn't correct. # Anyhow ignore these failures. if 'division by zero' in result.error \ or 'out of range' in result.error: LOG.debug('Ignoring error: %s', result.error) query_count -= 1 continue if result.is_known_error: known_error_count += 1 elif result.query_timed_out: query_timeout_count += 1 else: mismatch_count += 1 print('---Test Query---\n') print(result.test_sql + '\n') print('---Reference Query---\n') print(result.ref_sql + '\n') print('---Error---\n') print(result.error + '\n') print('------\n') if 'Could not connect' in result.error \ or "Couldn't open transport for" in result.error: if stop_on_crash: break # Assume Impala crashed and try restarting test_crash_count += 1 LOG.info('Restarting Impala') call([ join_path(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/") ]) self.test_connection.reconnect() query_result_comparator.test_cursor = self.test_connection.create_cursor( ) result = query_result_comparator.compare_query_results( query) if result.error: LOG.info('Restarting Impala') call([ join_path(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/") ]) self.test_connection.reconnect() query_result_comparator.test_cursor = self.test_connection.create_cursor( ) else: break if stop_on_result_mismatch and \ not (result.is_known_error or result.query_timed_out): break if last_error == result.error \ and not (result.is_known_error or result.query_timed_out): repeat_error_count += 1 if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT: break else: last_error = result.error repeat_error_count = 0 else: if result.query_resulted_in_data: LOG.info('Results matched (%s rows)', result.test_row_count) else: LOG.info('Query did not produce meaningful data') last_error = None repeat_error_count = 0 return SearchResults(query_count, queries_resulted_in_data_count, mismatch_count, query_timeout_count, known_error_count, test_crash_count, time() - start_time)
class InsertStatementGenerator(object): def __init__(self, profile): # QueryProfile-like object self.profile = profile # used to generate SELECT queries for INSERT ... SELECT statements; # to ensure state is completely reset, this is created anew with each call to # generate_statement() self.select_stmt_generator = None def generate_statement(self, tables, dml_table): """ Return a randomly generated INSERT statement. tables should be a list of Table objects. A typical source of such a list comes from db_connection.DbCursor.describe_common_tables(). This list describes the possible "sources" of the INSERT's WITH and FROM/WHERE clauses. dml_table is a required Table object. The INSERT will be into this table. """ if not (isinstance(tables, list) and len(tables) > 0 and all( (isinstance(t, Table) for t in tables))): raise Exception('tables must be a not-empty list of Table objects') if not isinstance(dml_table, Table): raise Exception('dml_table must be a Table') self.select_stmt_generator = QueryGenerator(self.profile) if dml_table.primary_keys: insert_statement = InsertStatement( conflict_action=InsertStatement.CONFLICT_ACTION_IGNORE) else: insert_statement = InsertStatement( conflict_action=InsertStatement.CONFLICT_ACTION_DEFAULT) insert_statement.execution = StatementExecutionMode.DML_TEST # Choose whether this is a # INSERT INTO table SELECT/VALUES # or # INSERT INTO table (col1, col2, ...) SELECT/VALUES # If the method returns None, it's the former. insert_column_list = self.profile.choose_insert_column_list(dml_table) insert_statement.insert_clause = InsertClause( dml_table, column_list=insert_column_list) # We still need to internally track the columns we're inserting. Keep in mind None # means "all" without an explicit column list. Since we've already created the # InsertClause object though, we can fill this in for ourselves. if insert_column_list is None: insert_column_list = dml_table.cols insert_item_data_types = [col.type for col in insert_column_list] # Decide whether this is INSERT VALUES or INSERT SELECT insert_source_clause = self.profile.choose_insert_source_clause() if issubclass(insert_source_clause, Query): # Use QueryGenerator()'s public interface to generate the SELECT. select_query = self.select_stmt_generator.generate_statement( tables, select_item_data_types=insert_item_data_types) # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The # generator's type system is not near sophisticated enough to know how random # expressions will be implicitly casted in the databases. This requires less work # to implement. IMPALA-4693 considers alternative approaches. self._cast_select_items(select_query, insert_column_list) insert_statement.with_clause = deepcopy(select_query.with_clause) select_query.with_clause = None insert_statement.select_query = select_query elif issubclass(insert_source_clause, ValuesClause): insert_statement.values_clause = self._generate_values_clause( insert_column_list) else: raise Exception('unsupported INSERT source clause: {0}'.format( insert_source_clause)) return insert_statement def _generate_values_clause(self, columns): """ Return a VALUES clause containing a variable number of rows. The values corresponding to primary keys will be non-null constants. Any other columns could be null, constants, or function trees that may or may not evaluate to null. """ values_rows = [] for _ in xrange(self.profile.choose_insert_values_row_count()): values_row = [] for col in columns: if col.is_primary_key: val = self.profile.choose_constant( return_type=col.exact_type, allow_null=False) elif 'constant' == self.profile.choose_values_item_expr(): val = self.profile.choose_constant( return_type=col.exact_type, allow_null=True) else: func_tree = self.select_stmt_generator.create_func_tree( col.type, allow_subquery=False) val = self.select_stmt_generator.populate_func_with_vals( func_tree) # Only the generic type, not the exact type, of the value will be known. To # avoid a lot of failed queries due to precision errors, we cast the val to # the exact type of the column. This will still not prevent "out of range" # conditions, as we don't try to evaluate the random expressions. val = CastFunc(val, col.exact_type) values_row.append(val) values_rows.append(ValuesRow(values_row)) return ValuesClause(values_rows) def _cast_select_items(self, select_query, column_list): """ For a given Query select_query and a column_list (list of Columns), cast each select item in select_query to the exact type of the column. A Query may have a UNION, recursively do this down the line. """ for col_idx, select_item in enumerate( select_query.select_clause.items): cast_val_expr = CastFunc(select_item.val_expr, column_list[col_idx].exact_type) select_item.val_expr = cast_val_expr if select_query.union_clause: self._cast_select_items(select_query.union_clause.query, column_list)
def generate_statement(self, tables, dml_table): """ Return a randomly generated INSERT statement. tables should be a list of Table objects. A typical source of such a list comes from db_connection.DbCursor.describe_common_tables(). This list describes the possible "sources" of the INSERT's WITH and FROM/WHERE clauses. dml_table is a required Table object. The INSERT will be into this table. """ if not (isinstance(tables, list) and len(tables) > 0 and all( (isinstance(t, Table) for t in tables))): raise Exception('tables must be a not-empty list of Table objects') if not isinstance(dml_table, Table): raise Exception('dml_table must be a Table') self.select_stmt_generator = QueryGenerator(self.profile) if dml_table.primary_keys: insert_statement = InsertStatement( conflict_action=InsertStatement.CONFLICT_ACTION_IGNORE) else: insert_statement = InsertStatement( conflict_action=InsertStatement.CONFLICT_ACTION_DEFAULT) insert_statement.execution = StatementExecutionMode.DML_TEST # Choose whether this is a # INSERT INTO table SELECT/VALUES # or # INSERT INTO table (col1, col2, ...) SELECT/VALUES # If the method returns None, it's the former. insert_column_list = self.profile.choose_insert_column_list(dml_table) insert_statement.insert_clause = InsertClause( dml_table, column_list=insert_column_list) # We still need to internally track the columns we're inserting. Keep in mind None # means "all" without an explicit column list. Since we've already created the # InsertClause object though, we can fill this in for ourselves. if insert_column_list is None: insert_column_list = dml_table.cols insert_item_data_types = [col.type for col in insert_column_list] # Decide whether this is INSERT VALUES or INSERT SELECT insert_source_clause = self.profile.choose_insert_source_clause() if issubclass(insert_source_clause, Query): # Use QueryGenerator()'s public interface to generate the SELECT. select_query = self.select_stmt_generator.generate_statement( tables, select_item_data_types=insert_item_data_types) # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The # generator's type system is not near sophisticated enough to know how random # expressions will be implicitly casted in the databases. This requires less work # to implement. IMPALA-4693 considers alternative approaches. self._cast_select_items(select_query, insert_column_list) insert_statement.with_clause = deepcopy(select_query.with_clause) select_query.with_clause = None insert_statement.select_query = select_query elif issubclass(insert_source_clause, ValuesClause): insert_statement.values_clause = self._generate_values_clause( insert_column_list) else: raise Exception('unsupported INSERT source clause: {0}'.format( insert_source_clause)) return insert_statement
def search(self, number_of_test_queries, stop_on_result_mismatch, stop_on_crash): if exists(self.query_shelve_path): # Ensure a clean shelve will be created remove(self.query_shelve_path) start_time = time() impala_sql_writer = SqlWriter.create(dialect=IMPALA) reference_sql_writer = SqlWriter.create( dialect=self.reference_connection.db_type) query_result_comparator = QueryResultComparator( self.impala_connection, self.reference_connection) query_generator = QueryGenerator() query_count = 0 queries_resulted_in_data_count = 0 mismatch_count = 0 query_timeout_count = 0 known_error_count = 0 impala_crash_count = 0 last_error = None repeat_error_count = 0 with open(self.query_log_path, 'w') as impala_query_log: impala_query_log.write( '--\n' '-- Stating new run\n' '--\n') while number_of_test_queries > query_count: query = query_generator.create_query(self.common_tables) impala_sql = impala_sql_writer.write_query(query) if 'FULL OUTER JOIN' in impala_sql and self.reference_connection.db_type == MYSQL: # Not supported by MySQL continue query_count += 1 LOG.info('Running query #%s', query_count) impala_query_log.write(impala_sql + ';\n') result = query_result_comparator.compare_query_results(query) if result.query_resulted_in_data: queries_resulted_in_data_count += 1 if result.error: # TODO: These first two come from psycopg2, the postgres driver. Maybe we should # try a different driver? Or maybe the usage of the driver isn't correct. # Anyhow ignore these failures. if 'division by zero' in result.error \ or 'out of range' in result.error \ or 'Too much data' in result.error: LOG.debug('Ignoring error: %s', result.error) query_count -= 1 continue if result.is_known_error: known_error_count += 1 elif result.query_timed_out: query_timeout_count += 1 else: mismatch_count += 1 with closing(open_shelve(self.query_shelve_path)) as query_shelve: query_shelve[str(query_count)] = query print('---Impala Query---\n') print(impala_sql_writer.write_query(query, pretty=True) + '\n') print('---Reference Query---\n') print(reference_sql_writer.write_query(query, pretty=True) + '\n') print('---Error---\n') print(result.error + '\n') print('------\n') if 'Could not connect' in result.error \ or "Couldn't open transport for" in result.error: # if stop_on_crash: # break # Assume Impala crashed and try restarting impala_crash_count += 1 LOG.info('Restarting Impala') call([join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")]) self.impala_connection.reconnect() query_result_comparator.impala_cursor = self.impala_connection.create_cursor() result = query_result_comparator.compare_query_results(query) if result.error: LOG.info('Restarting Impala') call([join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'), '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")]) self.impala_connection.reconnect() query_result_comparator.impala_cursor = self.impala_connection.create_cursor() else: break if stop_on_result_mismatch and \ not (result.is_known_error or result.query_timed_out): break if last_error == result.error \ and not (result.is_known_error or result.query_timed_out): repeat_error_count += 1 if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT: break else: last_error = result.error repeat_error_count = 0 else: if result.query_resulted_in_data: LOG.info('Results matched (%s rows)', result.impala_row_count) else: LOG.info('Query did not produce meaningful data') last_error = None repeat_error_count = 0 return SearchResults( query_count, queries_resulted_in_data_count, mismatch_count, query_timeout_count, known_error_count, impala_crash_count, time() - start_time)