def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False, encoding=None, test_file_vars=None): """ Runs the queries in the specified test based on the vector values Runs the query using targeting the file format/compression specified in the test vector and the exec options specified in the test vector. If multiple_impalad=True a connection to a random impalad will be chosen to execute each test section. Otherwise, the default impalad client will be used. Additionally, the encoding for all test data can be specified using the 'encoding' parameter. This is useful when data is ingested in a different encoding (ex. latin). If not set, the default system encoding will be used. If a dict 'test_file_vars' is provided, then all keys will be replaced with their values in queries before they are executed. Callers need to avoid using reserved key names, see 'reserved_keywords' below. """ table_format_info = vector.get_value('table_format') exec_options = vector.get_value('exec_option') # Resolve the current user's primary group name. group_id = pwd.getpwnam(getuser()).pw_gid group_name = grp.getgrgid(group_id).gr_name target_impalad_clients = list() if multiple_impalad: target_impalad_clients =\ map(ImpalaTestSuite.create_impala_client, IMPALAD_HOST_PORT_LIST) else: target_impalad_clients = [self.client] # Change the database to reflect the file_format, compression codec etc, or the # user specified database for all targeted impalad. for impalad_client in target_impalad_clients: ImpalaTestSuite.change_database(impalad_client, table_format_info, use_db, pytest.config.option.scale_factor) impalad_client.set_configuration(exec_options) sections = self.load_query_test_file(self.get_workload(), test_file_name, encoding=encoding) for test_section in sections: if 'SHELL' in test_section: assert len(test_section) == 1, \ "SHELL test sections can't contain other sections" cmd = test_section['SHELL']\ .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX)\ .replace('$IMPALA_HOME', IMPALA_HOME) if use_db: cmd = cmd.replace('$DATABASE', use_db) LOG.info("Shell command: " + cmd) check_call(cmd, shell=True) continue if 'QUERY' not in test_section: assert 0, 'Error in test file %s. Test cases require a -- QUERY section.\n%s' %\ (test_file_name, pprint.pformat(test_section)) if 'SETUP' in test_section: self.execute_test_case_setup(test_section['SETUP'], table_format_info) # TODO: support running query tests against different scale factors query = QueryTestSectionReader.build_query( test_section['QUERY'].replace( '$GROUP_NAME', group_name).replace('$IMPALA_HOME', IMPALA_HOME).replace( '$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX).replace( '$SECONDARY_FILESYSTEM', os.getenv("SECONDARY_FILESYSTEM") or str())) if use_db: query = query.replace('$DATABASE', use_db) reserved_keywords = [ "$DATABASE", "$FILESYSTEM_PREFIX", "$GROUP_NAME", "$IMPALA_HOME", "$NAMENODE", "$QUERY", "$SECONDARY_FILESYSTEM" ] if test_file_vars: for key, value in test_file_vars.iteritems(): if key in reserved_keywords: raise RuntimeError("Key {0} is reserved".format(key)) query = query.replace(key, value) if 'QUERY_NAME' in test_section: LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME']) # Support running multiple queries within the same test section, only verifying the # result of the final query. The main use case is to allow for 'USE database' # statements before a query executes, but it is not limited to that. # TODO: consider supporting result verification of all queries in the future result = None target_impalad_client = choice(target_impalad_clients) query_options_changed = [] try: user = None if 'USER' in test_section: # Create a new client so the session will use the new username. user = test_section['USER'].strip() target_impalad_client = self.create_impala_client() for query in query.split(';'): set_pattern_match = SET_PATTERN.match(query) if set_pattern_match != None: query_options_changed.append( set_pattern_match.groups()[0]) result = self.__execute_query(target_impalad_client, query, user=user) except Exception as e: if 'CATCH' in test_section: self.__verify_exceptions(test_section['CATCH'], str(e), use_db) continue raise finally: if len(query_options_changed) > 0: self.__restore_query_options(query_options_changed, target_impalad_client) if 'CATCH' in test_section and '__NO_ERROR__' not in test_section[ 'CATCH']: expected_str = " or ".join(test_section['CATCH']).strip() \ .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX) \ .replace('$NAMENODE', NAMENODE) \ .replace('$IMPALA_HOME', IMPALA_HOME) assert False, "Expected exception: %s" % expected_str assert result is not None assert result.success # Decode the results read back if the data is stored with a specific encoding. if encoding: result.data = [row.decode(encoding) for row in result.data] # Replace $NAMENODE in the expected results with the actual namenode URI. if 'RESULTS' in test_section: # Combining 'RESULTS' with 'DML_RESULTS" is currently unsupported because # __verify_results_and_errors calls verify_raw_results which always checks # ERRORS, TYPES, LABELS, etc. which doesn't make sense if there are two # different result sets to consider (IMPALA-4471). assert 'DML_RESULTS' not in test_section self.__verify_results_and_errors(vector, test_section, result, use_db) else: # TODO: Can't validate errors without expected results for now. assert 'ERRORS' not in test_section,\ "'ERRORS' sections must have accompanying 'RESULTS' sections" # If --update_results, then replace references to the namenode URI with $NAMENODE. if pytest.config.option.update_results and 'RESULTS' in test_section: test_section['RESULTS'] = test_section['RESULTS'] \ .replace(NAMENODE, '$NAMENODE') \ .replace('$IMPALA_HOME', IMPALA_HOME) if 'RUNTIME_PROFILE' in test_section: verify_runtime_profile(test_section['RUNTIME_PROFILE'], result.runtime_profile) if 'DML_RESULTS' in test_section: assert 'ERRORS' not in test_section # The limit is specified to ensure the queries aren't unbounded. We shouldn't have # test files that are checking the contents of tables larger than that anyways. dml_results_query = "select * from %s limit 1000" % \ test_section['DML_RESULTS_TABLE'] dml_result = self.__execute_query(target_impalad_client, dml_results_query) verify_raw_results( test_section, dml_result, vector.get_value('table_format').file_format, pytest.config.option.update_results, result_section='DML_RESULTS') if pytest.config.option.update_results: output_file = os.path.join( EE_TEST_LOGS_DIR, test_file_name.replace('/', '_') + ".test") write_test_file(output_file, sections, encoding=encoding)
def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False, encoding=None, test_file_vars=None): """ Runs the queries in the specified test based on the vector values Runs the query using targeting the file format/compression specified in the test vector and the exec options specified in the test vector. If multiple_impalad=True a connection to a random impalad will be chosen to execute each test section. Otherwise, the default impalad client will be used. If 'protocol' (either 'hs2' or 'beeswax') is set in the vector, a client for that protocol is used. Otherwise we use the default: beeswax. Additionally, the encoding for all test data can be specified using the 'encoding' parameter. This is useful when data is ingested in a different encoding (ex. latin). If not set, the default system encoding will be used. If a dict 'test_file_vars' is provided, then all keys will be replaced with their values in queries before they are executed. Callers need to avoid using reserved key names, see 'reserved_keywords' below. """ table_format_info = vector.get_value('table_format') exec_options = vector.get_value('exec_option') protocol = vector.get_value('protocol') # Resolve the current user's primary group name. group_id = pwd.getpwnam(getuser()).pw_gid group_name = grp.getgrgid(group_id).gr_name target_impalad_clients = list() if multiple_impalad: target_impalad_clients =\ [ImpalaTestSuite.create_impala_client(host_port, protocol=protocol) for host_port in self.__get_cluster_host_ports(protocol)] else: if protocol == 'beeswax': target_impalad_clients = [self.client] else: assert protocol == 'hs2' target_impalad_clients = [self.hs2_client] # Change the database to reflect the file_format, compression codec etc, or the # user specified database for all targeted impalad. for impalad_client in target_impalad_clients: ImpalaTestSuite.change_database(impalad_client, table_format_info, use_db, pytest.config.option.scale_factor) impalad_client.set_configuration(exec_options) sections = self.load_query_test_file(self.get_workload(), test_file_name, encoding=encoding) for test_section in sections: if 'SHELL' in test_section: assert len(test_section) == 1, \ "SHELL test sections can't contain other sections" cmd = test_section['SHELL']\ .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX)\ .replace('$FILESYSTEM_NAME', FILESYSTEM_NAME)\ .replace('$IMPALA_HOME', IMPALA_HOME) if use_db: cmd = cmd.replace('$DATABASE', use_db) LOG.info("Shell command: " + cmd) check_call(cmd, shell=True) continue if 'QUERY' not in test_section: assert 0, 'Error in test file %s. Test cases require a -- QUERY section.\n%s' %\ (test_file_name, pprint.pformat(test_section)) if 'SETUP' in test_section: self.execute_test_case_setup(test_section['SETUP'], table_format_info) # TODO: support running query tests against different scale factors query = QueryTestSectionReader.build_query(test_section['QUERY'] .replace('$GROUP_NAME', group_name) .replace('$IMPALA_HOME', IMPALA_HOME) .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX) .replace('$FILESYSTEM_NAME', FILESYSTEM_NAME) .replace('$SECONDARY_FILESYSTEM', os.getenv("SECONDARY_FILESYSTEM") or str()) .replace('$USER', getuser()) .replace('$INTERNAL_LISTEN_HOST', INTERNAL_LISTEN_HOST) .replace('$INTERNAL_LISTEN_IP', INTERNAL_LISTEN_IP)) if use_db: query = query.replace('$DATABASE', use_db) reserved_keywords = ["$DATABASE", "$FILESYSTEM_PREFIX", "$FILESYSTEM_NAME", "$GROUP_NAME", "$IMPALA_HOME", "$NAMENODE", "$QUERY", "$SECONDARY_FILESYSTEM", "$USER"] if test_file_vars: for key, value in test_file_vars.iteritems(): if key in reserved_keywords: raise RuntimeError("Key {0} is reserved".format(key)) query = query.replace(key, value) if 'QUERY_NAME' in test_section: LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME']) # Support running multiple queries within the same test section, only verifying the # result of the final query. The main use case is to allow for 'USE database' # statements before a query executes, but it is not limited to that. # TODO: consider supporting result verification of all queries in the future result = None target_impalad_client = choice(target_impalad_clients) query_options_changed = [] try: user = None if 'USER' in test_section: # Create a new client so the session will use the new username. user = test_section['USER'].strip() target_impalad_client = self.create_impala_client(protocol=protocol) for query in query.split(';'): set_pattern_match = SET_PATTERN.match(query) if set_pattern_match != None: query_options_changed.append(set_pattern_match.groups()[0]) assert set_pattern_match.groups()[0] not in vector.get_value("exec_option"), \ "%s cannot be set in the '.test' file since it is in the test vector. " \ "Consider deepcopy()-ing the vector and removing this option in the " \ "python test." % set_pattern_match.groups()[0] result = self.__execute_query(target_impalad_client, query, user=user) except Exception as e: if 'CATCH' in test_section: self.__verify_exceptions(test_section['CATCH'], str(e), use_db) continue raise finally: if len(query_options_changed) > 0: self.__restore_query_options(query_options_changed, target_impalad_client) if 'CATCH' in test_section and '__NO_ERROR__' not in test_section['CATCH']: expected_str = " or ".join(test_section['CATCH']).strip() \ .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX) \ .replace('$FILESYSTEM_NAME', FILESYSTEM_NAME) \ .replace('$NAMENODE', NAMENODE) \ .replace('$IMPALA_HOME', IMPALA_HOME) assert False, "Expected exception: %s" % expected_str assert result is not None assert result.success # Decode the results read back if the data is stored with a specific encoding. if encoding: result.data = [row.decode(encoding) for row in result.data] # Replace $NAMENODE in the expected results with the actual namenode URI. if 'RESULTS' in test_section: # Combining 'RESULTS' with 'DML_RESULTS" is currently unsupported because # __verify_results_and_errors calls verify_raw_results which always checks # ERRORS, TYPES, LABELS, etc. which doesn't make sense if there are two # different result sets to consider (IMPALA-4471). assert 'DML_RESULTS' not in test_section self.__verify_results_and_errors(vector, test_section, result, use_db) else: # TODO: Can't validate errors without expected results for now. assert 'ERRORS' not in test_section,\ "'ERRORS' sections must have accompanying 'RESULTS' sections" # If --update_results, then replace references to the namenode URI with $NAMENODE. if pytest.config.option.update_results and 'RESULTS' in test_section: test_section['RESULTS'] = test_section['RESULTS'] \ .replace(NAMENODE, '$NAMENODE') \ .replace('$IMPALA_HOME', IMPALA_HOME) \ .replace(INTERNAL_LISTEN_HOST, '$INTERNAL_LISTEN_HOST') \ .replace(INTERNAL_LISTEN_IP, '$INTERNAL_LISTEN_IP') rt_profile_info = None if 'RUNTIME_PROFILE_%s' % table_format_info.file_format in test_section: # If this table format has a RUNTIME_PROFILE section specifically for it, evaluate # that section and ignore any general RUNTIME_PROFILE sections. rt_profile_info = 'RUNTIME_PROFILE_%s' % table_format_info.file_format elif 'RUNTIME_PROFILE' in test_section: rt_profile_info = 'RUNTIME_PROFILE' if rt_profile_info is not None: rt_profile = verify_runtime_profile(test_section[rt_profile_info], result.runtime_profile, update_section=pytest.config.option.update_results) if pytest.config.option.update_results: test_section[rt_profile_info] = "".join(rt_profile) if 'DML_RESULTS' in test_section: assert 'ERRORS' not in test_section # The limit is specified to ensure the queries aren't unbounded. We shouldn't have # test files that are checking the contents of tables larger than that anyways. dml_results_query = "select * from %s limit 1000" % \ test_section['DML_RESULTS_TABLE'] dml_result = self.__execute_query(target_impalad_client, dml_results_query) verify_raw_results(test_section, dml_result, vector.get_value('table_format').file_format, result_section='DML_RESULTS', update_section=pytest.config.option.update_results) if pytest.config.option.update_results: output_file = os.path.join(EE_TEST_LOGS_DIR, test_file_name.replace('/','_') + ".test") write_test_file(output_file, sections, encoding=encoding)
def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False, encoding=None): """ Runs the queries in the specified test based on the vector values Runs the query using targeting the file format/compression specified in the test vector and the exec options specified in the test vector. If multiple_impalad=True a connection to a random impalad will be chosen to execute each test section. Otherwise, the default impalad client will be used. Additionally, the encoding for all test data can be specified using the 'encoding' parameter. This is useful when data is ingested in a different encoding (ex. latin). If not set, the default system encoding will be used. """ table_format_info = vector.get_value('table_format') exec_options = vector.get_value('exec_option') # Resolve the current user's primary group name. group_id = pwd.getpwnam(getuser()).pw_gid group_name = grp.getgrgid(group_id).gr_name target_impalad_clients = list() if multiple_impalad: target_impalad_clients =\ map(ImpalaTestSuite.create_impala_client, IMPALAD_HOST_PORT_LIST) else: target_impalad_clients = [self.client] # Change the database to reflect the file_format, compression codec etc, or the # user specified database for all targeted impalad. for impalad_client in target_impalad_clients: ImpalaTestSuite.change_database(impalad_client, table_format_info, use_db, pytest.config.option.scale_factor) impalad_client.set_configuration(exec_options) sections = self.load_query_test_file(self.get_workload(), test_file_name, encoding=encoding) for test_section in sections: if 'SHELL' in test_section: assert len(test_section) == 1, \ "SHELL test sections can't contain other sections" cmd = test_section['SHELL']\ .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX)\ .replace('$IMPALA_HOME', IMPALA_HOME) if use_db: cmd = cmd.replace('$DATABASE', use_db) LOG.info("Shell command: " + cmd) check_call(cmd, shell=True) continue if 'QUERY' not in test_section: assert 0, 'Error in test file %s. Test cases require a -- QUERY section.\n%s' %\ (test_file_name, pprint.pformat(test_section)) if 'SETUP' in test_section: self.execute_test_case_setup(test_section['SETUP'], table_format_info) # TODO: support running query tests against different scale factors query = QueryTestSectionReader.build_query(test_section['QUERY'] .replace('$GROUP_NAME', group_name) .replace('$IMPALA_HOME', IMPALA_HOME) .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX) .replace('$SECONDARY_FILESYSTEM', os.getenv("SECONDARY_FILESYSTEM") or str())) if use_db: query = query.replace('$DATABASE', use_db) if 'QUERY_NAME' in test_section: LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME']) # Support running multiple queries within the same test section, only verifying the # result of the final query. The main use case is to allow for 'USE database' # statements before a query executes, but it is not limited to that. # TODO: consider supporting result verification of all queries in the future result = None target_impalad_client = choice(target_impalad_clients) query_options_changed = [] try: user = None if 'USER' in test_section: # Create a new client so the session will use the new username. user = test_section['USER'].strip() target_impalad_client = self.create_impala_client() for query in query.split(';'): set_pattern_match = SET_PATTERN.match(query) if set_pattern_match != None: query_options_changed.append(set_pattern_match.groups()[0]) result = self.__execute_query(target_impalad_client, query, user=user) except Exception as e: if 'CATCH' in test_section: self.__verify_exceptions(test_section['CATCH'], str(e), use_db) continue raise finally: if len(query_options_changed) > 0: self.__restore_query_options(query_options_changed, target_impalad_client) if 'CATCH' in test_section: assert test_section['CATCH'].strip() == '' assert result is not None assert result.success # Decode the results read back if the data is stored with a specific encoding. if encoding: result.data = [row.decode(encoding) for row in result.data] # Replace $NAMENODE in the expected results with the actual namenode URI. if 'RESULTS' in test_section: self.__verify_results_and_errors(vector, test_section, result, use_db) else: # TODO: Can't validate errors without expected results for now. assert 'ERRORS' not in test_section,\ "'ERRORS' sections must have accompanying 'RESULTS' sections" # If --update_results, then replace references to the namenode URI with $NAMENODE. if pytest.config.option.update_results and 'RESULTS' in test_section: test_section['RESULTS'] = test_section['RESULTS'] \ .replace(NAMENODE, '$NAMENODE') \ .replace('$IMPALA_HOME', IMPALA_HOME) if 'RUNTIME_PROFILE' in test_section: verify_runtime_profile(test_section['RUNTIME_PROFILE'], result.runtime_profile) if pytest.config.option.update_results: output_file = os.path.join('/tmp', test_file_name.replace('/','_') + ".test") write_test_file(output_file, sections, encoding=encoding)