def __init__(self, test_section, test_file_name, test_db_name): if 'QUERY' in test_section: self.existing_table = True self.show_create_table_sql = remove_comments( test_section['QUERY']).strip() elif 'CREATE_TABLE' in test_section: self.__process_create_section(test_section['CREATE_TABLE'], test_file_name, test_db_name, 'table') elif 'CREATE_VIEW' in test_section: self.__process_create_section(test_section['CREATE_VIEW'], test_file_name, test_db_name, 'view') else: assert 0, 'Error in test file %s. Test cases require a '\ 'CREATE_TABLE section.\n%s' %\ (test_file_name, pprint.pformat(test_section)) results_key = 'RESULTS-HIVE' if HIVE_MAJOR_VERSION > 2: if 'RESULTS-HIVE-3' in test_section: # If the hive version is greater than 2 use the RESULTS-HIVE-3 available results_key = 'RESULTS-HIVE-3' expected_result = remove_comments(test_section[results_key]) self.expected_result = expected_result.replace( ShowCreateTableTestCase.RESULTS_DB_NAME_TOKEN, test_db_name)
def __init__(self, test_section, test_file_name, test_db_name): if 'QUERY' in test_section: self.existing_table = True self.show_create_table_sql = remove_comments(test_section['QUERY']).strip() elif 'CREATE_TABLE' in test_section: self.__process_create_section(test_section['CREATE_TABLE'], test_file_name, test_db_name, 'table') elif 'CREATE_VIEW' in test_section: self.__process_create_section(test_section['CREATE_VIEW'], test_file_name, test_db_name, 'view') else: assert 0, 'Error in test file %s. Test cases require a '\ 'CREATE_TABLE section.\n%s' %\ (test_file_name, pprint.pformat(test_section)) self.expected_result = remove_comments(test_section['RESULTS'])
def verify_runtime_profile(expected, actual): """ Check that lines matching all of the expected runtime profile entries are present in the actual text runtime profile. The check passes if, for each of the expected rows, at least one matching row is present in the actual runtime profile. Rows with the "row_regex:" prefix are treated as regular expressions. """ expected_lines = remove_comments(expected).splitlines() matched = [False] * len(expected_lines) expected_regexes = [] for expected_line in expected_lines: expected_regexes.append(try_compile_regex(expected_line)) # Check the expected and actual rows pairwise. for line in actual.splitlines(): for i in xrange(len(expected_lines)): if matched[i]: continue if expected_regexes[i] is not None: match = expected_regexes[i].match(line) else: match = expected_lines[i].strip() == line.strip() if match: matched[i] = True break unmatched_lines = [] for i in xrange(len(expected_lines)): if not matched[i]: unmatched_lines.append(expected_lines[i]) assert len(unmatched_lines) == 0, ( "Did not find matches for lines in runtime profile:" "\nEXPECTED LINES:\n%s\n\nACTUAL PROFILE:\n%s" % ('\n'.join(unmatched_lines), actual))
def verify_runtime_profile(expected, actual): """ Check that lines matching all of the expected runtime profile entries are present in the actual text runtime profile. The check passes if, for each of the expected rows, at least one matching row is present in the actual runtime profile. Rows with the "row_regex:" prefix are treated as regular expressions. """ expected_lines = remove_comments(expected).splitlines() matched = [False] * len(expected_lines) expected_regexes = [] for expected_line in expected_lines: expected_regexes.append(try_compile_regex(expected_line)) # Check the expected and actual rows pairwise. for line in actual.splitlines(): for i in xrange(len(expected_lines)): if matched[i]: continue if expected_regexes[i] is not None: match = expected_regexes[i].match(line) else: match = expected_lines[i].strip() == line.strip() if match: matched[i] = True break unmatched_lines = [] for i in xrange(len(expected_lines)): if not matched[i]: unmatched_lines.append(expected_lines[i]) assert len(unmatched_lines) == 0, ("Did not find matches for lines in runtime profile:" "\nEXPECTED LINES:\n%s\n\nACTUAL PROFILE:\n%s" % ('\n'.join(unmatched_lines), actual))
def __init__(self, test_section, test_file_name, test_db_name): if 'QUERY' in test_section: self.existing_table = True self.show_create_table_sql = remove_comments(test_section['QUERY']).strip() elif 'CREATE_TABLE' in test_section: self.__process_create_section(test_section['CREATE_TABLE'], test_file_name, test_db_name, 'table') elif 'CREATE_VIEW' in test_section: self.__process_create_section(test_section['CREATE_VIEW'], test_file_name, test_db_name, 'view') else: assert 0, 'Error in test file %s. Test cases require a '\ 'CREATE_TABLE section.\n%s' %\ (test_file_name, pprint.pformat(test_section)) expected_result = remove_comments(test_section['RESULTS']) self.expected_result = expected_result.replace( ShowCreateTableTestCase.RESULTS_DB_NAME_TOKEN, test_db_name)
def verify_runtime_profile(expected, actual, update_section=False): """ Check that lines matching all of the expected runtime profile entries are present in the actual text runtime profile. The check passes if, for each of the expected rows, at least one matching row is present in the actual runtime profile. Rows with the "row_regex:" prefix are treated as regular expressions. Rows with the "aggregation(function,field): value" syntax specifies an aggregation over the runtime profile. """ expected_lines = remove_comments(expected).splitlines() matched = [False] * len(expected_lines) expected_regexes = [] expected_aggregations = [] for expected_line in expected_lines: expected_regexes.append(try_compile_regex(expected_line)) expected_aggregations.append(try_compile_aggregation(expected_line)) # Check the expected and actual rows pairwise. for line in actual.splitlines(): for i in xrange(len(expected_lines)): if matched[i]: continue if expected_regexes[i] is not None: match = expected_regexes[i].match(line) elif expected_aggregations[i] is not None: # Aggregations are enforced separately match = True else: match = expected_lines[i].strip() == line.strip() if match: matched[i] = True break unmatched_lines = [] for i in xrange(len(expected_lines)): if not matched[i]: unmatched_lines.append(expected_lines[i]) assert len(unmatched_lines) == 0, ( "Did not find matches for lines in runtime profile:" "\nEXPECTED LINES:\n%s\n\nACTUAL PROFILE:\n%s" % ('\n'.join(unmatched_lines), actual)) updated_aggregations = [] # Compute the aggregations and check against values for i in xrange(len(expected_aggregations)): if (expected_aggregations[i] is None): continue function, field, expected_value = expected_aggregations[i] actual_value = compute_aggregation(function, field, actual) if update_section: updated_aggregations.append("aggregation(%s, %s): %d" % (function, field, actual_value)) else: assert actual_value == expected_value, ( "Aggregation of %s over %s did not match " "expected results.\nEXPECTED VALUE:\n%d\n\nACTUAL VALUE:\n%d" "\n\nPROFILE:\n%s\n" % (function, field, expected_value, actual_value, actual)) return updated_aggregations
def __process_create_section(self, section, test_file_name, test_db_name, table_type): self.existing_table = False self.create_table_sql = QueryTestSectionReader.build_query(remove_comments(section)) name = self.__get_table_name(self.create_table_sql, table_type) assert name.find(".") == -1, 'Error in test file %s. Found unexpected %s '\ 'name %s that is qualified with a database' % (table_type, test_file_name, name) self.table_name = test_db_name + '.' + name self.create_table_sql = self.create_table_sql.replace(name, self.table_name, 1) self.show_create_table_sql = 'show create %s %s' % (table_type, self.table_name) self.drop_table_sql = "drop %s %s" % (table_type, self.table_name)
def load_queries_from_test_file(file_path, db_name=None): LOG.debug("Loading queries from %s", file_path) test_cases = test_file_parser.parse_query_test_file(file_path) queries = list() for test_case in test_cases: query = Query() query.sql = test_file_parser.remove_comments(test_case["QUERY"]) query.db_name = db_name queries.append(query) return queries
def verify_runtime_profile(expected, actual, update_section=False): """ Check that lines matching all of the expected runtime profile entries are present in the actual text runtime profile. The check passes if, for each of the expected rows, at least one matching row is present in the actual runtime profile. Rows with the "row_regex:" prefix are treated as regular expressions. Rows with the "aggregation(function,field): value" syntax specifies an aggregation over the runtime profile. """ expected_lines = remove_comments(expected).splitlines() matched = [False] * len(expected_lines) expected_regexes = [] expected_aggregations = [] for expected_line in expected_lines: expected_regexes.append(try_compile_regex(expected_line)) expected_aggregations.append(try_compile_aggregation(expected_line)) # Check the expected and actual rows pairwise. for line in actual.splitlines(): for i in xrange(len(expected_lines)): if matched[i]: continue if expected_regexes[i] is not None: match = expected_regexes[i].match(line) elif expected_aggregations[i] is not None: # Aggregations are enforced separately match = True else: match = expected_lines[i].strip() == line.strip() if match: matched[i] = True break unmatched_lines = [] for i in xrange(len(expected_lines)): if not matched[i]: unmatched_lines.append(expected_lines[i]) assert len(unmatched_lines) == 0, ("Did not find matches for lines in runtime profile:" "\nEXPECTED LINES:\n%s\n\nACTUAL PROFILE:\n%s" % ('\n'.join(unmatched_lines), actual)) updated_aggregations = [] # Compute the aggregations and check against values for i in xrange(len(expected_aggregations)): if (expected_aggregations[i] is None): continue function, field, expected_value = expected_aggregations[i] actual_value = compute_aggregation(function, field, actual) if update_section: updated_aggregations.append("aggregation(%s, %s): %d" % (function, field, actual_value)) else: assert actual_value == expected_value, ("Aggregation of %s over %s did not match " "expected results.\nEXPECTED VALUE:\n%d\n\nACTUAL VALUE:\n%d" "\n\nPROFILE:\n%s\n" % (function, field, expected_value, actual_value, actual)) return updated_aggregations
def load_tpc_queries(workload): """Returns a list of tpc queries. 'workload' should either be 'tpch' or 'tpcds'.""" queries = list() query_dir = os.path.join(os.path.dirname(__file__), "..", "..", "testdata", "workloads", workload, "queries") for query_file in os.listdir(query_dir): if workload + "-q" not in query_file: continue test_cases = test_file_parser.parse_query_test_file( os.path.join(query_dir, query_file)) for test_case in test_cases: query = Query() query.sql = test_file_parser.remove_comments(test_case["QUERY"]) queries.append(query) return queries
def verify_raw_results(test_section, exec_result, file_format, update_section=False, replace_filenames=True, result_section='RESULTS'): """ Accepts a raw exec_result object and verifies it matches the expected results, including checking the ERRORS, TYPES, and LABELS test sections. If update_section is true, updates test_section with the actual results if they don't match the expected results. If update_section is false, failed verifications result in assertion failures, otherwise they are ignored. This process includes the parsing/transformation of the raw data results into the result format used in the tests. The result_section parameter can be used to make this function check the results in a DML_RESULTS section instead of the regular RESULTS section. TODO: separate out the handling of sections like ERRORS from checking of query results to allow regular RESULTS/ERRORS sections in tests with DML_RESULTS (IMPALA-4471). """ expected_results = None if result_section in test_section: expected_results = remove_comments(test_section[result_section]) else: assert 'ERRORS' not in test_section, "'ERRORS' section must have accompanying 'RESULTS' section" LOG.info("No results found. Skipping verification"); return if 'ERRORS' in test_section: expected_errors = split_section_lines(remove_comments(test_section['ERRORS'])) actual_errors = apply_error_match_filter(exec_result.log.split('\n'), replace_filenames) try: verify_errors(expected_errors, actual_errors) except AssertionError: if update_section: test_section['ERRORS'] = join_section_lines(actual_errors) else: raise if 'TYPES' in test_section: # Distinguish between an empty list and a list with an empty string. expected_types = list() if test_section.get('TYPES'): expected_types = [c.strip().upper() for c in test_section['TYPES'].rstrip('\n').split(',')] # Avro and Kudu represent TIMESTAMP columns as strings, so tests using TIMESTAMP are # skipped because results will be wrong. if file_format in ('avro', 'kudu') and 'TIMESTAMP' in expected_types: LOG.info("TIMESTAMP columns unsupported in %s, skipping verification." %\ file_format) return # Avro does not support as many types as Hive, so the Avro test tables may # have different column types than we expect (e.g., INT instead of # TINYINT). Bypass the type checking by ignoring the actual types of the Avro # table. if file_format == 'avro': LOG.info("Skipping type verification of Avro-format table.") actual_types = expected_types else: actual_types = parse_column_types(exec_result.schema) try: verify_results(expected_types, actual_types, order_matters=True) except AssertionError: if update_section: test_section['TYPES'] = join_section_lines([', '.join(actual_types)]) else: raise else: # This is an insert, so we are comparing the number of rows inserted expected_types = ['BIGINT'] actual_types = ['BIGINT'] actual_labels = ['DUMMY_LABEL'] if exec_result and exec_result.schema: actual_labels = parse_column_labels(exec_result.schema) if 'LABELS' in test_section: assert actual_labels is not None # Distinguish between an empty list and a list with an empty string. expected_labels = list() if test_section.get('LABELS'): expected_labels = [c.strip().upper() for c in test_section['LABELS'].split(',')] try: verify_results(expected_labels, actual_labels, order_matters=True) except AssertionError: if update_section: test_section['LABELS'] = join_section_lines([', '.join(actual_labels)]) else: raise # Get the verifier if specified. In the absence of an explicit # verifier, defaults to verifying equality. verifier = test_section.get('VERIFIER') order_matters = contains_order_by(exec_result.query) # If the test section is explicitly annotated to specify the order matters, # then do not sort the actual and expected results. if verifier and verifier.upper() == 'VERIFY_IS_EQUAL': order_matters = True # If the test result section is explicitly annotated to specify order does not matter, # then sort the actual and expected results before verification. if verifier and verifier.upper() == 'VERIFY_IS_EQUAL_SORTED': order_matters = False expected_results_list = [] if 'MULTI_LINE' in test_section: expected_results_list = map(lambda s: s.replace('\n', '\\n'), re.findall(r'\[(.*?)\]', expected_results, flags=re.DOTALL)) else: expected_results_list = split_section_lines(expected_results) expected = QueryTestResult(expected_results_list, expected_types, actual_labels, order_matters) actual = QueryTestResult(parse_result_rows(exec_result), actual_types, actual_labels, order_matters) assert verifier in VERIFIER_MAP.keys(), "Unknown verifier: " + verifier try: VERIFIER_MAP[verifier](expected, actual) except AssertionError: if update_section: test_section[result_section] = join_section_lines(actual.result_list) else: raise
def verify_raw_results(test_section, exec_result, file_format, update_section=False): """ Accepts a raw exec_result object and verifies it matches the expected results. If update_section is true, updates test_section with the actual results if they don't match the expected results. If update_section is false, failed verifications result in assertion failures, otherwise they are ignored. This process includes the parsing/transformation of the raw data results into the result format used in the tests. """ expected_results = None if 'RESULTS' in test_section: expected_results = remove_comments(test_section['RESULTS']) else: LOG.info("No results found. Skipping verification"); return if 'ERRORS' in test_section: expected_errors = test_section['ERRORS'].split('\n') actual_errors = apply_error_match_filter(exec_result.log.split('\n')) try: verify_errors(expected_errors, actual_errors) except AssertionError: if update_section: test_section['ERRORS'] = '\n'.join(actual_errors) else: raise if 'TYPES' in test_section: # Distinguish between an empty list and a list with an empty string. expected_types = list() if test_section.get('TYPES'): expected_types = [c.strip().upper() for c in test_section['TYPES'].split(',')] # Avro does not support as many types as Hive, so the Avro test tables may # have different column types than we expect (e.g., INT instead of # TINYINT). We represent TIMESTAMP columns as strings in Avro, so we bail in # this case since the results will be wrong. Otherwise we bypass the type # checking by ignoring the actual types of the Avro table. if file_format == 'avro': if 'TIMESTAMP' in expected_types: LOG.info("TIMESTAMP columns unsupported in Avro, skipping verification.") return LOG.info("Skipping type verification of Avro-format table.") actual_types = expected_types else: actual_types = parse_column_types(exec_result.schema) try: verify_results(expected_types, actual_types, order_matters=True) except AssertionError: if update_section: test_section['TYPES'] = ', '.join(actual_types) else: raise else: # This is an insert, so we are comparing the number of rows inserted expected_types = ['BIGINT'] actual_types = ['BIGINT'] if 'LABELS' in test_section: # Distinguish between an empty list and a list with an empty string. expected_labels = list() if test_section.get('LABELS'): expected_labels = [c.strip().upper() for c in test_section['LABELS'].split(',')] actual_labels = parse_column_labels(exec_result.schema) try: verify_results(expected_labels, actual_labels, order_matters=True) except AssertionError: if update_section: test_section['LABELS'] = ', '.join(actual_labels) else: raise # Get the verifier if specified. In the absence of an explicit # verifier, defaults to verifying equality. verifier = test_section.get('VERIFIER') order_matters = contains_order_by(exec_result.query) # If the test section is explicitly annotated to specify the order matters, # then do not sort the actual and expected results. if verifier and verifier.upper() == 'VERIFY_IS_EQUAL': order_matters = True # If the test result section is explicitly annotated to specify order does not matter, # then sort the actual and expected results before verification. if verifier and verifier.upper() == 'VERIFY_IS_EQUAL_SORTED': order_matters = False expected = QueryTestResult(expected_results.split('\n'), expected_types, order_matters) actual = QueryTestResult(parse_result_rows(exec_result), actual_types, order_matters) assert verifier in VERIFIER_MAP.keys(), "Unknown verifier: " + verifier try: VERIFIER_MAP[verifier](expected, actual) except AssertionError: if update_section: test_section['RESULTS'] = '\n'.join(actual.result_list) else: raise
def verify_raw_results(test_section, exec_result, file_format, result_section, type_section='TYPES', update_section=False, replace_filenames=True): """ Accepts a raw exec_result object and verifies it matches the expected results, including checking the ERRORS, TYPES, and LABELS test sections. If update_section is true, updates test_section with the actual results if they don't match the expected results. If update_section is false, failed verifications result in assertion failures, otherwise they are ignored. This process includes the parsing/transformation of the raw data results into the result format used in the tests. The result_section parameter can be used to make this function check the results in a DML_RESULTS section instead of the regular RESULTS section. The 'type_section' parameter can be used to make this function check the types against an alternative section from the default TYPES. TODO: separate out the handling of sections like ERRORS from checking of query results to allow regular RESULTS/ERRORS sections in tests with DML_RESULTS (IMPALA-4471). """ expected_results = None if result_section in test_section: expected_results = remove_comments(test_section[result_section]) else: assert 'ERRORS' not in test_section, "'ERRORS' section must have accompanying 'RESULTS' section" LOG.info("No results found. Skipping verification") return if 'ERRORS' in test_section: expected_errors = split_section_lines(remove_comments(test_section['ERRORS'])) actual_errors = apply_error_match_filter(exec_result.log.split('\n'), replace_filenames) try: verify_errors(expected_errors, actual_errors) except AssertionError: if update_section: test_section['ERRORS'] = join_section_lines(actual_errors) else: raise if type_section in test_section: # Distinguish between an empty list and a list with an empty string. section = test_section[type_section] expected_types = [c.strip().upper() for c in remove_comments(section).rstrip('\n').split(',')] # Avro and Kudu represent TIMESTAMP columns as strings, so tests using TIMESTAMP are # skipped because results will be wrong. if file_format in ('avro', 'kudu') and 'TIMESTAMP' in expected_types: LOG.info("TIMESTAMP columns unsupported in %s, skipping verification." %\ file_format) return # Avro does not support as many types as Hive, so the Avro test tables may # have different column types than we expect (e.g., INT instead of # TINYINT). Bypass the type checking by ignoring the actual types of the Avro # table. if file_format == 'avro': LOG.info("Skipping type verification of Avro-format table.") actual_types = expected_types else: actual_types = exec_result.column_types try: verify_results(expected_types, actual_types, order_matters=True) except AssertionError: if update_section: test_section['TYPES'] = join_section_lines([', '.join(actual_types)]) else: raise else: # This is an insert, so we are comparing the number of rows inserted expected_types = ['BIGINT'] actual_types = ['BIGINT'] actual_labels = ['DUMMY_LABEL'] if exec_result and exec_result.column_labels: actual_labels = exec_result.column_labels if 'LABELS' in test_section: assert actual_labels is not None # Distinguish between an empty list and a list with an empty string. expected_labels = list() if test_section.get('LABELS'): expected_labels = [c.strip().upper() for c in test_section['LABELS'].split(',')] try: verify_results(expected_labels, actual_labels, order_matters=True) except AssertionError: if update_section: test_section['LABELS'] = join_section_lines([', '.join(actual_labels)]) else: raise # Get the verifier if specified. In the absence of an explicit # verifier, defaults to verifying equality. verifier = test_section.get('VERIFIER') order_matters = contains_order_by(exec_result.query) # If the test section is explicitly annotated to specify the order matters, # then do not sort the actual and expected results. if verifier and verifier.upper() == 'VERIFY_IS_EQUAL': order_matters = True # If the test result section is explicitly annotated to specify order does not matter, # then sort the actual and expected results before verification. if verifier and verifier.upper() == 'VERIFY_IS_EQUAL_SORTED': order_matters = False expected_results_list = [] if 'MULTI_LINE' in test_section: expected_results_list = map(lambda s: s.replace('\n', '\\n'), re.findall(r'\[(.*?)\]', expected_results, flags=re.DOTALL)) else: expected_results_list = split_section_lines(expected_results) expected = QueryTestResult(expected_results_list, expected_types, actual_labels, order_matters) actual = QueryTestResult(parse_result_rows(exec_result), actual_types, actual_labels, order_matters) assert verifier in VERIFIER_MAP.keys(), "Unknown verifier: " + verifier try: VERIFIER_MAP[verifier](expected, actual) except AssertionError: if update_section: test_section[result_section] = join_section_lines(actual.result_list) else: raise