def check(self, project, time_range_start, time_range_end): """Perform a table equivalence check for a project in a time window. Given an M-Lab project and a time window, checks that the per-month tables and the per-project table contain equivalent data for that project within the time window. Args: project: Numerical ID of M-Lab project in BigQuery (e.g. NDT = 0). time_range_start: Start of window (inclusive) for which to generate query (as datetime). time_range_end: End of time window (not inclusive) for which to generate query (as datetime). Returns: A CheckResult object representing the result of the check. """ query = self._query_generator_factory.create( project, time_range_start, time_range_end).generate_query() logger.debug('Performing table equivalence check. BigQuery SQL:%s', formatting.indent(query)) query_result = self._query_executor.execute_query(query) if query_result: # Non-empty results of the query indicates that the check failed. per_month_ids, per_project_ids = _parse_query_result(query_result) message = _format_check_failure_message(per_month_ids, per_project_ids, query) return CheckResult(success=False, message=message) else: return CheckResult(success=True)
def _construct_equivalence_query(per_month_query, per_project_query): """Constructs BigQuery SQL to be used in a table equivalence check. Constructs a query composed of two subqueries that retrieve test_id values from per-month table(s) and a per-project table. The query yields all of the test_id values that appear in only one of the two tables (indicating an inconsistency between the tables) Args: per_month_query: A BigQuery SQL query that selects test_id values from the per-month tables. per_project_query: A BigQuery SQL query that selects test_id values from a per-project table. Returns: A BigQuery SQL query to test the equivalence of the two subqueries. """ return """ SELECT per_month.test_id, per_project.test_id FROM ( {per_month_query} ) AS per_month FULL OUTER JOIN EACH ( {per_project_query} ) AS per_project ON per_month.test_id=per_project.test_id WHERE per_month.test_id IS NULL OR per_project.test_id IS NULL""".format( per_month_query=formatting.indent(per_month_query, 8), per_project_query=formatting.indent(per_project_query, 8))
def test_check_fails_when_extra_ids_are_in_per_project_table_only(self): """If per-project table contains extra test_ids, check fails.""" self.query_executor.execute_query.return_value = ( 'per_month_test_id,per_project_test_id\n' ',mock_id_3') check_result = self.checker.check(constants.PROJECT_ID_NDT, START_TIME, END_TIME) self.assertFalse(check_result.success) self.assertMultiLineEqual( check_result.message, ('Check failed: TABLE EQUIVALENCE\n' 'test_id values present in per-project table, but NOT present in ' 'per-month table:\n' ' mock_id_3\n' 'BigQuery SQL:\n' + formatting.indent(MOCK_QUERY)))
def test_check_trims_list_of_extra_ids_when_the_list_is_large(self): """When the check finds many extra test_ids, we should trim the list. If the list of extra test_id values is very long, we should trim it down, remove duplicates, and sort it. """ mock_query_result = 'per_month_test_id,per_project_test_id\n' # Add the first 5 values out of order. mock_query_result += ',mock_id_03\n' mock_query_result += ',mock_id_00\n' mock_query_result += ',mock_id_02\n' mock_query_result += ',mock_id_04\n' mock_query_result += ',mock_id_01\n' # Add 100 more rows with duplicates. for i in range(0, 100): mock_query_result += ',mock_id_%02d\n' % i self.query_executor.execute_query.return_value = mock_query_result check_result = self.checker.check(constants.PROJECT_ID_NDT, START_TIME, END_TIME) self.assertFalse(check_result.success) self.assertMultiLineEqual( check_result.message, ('Check failed: TABLE EQUIVALENCE\n' 'test_id values present in per-project table, but NOT present in ' 'per-month table:\n' ' mock_id_00\n' ' mock_id_01\n' ' mock_id_02\n' ' mock_id_03\n' ' mock_id_04\n' ' mock_id_05\n' ' mock_id_06\n' ' mock_id_07\n' ' mock_id_08\n' ' mock_id_09\n' ' (95 additional or duplicate test_id values omitted)\n' 'BigQuery SQL:\n' + formatting.indent(MOCK_QUERY)))
def _format_check_failure_message(per_month_ids, per_project_ids, query): """Creates a user-friendly message explaining an equivalence check failure. Args: per_month_ids: A list of test_id values that appeared only in the per-month tables. per_project_ids: A list of test_id values that appeared only in the per-project tables. query: The SQL query used to compare the two tables. Returns: A user-friendly message explaining the sanity check failure. """ message = 'Check failed: TABLE EQUIVALENCE\n' if per_month_ids: message += ('test_id values present in per-month table, but NOT present' ' in per-project table:\n') message += '%s\n' % _format_test_ids(per_month_ids) if per_project_ids: message += ('test_id values present in per-project table, but NOT ' 'present in per-month table:\n') message += '%s\n' % _format_test_ids(per_project_ids) message += 'BigQuery SQL:\n%s' % formatting.indent(query, 2) return message
def _format_test_ids(test_ids): """Formats a list of test_id values to be printed to the console. Formats a list of test_id values so that they can be printed to the console. This involves: * Removing duplicates * Sorting the values in lexicographic order * Reducing the list to size _MAX_DISPLAYED_TEST_IDS * Adding a message to indicate when test_id values were removed. Args: test_ids: A list of test_id values to format. Returns: A formatted list of test_id values that can be included in a check failure message. """ unique_test_ids = sorted(list(set(test_ids)))[:_MAX_DISPLAYED_TEST_IDS] number_omitted_ids = max(0, len(test_ids) - _MAX_DISPLAYED_TEST_IDS) if number_omitted_ids: unique_test_ids.append( '(%s additional or duplicate test_id values omitted)' % number_omitted_ids) return formatting.indent('\n'.join(unique_test_ids), 2)
def test_indent_negative_spacing_raises_exception(self): """indent should reject a negative value for spacing.""" with self.assertRaises(ValueError): formatting.indent('a', -1)
def assertIndent(self, expected, original, spaces): self.assertEqual(expected, formatting.indent(original, spaces))