def check(self, project, time_range_start, time_range_end):
        """Perform a table equivalence check for a project in a time window.

        Given an M-Lab project and a time window, checks that the per-month
        tables and the per-project table contain equivalent data for that
        project within the time window.

        Args:
            project: Numerical ID of M-Lab project in BigQuery (e.g. NDT = 0).
            time_range_start: Start of window (inclusive) for which to generate
                query (as datetime).
            time_range_end: End of time window (not inclusive) for which to
                generate query (as datetime).

        Returns:
            A CheckResult object representing the result of the check.
        """
        query = self._query_generator_factory.create(
            project, time_range_start, time_range_end).generate_query()
        logger.debug('Performing table equivalence check. BigQuery SQL:%s',
                     formatting.indent(query))
        query_result = self._query_executor.execute_query(query)
        if query_result:
            # Non-empty results of the query indicates that the check failed.
            per_month_ids, per_project_ids = _parse_query_result(query_result)
            message = _format_check_failure_message(per_month_ids,
                                                    per_project_ids, query)
            return CheckResult(success=False, message=message)
        else:
            return CheckResult(success=True)
Example #2
0
def _construct_equivalence_query(per_month_query, per_project_query):
    """Constructs BigQuery SQL to be used in a table equivalence check.

    Constructs a query composed of two subqueries that retrieve test_id values
    from per-month table(s) and a per-project table. The query yields all of the
    test_id values that appear in only one of the two tables (indicating an
    inconsistency between the tables)

    Args:
        per_month_query: A BigQuery SQL query that selects test_id values from
            the per-month tables.
        per_project_query: A BigQuery SQL query that selects test_id values from
            a per-project table.

    Returns:
        A BigQuery SQL query to test the equivalence of the two subqueries.
    """
    return """
SELECT
    per_month.test_id,
    per_project.test_id
FROM
    (
{per_month_query}
    ) AS per_month
    FULL OUTER JOIN EACH
    (
{per_project_query}
    ) AS per_project
ON
    per_month.test_id=per_project.test_id
WHERE
    per_month.test_id IS NULL
    OR per_project.test_id IS NULL""".format(
        per_month_query=formatting.indent(per_month_query, 8),
        per_project_query=formatting.indent(per_project_query, 8))
    def test_check_fails_when_extra_ids_are_in_per_project_table_only(self):
        """If per-project table contains extra test_ids, check fails."""
        self.query_executor.execute_query.return_value = (
            'per_month_test_id,per_project_test_id\n'
            ',mock_id_3')

        check_result = self.checker.check(constants.PROJECT_ID_NDT, START_TIME,
                                          END_TIME)
        self.assertFalse(check_result.success)
        self.assertMultiLineEqual(
            check_result.message,
            ('Check failed: TABLE EQUIVALENCE\n'
             'test_id values present in per-project table, but NOT present in '
             'per-month table:\n'
             '  mock_id_3\n'
             'BigQuery SQL:\n' + formatting.indent(MOCK_QUERY)))
    def test_check_trims_list_of_extra_ids_when_the_list_is_large(self):
        """When the check finds many extra test_ids, we should trim the list.

        If the list of extra test_id values is very long, we should trim it
        down, remove duplicates, and sort it.
        """
        mock_query_result = 'per_month_test_id,per_project_test_id\n'

        # Add the first 5 values out of order.
        mock_query_result += ',mock_id_03\n'
        mock_query_result += ',mock_id_00\n'
        mock_query_result += ',mock_id_02\n'
        mock_query_result += ',mock_id_04\n'
        mock_query_result += ',mock_id_01\n'

        # Add 100 more rows with duplicates.
        for i in range(0, 100):
            mock_query_result += ',mock_id_%02d\n' % i

        self.query_executor.execute_query.return_value = mock_query_result

        check_result = self.checker.check(constants.PROJECT_ID_NDT, START_TIME,
                                          END_TIME)
        self.assertFalse(check_result.success)
        self.assertMultiLineEqual(
            check_result.message,
            ('Check failed: TABLE EQUIVALENCE\n'
             'test_id values present in per-project table, but NOT present in '
             'per-month table:\n'
             '  mock_id_00\n'
             '  mock_id_01\n'
             '  mock_id_02\n'
             '  mock_id_03\n'
             '  mock_id_04\n'
             '  mock_id_05\n'
             '  mock_id_06\n'
             '  mock_id_07\n'
             '  mock_id_08\n'
             '  mock_id_09\n'
             '  (95 additional or duplicate test_id values omitted)\n'
             'BigQuery SQL:\n' + formatting.indent(MOCK_QUERY)))
def _format_check_failure_message(per_month_ids, per_project_ids, query):
    """Creates a user-friendly message explaining an equivalence check failure.

    Args:
        per_month_ids: A list of test_id values that appeared only in the
            per-month tables.
        per_project_ids: A list of test_id values that appeared only in the
            per-project tables.
        query: The SQL query used to compare the two tables.

    Returns:
        A user-friendly message explaining the sanity check failure.
    """
    message = 'Check failed: TABLE EQUIVALENCE\n'
    if per_month_ids:
        message += ('test_id values present in per-month table, but NOT present'
                    ' in per-project table:\n')
        message += '%s\n' % _format_test_ids(per_month_ids)
    if per_project_ids:
        message += ('test_id values present in per-project table, but NOT '
                    'present in per-month table:\n')
        message += '%s\n' % _format_test_ids(per_project_ids)
    message += 'BigQuery SQL:\n%s' % formatting.indent(query, 2)
    return message
def _format_test_ids(test_ids):
    """Formats a list of test_id values to be printed to the console.

    Formats a list of test_id values so that they can be printed to the console.
    This involves:
        * Removing duplicates
        * Sorting the values in lexicographic order
        * Reducing the list to size _MAX_DISPLAYED_TEST_IDS
        * Adding a message to indicate when test_id values were removed.

    Args:
        test_ids: A list of test_id values to format.

    Returns:
        A formatted list of test_id values that can be included in a check
        failure message.
    """
    unique_test_ids = sorted(list(set(test_ids)))[:_MAX_DISPLAYED_TEST_IDS]
    number_omitted_ids = max(0, len(test_ids) - _MAX_DISPLAYED_TEST_IDS)
    if number_omitted_ids:
        unique_test_ids.append(
            '(%s additional or duplicate test_id values omitted)' %
            number_omitted_ids)
    return formatting.indent('\n'.join(unique_test_ids), 2)
Example #7
0
 def test_indent_negative_spacing_raises_exception(self):
     """indent should reject a negative value for spacing."""
     with self.assertRaises(ValueError):
         formatting.indent('a', -1)
Example #8
0
 def assertIndent(self, expected, original, spaces):
     self.assertEqual(expected, formatting.indent(original, spaces))