Beispiel #1
0
async def check_resource_aggregation(app, db):
    def json_to_value(x):
        if x is None:
            return x
        return json.loads(x)

    def merge(r1, r2):
        if r1 is None:
            r1 = {}
        if r2 is None:
            r2 = {}

        result = {}

        def add_items(d):
            for k, v in d.items():
                if k not in result:
                    result[k] = v
                else:
                    result[k] += v

        add_items(r1)
        add_items(r2)
        return result

    def seqop(result, k, v):
        if k not in result:
            result[k] = v
        else:
            result[k] = merge(result[k], v)

    def fold(d, key_f):
        if d is None:
            d = {}
        d = copy.deepcopy(d)
        result = {}
        for k, v in d.items():
            seqop(result, key_f(k), v)
        return result

    @transaction(db, read_only=True)
    async def check(tx):
        attempt_resources = tx.execute_and_fetchall('''
SELECT attempt_resources.batch_id, attempt_resources.job_id, attempt_resources.attempt_id,
  JSON_OBJECTAGG(resource, quantity * GREATEST(COALESCE(end_time - start_time, 0), 0)) as resources
FROM attempt_resources
INNER JOIN attempts
ON attempts.batch_id = attempt_resources.batch_id AND
  attempts.job_id = attempt_resources.job_id AND
  attempts.attempt_id = attempt_resources.attempt_id
GROUP BY batch_id, job_id, attempt_id
LOCK IN SHARE MODE;
''')

        agg_job_resources = tx.execute_and_fetchall('''
SELECT batch_id, job_id, JSON_OBJECTAGG(resource, `usage`) as resources
FROM aggregated_job_resources
GROUP BY batch_id, job_id
LOCK IN SHARE MODE;
''')

        agg_batch_resources = tx.execute_and_fetchall('''
SELECT batch_id, billing_project, JSON_OBJECTAGG(resource, `usage`) as resources
FROM (
  SELECT batch_id, resource, SUM(`usage`) AS `usage`
  FROM aggregated_batch_resources
  GROUP BY batch_id, resource) AS t
JOIN batches ON batches.id = t.batch_id
GROUP BY t.batch_id, billing_project
LOCK IN SHARE MODE;
''')

        agg_billing_project_resources = tx.execute_and_fetchall('''
SELECT billing_project, JSON_OBJECTAGG(resource, `usage`) as resources
FROM (
  SELECT billing_project, resource, SUM(`usage`) AS `usage`
  FROM aggregated_billing_project_resources
  GROUP BY billing_project, resource) AS t
GROUP BY t.billing_project
LOCK IN SHARE MODE;
''')

        attempt_resources = {(record['batch_id'], record['job_id'],
                              record['attempt_id']):
                             json_to_value(record['resources'])
                             async for record in attempt_resources}

        agg_job_resources = {(record['batch_id'], record['job_id']):
                             json_to_value(record['resources'])
                             async for record in agg_job_resources}

        agg_batch_resources = {(record['batch_id'], record['billing_project']):
                               json_to_value(record['resources'])
                               async for record in agg_batch_resources}

        agg_billing_project_resources = {
            record['billing_project']: json_to_value(record['resources'])
            async for record in agg_billing_project_resources
        }

        attempt_by_batch_resources = fold(attempt_resources, lambda k: k[0])
        attempt_by_job_resources = fold(attempt_resources, lambda k:
                                        (k[0], k[1]))
        job_by_batch_resources = fold(agg_job_resources, lambda k: k[0])
        batch_by_billing_project_resources = fold(agg_batch_resources,
                                                  lambda k: k[1])

        agg_batch_resources_2 = {
            batch_id: resources
            for (batch_id, _), resources in agg_batch_resources.items()
        }

        assert attempt_by_batch_resources == agg_batch_resources_2, (
            dictdiffer.diff(attempt_by_batch_resources, agg_batch_resources_2),
            attempt_by_batch_resources,
            agg_batch_resources_2,
        )
        assert attempt_by_job_resources == agg_job_resources, (
            dictdiffer.diff(attempt_by_job_resources, agg_job_resources),
            attempt_by_job_resources,
            agg_job_resources,
        )
        assert job_by_batch_resources == agg_batch_resources_2, (
            dictdiffer.diff(job_by_batch_resources, agg_batch_resources_2),
            job_by_batch_resources,
            agg_batch_resources_2,
        )
        assert batch_by_billing_project_resources == agg_billing_project_resources, (
            dictdiffer.diff(batch_by_billing_project_resources,
                            agg_billing_project_resources),
            batch_by_billing_project_resources,
            agg_billing_project_resources,
        )

    try:
        await check()  # pylint: disable=no-value-for-parameter
    except Exception as e:
        app['check_resource_aggregation_error'] = serialization.exception_to_dict(
            e)
        log.exception('while checking resource aggregation')
Beispiel #2
0
async def check_incremental(app, db):
    @transaction(db, read_only=True)
    async def check(tx):
        user_inst_coll_with_broken_resources = tx.execute_and_fetchall('''
SELECT
  t.*,
  u.*
FROM
(
  SELECT user, inst_coll,
    CAST(COALESCE(SUM(state = 'Ready' AND runnable), 0) AS SIGNED) AS actual_n_ready_jobs,
    CAST(COALESCE(SUM(cores_mcpu * (state = 'Ready' AND runnable)), 0) AS SIGNED) AS actual_ready_cores_mcpu,
    CAST(COALESCE(SUM(state = 'Running' AND (NOT cancelled)), 0) AS SIGNED) AS actual_n_running_jobs,
    CAST(COALESCE(SUM(cores_mcpu * (state = 'Running' AND (NOT cancelled))), 0) AS SIGNED) AS actual_running_cores_mcpu,
    CAST(COALESCE(SUM(state = 'Creating' AND (NOT cancelled)), 0) AS SIGNED) AS actual_n_creating_jobs,
    CAST(COALESCE(SUM(state = 'Ready' AND cancelled), 0) AS SIGNED) AS actual_n_cancelled_ready_jobs,
    CAST(COALESCE(SUM(state = 'Running' AND cancelled), 0) AS SIGNED) AS actual_n_cancelled_running_jobs,
    CAST(COALESCE(SUM(state = 'Creating' AND cancelled), 0) AS SIGNED) AS actual_n_cancelled_creating_jobs
  FROM
  (
    SELECT batches.user, jobs.state, jobs.cores_mcpu, jobs.inst_coll,
      (jobs.always_run OR NOT (jobs.cancelled OR batches_cancelled.id IS NOT NULL)) AS runnable,
      (NOT jobs.always_run AND (jobs.cancelled OR batches_cancelled.id IS NOT NULL)) AS cancelled
    FROM batches
    INNER JOIN jobs ON batches.id = jobs.batch_id
    LEFT JOIN batches_cancelled ON batches.id = batches_cancelled.id
    WHERE batches.`state` = 'running'
  ) as v
  GROUP BY user, inst_coll
) as t
INNER JOIN
(
  SELECT user, inst_coll,
    CAST(COALESCE(SUM(n_ready_jobs), 0) AS SIGNED) AS expected_n_ready_jobs,
    CAST(COALESCE(SUM(ready_cores_mcpu), 0) AS SIGNED) AS expected_ready_cores_mcpu,
    CAST(COALESCE(SUM(n_running_jobs), 0) AS SIGNED) AS expected_n_running_jobs,
    CAST(COALESCE(SUM(running_cores_mcpu), 0) AS SIGNED) AS expected_running_cores_mcpu,
    CAST(COALESCE(SUM(n_creating_jobs), 0) AS SIGNED) AS expected_n_creating_jobs,
    CAST(COALESCE(SUM(n_cancelled_ready_jobs), 0) AS SIGNED) AS expected_n_cancelled_ready_jobs,
    CAST(COALESCE(SUM(n_cancelled_running_jobs), 0) AS SIGNED) AS expected_n_cancelled_running_jobs,
    CAST(COALESCE(SUM(n_cancelled_creating_jobs), 0) AS SIGNED) AS expected_n_cancelled_creating_jobs
  FROM user_inst_coll_resources
  GROUP BY user, inst_coll
) AS u
ON t.user = u.user AND t.inst_coll = u.inst_coll
WHERE actual_n_ready_jobs != expected_n_ready_jobs
   OR actual_ready_cores_mcpu != expected_ready_cores_mcpu
   OR actual_n_running_jobs != expected_n_running_jobs
   OR actual_running_cores_mcpu != expected_running_cores_mcpu
   OR actual_n_creating_jobs != expected_n_creating_jobs
   OR actual_n_cancelled_ready_jobs != expected_n_cancelled_ready_jobs
   OR actual_n_cancelled_running_jobs != expected_n_cancelled_running_jobs
   OR actual_n_cancelled_creating_jobs != expected_n_cancelled_creating_jobs
LOCK IN SHARE MODE;
''')

        failures = [
            record async for record in user_inst_coll_with_broken_resources
        ]
        if len(failures) > 0:
            raise ValueError(json.dumps(failures))

    try:
        await check()  # pylint: disable=no-value-for-parameter
    except Exception as e:
        app['check_incremental_error'] = serialization.exception_to_dict(e)
        log.exception('while checking incremental')