Ejemplo n.º 1
0
async def unschedule_job(app, record):
    scheduler_state_changed = app['scheduler_state_changed']
    db = app['db']
    inst_pool = app['inst_pool']

    batch_id = record['batch_id']
    job_id = record['job_id']
    id = (batch_id, job_id)

    instance_name = record['instance_name']
    assert instance_name is not None

    log.info(f'unscheduling job {id} from instance {instance_name}')

    instance = inst_pool.name_instance.get(instance_name)
    if not instance:
        log.warning(f'unschedule job {id}: unknown instance {instance_name}')
        return

    await check_call_procedure(db, 'CALL unschedule_job(%s, %s, %s);',
                               (batch_id, job_id, instance_name))

    log.info(f'unschedule job {id}: updated database')

    instance.adjust_free_cores_in_memory(record['cores_mcpu'])
    scheduler_state_changed.set()

    log.info(f'unschedule job {id}: updated {instance} free cores')

    url = (f'http://{instance.ip_address}:5000'
           f'/api/v1alpha/batches/{batch_id}/jobs/{job_id}/delete')
    delay = 0.1
    while True:
        if instance.state in ('inactive', 'deleted'):
            break
        try:
            async with aiohttp.ClientSession(
                    raise_for_status=True,
                    timeout=aiohttp.ClientTimeout(total=60)) as session:
                await session.delete(url)
                await instance.mark_healthy()
                break
        except Exception as e:
            if (isinstance(e, aiohttp.ClientResponseError)
                    and e.status == 404):  # pylint: disable=no-member
                await instance.mark_healthy()
                break
            else:
                await instance.incr_failed_request_count()
                if is_transient_error(e):
                    pass
                else:
                    raise
        delay = await sleep_and_backoff(delay)

    log.info(f'unschedule job {id}: called delete job')
Ejemplo n.º 2
0
async def unschedule_job(app, record):
    cancel_ready_state_changed = app['cancel_ready_state_changed']
    scheduler_state_changed = app['scheduler_state_changed']
    db = app['db']
    inst_pool = app['inst_pool']

    batch_id = record['batch_id']
    job_id = record['job_id']
    attempt_id = record['attempt_id']
    id = (batch_id, job_id)

    instance_name = record['instance_name']
    assert instance_name is not None

    log.info(
        f'unscheduling job {id}, attempt {attempt_id} from instance {instance_name}'
    )

    end_time = time_msecs()

    try:
        rv = await db.execute_and_fetchone(
            'CALL unschedule_job(%s, %s, %s, %s, %s, %s);',
            (batch_id, job_id, attempt_id, instance_name, end_time,
             'cancelled'))
    except Exception:
        log.exception(
            f'error while unscheduling job {id} on instance {instance_name}')
        raise

    log.info(f'unschedule job {id}: updated database {rv}')

    # job that was running is now ready to be cancelled
    cancel_ready_state_changed.set()

    instance = inst_pool.name_instance.get(instance_name)
    if not instance:
        log.warning(
            f'unschedule job {id}, attempt {attempt_id}: unknown instance {instance_name}'
        )
        return

    if rv['delta_cores_mcpu'] and instance.state == 'active':
        instance.adjust_free_cores_in_memory(rv['delta_cores_mcpu'])
        scheduler_state_changed.set()
        log.info(
            f'unschedule job {id}, attempt {attempt_id}: updated {instance} free cores'
        )

    url = (f'http://{instance.ip_address}:5000'
           f'/api/v1alpha/batches/{batch_id}/jobs/{job_id}/delete')
    delay = 0.1
    while True:
        if instance.state in ('inactive', 'deleted'):
            break
        try:
            async with aiohttp.ClientSession(
                    raise_for_status=True,
                    timeout=aiohttp.ClientTimeout(total=60)) as session:
                await session.delete(url)
                await instance.mark_healthy()
                break
        except Exception as e:
            if (isinstance(e, aiohttp.ClientResponseError)
                    and e.status == 404):  # pylint: disable=no-member
                await instance.mark_healthy()
                break
            await instance.incr_failed_request_count()
            if is_transient_error(e):
                pass
            else:
                raise
        delay = await sleep_and_backoff(delay)

    log.info(f'unschedule job {id}, attempt {attempt_id}: called delete job')