Ejemplo n.º 1
0
    async def schedule_jobs_loop_body(self):
        if self.app['frozen']:
            log.info(f'not scheduling any jobs for {self}; batch is frozen')
            return True

        log.info(f'starting scheduling jobs for {self}')
        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        should_wait = True

        n_scheduled = 0

        async for record in self.db.select_and_fetchall(
                '''
SELECT jobs.*, batches.format_version, batches.userdata, batches.user, attempts.instance_name
FROM batches
INNER JOIN jobs ON batches.id = jobs.batch_id
LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id
LEFT JOIN instances ON attempts.instance_name = instances.name
WHERE batches.state = 'running'
  AND jobs.state = 'Creating'
  AND (jobs.always_run OR NOT jobs.cancelled)
  AND jobs.inst_coll = %s
  AND instances.`state` = 'active'
ORDER BY instances.time_activated ASC
LIMIT 300;
''',
            (self.name, ),
                timer_description=
                f'in schedule_jobs for {self}: get ready jobs with active instances',
        ):
            batch_id = record['batch_id']
            job_id = record['job_id']
            instance_name = record['instance_name']
            id = (batch_id, job_id)
            log.info(f'scheduling job {id}')

            instance = self.name_instance[instance_name]
            n_scheduled += 1
            should_wait = False

            async def schedule_with_error_handling(app, record, id, instance):
                try:
                    await schedule_job(app, record, instance)
                except Exception:
                    log.info(f'scheduling job {id} on {instance} for {self}',
                             exc_info=True)

            await waitable_pool.call(schedule_with_error_handling, self.app,
                                     record, id, instance)

        await waitable_pool.wait()

        log.info(f'scheduled {n_scheduled} jobs for {self}')

        return should_wait
Ejemplo n.º 2
0
    async def copy(self, worker_pool: AsyncWorkerPool, copy_report: CopyReport,
                   transfer: Union[Transfer, List[Transfer]]):
        try:
            if isinstance(transfer, Transfer):
                await self._copy_one_transfer(worker_pool,
                                              copy_report._transfer_report,
                                              transfer)
                return

            async with WaitableSharedPool(worker_pool) as pool:
                for r, t in zip(copy_report._transfer_report, transfer):
                    await pool.call(self._copy_one_transfer, worker_pool, r, t)
        except Exception as e:
            copy_report.set_exception(e)
Ejemplo n.º 3
0
    async def cancel_orphaned_attempts_loop_body(self):
        log.info('cancelling orphaned attempts')
        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        n_unscheduled = 0

        async for record in self.db.select_and_fetchall(
                '''
SELECT attempts.*
FROM attempts
INNER JOIN jobs ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id
LEFT JOIN instances ON attempts.instance_name = instances.name
WHERE attempts.start_time IS NOT NULL
  AND attempts.end_time IS NULL
  AND (jobs.state != 'Running' OR jobs.attempt_id != attempts.attempt_id)
  AND instances.`state` = 'active'
ORDER BY attempts.start_time ASC
LIMIT 300;
''',
                timer_description='in cancel_orphaned_attempts',
        ):
            batch_id = record['batch_id']
            job_id = record['job_id']
            attempt_id = record['attempt_id']
            instance_name = record['instance_name']
            id = (batch_id, job_id)

            n_unscheduled += 1

            async def unschedule_with_error_handling(app, record,
                                                     instance_name, id,
                                                     attempt_id):
                try:
                    await unschedule_job(app, record)
                except Exception:
                    log.info(
                        f'unscheduling job {id} with orphaned attempt {attempt_id} on instance {instance_name}',
                        exc_info=True,
                    )

            await waitable_pool.call(unschedule_with_error_handling, self.app,
                                     record, instance_name, id, attempt_id)

        await waitable_pool.wait()

        log.info(f'cancelled {n_unscheduled} orphaned attempts')
Ejemplo n.º 4
0
    async def copy_as_dir(self, worker_pool: AsyncWorkerPool,
                          source_report: SourceReport):
        src = self.src
        if not src.endswith('/'):
            src = src + '/'

        try:
            srcentries = await self.router_fs.listfiles(src, recursive=True)
        except (NotADirectoryError, FileNotFoundError):
            self.src_is_dir = False
            await self.release_barrier()
            return

        self.src_is_dir = True
        await self.release_barrier_and_wait()

        if self.src_is_file:
            raise FileAndDirectoryError(self.src)

        source_report._source_type = AsyncFS.DIR

        full_dest, full_dest_type = await self._full_dest()
        if full_dest_type == AsyncFS.FILE:
            raise NotADirectoryError(full_dest)

        async with WaitableSharedPool(worker_pool) as pool:
            async for srcentry in srcentries:
                srcfile = srcentry.url_maybe_trailing_slash()
                assert srcfile.startswith(src)

                # skip files with empty names
                if srcfile.endswith('/'):
                    continue

                relsrcfile = srcfile[len(src):]
                assert not relsrcfile.startswith('/')

                await pool.call(self._copy_file, source_report, srcfile,
                                url_join(full_dest, relsrcfile))
Ejemplo n.º 5
0
    async def _copy_one_transfer(self, worker_pool: AsyncWorkerPool,
                                 transfer_report: TransferReport,
                                 transfer: Transfer):
        try:
            dest_type_task = asyncio.create_task(self._dest_type(transfer))
            dest_type_task_awaited = False

            try:
                src = transfer.src
                if isinstance(src, str):
                    await self.copy_source(worker_pool, transfer,
                                           transfer_report._source_report, src,
                                           dest_type_task)
                else:
                    if transfer.treat_dest_as == Transfer.TARGET_FILE:
                        raise NotADirectoryError(transfer.dest)

                    async with WaitableSharedPool(worker_pool) as pool:
                        for r, s in zip(transfer_report._source_report, src):
                            await pool.call(self.copy_source, worker_pool,
                                            transfer, r, s, dest_type_task)

                # raise potential exception
                dest_type_task_awaited = True
                await dest_type_task
            finally:
                if not dest_type_task_awaited:
                    # retrieve dest_type_task exception to avoid
                    # "Task exception was never retrieved" errors
                    try:
                        dest_type_task_awaited = True
                        await dest_type_task
                    except:
                        pass
        except Exception as e:
            transfer_report.set_exception(e)
Ejemplo n.º 6
0
Archivo: pool.py Proyecto: saponas/hail
    async def schedule_loop_body(self):
        log.info(f'schedule {self.pool}: starting')
        start = time_msecs()
        n_scheduled = 0

        user_resources = await self.compute_fair_share()

        total = sum(resources['allocated_cores_mcpu']
                    for resources in user_resources.values())
        if not total:
            log.info(f'schedule {self.pool}: no allocated cores')
            should_wait = True
            return should_wait
        user_share = {
            user:
            max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20)
            for user, resources in user_resources.items()
        }

        async def user_runnable_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT id, cancelled, userdata, user, format_version
FROM batches
WHERE user = %s AND `state` = 'running';
''',
                (user, ),
                    timer_description=
                    f'in schedule {self.pool}: get {user} running batches',
            ):
                async for record in self.db.select_and_fetchall(
                        '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 1 AND inst_coll = %s
LIMIT %s;
''',
                    (batch['id'], self.pool.name, remaining.value),
                        timer_description=
                        f'in schedule {self.pool}: get {user} batch {batch["id"]} runnable jobs (1)',
                ):
                    record['batch_id'] = batch['id']
                    record['userdata'] = batch['userdata']
                    record['user'] = batch['user']
                    record['format_version'] = batch['format_version']
                    yield record
                if not batch['cancelled']:
                    async for record in self.db.select_and_fetchall(
                            '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND inst_coll = %s AND cancelled = 0
LIMIT %s;
''',
                        (batch['id'], self.pool.name, remaining.value),
                            timer_description=
                            f'in schedule {self.pool}: get {user} batch {batch["id"]} runnable jobs (2)',
                    ):
                        record['batch_id'] = batch['id']
                        record['userdata'] = batch['userdata']
                        record['user'] = batch['user']
                        record['format_version'] = batch['format_version']
                        yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        def get_instance(user, cores_mcpu):
            i = self.pool.healthy_instances_by_free_cores.bisect_key_left(
                cores_mcpu)
            while i < len(self.pool.healthy_instances_by_free_cores):
                instance = self.pool.healthy_instances_by_free_cores[i]
                assert cores_mcpu <= instance.free_cores_mcpu
                return instance
                i += 1
            histogram = collections.defaultdict(int)
            for instance in self.pool.healthy_instances_by_free_cores:
                histogram[instance.free_cores_mcpu] += 1
            log.info(
                f'schedule {self.pool}: no viable instances for {cores_mcpu}: {histogram}'
            )
            return None

        should_wait = True
        for user, resources in user_resources.items():
            allocated_cores_mcpu = resources['allocated_cores_mcpu']
            if allocated_cores_mcpu == 0:
                continue

            scheduled_cores_mcpu = 0
            share = user_share[user]

            log.info(
                f'schedule {self.pool}: user-share: {user}: {allocated_cores_mcpu} {share}'
            )

            remaining = Box(share)
            async for record in user_runnable_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                id = (batch_id, job_id)
                attempt_id = secret_alnum_string(6)
                record['attempt_id'] = attempt_id

                if scheduled_cores_mcpu + record[
                        'cores_mcpu'] > allocated_cores_mcpu:
                    if random.random() > self.exceeded_shares_counter.rate():
                        self.exceeded_shares_counter.push(True)
                        self.scheduler_state_changed.set()
                        break
                    self.exceeded_shares_counter.push(False)

                instance = get_instance(user, record['cores_mcpu'])
                if instance:
                    instance.adjust_free_cores_in_memory(-record['cores_mcpu'])
                    scheduled_cores_mcpu += record['cores_mcpu']
                    n_scheduled += 1
                    should_wait = False

                    async def schedule_with_error_handling(
                            app, record, id, instance):
                        try:
                            await schedule_job(app, record, instance)
                        except Exception:
                            log.info(
                                f'scheduling job {id} on {instance} for {self.pool}',
                                exc_info=True)

                    await waitable_pool.call(schedule_with_error_handling,
                                             self.app, record, id, instance)

                remaining.value -= 1
                if remaining.value <= 0:
                    break

        await waitable_pool.wait()

        end = time_msecs()
        log.info(
            f'schedule: scheduled {n_scheduled} jobs in {end - start}ms for {self.pool}'
        )

        return should_wait
Ejemplo n.º 7
0
    async def create_instances_loop_body(self):
        log.info(f'create_instances for {self}: starting')
        start = time_msecs()
        n_instances_created = 0

        user_resources = await self.compute_fair_share()

        total = sum(resources['n_allocated_jobs']
                    for resources in user_resources.values())
        if not total:
            log.info(f'create_instances {self}: no allocated jobs')
            should_wait = True
            return should_wait
        user_share = {
            user: max(int(300 * resources['n_allocated_jobs'] / total + 0.5),
                      20)
            for user, resources in user_resources.items()
        }

        async def user_runnable_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT id, cancelled, userdata, user, format_version
FROM batches
WHERE user = %s AND `state` = 'running';
''',
                (user, ),
                    timer_description=
                    f'in create_instances {self}: get {user} running batches',
            ):
                async for record in self.db.select_and_fetchall(
                        '''
SELECT jobs.job_id, jobs.spec, jobs.cores_mcpu, COALESCE(SUM(instances.state IS NOT NULL AND
  (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled)
LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id
LEFT JOIN instances ON attempts.instance_name = instances.name
WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 1 AND jobs.inst_coll = %s
GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu
HAVING live_attempts = 0
LIMIT %s;
''',
                    (batch['id'], self.name, remaining.value),
                        timer_description=
                        f'in create_instances {self}: get {user} batch {batch["id"]} runnable jobs (1)',
                ):
                    record['batch_id'] = batch['id']
                    record['userdata'] = batch['userdata']
                    record['user'] = batch['user']
                    record['format_version'] = batch['format_version']
                    yield record
                if not batch['cancelled']:
                    async for record in self.db.select_and_fetchall(
                            '''
SELECT jobs.job_id, jobs.spec, jobs.cores_mcpu, COALESCE(SUM(instances.state IS NOT NULL AND
  (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id
LEFT JOIN instances ON attempts.instance_name = instances.name
WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 0 AND jobs.inst_coll = %s AND cancelled = 0
GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu
HAVING live_attempts = 0
LIMIT %s;
''',
                        (batch['id'], self.name, remaining.value),
                            timer_description=
                            f'in create_instances {self}: get {user} batch {batch["id"]} runnable jobs (2)',
                    ):
                        record['batch_id'] = batch['id']
                        record['userdata'] = batch['userdata']
                        record['user'] = batch['user']
                        record['format_version'] = batch['format_version']
                        yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        should_wait = True
        for user, resources in user_resources.items():
            n_allocated_instances = resources['n_allocated_jobs']
            if n_allocated_instances == 0:
                continue

            n_user_instances_created = 0

            share = user_share[user]

            log.info(f'create_instances {self}: user-share: {user}: {share}')

            remaining = Box(share)
            async for record in user_runnable_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                id = (batch_id, job_id)
                attempt_id = secret_alnum_string(6)
                record['attempt_id'] = attempt_id

                if n_user_instances_created >= n_allocated_instances:
                    if random.random() > self.exceeded_shares_counter.rate():
                        self.exceeded_shares_counter.push(True)
                        self.scheduler_state_changed.set()
                        break
                    self.exceeded_shares_counter.push(False)

                n_instances_created += 1
                n_user_instances_created += 1
                should_wait = False

                log.info(f'creating job private instance for job {id}')

                async def create_instance_with_error_handling(
                        batch_id, job_id, attempt_id, record, id):
                    try:
                        batch_format_version = BatchFormatVersion(
                            record['format_version'])
                        spec = json.loads(record['spec'])
                        machine_spec = batch_format_version.get_spec_machine_spec(
                            spec)
                        instance, resources = await self.create_instance(
                            batch_id, job_id, machine_spec)
                        await mark_job_creating(self.app, batch_id, job_id,
                                                attempt_id, instance,
                                                time_msecs(), resources)
                    except Exception:
                        log.info(f'creating job private instance for job {id}',
                                 exc_info=True)

                await waitable_pool.call(create_instance_with_error_handling,
                                         batch_id, job_id, attempt_id, record,
                                         id)

                remaining.value -= 1
                if remaining.value <= 0:
                    break

        await waitable_pool.wait()

        end = time_msecs()
        log.info(
            f'create_instances: created instances for {n_instances_created} jobs in {end - start}ms for {self}'
        )

        await asyncio.sleep(
            15)  # ensure we don't create more instances than GCE limit

        return should_wait
Ejemplo n.º 8
0
    async def schedule_loop_body(self):
        log.info('schedule: starting')
        start = time_msecs()
        n_scheduled = 0

        user_resources = await self.compute_fair_share()

        total = sum(resources['allocated_cores_mcpu']
                    for resources in user_resources.values())
        if not total:
            log.info('schedule: no allocated cores')
            should_wait = True
            return should_wait
        user_share = {
            user:
            max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20)
            for user, resources in user_resources.items()
        }

        async def user_runnable_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT id, cancelled, userdata, user, format_version
FROM batches
WHERE user = %s AND `state` = 'running';
''', (user, ),
                    timer_description=f'in schedule: get {user} running batches'
            ):
                async for record in self.db.select_and_fetchall(
                        '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 1
LIMIT %s;
''', (batch['id'], remaining.value),
                        timer_description=
                        f'in schedule: get {user} batch {batch["id"]} runnable jobs (1)'
                ):
                    record['batch_id'] = batch['id']
                    record['userdata'] = batch['userdata']
                    record['user'] = batch['user']
                    record['format_version'] = batch['format_version']
                    yield record
                if not batch['cancelled']:
                    async for record in self.db.select_and_fetchall(
                            '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND cancelled = 0
LIMIT %s;
''', (batch['id'], remaining.value),
                            timer_description=
                            f'in schedule: get {user} batch {batch["id"]} runnable jobs (2)'
                    ):
                        record['batch_id'] = batch['id']
                        record['userdata'] = batch['userdata']
                        record['user'] = batch['user']
                        record['format_version'] = batch['format_version']
                        yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        def get_instance(user, cores_mcpu):
            i = self.inst_pool.healthy_instances_by_free_cores.bisect_key_left(
                cores_mcpu)
            while i < len(self.inst_pool.healthy_instances_by_free_cores):
                instance = self.inst_pool.healthy_instances_by_free_cores[i]
                assert cores_mcpu <= instance.free_cores_mcpu
                if user != 'ci' or (user == 'ci' and
                                    instance.zone.startswith('us-central1')):
                    return instance
                i += 1
            return None

        should_wait = True
        for user, resources in user_resources.items():
            allocated_cores_mcpu = resources['allocated_cores_mcpu']
            if allocated_cores_mcpu == 0:
                continue

            scheduled_cores_mcpu = 0
            share = user_share[user]

            remaining = Box(share)
            async for record in user_runnable_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                id = (batch_id, job_id)
                attempt_id = ''.join([
                    secrets.choice('abcdefghijklmnopqrstuvwxyz0123456789')
                    for _ in range(6)
                ])
                record['attempt_id'] = attempt_id

                if scheduled_cores_mcpu + record[
                        'cores_mcpu'] > allocated_cores_mcpu:
                    break

                instance = get_instance(user, record['cores_mcpu'])
                if instance:
                    instance.adjust_free_cores_in_memory(-record['cores_mcpu'])
                    scheduled_cores_mcpu += record['cores_mcpu']
                    n_scheduled += 1
                    should_wait = False

                    async def schedule_with_error_handling(
                            app, record, id, instance):
                        try:
                            await schedule_job(app, record, instance)
                        except Exception:
                            log.info(f'scheduling job {id} on {instance}',
                                     exc_info=True)

                    await waitable_pool.call(schedule_with_error_handling,
                                             self.app, record, id, instance)

                remaining.value -= 1
                if remaining.value <= 0:
                    break

        await waitable_pool.wait()

        end = time_msecs()
        log.info(f'schedule: scheduled {n_scheduled} jobs in {end - start}ms')

        return should_wait
Ejemplo n.º 9
0
    async def cancel_cancelled_running_jobs_loop_body(self):
        records = self.db.select_and_fetchall(
            '''
SELECT user, n_cancelled_running_jobs
FROM (SELECT user,
    CAST(COALESCE(SUM(n_cancelled_running_jobs), 0) AS SIGNED) AS n_cancelled_running_jobs
  FROM user_resources
  GROUP BY user) AS t
WHERE n_cancelled_running_jobs > 0;
''',
            timer_description=
            f'in cancel_cancelled_running_jobs: aggregate n_cancelled_running_jobs'
        )
        user_n_cancelled_running_jobs = {
            record['user']: record['n_cancelled_running_jobs']
            async for record in records
        }

        total = sum(user_n_cancelled_running_jobs.values())
        if not total:
            should_wait = True
            return should_wait
        user_share = {
            user: max(int(300 * user_n_jobs / total + 0.5), 20)
            for user, user_n_jobs in user_n_cancelled_running_jobs.items()
        }

        async def user_cancelled_running_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT id
FROM batches
WHERE user = %s AND `state` = 'running' AND cancelled = 1;
''', (user, ),
                    timer_description=
                    f'in cancel_cancelled_running_jobs: get {user} cancelled batches'
            ):
                async for record in self.db.select_and_fetchall(
                        '''
SELECT jobs.job_id, attempts.attempt_id, attempts.instance_name
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
STRAIGHT_JOIN attempts
  ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id
WHERE jobs.batch_id = %s AND state = 'Running' AND always_run = 0 AND cancelled = 0
LIMIT %s;
''', (batch['id'], remaining.value),
                        timer_description=
                        f'in cancel_cancelled_running_jobs: get {user} batch {batch["id"]} running cancelled jobs'
                ):
                    record['batch_id'] = batch['id']
                    yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        should_wait = True
        for user, share in user_share.items():
            remaining = Box(share)
            async for record in user_cancelled_running_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                id = (batch_id, job_id)

                async def unschedule_with_error_handling(
                        app, record, instance_name, id):
                    try:
                        await unschedule_job(app, record)
                    except Exception:
                        log.info(
                            f'unscheduling job {id} on instance {instance_name}',
                            exc_info=True)

                await waitable_pool.call(unschedule_with_error_handling,
                                         self.app, record,
                                         record['instance_name'], id)

                remaining.value -= 1
                if remaining.value <= 0:
                    should_wait = False
                    break

        await waitable_pool.wait()

        return should_wait
Ejemplo n.º 10
0
    async def cancel_cancelled_ready_jobs_loop_body(self):
        records = self.db.select_and_fetchall(
            '''
SELECT user, n_cancelled_ready_jobs
FROM (SELECT user,
    CAST(COALESCE(SUM(n_cancelled_ready_jobs), 0) AS SIGNED) AS n_cancelled_ready_jobs
  FROM user_resources
  GROUP BY user) AS t
WHERE n_cancelled_ready_jobs > 0;
''',
            timer_description=
            'in cancel_cancelled_ready_jobs: aggregate n_cancelled_ready_jobs')
        user_n_cancelled_ready_jobs = {
            record['user']: record['n_cancelled_ready_jobs']
            async for record in records
        }

        total = sum(user_n_cancelled_ready_jobs.values())
        if not total:
            should_wait = True
            return should_wait
        user_share = {
            user: max(int(300 * user_n_jobs / total + 0.5), 20)
            for user, user_n_jobs in user_n_cancelled_ready_jobs.items()
        }

        async def user_cancelled_ready_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT id, cancelled
FROM batches
WHERE user = %s AND `state` = 'running';
''', (user, ),
                    timer_description=
                    f'in cancel_cancelled_ready_jobs: get {user} running batches'
            ):
                if batch['cancelled']:
                    async for record in self.db.select_and_fetchall(
                            '''
SELECT jobs.job_id
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 0
LIMIT %s;
''', (batch['id'], remaining.value),
                            timer_description=
                            f'in cancel_cancelled_ready_jobs: get {user} batch {batch["id"]} ready cancelled jobs (1)'
                    ):
                        record['batch_id'] = batch['id']
                        yield record
                else:
                    async for record in self.db.select_and_fetchall(
                            '''
SELECT jobs.job_id
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND cancelled = 1
LIMIT %s;
''', (batch['id'], remaining.value),
                            timer_description=
                            f'in cancel_cancelled_ready_jobs: get {user} batch {batch["id"]} ready cancelled jobs (2)'
                    ):
                        record['batch_id'] = batch['id']
                        yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        should_wait = True
        for user, share in user_share.items():
            remaining = Box(share)
            async for record in user_cancelled_ready_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                id = (batch_id, job_id)
                log.info(f'cancelling job {id}')

                async def cancel_with_error_handling(app, batch_id, job_id,
                                                     id):
                    try:
                        await mark_job_complete(app, batch_id, job_id, None,
                                                None, 'Cancelled', None, None,
                                                None, 'cancelled')
                    except Exception:
                        log.info(f'error while cancelling job {id}',
                                 exc_info=True)

                await waitable_pool.call(cancel_with_error_handling, self.app,
                                         batch_id, job_id, id)

                remaining.value -= 1
                if remaining.value <= 0:
                    should_wait = False
                    break

        await waitable_pool.wait()

        return should_wait
Ejemplo n.º 11
0
    async def cancel_cancelled_creating_jobs_loop_body(self):
        records = self.db.select_and_fetchall(
            '''
SELECT user, CAST(COALESCE(SUM(n_cancelled_creating_jobs), 0) AS SIGNED) AS n_cancelled_creating_jobs
FROM user_inst_coll_resources
GROUP BY user
HAVING n_cancelled_creating_jobs > 0;
''', )
        user_n_cancelled_creating_jobs = {
            record['user']: record['n_cancelled_creating_jobs']
            async for record in records
        }

        total = sum(user_n_cancelled_creating_jobs.values())
        if total == 0:
            should_wait = True
            return should_wait
        user_share = {
            user: max(int(300 * user_n_jobs / total + 0.5), 20)
            for user, user_n_jobs in user_n_cancelled_creating_jobs.items()
        }

        async def user_cancelled_creating_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT batches.id
FROM batches
INNER JOIN batches_cancelled
        ON batches.id = batches_cancelled.id
WHERE user = %s AND `state` = 'running';
''',
                (user, ),
            ):
                async for record in self.db.select_and_fetchall(
                        '''
SELECT jobs.job_id, attempts.attempt_id, attempts.instance_name
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
STRAIGHT_JOIN attempts
  ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id
WHERE jobs.batch_id = %s AND state = 'Creating' AND always_run = 0 AND cancelled = 0
LIMIT %s;
''',
                    (batch['id'], remaining.value),
                ):
                    record['batch_id'] = batch['id']
                    yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        should_wait = True
        for user, share in user_share.items():
            remaining = Box(share)
            async for record in user_cancelled_creating_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                attempt_id = record['attempt_id']
                instance_name = record['instance_name']
                id = (batch_id, job_id)

                async def cancel_with_error_handling(app, batch_id, job_id,
                                                     attempt_id, instance_name,
                                                     id):
                    try:
                        resources = []
                        end_time = time_msecs()
                        await mark_job_complete(
                            app,
                            batch_id,
                            job_id,
                            attempt_id,
                            instance_name,
                            'Cancelled',
                            None,
                            None,
                            end_time,
                            'cancelled',
                            resources,
                        )

                        instance = self.inst_coll_manager.get_instance(
                            instance_name)
                        if instance is None:
                            log.warning(
                                f'in cancel_cancelled_creating_jobs: unknown instance {instance_name}'
                            )
                            return

                        await instance.inst_coll.call_delete_instance(
                            instance, 'cancelled')

                    except Exception:
                        log.info(
                            f'cancelling creating job {id} on instance {instance_name}',
                            exc_info=True)

                await waitable_pool.call(cancel_with_error_handling, self.app,
                                         batch_id, job_id, attempt_id,
                                         instance_name, id)

                remaining.value -= 1
                if remaining.value <= 0:
                    should_wait = False
                    break

        await waitable_pool.wait()

        return should_wait
Ejemplo n.º 12
0
    async def schedule_loop_body(self):
        if self.app['frozen']:
            log.info(
                f'not scheduling any jobs for {self.pool}; batch is frozen')
            return True

        log.info(f'schedule {self.pool}: starting')
        start = time_msecs()
        n_scheduled = 0

        user_resources = await self.compute_fair_share()

        total = sum(resources['allocated_cores_mcpu']
                    for resources in user_resources.values())
        if not total:
            log.info(f'schedule {self.pool}: no allocated cores')
            should_wait = True
            return should_wait
        user_share = {
            user:
            max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20)
            for user, resources in user_resources.items()
        }

        async def user_runnable_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT batches.id, batches_cancelled.id IS NOT NULL AS cancelled, userdata, user, format_version
FROM batches
LEFT JOIN batches_cancelled
       ON batches.id = batches_cancelled.id
WHERE user = %s AND `state` = 'running';
''',
                (user, ),
                    "user_runnable_jobs__select_running_batches",
            ):
                async for record in self.db.select_and_fetchall(
                        '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 1 AND inst_coll = %s
LIMIT %s;
''',
                    (batch['id'], self.pool.name, remaining.value),
                        "user_runnable_jobs__select_ready_always_run_jobs",
                ):
                    record['batch_id'] = batch['id']
                    record['userdata'] = batch['userdata']
                    record['user'] = batch['user']
                    record['format_version'] = batch['format_version']
                    yield record
                if not batch['cancelled']:
                    async for record in self.db.select_and_fetchall(
                            '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND inst_coll = %s AND cancelled = 0
LIMIT %s;
''',
                        (batch['id'], self.pool.name, remaining.value),
                            "user_runnable_jobs__select_ready_jobs_batch_not_cancelled",
                    ):
                        record['batch_id'] = batch['id']
                        record['userdata'] = batch['userdata']
                        record['user'] = batch['user']
                        record['format_version'] = batch['format_version']
                        yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        should_wait = True
        for user, resources in user_resources.items():
            allocated_cores_mcpu = resources['allocated_cores_mcpu']
            if allocated_cores_mcpu == 0:
                continue

            scheduled_cores_mcpu = 0
            share = user_share[user]

            remaining = Box(share)
            async for record in user_runnable_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                id = (batch_id, job_id)
                attempt_id = secret_alnum_string(6)
                record['attempt_id'] = attempt_id

                if scheduled_cores_mcpu + record[
                        'cores_mcpu'] > allocated_cores_mcpu:
                    if random.random() > self.exceeded_shares_counter.rate():
                        self.exceeded_shares_counter.push(True)
                        self.scheduler_state_changed.set()
                        break
                    self.exceeded_shares_counter.push(False)

                instance = self.pool.get_instance(user, record['cores_mcpu'])
                if instance:
                    instance.adjust_free_cores_in_memory(-record['cores_mcpu'])
                    scheduled_cores_mcpu += record['cores_mcpu']
                    n_scheduled += 1

                    async def schedule_with_error_handling(
                            app, record, id, instance):
                        try:
                            await schedule_job(app, record, instance)
                        except Exception:
                            log.info(
                                f'scheduling job {id} on {instance} for {self.pool}',
                                exc_info=True)

                    await waitable_pool.call(schedule_with_error_handling,
                                             self.app, record, id, instance)

                remaining.value -= 1
                if remaining.value <= 0:
                    should_wait = False
                    break

        await waitable_pool.wait()

        end = time_msecs()
        log.info(
            f'schedule: attempted to schedule {n_scheduled} jobs in {end - start}ms for {self.pool}'
        )

        return should_wait