Example #1
0
    async def _attach(self):
        async with LoggingTimer(
                f'attaching disk {self.name} to {self.instance_name}'):
            config = {
                'source':
                f'/compute/v1/projects/{self.project}/zones/{self.zone}/disks/{self.name}',
                'autoDelete': True,
                'deviceName': self.name,
            }

            await self.compute_client.attach_disk(
                f'/zones/{self.zone}/instances/{self.instance_name}/attachDisk',
                json=config)
            self._attached = True
Example #2
0
 async def execute_and_fetchall(self,
                                sql,
                                args=None,
                                timer_description=None):
     assert self.conn
     async with self.conn.cursor() as cursor:
         if timer_description is None:
             await cursor.execute(sql, args)
         else:
             async with LoggingTimer(
                     f'{timer_description}: execute_and_fetchall: execute'):
                 await cursor.execute(sql, args)
         while True:
             if timer_description is None:
                 rows = await cursor.fetchmany(100)
             else:
                 async with LoggingTimer(
                         f'{timer_description}: execute_and_fetchall: fetchmany'
                 ):
                     rows = await cursor.fetchmany(100)
             if not rows:
                 break
             for row in rows:
                 yield row
Example #3
0
    async def _create(self, labels=None):
        async with LoggingTimer(f'creating disk {self.name}'):
            if labels is None:
                labels = {}

            config = {
                'name': self.name,
                'sizeGb': f'{self.size_in_gb}',
                'type': f'zones/{self.zone}/diskTypes/pd-ssd',
                'labels': labels,
            }

            await self.compute_client.create_disk(f'/zones/{self.zone}/disks',
                                                  json=config)
            self._created = True
Example #4
0
File: disk.py Project: saponas/hail
 async def _delete(self):
     async with LoggingTimer(f'deleting disk {self.name}'):
         await self.compute_client.delete_disk(
             f'/zones/{self.zone}/disks/{self.name}')
Example #5
0
File: disk.py Project: saponas/hail
 async def _detach(self):
     async with LoggingTimer(
             f'detaching disk {self.name} from {self.instance_name}'):
         await self.compute_client.detach_disk(
             f'/zones/{self.zone}/instances/{self.instance_name}/detachDisk',
             params={'deviceName': self.name})
Example #6
0
async def create_jobs(request, userdata):
    app = request.app
    db = app['db']
    log_store = app['log_store']

    worker_type = app['worker_type']
    worker_cores = app['worker_cores']

    batch_id = int(request.match_info['batch_id'])

    user = userdata['username']
    # restrict to what's necessary; in particular, drop the session
    # which is sensitive
    userdata = {
        'username': user,
        'bucket_name': userdata['bucket_name'],
        'gsa_key_secret_name': userdata['gsa_key_secret_name'],
        'tokens_secret_name': userdata['tokens_secret_name']
    }

    async with LoggingTimer(f'batch {batch_id} create jobs') as timer:
        async with timer.step('fetch batch'):
            record = await db.select_and_fetchone(
                '''
SELECT `state`, format_version FROM batches
WHERE user = %s AND id = %s AND NOT deleted;
''', (user, batch_id))

        if not record:
            raise web.HTTPNotFound()
        if record['state'] != 'open':
            raise web.HTTPBadRequest(reason=f'batch {batch_id} is not open')
        batch_format_version = BatchFormatVersion(record['format_version'])

        async with timer.step('get request json'):
            job_specs = await request.json()

        async with timer.step('validate job_specs'):
            try:
                validate_jobs(job_specs)
            except ValidationError as e:
                raise web.HTTPBadRequest(reason=e.reason)

        async with timer.step('build db args'):
            spec_writer = SpecWriter(log_store, batch_id)

            jobs_args = []
            job_parents_args = []
            job_attributes_args = []

            n_ready_jobs = 0
            ready_cores_mcpu = 0
            n_ready_cancellable_jobs = 0
            ready_cancellable_cores_mcpu = 0

            prev_job_idx = None
            start_job_id = None

            for spec in job_specs:
                job_id = spec['job_id']
                parent_ids = spec.pop('parent_ids', [])
                always_run = spec.pop('always_run', False)

                if batch_format_version.has_full_spec_in_gcs():
                    attributes = spec.pop('attributes', None)
                else:
                    attributes = spec.get('attributes')

                id = (batch_id, job_id)

                if start_job_id is None:
                    start_job_id = job_id

                if batch_format_version.has_full_spec_in_gcs(
                ) and prev_job_idx:
                    if job_id != prev_job_idx + 1:
                        raise web.HTTPBadRequest(
                            reason=
                            f'noncontiguous job ids found in the spec: {prev_job_idx} -> {job_id}'
                        )
                prev_job_idx = job_id

                resources = spec.get('resources')
                if not resources:
                    resources = {}
                    spec['resources'] = resources
                if 'cpu' not in resources:
                    resources['cpu'] = BATCH_JOB_DEFAULT_CPU
                if 'memory' not in resources:
                    resources['memory'] = BATCH_JOB_DEFAULT_MEMORY

                req_cores_mcpu = parse_cpu_in_mcpu(resources['cpu'])
                req_memory_bytes = parse_memory_in_bytes(resources['memory'])

                if req_cores_mcpu == 0:
                    raise web.HTTPBadRequest(
                        reason=f'bad resource request for job {id}: '
                        f'cpu cannot be 0')

                cores_mcpu = adjust_cores_for_memory_request(
                    req_cores_mcpu, req_memory_bytes, worker_type)

                if cores_mcpu > worker_cores * 1000:
                    total_memory_available = worker_memory_per_core_gb(
                        worker_type) * worker_cores
                    raise web.HTTPBadRequest(
                        reason=
                        f'resource requests for job {id} are unsatisfiable: '
                        f'requested: cpu={resources["cpu"]}, memory={resources["memory"]} '
                        f'maximum: cpu={worker_cores}, memory={total_memory_available}G'
                    )

                secrets = spec.get('secrets')
                if not secrets:
                    secrets = []
                    spec['secrets'] = secrets
                secrets.append({
                    'namespace': BATCH_PODS_NAMESPACE,
                    'name': userdata['gsa_key_secret_name'],
                    'mount_path': '/gsa-key',
                    'mount_in_copy': True
                })

                env = spec.get('env')
                if not env:
                    env = []
                    spec['env'] = env

                if len(parent_ids) == 0:
                    state = 'Ready'
                    n_ready_jobs += 1
                    ready_cores_mcpu += cores_mcpu
                    if not always_run:
                        n_ready_cancellable_jobs += 1
                        ready_cancellable_cores_mcpu += cores_mcpu
                else:
                    state = 'Pending'

                spec_writer.add(json.dumps(spec))
                db_spec = batch_format_version.db_spec(spec)

                jobs_args.append((batch_id, job_id, state, json.dumps(db_spec),
                                  always_run, cores_mcpu, len(parent_ids)))

                for parent_id in parent_ids:
                    job_parents_args.append((batch_id, job_id, parent_id))

                if attributes:
                    for k, v in attributes.items():
                        job_attributes_args.append((batch_id, job_id, k, v))

        if batch_format_version.has_full_spec_in_gcs():
            async with timer.step('write spec to gcs'):
                await spec_writer.write()

        rand_token = random.randint(0, app['n_tokens'] - 1)
        n_jobs = len(job_specs)

        async with timer.step('insert jobs'):

            @transaction(db)
            async def insert(tx):
                try:
                    await tx.execute_many(
                        '''
INSERT INTO jobs (batch_id, job_id, state, spec, always_run, cores_mcpu, n_pending_parents)
VALUES (%s, %s, %s, %s, %s, %s, %s);
''', jobs_args)
                except pymysql.err.IntegrityError as err:
                    # 1062 ER_DUP_ENTRY https://dev.mysql.com/doc/refman/5.7/en/server-error-reference.html#error_er_dup_entry
                    if err.args[0] == 1062:
                        log.info(
                            f'bunch containing job {(batch_id, jobs_args[0][1])} already inserted ({err})'
                        )
                        raise web.Response()
                    raise
                await tx.execute_many(
                    '''
INSERT INTO `job_parents` (batch_id, job_id, parent_id)
VALUES (%s, %s, %s);
''', job_parents_args)
                await tx.execute_many(
                    '''
INSERT INTO `job_attributes` (batch_id, job_id, `key`, `value`)
VALUES (%s, %s, %s, %s);
''', job_attributes_args)
                await tx.execute_update(
                    '''
INSERT INTO batches_staging (batch_id, token, n_jobs, n_ready_jobs, ready_cores_mcpu)
VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
  n_jobs = n_jobs + %s,
  n_ready_jobs = n_ready_jobs + %s,
  ready_cores_mcpu = ready_cores_mcpu + %s;
''', (batch_id, rand_token, n_jobs, n_ready_jobs, ready_cores_mcpu, n_jobs,
                n_ready_jobs, ready_cores_mcpu))
                await tx.execute_update(
                    '''
INSERT INTO batch_cancellable_resources (batch_id, token, n_ready_cancellable_jobs, ready_cancellable_cores_mcpu)
VALUES (%s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
  n_ready_cancellable_jobs = n_ready_cancellable_jobs + %s,
  ready_cancellable_cores_mcpu = ready_cancellable_cores_mcpu + %s;
''', (batch_id, rand_token, n_ready_cancellable_jobs,
                ready_cancellable_cores_mcpu, n_ready_cancellable_jobs,
                ready_cancellable_cores_mcpu))

                if batch_format_version.has_full_spec_in_gcs():
                    await tx.execute_update(
                        '''
INSERT INTO batch_bunches (batch_id, token, start_job_id)
VALUES (%s, %s, %s);
''', (batch_id, spec_writer.token, start_job_id))

            await insert()  # pylint: disable=no-value-for-parameter
    return web.Response()