Esempio n. 1
0
 async def __aenter__(self):
     headers = service_auth_headers(self._deploy_config, 'ci')
     self._session = ssl_client_session(
         raise_for_status=True,
         timeout=aiohttp.ClientTimeout(total=60),
         headers=headers)
     return self
Esempio n. 2
0
    async def activate(self):
        async with ssl_client_session(
                raise_for_status=True,
                timeout=aiohttp.ClientTimeout(total=60)) as session:
            resp = await request_retry_transient_errors(
                session,
                'POST',
                deploy_config.url('batch-driver',
                                  '/api/v1alpha/instances/activate'),
                json={'ip_address': os.environ['IP_ADDRESS']},
                headers={
                    'X-Hail-Instance-Name': NAME,
                    'Authorization': f'Bearer {os.environ["ACTIVATION_TOKEN"]}'
                })
            resp_json = await resp.json()
            self.headers = {
                'X-Hail-Instance-Name': NAME,
                'Authorization': f'Bearer {resp_json["token"]}'
            }

            with open('key.json', 'w') as f:
                f.write(json.dumps(resp_json['key']))

            credentials = google.oauth2.service_account.Credentials.from_service_account_file(
                'key.json')
            self.log_store = LogStore(BATCH_LOGS_BUCKET_NAME,
                                      WORKER_LOGS_BUCKET_NAME,
                                      INSTANCE_ID,
                                      self.pool,
                                      project=PROJECT,
                                      credentials=credentials)
Esempio n. 3
0
    async def __init__(self,
                       billing_project,
                       deploy_config=None,
                       session=None,
                       headers=None,
                       _token=None):
        self.billing_project = billing_project

        if not deploy_config:
            deploy_config = get_deploy_config()

        self.url = deploy_config.base_url('batch')

        if session is None:
            session = ssl_client_session(
                raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60))
        self._session = session

        userinfo = await async_get_userinfo(deploy_config)
        self.bucket = userinfo['bucket_name']

        h = {}
        if headers:
            h.update(headers)
        if _token:
            h['Authorization'] = f'Bearer {_token}'
        else:
            h.update(service_auth_headers(deploy_config, 'batch'))
        self._headers = h
Esempio n. 4
0
async def test_deploy():
    deploy_config = get_deploy_config()
    ci_deploy_status_url = deploy_config.url('ci',
                                             '/api/v1alpha/deploy_status')
    headers = service_auth_headers(deploy_config, 'ci')
    async with ssl_client_session(
            raise_for_status=True,
            timeout=aiohttp.ClientTimeout(total=60)) as session:

        async def wait_forever():
            deploy_state = None
            failure_information = None
            while deploy_state is None:
                resp = await utils.request_retry_transient_errors(
                    session, 'GET', f'{ci_deploy_status_url}', headers=headers)
                deploy_statuses = await resp.json()
                log.info(
                    f'deploy_statuses:\n{json.dumps(deploy_statuses, indent=2)}'
                )
                assert len(deploy_statuses) == 1, deploy_statuses
                deploy_status = deploy_statuses[0]
                deploy_state = deploy_status['deploy_state']
                failure_information = deploy_status.get('failure_information')
                await asyncio.sleep(5)
            log.info(f'returning {deploy_status} {failure_information}')
            return deploy_state, failure_information

        deploy_state, failure_information = await asyncio.wait_for(
            wait_forever(), timeout=30 * 60)
        assert deploy_state == 'success', str(failure_information)
Esempio n. 5
0
async def async_main(args):
    deploy_config = get_deploy_config()
    if args.namespace:
        auth_ns = args.namespace
        deploy_config = deploy_config.with_service('auth', auth_ns)
    else:
        auth_ns = deploy_config.service_ns('auth')
    headers = namespace_auth_headers(deploy_config, auth_ns, authorize_target=False)
    async with ssl_client_session(
            raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60), headers=headers) as session:
        await auth_flow(deploy_config, auth_ns, session)
Esempio n. 6
0
    async def run(self):
        app_runner = None
        site = None
        try:
            app = web.Application(client_max_size=HTTP_CLIENT_MAX_SIZE)
            app.add_routes([
                web.post('/api/v1alpha/batches/jobs/create', self.create_job),
                web.delete(
                    '/api/v1alpha/batches/{batch_id}/jobs/{job_id}/delete',
                    self.delete_job),
                web.get('/api/v1alpha/batches/{batch_id}/jobs/{job_id}/log',
                        self.get_job_log),
                web.get('/api/v1alpha/batches/{batch_id}/jobs/{job_id}/status',
                        self.get_job_status),
                web.get('/healthcheck', self.healthcheck)
            ])

            app_runner = web.AppRunner(app)
            await app_runner.setup()
            site = web.TCPSite(app_runner, '0.0.0.0', 5000)
            await site.start()

            await self.activate()

            idle_duration = time_msecs() - self.last_updated
            while self.jobs or idle_duration < MAX_IDLE_TIME_MSECS:
                log.info(
                    f'n_jobs {len(self.jobs)} free_cores {self.cpu_sem.value / 1000} idle {idle_duration}'
                )
                await asyncio.sleep(15)
                idle_duration = time_msecs() - self.last_updated

            log.info(f'idle {idle_duration} seconds, exiting')

            async with ssl_client_session(
                    raise_for_status=True,
                    timeout=aiohttp.ClientTimeout(total=60)) as session:
                # Don't retry.  If it doesn't go through, the driver
                # monitoring loops will recover.  If the driver is
                # gone (e.g. testing a PR), this would go into an
                # infinite loop and the instance won't be deleted.
                await session.post(deploy_config.url(
                    'batch-driver', '/api/v1alpha/instances/deactivate'),
                                   headers=self.headers)
            log.info('deactivated')
        finally:
            log.info('shutting down')
            if site:
                await site.stop()
                log.info('stopped site')
            if app_runner:
                await app_runner.cleanup()
                log.info('cleaned up app runner')
Esempio n. 7
0
async def async_get_userinfo(deploy_config=None, headers=None):
    if deploy_config is None:
        deploy_config = get_deploy_config()
    if headers is None:
        headers = service_auth_headers(deploy_config, 'auth')
    userinfo_url = deploy_config.url('auth', '/api/v1alpha/userinfo')
    async with ssl_client_session(
            raise_for_status=True,
            timeout=aiohttp.ClientTimeout(total=5)) as session:
        resp = await request_retry_transient_errors(session,
                                                    'GET',
                                                    userinfo_url,
                                                    headers=headers)
        return await resp.json()
Esempio n. 8
0
async def _userdata_from_session_id(session_id):
    headers = {'Authorization': f'Bearer {session_id}'}
    try:
        async with ssl_client_session(
                raise_for_status=True, timeout=aiohttp.ClientTimeout(total=5)) as session:
            resp = await request_retry_transient_errors(
                session, 'GET', deploy_config.url('auth', '/api/v1alpha/userinfo'),
                headers=headers)
            assert resp.status == 200
            return await resp.json()
    except aiohttp.ClientResponseError as e:
        if e.status == 401:
            return None

        log.exception('unknown exception getting userinfo')
        raise web.HTTPInternalServerError()
    except Exception:  # pylint: disable=broad-except
        log.exception('unknown exception getting userinfo')
        raise web.HTTPInternalServerError()
Esempio n. 9
0
    async def post_job_started_1(self, job):
        full_status = await job.status()

        status = {
            'version': full_status['version'],
            'batch_id': full_status['batch_id'],
            'job_id': full_status['job_id'],
            'attempt_id': full_status['attempt_id'],
            'start_time': full_status['start_time'],
            'resources': full_status['resources']
        }

        body = {
            'status': status
        }

        async with ssl_client_session(
                raise_for_status=True, timeout=aiohttp.ClientTimeout(total=5)) as session:
            await request_retry_transient_errors(
                session, 'POST',
                deploy_config.url('batch-driver', '/api/v1alpha/instances/job_started'),
                json=body, headers=self.headers)
Esempio n. 10
0
async def notebook_status_from_notebook(k8s, service, headers, cookies,
                                        notebook):
    status = await k8s_notebook_status_from_notebook(k8s, notebook)
    if not status:
        return None

    if status['state'] == 'Initializing':
        if notebook['state'] == 'Ready':
            status['state'] = 'Ready'
        else:
            pod_name = notebook['pod_name']

            # don't have dev credentials to connect through internal.hail.is
            ready_url = deploy_config.external_url(
                service,
                f'/instance/{notebook["notebook_token"]}/?token={notebook["jupyter_token"]}'
            )
            try:
                async with ssl_client_session(
                        timeout=aiohttp.ClientTimeout(total=1),
                        headers=headers,
                        cookies=cookies) as session:
                    async with session.get(ready_url) as resp:
                        if resp.status >= 200 and resp.status < 300:
                            log.info(
                                f'GET on jupyter pod {pod_name} succeeded: {resp}'
                            )
                            status['state'] = 'Ready'
                        else:
                            log.info(
                                f'GET on jupyter pod {pod_name} failed: {resp}'
                            )
            except aiohttp.ServerTimeoutError:
                log.exception(
                    f'GET on jupyter pod {pod_name} timed out: {resp}')

    return status
Esempio n. 11
0
 def __init__(self, should_fail):
     self.should_fail = should_fail
     self.real_session = ssl_client_session(
         raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60))
Esempio n. 12
0
    async def post_job_complete_1(self, job):
        run_duration = job.end_time - job.start_time

        full_status = await retry_all_errors(
            f'error while getting status for {job}')(job.status)

        if job.format_version.has_full_status_in_gcs():
            await retry_all_errors(
                f'error while writing status file to gcs for {job}')(
                    self.log_store.write_status_file, job.batch_id, job.job_id,
                    job.attempt_id, json.dumps(full_status))

        db_status = job.format_version.db_status(full_status)

        status = {
            'version': full_status['version'],
            'batch_id': full_status['batch_id'],
            'job_id': full_status['job_id'],
            'attempt_id': full_status['attempt_id'],
            'state': full_status['state'],
            'start_time': full_status['start_time'],
            'end_time': full_status['end_time'],
            'status': db_status
        }

        body = {'status': status}

        start_time = time_msecs()
        delay_secs = 0.1
        while True:
            try:
                async with ssl_client_session(
                        raise_for_status=True,
                        timeout=aiohttp.ClientTimeout(total=5)) as session:
                    await session.post(deploy_config.url(
                        'batch-driver', '/api/v1alpha/instances/job_complete'),
                                       json=body,
                                       headers=self.headers)
                    return
            except asyncio.CancelledError:  # pylint: disable=try-except-raise
                raise
            except Exception as e:
                if isinstance(e,
                              aiohttp.ClientResponseError) and e.status == 404:  # pylint: disable=no-member
                    raise
                log.exception(f'failed to mark {job} complete, retrying')

            # unlist job after 3m or half the run duration
            now = time_msecs()
            elapsed = now - start_time
            if (job.id in self.jobs and elapsed > 180 * 1000
                    and elapsed > run_duration / 2):
                log.info(
                    f'too much time elapsed marking {job} complete, removing from jobs, will keep retrying'
                )
                del self.jobs[job.id]
                self.last_updated = time_msecs()

            await asyncio.sleep(delay_secs * random.uniform(0.7, 1.3))
            # exponentially back off, up to (expected) max of 2m
            delay_secs = min(delay_secs * 2, 2 * 60.0)
Esempio n. 13
0
async def run(args, i):
    headers = service_auth_headers(deploy_config,
                                   'workshop',
                                   authorize_target=False)

    async with ssl_client_session(raise_for_status=True) as session:
        # make sure notebook is up
        async with session.get(deploy_config.url('workshop', ''),
                               headers=headers) as resp:
            await resp.text()

        log.info(f'{i} loaded notebook home page')

        # log in as workshop guest
        # get csrf token
        async with session.get(deploy_config.url('workshop', '/login'),
                               headers=headers) as resp:
            pass

        data = aiohttp.FormData()
        data.add_field(name='name', value=args.workshop)
        data.add_field(name='password', value=args.password)
        data.add_field(name='_csrf', value=get_cookie(session, '_csrf'))
        async with session.post(deploy_config.url('workshop', '/login'),
                                data=data,
                                headers=headers) as resp:
            pass

        log.info(f'{i} logged in')

        # create notebook
        # get csrf token
        async with session.get(deploy_config.url('workshop', '/notebook'),
                               headers=headers) as resp:
            pass

        data = aiohttp.FormData()
        data.add_field(name='_csrf', value=get_cookie(session, '_csrf'))
        async with session.post(deploy_config.url('workshop', '/notebook'),
                                data=data,
                                headers=headers) as resp:
            pass

        log.info(f'{i} created notebook')

        start = time.time()

        # wait for notebook ready
        ready = False
        attempt = 0
        # 5 attempts overkill, should only take 2: Scheduling => Running => Ready
        while not ready and attempt < 5:
            async with session.ws_connect(deploy_config.url('workshop',
                                                            '/notebook/wait',
                                                            base_scheme='ws'),
                                          headers=headers) as ws:
                async for msg in ws:
                    if msg.data == '1':
                        ready = True
            attempt += 1

        end = time.time()
        duration = end - start

        log.info(f'{i} notebook state {ready} duration {duration}')

        # delete notebook
        # get csrf token
        async with session.get(deploy_config.url('workshop', '/notebook'),
                               headers=headers) as resp:
            pass

        data = aiohttp.FormData()
        data.add_field(name='_csrf', value=get_cookie(session, '_csrf'))
        async with session.post(deploy_config.url('workshop',
                                                  '/notebook/delete'),
                                data=data,
                                headers=headers) as resp:
            pass

        log.info(f'{i} notebook delete, done.')

    return duration, ready