async def __aenter__(self): headers = service_auth_headers(self._deploy_config, 'ci') self._session = ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60), headers=headers) return self
async def activate(self): async with ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60)) as session: resp = await request_retry_transient_errors( session, 'POST', deploy_config.url('batch-driver', '/api/v1alpha/instances/activate'), json={'ip_address': os.environ['IP_ADDRESS']}, headers={ 'X-Hail-Instance-Name': NAME, 'Authorization': f'Bearer {os.environ["ACTIVATION_TOKEN"]}' }) resp_json = await resp.json() self.headers = { 'X-Hail-Instance-Name': NAME, 'Authorization': f'Bearer {resp_json["token"]}' } with open('key.json', 'w') as f: f.write(json.dumps(resp_json['key'])) credentials = google.oauth2.service_account.Credentials.from_service_account_file( 'key.json') self.log_store = LogStore(BATCH_LOGS_BUCKET_NAME, WORKER_LOGS_BUCKET_NAME, INSTANCE_ID, self.pool, project=PROJECT, credentials=credentials)
async def __init__(self, billing_project, deploy_config=None, session=None, headers=None, _token=None): self.billing_project = billing_project if not deploy_config: deploy_config = get_deploy_config() self.url = deploy_config.base_url('batch') if session is None: session = ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60)) self._session = session userinfo = await async_get_userinfo(deploy_config) self.bucket = userinfo['bucket_name'] h = {} if headers: h.update(headers) if _token: h['Authorization'] = f'Bearer {_token}' else: h.update(service_auth_headers(deploy_config, 'batch')) self._headers = h
async def test_deploy(): deploy_config = get_deploy_config() ci_deploy_status_url = deploy_config.url('ci', '/api/v1alpha/deploy_status') headers = service_auth_headers(deploy_config, 'ci') async with ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60)) as session: async def wait_forever(): deploy_state = None failure_information = None while deploy_state is None: resp = await utils.request_retry_transient_errors( session, 'GET', f'{ci_deploy_status_url}', headers=headers) deploy_statuses = await resp.json() log.info( f'deploy_statuses:\n{json.dumps(deploy_statuses, indent=2)}' ) assert len(deploy_statuses) == 1, deploy_statuses deploy_status = deploy_statuses[0] deploy_state = deploy_status['deploy_state'] failure_information = deploy_status.get('failure_information') await asyncio.sleep(5) log.info(f'returning {deploy_status} {failure_information}') return deploy_state, failure_information deploy_state, failure_information = await asyncio.wait_for( wait_forever(), timeout=30 * 60) assert deploy_state == 'success', str(failure_information)
async def async_main(args): deploy_config = get_deploy_config() if args.namespace: auth_ns = args.namespace deploy_config = deploy_config.with_service('auth', auth_ns) else: auth_ns = deploy_config.service_ns('auth') headers = namespace_auth_headers(deploy_config, auth_ns, authorize_target=False) async with ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60), headers=headers) as session: await auth_flow(deploy_config, auth_ns, session)
async def run(self): app_runner = None site = None try: app = web.Application(client_max_size=HTTP_CLIENT_MAX_SIZE) app.add_routes([ web.post('/api/v1alpha/batches/jobs/create', self.create_job), web.delete( '/api/v1alpha/batches/{batch_id}/jobs/{job_id}/delete', self.delete_job), web.get('/api/v1alpha/batches/{batch_id}/jobs/{job_id}/log', self.get_job_log), web.get('/api/v1alpha/batches/{batch_id}/jobs/{job_id}/status', self.get_job_status), web.get('/healthcheck', self.healthcheck) ]) app_runner = web.AppRunner(app) await app_runner.setup() site = web.TCPSite(app_runner, '0.0.0.0', 5000) await site.start() await self.activate() idle_duration = time_msecs() - self.last_updated while self.jobs or idle_duration < MAX_IDLE_TIME_MSECS: log.info( f'n_jobs {len(self.jobs)} free_cores {self.cpu_sem.value / 1000} idle {idle_duration}' ) await asyncio.sleep(15) idle_duration = time_msecs() - self.last_updated log.info(f'idle {idle_duration} seconds, exiting') async with ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60)) as session: # Don't retry. If it doesn't go through, the driver # monitoring loops will recover. If the driver is # gone (e.g. testing a PR), this would go into an # infinite loop and the instance won't be deleted. await session.post(deploy_config.url( 'batch-driver', '/api/v1alpha/instances/deactivate'), headers=self.headers) log.info('deactivated') finally: log.info('shutting down') if site: await site.stop() log.info('stopped site') if app_runner: await app_runner.cleanup() log.info('cleaned up app runner')
async def async_get_userinfo(deploy_config=None, headers=None): if deploy_config is None: deploy_config = get_deploy_config() if headers is None: headers = service_auth_headers(deploy_config, 'auth') userinfo_url = deploy_config.url('auth', '/api/v1alpha/userinfo') async with ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=5)) as session: resp = await request_retry_transient_errors(session, 'GET', userinfo_url, headers=headers) return await resp.json()
async def _userdata_from_session_id(session_id): headers = {'Authorization': f'Bearer {session_id}'} try: async with ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=5)) as session: resp = await request_retry_transient_errors( session, 'GET', deploy_config.url('auth', '/api/v1alpha/userinfo'), headers=headers) assert resp.status == 200 return await resp.json() except aiohttp.ClientResponseError as e: if e.status == 401: return None log.exception('unknown exception getting userinfo') raise web.HTTPInternalServerError() except Exception: # pylint: disable=broad-except log.exception('unknown exception getting userinfo') raise web.HTTPInternalServerError()
async def post_job_started_1(self, job): full_status = await job.status() status = { 'version': full_status['version'], 'batch_id': full_status['batch_id'], 'job_id': full_status['job_id'], 'attempt_id': full_status['attempt_id'], 'start_time': full_status['start_time'], 'resources': full_status['resources'] } body = { 'status': status } async with ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=5)) as session: await request_retry_transient_errors( session, 'POST', deploy_config.url('batch-driver', '/api/v1alpha/instances/job_started'), json=body, headers=self.headers)
async def notebook_status_from_notebook(k8s, service, headers, cookies, notebook): status = await k8s_notebook_status_from_notebook(k8s, notebook) if not status: return None if status['state'] == 'Initializing': if notebook['state'] == 'Ready': status['state'] = 'Ready' else: pod_name = notebook['pod_name'] # don't have dev credentials to connect through internal.hail.is ready_url = deploy_config.external_url( service, f'/instance/{notebook["notebook_token"]}/?token={notebook["jupyter_token"]}' ) try: async with ssl_client_session( timeout=aiohttp.ClientTimeout(total=1), headers=headers, cookies=cookies) as session: async with session.get(ready_url) as resp: if resp.status >= 200 and resp.status < 300: log.info( f'GET on jupyter pod {pod_name} succeeded: {resp}' ) status['state'] = 'Ready' else: log.info( f'GET on jupyter pod {pod_name} failed: {resp}' ) except aiohttp.ServerTimeoutError: log.exception( f'GET on jupyter pod {pod_name} timed out: {resp}') return status
def __init__(self, should_fail): self.should_fail = should_fail self.real_session = ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60))
async def post_job_complete_1(self, job): run_duration = job.end_time - job.start_time full_status = await retry_all_errors( f'error while getting status for {job}')(job.status) if job.format_version.has_full_status_in_gcs(): await retry_all_errors( f'error while writing status file to gcs for {job}')( self.log_store.write_status_file, job.batch_id, job.job_id, job.attempt_id, json.dumps(full_status)) db_status = job.format_version.db_status(full_status) status = { 'version': full_status['version'], 'batch_id': full_status['batch_id'], 'job_id': full_status['job_id'], 'attempt_id': full_status['attempt_id'], 'state': full_status['state'], 'start_time': full_status['start_time'], 'end_time': full_status['end_time'], 'status': db_status } body = {'status': status} start_time = time_msecs() delay_secs = 0.1 while True: try: async with ssl_client_session( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=5)) as session: await session.post(deploy_config.url( 'batch-driver', '/api/v1alpha/instances/job_complete'), json=body, headers=self.headers) return except asyncio.CancelledError: # pylint: disable=try-except-raise raise except Exception as e: if isinstance(e, aiohttp.ClientResponseError) and e.status == 404: # pylint: disable=no-member raise log.exception(f'failed to mark {job} complete, retrying') # unlist job after 3m or half the run duration now = time_msecs() elapsed = now - start_time if (job.id in self.jobs and elapsed > 180 * 1000 and elapsed > run_duration / 2): log.info( f'too much time elapsed marking {job} complete, removing from jobs, will keep retrying' ) del self.jobs[job.id] self.last_updated = time_msecs() await asyncio.sleep(delay_secs * random.uniform(0.7, 1.3)) # exponentially back off, up to (expected) max of 2m delay_secs = min(delay_secs * 2, 2 * 60.0)
async def run(args, i): headers = service_auth_headers(deploy_config, 'workshop', authorize_target=False) async with ssl_client_session(raise_for_status=True) as session: # make sure notebook is up async with session.get(deploy_config.url('workshop', ''), headers=headers) as resp: await resp.text() log.info(f'{i} loaded notebook home page') # log in as workshop guest # get csrf token async with session.get(deploy_config.url('workshop', '/login'), headers=headers) as resp: pass data = aiohttp.FormData() data.add_field(name='name', value=args.workshop) data.add_field(name='password', value=args.password) data.add_field(name='_csrf', value=get_cookie(session, '_csrf')) async with session.post(deploy_config.url('workshop', '/login'), data=data, headers=headers) as resp: pass log.info(f'{i} logged in') # create notebook # get csrf token async with session.get(deploy_config.url('workshop', '/notebook'), headers=headers) as resp: pass data = aiohttp.FormData() data.add_field(name='_csrf', value=get_cookie(session, '_csrf')) async with session.post(deploy_config.url('workshop', '/notebook'), data=data, headers=headers) as resp: pass log.info(f'{i} created notebook') start = time.time() # wait for notebook ready ready = False attempt = 0 # 5 attempts overkill, should only take 2: Scheduling => Running => Ready while not ready and attempt < 5: async with session.ws_connect(deploy_config.url('workshop', '/notebook/wait', base_scheme='ws'), headers=headers) as ws: async for msg in ws: if msg.data == '1': ready = True attempt += 1 end = time.time() duration = end - start log.info(f'{i} notebook state {ready} duration {duration}') # delete notebook # get csrf token async with session.get(deploy_config.url('workshop', '/notebook'), headers=headers) as resp: pass data = aiohttp.FormData() data.add_field(name='_csrf', value=get_cookie(session, '_csrf')) async with session.post(deploy_config.url('workshop', '/notebook/delete'), data=data, headers=headers) as resp: pass log.info(f'{i} notebook delete, done.') return duration, ready