async def log_job(request, ws, job, pod_id, namespace, container): job_uuid = job.uuid.hex if job_uuid in request.app.job_logs_ws_managers: ws_manager = request.app.job_logs_ws_managers[job_uuid] else: ws_manager = SocketManager() request.app.job_logs_ws_managers[job_uuid] = ws_manager ws_manager.add_socket(ws) # Stream phase changes status = None while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status): job.refresh_from_db() if status != job.last_status: status = job.last_status await notify_ws(ws=ws, message=get_status_message(status)) if should_disconnect(ws=ws, ws_manager=ws_manager): return await asyncio.sleep(SOCKET_SLEEP) if JobLifeCycle.is_done(status): await notify_ws(ws=ws, message=get_status_message(status)) return config.load_incluster_config() k8s_api = client.CoreV1Api() await log_job_pod(k8s_api=k8s_api, ws=ws, ws_manager=ws_manager, pod_id=pod_id, container=container, namespace=namespace)
async def job_resources(request, ws, username, project_name, experiment_sequence, job_sequence): project = _get_project(username, project_name) if not has_project_permissions(request.app.user, project, 'GET'): exceptions.Forbidden("You don't have access to this project") experiment = _get_validated_experiment(project, experiment_sequence) job = _get_job(experiment, job_sequence) job_uuid = job.uuid.hex job_name = '{}.{}'.format(job.role, job.sequence) auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED, instance=job, actor_id=request.app.user.id) if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid): _logger.info('Job resources with uuid `%s` is now being monitored', job_name) RedisToStream.monitor_job_resources(job_uuid=job_uuid) if job_uuid in request.app.job_resources_ws_mangers: ws_manager = request.app.job_resources_ws_mangers[job_uuid] else: ws_manager = SocketManager() request.app.job_resources_ws_mangers[job_uuid] = ws_manager def handle_job_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: _logger.info('Stopping resources monitor for job %s', job_name) RedisToStream.remove_job_resources(job_uuid=job_uuid) request.app.job_resources_ws_mangers.pop(job_uuid, None) _logger.info('Quitting resources socket for job %s', job_name) ws_manager.add_socket(ws) should_check = 0 while True: resources = RedisToStream.get_latest_job_resources(job=job_uuid, job_name=job_name) should_check += 1 # After trying a couple of time, we must check the status of the job if should_check > RESOURCES_CHECK: job.refresh_from_db() if job.is_done: _logger.info('removing all socket because the job `%s` is done', job_name) ws_manager.ws = set([]) handle_job_disconnected_ws(ws) return else: should_check -= CHECK_DELAY if resources: try: await ws.send(resources) except ConnectionClosed: handle_job_disconnected_ws(ws) return # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access handle_job_disconnected_ws(ws) return await asyncio.sleep(SOCKET_SLEEP)
async def job_resources(request, ws, username, project_name, experiment_sequence, job_sequence): project = _get_project(username, project_name) if not has_project_permissions(request.app.user, project, 'GET'): exceptions.Forbidden("You don't have access to this project") experiment = _get_validated_experiment(project, experiment_sequence) job = _get_job(experiment, job_sequence) job_uuid = job.uuid.hex job_name = '{}.{}'.format(job.role, job.sequence) auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED, instance=job, actor_id=request.app.user.id) if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid): logger.info('Job resources with uuid `%s` is now being monitored', job_name) RedisToStream.monitor_job_resources(job_uuid=job_uuid) if job_uuid in request.app.job_resources_ws_mangers: ws_manager = request.app.job_resources_ws_mangers[job_uuid] else: ws_manager = SocketManager() request.app.job_resources_ws_mangers[job_uuid] = ws_manager def handle_job_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: logger.info('Stopping resources monitor for job %s', job_name) RedisToStream.remove_job_resources(job_uuid=job_uuid) request.app.job_resources_ws_mangers.pop(job_uuid, None) logger.info('Quitting resources socket for job %s', job_name) ws_manager.add_socket(ws) should_check = 0 while True: resources = RedisToStream.get_latest_job_resources(job=job_uuid, job_name=job_name) should_check += 1 # After trying a couple of time, we must check the status of the job if should_check > RESOURCES_CHECK: job.refresh_from_db() if job.is_done: logger.info('removing all socket because the job `%s` is done', job_name) ws_manager.ws = set([]) handle_job_disconnected_ws(ws) return else: should_check -= CHECK_DELAY if resources: try: await ws.send(resources) except ConnectionClosed: handle_job_disconnected_ws(ws) return # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access handle_job_disconnected_ws(ws) return await asyncio.sleep(SOCKET_SLEEP)
async def log_experiment(request, ws, experiment, namespace, container): experiment_uuid = experiment.uuid.hex if experiment_uuid in request.app.experiment_logs_ws_managers: ws_manager = request.app.experiment_logs_ws_managers[experiment_uuid] else: ws_manager = SocketManager() request.app.experiment_logs_ws_managers[experiment_uuid] = ws_manager ws_manager.add_socket(ws) # Stream phase changes status = None while status != ExperimentLifeCycle.RUNNING and not ExperimentLifeCycle.is_done( status): experiment.refresh_from_db() if status != experiment.last_status: status = experiment.last_status await notify_ws(ws=ws, message=get_status_message(status)) if should_disconnect(ws=ws, ws_manager=ws_manager): return await asyncio.sleep(SOCKET_SLEEP) if ExperimentLifeCycle.is_done(status): await notify_ws(ws=ws, message=get_status_message(status)) return config.load_incluster_config() k8s_api = client.CoreV1Api() log_requests = [] for job in experiment.jobs.all(): pod_id = job.pod_id log_requests.append( log_job_pod(k8s_api=k8s_api, ws=ws, ws_manager=ws_manager, pod_id=pod_id, container=container, namespace=namespace, task_type=job.role, task_idx=job.sequence)) await asyncio.wait(log_requests)
async def experiment_resources(request, ws, username, project_name, experiment_id): experiment, message = validate_experiment(request=request, username=username, project_name=project_name, experiment_id=experiment_id) if experiment is None: await ws.send(get_error_message(message)) return experiment_uuid = experiment.uuid.hex auditor.record(event_type=EXPERIMENT_RESOURCES_VIEWED, instance=experiment, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_experiment_resources(experiment_uuid=experiment_uuid): logger.info('Experiment resource with uuid `%s` is now being monitored', experiment_uuid) RedisToStream.monitor_experiment_resources(experiment_uuid=experiment_uuid) if experiment_uuid in request.app.experiment_resources_ws_managers: ws_manager = request.app.experiment_resources_ws_managers[experiment_uuid] else: ws_manager = SocketManager() request.app.experiment_resources_ws_managers[experiment_uuid] = ws_manager def handle_experiment_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: logger.info('Stopping resources monitor for uuid %s', experiment_uuid) RedisToStream.remove_experiment_resources(experiment_uuid=experiment_uuid) request.app.experiment_resources_ws_managers.pop(experiment_uuid, None) logger.info('Quitting resources socket for uuid %s', experiment_uuid) jobs = [] for job in experiment.jobs.values('uuid', 'role', 'id'): job['uuid'] = job['uuid'].hex job['name'] = '{}.{}'.format(job.pop('role'), job.pop('id')) jobs.append(job) ws_manager.add_socket(ws) should_check = 0 while True: resources = RedisToStream.get_latest_experiment_resources(jobs) should_check += 1 # After trying a couple of time, we must check the status of the experiment if should_check > RESOURCES_CHECK: experiment.refresh_from_db() if experiment.is_done: logger.info( 'removing all socket because the experiment `%s` is done', experiment_uuid) ws_manager.ws = set([]) handle_experiment_disconnected_ws(ws) return else: should_check -= CHECK_DELAY if resources: try: await ws.send(resources) except ConnectionClosed: handle_experiment_disconnected_ws(ws) return # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access handle_experiment_disconnected_ws(ws) return await asyncio.sleep(SOCKET_SLEEP)