def update_job_containers(event: Mapping, status: str, job_container_name: str) -> None: job_containers = RedisJobContainers() if JobLifeCycle.is_done(status): # Remove the job monitoring job_uuid = event['metadata']['labels']['job_uuid'] logger.info('Stop monitoring job_uuid: %s', job_uuid) job_containers.remove_job(job_uuid) if event['status']['container_statuses'] is None: return def get_container_id(container_id): if not container_id: return None if container_id.startswith('docker://'): return container_id[len('docker://'):] return container_id for container_status in event['status']['container_statuses']: if container_status['name'] != job_container_name: continue container_id = get_container_id(container_status['container_id']) if container_id: job_uuid = event['metadata']['labels']['job_uuid'] if container_status['state']['running'] is not None: logger.info('Monitoring (container_id, job_uuid): (%s, %s)', container_id, job_uuid) job_containers.monitor(container_id=container_id, job_uuid=job_uuid) else: job_containers.remove_container(container_id=container_id)
def test_update_job_containers(self): update_job_containers( event=status_experiment_job_event_with_conditions['object'], status=JobLifeCycle.BUILDING, job_container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)) # Assert it's still 0 because no job was created with that job_uuid assert len(RedisJobContainers().get_containers()) == 0 # pylint:disable=len-as-condition # Create a job with a specific uuid labels = status_experiment_job_event_with_conditions['object'][ 'metadata']['labels'] ExperimentJobFactory(uuid=labels['job_uuid']) job = ExperimentJob.objects.get(uuid=labels['job_uuid']) update_job_containers( event=status_experiment_job_event_with_conditions['object'], status=JobLifeCycle.BUILDING, job_container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)) # Assert now it has started monitoring the container assert len(RedisJobContainers().get_containers()) == 1 container_id = '539e6a6f4209997094802b0657f90576fe129b7f81697120172836073d9bbd75' assert RedisJobContainers().get_containers() == [container_id] job_uuid, experiment_uuid = RedisJobContainers().get_job(container_id) assert job.uuid.hex == job_uuid assert job.experiment.uuid.hex == experiment_uuid
def monitor(containers: Dict, node: 'ClusterNode', persist: bool) -> None: job_containers = RedisJobContainers() container_ids = job_containers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = { gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources } update_cluster_node(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue try: payload = get_container_resources(node, containers[container_id], gpu_resources, job_containers) except KeyError: payload = None if payload: payload = payload.to_dict() # todo: Re-enable publishing # logger.debug("Publishing resources event") # celery_app.send_task( # K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_RESOURCES, # kwargs={'payload': payload, 'persist': persist}) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = job_containers.get_experiment_for_job(job_uuid) set_last_resources_cond = ( RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources( experiment_uuid)) if set_last_resources_cond: RedisToStream.set_latest_job_resources(job_uuid, payload)
def test_update_job_containers_with_no_container_statuses(self): update_job_containers( event=status_experiment_job_event['object'], status=JobLifeCycle.BUILDING, job_container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)) assert len(RedisJobContainers().get_containers()) == 0 # pylint:disable=len-as-condition