Esempio n. 1
0
def update_job_containers(event: Mapping, status: str,
                          job_container_name: str) -> None:
    job_containers = RedisJobContainers()
    if JobLifeCycle.is_done(status):
        # Remove the job monitoring
        job_uuid = event['metadata']['labels']['job_uuid']
        logger.info('Stop monitoring job_uuid: %s', job_uuid)
        job_containers.remove_job(job_uuid)

    if event['status']['container_statuses'] is None:
        return

    def get_container_id(container_id):
        if not container_id:
            return None
        if container_id.startswith('docker://'):
            return container_id[len('docker://'):]
        return container_id

    for container_status in event['status']['container_statuses']:
        if container_status['name'] != job_container_name:
            continue

        container_id = get_container_id(container_status['container_id'])
        if container_id:
            job_uuid = event['metadata']['labels']['job_uuid']
            if container_status['state']['running'] is not None:
                logger.info('Monitoring (container_id, job_uuid): (%s, %s)',
                            container_id, job_uuid)
                job_containers.monitor(container_id=container_id,
                                       job_uuid=job_uuid)
            else:

                job_containers.remove_container(container_id=container_id)
Esempio n. 2
0
    def test_update_job_containers(self):
        update_job_containers(
            event=status_experiment_job_event_with_conditions['object'],
            status=JobLifeCycle.BUILDING,
            job_container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS))
        # Assert it's still 0 because no job was created with that job_uuid
        assert len(RedisJobContainers().get_containers()) == 0  # pylint:disable=len-as-condition

        # Create a job with a specific uuid
        labels = status_experiment_job_event_with_conditions['object'][
            'metadata']['labels']
        ExperimentJobFactory(uuid=labels['job_uuid'])
        job = ExperimentJob.objects.get(uuid=labels['job_uuid'])
        update_job_containers(
            event=status_experiment_job_event_with_conditions['object'],
            status=JobLifeCycle.BUILDING,
            job_container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS))
        # Assert now it has started monitoring the container
        assert len(RedisJobContainers().get_containers()) == 1
        container_id = '539e6a6f4209997094802b0657f90576fe129b7f81697120172836073d9bbd75'
        assert RedisJobContainers().get_containers() == [container_id]
        job_uuid, experiment_uuid = RedisJobContainers().get_job(container_id)
        assert job.uuid.hex == job_uuid
        assert job.experiment.uuid.hex == experiment_uuid
Esempio n. 3
0
def monitor(containers: Dict, node: 'ClusterNode', persist: bool) -> None:
    job_containers = RedisJobContainers()
    container_ids = job_containers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    update_cluster_node(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        try:
            payload = get_container_resources(node, containers[container_id],
                                              gpu_resources, job_containers)
        except KeyError:
            payload = None
        if payload:
            payload = payload.to_dict()
            # todo: Re-enable publishing
            # logger.debug("Publishing resources event")
            # celery_app.send_task(
            #     K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_RESOURCES,
            #     kwargs={'payload': payload, 'persist': persist})

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = job_containers.get_experiment_for_job(job_uuid)
            set_last_resources_cond = (
                RedisToStream.is_monitored_job_resources(job_uuid)
                or RedisToStream.is_monitored_experiment_resources(
                    experiment_uuid))
            if set_last_resources_cond:
                RedisToStream.set_latest_job_resources(job_uuid, payload)
Esempio n. 4
0
 def test_update_job_containers_with_no_container_statuses(self):
     update_job_containers(
         event=status_experiment_job_event['object'],
         status=JobLifeCycle.BUILDING,
         job_container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS))
     assert len(RedisJobContainers().get_containers()) == 0  # pylint:disable=len-as-condition