def new_experiment_job_status(sender, **kwargs): instance = kwargs['instance'] created = kwargs.get('created', False) job = instance.job if created: # update job last_status job.status = instance job.save() # check if the new status is done to remove the containers from the monitors if job.is_done: from libs.redis_db import RedisJobContainers RedisJobContainers.remove_job(job.uuid.hex) # Check if the experiment job status if not created: return # Check if we need to change the experiment status experiment = instance.job.experiment if experiment.is_done: return celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS, kwargs={'experiment_id': experiment.id}, countdown=1)
def new_experiment_job_status(sender, **kwargs): instance = kwargs['instance'] created = kwargs.get('created', False) job = instance.job if created: # update job last_status job.job_status = instance job.save() # check if the new status is done to remove the containers from the monitors if job.is_done: from libs.redis_db import RedisJobContainers RedisJobContainers.remove_job(job.uuid.hex) # Check if the experiment job status if not created: return # Check if we need to change the experiment status experiment = instance.job.experiment if experiment.is_done: return check_experiment_status.delay(experiment_uuid=experiment.uuid.hex)
def run(containers, node, persist): container_ids = RedisJobContainers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = { gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources } update_cluster_node(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue payload = get_container_resources(node, containers[container_id], gpu_resources) if payload: payload = payload.to_dict() logger.debug("Publishing resources event") celery_app.send_task(EventsCeleryTasks.EVENTS_HANDLE_RESOURCES, kwargs={ 'payload': payload, 'persist': persist }) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = RedisJobContainers.get_experiment_for_job( job_uuid) set_last_resources_cond = ( RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources( experiment_uuid)) if set_last_resources_cond: RedisToStream.set_latest_job_resources(job_uuid, payload)
def run(containers, node, persist): container_ids = RedisJobContainers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = { gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources } # update cluster and current node update_cluster(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue payload = get_container_resources(node, containers[container_id], gpu_resources) if payload: payload = payload.to_dict() logger.info("Publishing resources event") handle_events_resources.delay(payload=payload, persist=persist) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = RedisJobContainers.get_experiment_for_job( job_uuid) if (RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources( experiment_uuid)): RedisToStream.set_latest_job_resources(job_uuid, payload)
def experiment_job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job # update job last_status job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save() # check if the new status is done to remove the containers from the monitors if job.is_done: from libs.redis_db import RedisJobContainers RedisJobContainers.remove_job(job.uuid.hex) # Check if we need to change the experiment status experiment = instance.job.experiment if experiment.is_done: return celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS, kwargs={'experiment_id': experiment.id}, countdown=1)
def run(containers, node, persist): container_ids = RedisJobContainers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources} # update cluster and current node update_cluster(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue payload = get_container_resources(node, containers[container_id], gpu_resources) if payload: payload = payload.to_dict() logger.info("Publishing resources event") celery_app.send_task( EventsCeleryTasks.EVENTS_HANDLE_RESOURCES, kwargs={'payload': payload, 'persist': persist}) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid) if (RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources(experiment_uuid)): RedisToStream.set_latest_job_resources(job_uuid, payload)
def new_experiment_job_status(sender, **kwargs): instance = kwargs['instance'] created = kwargs.get('created', False) job = instance.job if created: # update job last_status job.status = instance job.save() # check if the new status is done to remove the containers from the monitors if job.is_done: from libs.redis_db import RedisJobContainers RedisJobContainers.remove_job(job.uuid.hex) # Check if the experiment job status if not created: return # Check if we need to change the experiment status experiment = instance.job.experiment if experiment.is_done: return celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS, kwargs={'experiment_id': experiment.id}, countdown=1)
def test_update_job_containers(self): update_job_containers(event=status_raw_event_with_conditions['object'], status=JobLifeCycle.BUILDING, job_container_name=settings.JOB_CONTAINER_NAME) # Assert it's still 0 because no job was created with that job_uuid assert len(RedisJobContainers.get_containers()) == 0 # Create a job with a specific uuid labels = status_raw_event_with_conditions['object']['metadata']['labels'] ExperimentJobFactory(uuid=labels['job_uuid']) job = ExperimentJob.objects.get(uuid=labels['job_uuid']) update_job_containers(event=status_raw_event_with_conditions['object'], status=JobLifeCycle.BUILDING, job_container_name=settings.JOB_CONTAINER_NAME) # Assert now it has started monitoring the container assert len(RedisJobContainers.get_containers()) == 1 container_id = '539e6a6f4209997094802b0657f90576fe129b7f81697120172836073d9bbd75' assert RedisJobContainers.get_containers() == [container_id] job_uuid, experiment_uuid = RedisJobContainers.get_job(container_id) assert job.uuid.hex == job_uuid assert job.experiment.uuid.hex == experiment_uuid
def test_update_job_containers(self): update_job_containers(event=status_experiment_job_event_with_conditions['object'], status=JobLifeCycle.BUILDING, job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB) # Assert it's still 0 because no job was created with that job_uuid assert len(RedisJobContainers.get_containers()) == 0 # pylint:disable=len-as-condition # Create a job with a specific uuid labels = status_experiment_job_event_with_conditions['object']['metadata']['labels'] ExperimentJobFactory(uuid=labels['job_uuid']) job = ExperimentJob.objects.get(uuid=labels['job_uuid']) update_job_containers(event=status_experiment_job_event_with_conditions['object'], status=JobLifeCycle.BUILDING, job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB) # Assert now it has started monitoring the container assert len(RedisJobContainers.get_containers()) == 1 container_id = '539e6a6f4209997094802b0657f90576fe129b7f81697120172836073d9bbd75' assert RedisJobContainers.get_containers() == [container_id] job_uuid, experiment_uuid = RedisJobContainers.get_job(container_id) assert job.uuid.hex == job_uuid assert job.experiment.uuid.hex == experiment_uuid
def update_job_containers(event, status, job_container_name): if JobLifeCycle.is_done(status): # Remove the job monitoring job_uuid = event['metadata']['labels']['job_uuid'] logger.info('Stop monitoring job_uuid: %s', job_uuid) RedisJobContainers.remove_job(job_uuid) if event['status']['container_statuses'] is None: return def get_container_id(container_id): if not container_id: return None if container_id.startswith('docker://'): return container_id[len('docker://'):] return container_id for container_status in event['status']['container_statuses']: if container_status['name'] != job_container_name: continue container_id = get_container_id(container_status['container_id']) if container_id: job_uuid = event['metadata']['labels']['job_uuid'] if container_status['state']['running'] is not None: logger.info('Monitoring (container_id, job_uuid): (%s, %s)', container_id, job_uuid) RedisJobContainers.monitor(container_id=container_id, job_uuid=job_uuid) else: RedisJobContainers.remove_container(container_id=container_id)
def get_container_resources(node, container, gpu_resources): # Check if the container is running if container.status != ContainerStatuses.RUNNING: logger.info("`{}` container is not running".format(container.name)) RedisJobContainers.remove_container(container.id) return job_uuid, experiment_uuid = RedisJobContainers.get_job(container.id) if not job_uuid: logger.info("`{}` container is not recognised".format(container.name)) return logger.info("Streaming resources for container {} " "in (job, experiment) (`{}`, `{}`) ".format(container.id, job_uuid, experiment_uuid)) try: stats = container.stats(decode=True, stream=False) except NotFound: logger.info("`{}` was not found".format(container.name)) RedisJobContainers.remove_container(container.id) return except requests.ReadTimeout: return precpu_stats = stats['precpu_stats'] cpu_stats = stats['cpu_stats'] pre_total_usage = float(precpu_stats['cpu_usage']['total_usage']) total_usage = float(cpu_stats['cpu_usage']['total_usage']) delta_total_usage = total_usage - pre_total_usage pre_system_cpu_usage = float(precpu_stats['system_cpu_usage']) system_cpu_usage = float(cpu_stats['system_cpu_usage']) delta_system_cpu_usage = system_cpu_usage - pre_system_cpu_usage percpu_usage = cpu_stats['cpu_usage']['percpu_usage'] num_cpu_cores = len(percpu_usage) if num_cpu_cores >= node.cpu * 1.5: logger.warning('Docker reporting num cpus `{}` and kubernetes reporting `{}`'.format( num_cpu_cores, node.cpu )) num_cpu_cores = node.cpu cpu_percentage = 0. percpu_percentage = [0.] * num_cpu_cores if delta_total_usage > 0 and delta_system_cpu_usage > 0: cpu_percentage = (delta_total_usage / delta_system_cpu_usage) * num_cpu_cores * 100.0 percpu_percentage = [cpu_usage / total_usage * cpu_percentage for cpu_usage in percpu_usage] memory_used = int(stats['memory_stats']['usage']) memory_limit = int(stats['memory_stats']['limit']) container_gpu_resources = None if gpu_resources: gpu_indices = get_container_gpu_indices(container) container_gpu_resources = [gpu_resources[gpu_indice] for gpu_indice in gpu_indices] return ContainerResourcesConfig.from_dict({ 'job_uuid': job_uuid, 'job_name': job_uuid, # it will be updated during the streaming 'experiment_uuid': experiment_uuid, 'container_id': container.id, 'cpu_percentage': cpu_percentage, 'n_cpus': num_cpu_cores, 'percpu_percentage': percpu_percentage, 'memory_used': memory_used, 'memory_limit': memory_limit, 'gpu_resources': container_gpu_resources })
def test_update_job_containers_with_no_container_statuses(self): update_job_containers(event=status_experiment_job_event['object'], status=JobLifeCycle.BUILDING, job_container_name=settings.CONTAINER_NAME_JOB) assert len(RedisJobContainers.get_containers()) == 0
def get_container_resources(node, container, gpu_resources): # Check if the container is running if container.status != ContainerStatuses.RUNNING: logger.info("`%s` container is not running", container.name) RedisJobContainers.remove_container(container.id) return job_uuid, experiment_uuid = RedisJobContainers.get_job(container.id) if not job_uuid: logger.info("`%s` container is not recognised", container.name) return logger.info( "Streaming resources for container %s in (job, experiment) (`%s`, `%s`) ", container.id, job_uuid, experiment_uuid) try: stats = container.stats(decode=True, stream=False) except NotFound: logger.info("`%s` was not found", container.name) RedisJobContainers.remove_container(container.id) return except requests.ReadTimeout: return precpu_stats = stats['precpu_stats'] cpu_stats = stats['cpu_stats'] pre_total_usage = float(precpu_stats['cpu_usage']['total_usage']) total_usage = float(cpu_stats['cpu_usage']['total_usage']) delta_total_usage = total_usage - pre_total_usage pre_system_cpu_usage = float(precpu_stats['system_cpu_usage']) system_cpu_usage = float(cpu_stats['system_cpu_usage']) delta_system_cpu_usage = system_cpu_usage - pre_system_cpu_usage percpu_usage = cpu_stats['cpu_usage']['percpu_usage'] num_cpu_cores = len(percpu_usage) if num_cpu_cores >= node.cpu * 1.5: logger.warning('Docker reporting num cpus `%s` and kubernetes reporting `%s`', num_cpu_cores, node.cpu) num_cpu_cores = node.cpu cpu_percentage = 0. percpu_percentage = [0.] * num_cpu_cores if delta_total_usage > 0 and delta_system_cpu_usage > 0: cpu_percentage = (delta_total_usage / delta_system_cpu_usage) * num_cpu_cores * 100.0 percpu_percentage = [cpu_usage / total_usage * cpu_percentage for cpu_usage in percpu_usage] memory_used = int(stats['memory_stats']['usage']) memory_limit = int(stats['memory_stats']['limit']) container_gpu_resources = None if gpu_resources: gpu_indices = get_container_gpu_indices(container) container_gpu_resources = [gpu_resources[gpu_indice] for gpu_indice in gpu_indices] return ContainerResourcesConfig.from_dict({ 'job_uuid': job_uuid, 'job_name': job_uuid, # it will be updated during the streaming 'experiment_uuid': experiment_uuid, 'container_id': container.id, 'cpu_percentage': cpu_percentage, 'n_cpus': num_cpu_cores, 'percpu_percentage': percpu_percentage, 'memory_used': memory_used, 'memory_limit': memory_limit, 'gpu_resources': container_gpu_resources })
def test_update_job_containers_with_no_container_statuses(self): update_job_containers(event=status_experiment_job_event['object'], status=JobLifeCycle.BUILDING, job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB) assert len(RedisJobContainers.get_containers()) == 0 # pylint:disable=len-as-condition