def new_experiment_job_status(sender, **kwargs):
    instance = kwargs['instance']
    created = kwargs.get('created', False)
    job = instance.job

    if created:
        # update job last_status
        job.status = instance
        job.save()

    # check if the new status is done to remove the containers from the monitors
    if job.is_done:
        from libs.redis_db import RedisJobContainers

        RedisJobContainers.remove_job(job.uuid.hex)

    # Check if the experiment job status
    if not created:
        return

    # Check if we need to change the experiment status
    experiment = instance.job.experiment
    if experiment.is_done:
        return

    celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS,
                         kwargs={'experiment_id': experiment.id},
                         countdown=1)
Beispiel #2
0
def new_experiment_job_status(sender, **kwargs):
    instance = kwargs['instance']
    created = kwargs.get('created', False)
    job = instance.job

    if created:
        # update job last_status
        job.job_status = instance
        job.save()

    # check if the new status is done to remove the containers from the monitors
    if job.is_done:
        from libs.redis_db import RedisJobContainers

        RedisJobContainers.remove_job(job.uuid.hex)

    # Check if the experiment job status
    if not created:
        return

    # Check if we need to change the experiment status
    experiment = instance.job.experiment
    if experiment.is_done:
        return

    check_experiment_status.delay(experiment_uuid=experiment.uuid.hex)
Beispiel #3
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    update_cluster_node(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.debug("Publishing resources event")
            celery_app.send_task(EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                                 kwargs={
                                     'payload': payload,
                                     'persist': persist
                                 })

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            set_last_resources_cond = (
                RedisToStream.is_monitored_job_resources(job_uuid)
                or RedisToStream.is_monitored_experiment_resources(
                    experiment_uuid))
            if set_last_resources_cond:
                RedisToStream.set_latest_job_resources(job_uuid, payload)
Beispiel #4
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            handle_events_resources.delay(payload=payload, persist=persist)

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid)
                    or RedisToStream.is_monitored_experiment_resources(
                        experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
Beispiel #5
0
def experiment_job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job

    # update job last_status
    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save()

    # check if the new status is done to remove the containers from the monitors
    if job.is_done:
        from libs.redis_db import RedisJobContainers

        RedisJobContainers.remove_job(job.uuid.hex)

    # Check if we need to change the experiment status
    experiment = instance.job.experiment
    if experiment.is_done:
        return

    celery_app.send_task(
        SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS,
        kwargs={'experiment_id': experiment.id},
        countdown=1)
Beispiel #6
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources}
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id], gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            celery_app.send_task(
                EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                kwargs={'payload': payload, 'persist': persist})

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid) or
                    RedisToStream.is_monitored_experiment_resources(experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
Beispiel #7
0
def new_experiment_job_status(sender, **kwargs):
    instance = kwargs['instance']
    created = kwargs.get('created', False)
    job = instance.job

    if created:
        # update job last_status
        job.status = instance
        job.save()

    # check if the new status is done to remove the containers from the monitors
    if job.is_done:
        from libs.redis_db import RedisJobContainers

        RedisJobContainers.remove_job(job.uuid.hex)

    # Check if the experiment job status
    if not created:
        return

    # Check if we need to change the experiment status
    experiment = instance.job.experiment
    if experiment.is_done:
        return

    celery_app.send_task(
        SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS,
        kwargs={'experiment_id': experiment.id},
        countdown=1)
Beispiel #8
0
    def test_update_job_containers(self):
        update_job_containers(event=status_raw_event_with_conditions['object'],
                              status=JobLifeCycle.BUILDING,
                              job_container_name=settings.JOB_CONTAINER_NAME)
        # Assert it's still 0 because no job was created with that job_uuid
        assert len(RedisJobContainers.get_containers()) == 0

        # Create a job with a specific uuid
        labels = status_raw_event_with_conditions['object']['metadata']['labels']
        ExperimentJobFactory(uuid=labels['job_uuid'])
        job = ExperimentJob.objects.get(uuid=labels['job_uuid'])
        update_job_containers(event=status_raw_event_with_conditions['object'],
                              status=JobLifeCycle.BUILDING,
                              job_container_name=settings.JOB_CONTAINER_NAME)
        # Assert now it has started monitoring the container
        assert len(RedisJobContainers.get_containers()) == 1
        container_id = '539e6a6f4209997094802b0657f90576fe129b7f81697120172836073d9bbd75'
        assert RedisJobContainers.get_containers() == [container_id]
        job_uuid, experiment_uuid = RedisJobContainers.get_job(container_id)
        assert job.uuid.hex == job_uuid
        assert job.experiment.uuid.hex == experiment_uuid
Beispiel #9
0
    def test_update_job_containers(self):
        update_job_containers(event=status_experiment_job_event_with_conditions['object'],
                              status=JobLifeCycle.BUILDING,
                              job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)
        # Assert it's still 0 because no job was created with that job_uuid
        assert len(RedisJobContainers.get_containers()) == 0  # pylint:disable=len-as-condition

        # Create a job with a specific uuid
        labels = status_experiment_job_event_with_conditions['object']['metadata']['labels']
        ExperimentJobFactory(uuid=labels['job_uuid'])
        job = ExperimentJob.objects.get(uuid=labels['job_uuid'])
        update_job_containers(event=status_experiment_job_event_with_conditions['object'],
                              status=JobLifeCycle.BUILDING,
                              job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)
        # Assert now it has started monitoring the container
        assert len(RedisJobContainers.get_containers()) == 1
        container_id = '539e6a6f4209997094802b0657f90576fe129b7f81697120172836073d9bbd75'
        assert RedisJobContainers.get_containers() == [container_id]
        job_uuid, experiment_uuid = RedisJobContainers.get_job(container_id)
        assert job.uuid.hex == job_uuid
        assert job.experiment.uuid.hex == experiment_uuid
Beispiel #10
0
def update_job_containers(event, status, job_container_name):
    if JobLifeCycle.is_done(status):
        # Remove the job monitoring
        job_uuid = event['metadata']['labels']['job_uuid']
        logger.info('Stop monitoring job_uuid: %s', job_uuid)
        RedisJobContainers.remove_job(job_uuid)

    if event['status']['container_statuses'] is None:
        return

    def get_container_id(container_id):
        if not container_id:
            return None
        if container_id.startswith('docker://'):
            return container_id[len('docker://'):]
        return container_id

    for container_status in event['status']['container_statuses']:
        if container_status['name'] != job_container_name:
            continue

        container_id = get_container_id(container_status['container_id'])
        if container_id:
            job_uuid = event['metadata']['labels']['job_uuid']
            if container_status['state']['running'] is not None:
                logger.info('Monitoring (container_id, job_uuid): (%s, %s)',
                            container_id, job_uuid)
                RedisJobContainers.monitor(container_id=container_id,
                                           job_uuid=job_uuid)
            else:

                RedisJobContainers.remove_container(container_id=container_id)
Beispiel #11
0
def update_job_containers(event, status, job_container_name):
    if JobLifeCycle.is_done(status):
        # Remove the job monitoring
        job_uuid = event['metadata']['labels']['job_uuid']
        logger.info('Stop monitoring job_uuid: %s', job_uuid)
        RedisJobContainers.remove_job(job_uuid)

    if event['status']['container_statuses'] is None:
        return

    def get_container_id(container_id):
        if not container_id:
            return None
        if container_id.startswith('docker://'):
            return container_id[len('docker://'):]
        return container_id

    for container_status in event['status']['container_statuses']:
        if container_status['name'] != job_container_name:
            continue

        container_id = get_container_id(container_status['container_id'])
        if container_id:
            job_uuid = event['metadata']['labels']['job_uuid']
            if container_status['state']['running'] is not None:
                logger.info('Monitoring (container_id, job_uuid): (%s, %s)',
                            container_id, job_uuid)
                RedisJobContainers.monitor(container_id=container_id, job_uuid=job_uuid)
            else:

                RedisJobContainers.remove_container(container_id=container_id)
Beispiel #12
0
def get_container_resources(node, container, gpu_resources):
    # Check if the container is running
    if container.status != ContainerStatuses.RUNNING:
        logger.info("`{}` container is not running".format(container.name))
        RedisJobContainers.remove_container(container.id)
        return

    job_uuid, experiment_uuid = RedisJobContainers.get_job(container.id)

    if not job_uuid:
        logger.info("`{}` container is not recognised".format(container.name))
        return

    logger.info("Streaming resources for container {} "
                "in (job, experiment) (`{}`, `{}`) ".format(container.id,
                                                            job_uuid,
                                                            experiment_uuid))

    try:
        stats = container.stats(decode=True, stream=False)
    except NotFound:
        logger.info("`{}` was not found".format(container.name))
        RedisJobContainers.remove_container(container.id)
        return
    except requests.ReadTimeout:
        return

    precpu_stats = stats['precpu_stats']
    cpu_stats = stats['cpu_stats']

    pre_total_usage = float(precpu_stats['cpu_usage']['total_usage'])
    total_usage = float(cpu_stats['cpu_usage']['total_usage'])
    delta_total_usage = total_usage - pre_total_usage

    pre_system_cpu_usage = float(precpu_stats['system_cpu_usage'])
    system_cpu_usage = float(cpu_stats['system_cpu_usage'])
    delta_system_cpu_usage = system_cpu_usage - pre_system_cpu_usage

    percpu_usage = cpu_stats['cpu_usage']['percpu_usage']
    num_cpu_cores = len(percpu_usage)
    if num_cpu_cores >= node.cpu * 1.5:
        logger.warning('Docker reporting num cpus `{}` and kubernetes reporting `{}`'.format(
            num_cpu_cores, node.cpu
        ))
        num_cpu_cores = node.cpu
    cpu_percentage = 0.
    percpu_percentage = [0.] * num_cpu_cores
    if delta_total_usage > 0 and delta_system_cpu_usage > 0:
        cpu_percentage = (delta_total_usage / delta_system_cpu_usage) * num_cpu_cores * 100.0
        percpu_percentage = [cpu_usage / total_usage * cpu_percentage for cpu_usage in percpu_usage]

    memory_used = int(stats['memory_stats']['usage'])
    memory_limit = int(stats['memory_stats']['limit'])

    container_gpu_resources = None
    if gpu_resources:
        gpu_indices = get_container_gpu_indices(container)
        container_gpu_resources = [gpu_resources[gpu_indice] for gpu_indice in gpu_indices]

    return ContainerResourcesConfig.from_dict({
        'job_uuid': job_uuid,
        'job_name': job_uuid,  # it will be updated during the streaming
        'experiment_uuid': experiment_uuid,
        'container_id': container.id,
        'cpu_percentage': cpu_percentage,
        'n_cpus': num_cpu_cores,
        'percpu_percentage': percpu_percentage,
        'memory_used': memory_used,
        'memory_limit': memory_limit,
        'gpu_resources': container_gpu_resources
    })
Beispiel #13
0
 def test_update_job_containers_with_no_container_statuses(self):
     update_job_containers(event=status_experiment_job_event['object'],
                           status=JobLifeCycle.BUILDING,
                           job_container_name=settings.CONTAINER_NAME_JOB)
     assert len(RedisJobContainers.get_containers()) == 0
Beispiel #14
0
def get_container_resources(node, container, gpu_resources):
    # Check if the container is running
    if container.status != ContainerStatuses.RUNNING:
        logger.info("`%s` container is not running", container.name)
        RedisJobContainers.remove_container(container.id)
        return

    job_uuid, experiment_uuid = RedisJobContainers.get_job(container.id)

    if not job_uuid:
        logger.info("`%s` container is not recognised", container.name)
        return

    logger.info(
        "Streaming resources for container %s in (job, experiment) (`%s`, `%s`) ",
        container.id, job_uuid, experiment_uuid)

    try:
        stats = container.stats(decode=True, stream=False)
    except NotFound:
        logger.info("`%s` was not found", container.name)
        RedisJobContainers.remove_container(container.id)
        return
    except requests.ReadTimeout:
        return

    precpu_stats = stats['precpu_stats']
    cpu_stats = stats['cpu_stats']

    pre_total_usage = float(precpu_stats['cpu_usage']['total_usage'])
    total_usage = float(cpu_stats['cpu_usage']['total_usage'])
    delta_total_usage = total_usage - pre_total_usage

    pre_system_cpu_usage = float(precpu_stats['system_cpu_usage'])
    system_cpu_usage = float(cpu_stats['system_cpu_usage'])
    delta_system_cpu_usage = system_cpu_usage - pre_system_cpu_usage

    percpu_usage = cpu_stats['cpu_usage']['percpu_usage']
    num_cpu_cores = len(percpu_usage)
    if num_cpu_cores >= node.cpu * 1.5:
        logger.warning('Docker reporting num cpus `%s` and kubernetes reporting `%s`',
                       num_cpu_cores, node.cpu)
        num_cpu_cores = node.cpu
    cpu_percentage = 0.
    percpu_percentage = [0.] * num_cpu_cores
    if delta_total_usage > 0 and delta_system_cpu_usage > 0:
        cpu_percentage = (delta_total_usage / delta_system_cpu_usage) * num_cpu_cores * 100.0
        percpu_percentage = [cpu_usage / total_usage * cpu_percentage for cpu_usage in percpu_usage]

    memory_used = int(stats['memory_stats']['usage'])
    memory_limit = int(stats['memory_stats']['limit'])

    container_gpu_resources = None
    if gpu_resources:
        gpu_indices = get_container_gpu_indices(container)
        container_gpu_resources = [gpu_resources[gpu_indice] for gpu_indice in gpu_indices]

    return ContainerResourcesConfig.from_dict({
        'job_uuid': job_uuid,
        'job_name': job_uuid,  # it will be updated during the streaming
        'experiment_uuid': experiment_uuid,
        'container_id': container.id,
        'cpu_percentage': cpu_percentage,
        'n_cpus': num_cpu_cores,
        'percpu_percentage': percpu_percentage,
        'memory_used': memory_used,
        'memory_limit': memory_limit,
        'gpu_resources': container_gpu_resources
    })
Beispiel #15
0
 def test_update_job_containers_with_no_container_statuses(self):
     update_job_containers(event=status_experiment_job_event['object'],
                           status=JobLifeCycle.BUILDING,
                           job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)
     assert len(RedisJobContainers.get_containers()) == 0  # pylint:disable=len-as-condition