コード例 #1
0
ファイル: resources.py プロジェクト: rohansaphal97/polyaxon
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            handle_events_resources.delay(payload=payload, persist=persist)

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid)
                    or RedisToStream.is_monitored_experiment_resources(
                        experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
コード例 #2
0
ファイル: monitor.py プロジェクト: whmnoe4j/polyaxon
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    update_cluster_node(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.debug("Publishing resources event")
            celery_app.send_task(EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                                 kwargs={
                                     'payload': payload,
                                     'persist': persist
                                 })

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            set_last_resources_cond = (
                RedisToStream.is_monitored_job_resources(job_uuid)
                or RedisToStream.is_monitored_experiment_resources(
                    experiment_uuid))
            if set_last_resources_cond:
                RedisToStream.set_latest_job_resources(job_uuid, payload)
コード例 #3
0
ファイル: monitor.py プロジェクト: ttsvetanov/polyaxon
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources}
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id], gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            celery_app.send_task(
                EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                kwargs={'payload': payload, 'persist': persist})

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid) or
                    RedisToStream.is_monitored_experiment_resources(experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
コード例 #4
0
ファイル: test_utils.py プロジェクト: vhcg77/polyaxon
    def test_update_job_containers(self):
        update_job_containers(event=status_raw_event_with_conditions['object'],
                              status=JobLifeCycle.BUILDING,
                              job_container_name=settings.JOB_CONTAINER_NAME)
        # Assert it's still 0 because no job was created with that job_uuid
        assert len(RedisJobContainers.get_containers()) == 0

        # Create a job with a specific uuid
        labels = status_raw_event_with_conditions['object']['metadata']['labels']
        ExperimentJobFactory(uuid=labels['job_uuid'])
        job = ExperimentJob.objects.get(uuid=labels['job_uuid'])
        update_job_containers(event=status_raw_event_with_conditions['object'],
                              status=JobLifeCycle.BUILDING,
                              job_container_name=settings.JOB_CONTAINER_NAME)
        # Assert now it has started monitoring the container
        assert len(RedisJobContainers.get_containers()) == 1
        container_id = '539e6a6f4209997094802b0657f90576fe129b7f81697120172836073d9bbd75'
        assert RedisJobContainers.get_containers() == [container_id]
        job_uuid, experiment_uuid = RedisJobContainers.get_job(container_id)
        assert job.uuid.hex == job_uuid
        assert job.experiment.uuid.hex == experiment_uuid
コード例 #5
0
ファイル: test_utils.py プロジェクト: ttsvetanov/polyaxon
    def test_update_job_containers(self):
        update_job_containers(event=status_experiment_job_event_with_conditions['object'],
                              status=JobLifeCycle.BUILDING,
                              job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)
        # Assert it's still 0 because no job was created with that job_uuid
        assert len(RedisJobContainers.get_containers()) == 0  # pylint:disable=len-as-condition

        # Create a job with a specific uuid
        labels = status_experiment_job_event_with_conditions['object']['metadata']['labels']
        ExperimentJobFactory(uuid=labels['job_uuid'])
        job = ExperimentJob.objects.get(uuid=labels['job_uuid'])
        update_job_containers(event=status_experiment_job_event_with_conditions['object'],
                              status=JobLifeCycle.BUILDING,
                              job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)
        # Assert now it has started monitoring the container
        assert len(RedisJobContainers.get_containers()) == 1
        container_id = '539e6a6f4209997094802b0657f90576fe129b7f81697120172836073d9bbd75'
        assert RedisJobContainers.get_containers() == [container_id]
        job_uuid, experiment_uuid = RedisJobContainers.get_job(container_id)
        assert job.uuid.hex == job_uuid
        assert job.experiment.uuid.hex == experiment_uuid
コード例 #6
0
 def test_update_job_containers_with_no_container_statuses(self):
     update_job_containers(event=status_experiment_job_event['object'],
                           status=JobLifeCycle.BUILDING,
                           job_container_name=settings.CONTAINER_NAME_JOB)
     assert len(RedisJobContainers.get_containers()) == 0
コード例 #7
0
ファイル: test_utils.py プロジェクト: ttsvetanov/polyaxon
 def test_update_job_containers_with_no_container_statuses(self):
     update_job_containers(event=status_experiment_job_event['object'],
                           status=JobLifeCycle.BUILDING,
                           job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)
     assert len(RedisJobContainers.get_containers()) == 0  # pylint:disable=len-as-condition