Esempio n. 1
0
    def test_set_latest_job_resources(self):
        gpu_resources = {
            'index':
            0,
            'bus_id':
            '0000:00:1E.1',
            'memory_free':
            1000,
            'memory_total':
            12883853312,
            'memory_used':
            8388608000,
            'memory_utilization':
            0,
            'minor':
            1,
            'name':
            'GeForce GTX TITAN 0',
            'power_draw':
            125,
            'power_limit':
            250,
            'processes': [{
                'command': 'python',
                'gpu_memory_usage': 4000,
                'pid': 48448,
                'username': '******'
            }, {
                'command': 'python',
                'gpu_memory_usage': 4000,
                'pid': 153223,
                'username': '******'
            }],
            'serial':
            '0322917092147',
            'temperature_gpu':
            80,
            'utilization_gpu':
            76,
            'uuid':
            'GPU-10fb0fbd-2696-43f3-467f-d280d906a107'
        }

        config_dict = {
            'job_uuid': uuid.uuid4().hex,
            'experiment_uuid': uuid.uuid4().hex,
            'container_id':
            '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4',
            'cpu_percentage': 0.6947691836734693,
            'percpu_percentage': [0.4564075715616173, 0.23836161211185192],
            'memory_used': 84467712,
            'memory_limit': 2096160768,
            'gpu_resources': gpu_resources
        }

        RedisToStream.set_latest_job_resources(config_dict['job_uuid'],
                                               config_dict)
        config_dict['job_name'] = 'master.0'
        assert config_dict == RedisToStream.get_latest_job_resources(
            config_dict['job_uuid'], 'master.0', True)
Esempio n. 2
0
async def experiment_job_resources(request, ws, username, project_name,
                                   experiment_id, job_id):
    job, _, message = validate_experiment_job(request=request,
                                              username=username,
                                              project_name=project_name,
                                              experiment_id=experiment_id,
                                              job_id=job_id)
    if job is None:
        await ws.send(get_error_message(message))
        return
    job_uuid = job.uuid.hex
    job_name = '{}.{}'.format(job.role, job.id)
    auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id,
                   actor_name=request.app.user.username)

    if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid):
        logger.info('Job resources with uuid `%s` is now being monitored',
                    job_name)
        RedisToStream.monitor_job_resources(job_uuid=job_uuid)

    if job_uuid in request.app.job_resources_ws_managers:
        ws_manager = request.app.job_resources_ws_managers[job_uuid]
    else:
        ws_manager = SocketManager()
        request.app.job_resources_ws_managers[job_uuid] = ws_manager

    def handle_job_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            logger.info('Stopping resources monitor for job %s', job_name)
            RedisToStream.remove_job_resources(job_uuid=job_uuid)
            request.app.job_resources_ws_managers.pop(job_uuid, None)

        logger.info('Quitting resources socket for job %s', job_name)

    ws_manager.add_socket(ws)
    should_check = 0
    while True:
        resources = RedisToStream.get_latest_job_resources(job=job_uuid,
                                                           job_name=job_name)
        should_check += 1

        # After trying a couple of time, we must check the status of the job
        if should_check > RESOURCES_CHECK:
            job.refresh_from_db()
            if job.is_done:
                logger.info('removing all socket because the job `%s` is done',
                            job_name)
                ws_manager.ws = set([])
                handle_job_disconnected_ws(ws)
                return
            else:
                should_check -= CHECK_DELAY

        if resources:
            try:
                await ws.send(resources)
            except ConnectionClosed:
                handle_job_disconnected_ws(ws)
                return

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            handle_job_disconnected_ws(ws)
            return
        await asyncio.sleep(SOCKET_SLEEP)