Esempio n. 1
0
 def gpu_resources(cls, jobs_resources):
     jobs_resources = to_list(jobs_resources)
     click.clear()
     data = [[
         'job_name', 'name', 'GPU Usage', 'GPU Mem Usage / Total',
         'GPU Temperature', 'Power Draw / Limit'
     ]]
     non_gpu_jobs = 0
     for job_resources in jobs_resources:
         job_resources = ContainerResourcesConfig.from_dict(job_resources)
         line = []
         if not job_resources.gpu_resources:
             non_gpu_jobs += 1
             continue
         for gpu_resources in job_resources.gpu_resources:
             line += [
                 job_resources.job_name,
                 gpu_resources.name,
                 to_percentage(gpu_resources.utilization_gpu / 100),
                 '{} / {}'.format(
                     to_unit_memory(gpu_resources.memory_used),
                     to_unit_memory(gpu_resources.memory_total)),
                 gpu_resources.temperature_gpu,
                 '{} / {}'.format(gpu_resources.power_draw,
                                  gpu_resources.power_limit),
             ]
         data.append(line)
     if non_gpu_jobs == len(jobs_resources):
         Printer.print_error(
             'No GPU job was found, please run `resources` command without `-g | --gpu` option.'
         )
         exit(1)
     click.echo(tabulate(data, headers="firstrow"))
     sys.stdout.flush()
    def test_container_resources(self):
        gpu_resources = {
            'index':
            0,
            'bus_id':
            '0000:00:1E.1',
            'memory_free':
            1000,
            'memory_total':
            12883853312,
            'memory_used':
            8388608000,
            'memory_utilization':
            0,
            'minor':
            1,
            'name':
            'GeForce GTX TITAN 0',
            'power_draw':
            125,
            'power_limit':
            250,
            'processes': [{
                'command': 'python',
                'gpu_memory_usage': 4000,
                'pid': 48448,
                'username': '******'
            }, {
                'command': 'python',
                'gpu_memory_usage': 4000,
                'pid': 153223,
                'username': '******'
            }],
            'serial':
            '0322917092147',
            'temperature_gpu':
            80,
            'utilization_gpu':
            76,
            'uuid':
            'GPU-10fb0fbd-2696-43f3-467f-d280d906a107'
        }

        config_dict = {
            'job_uuid': uuid.uuid4().hex,
            'experiment_uuid': uuid.uuid4().hex,
            'container_id':
            '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4',
            'cpu_percentage': 0.6947691836734693,
            'percpu_percentage': [0.4564075715616173, 0.23836161211185192],
            'memory_used': 84467712,
            'memory_limit': 2096160768,
            'gpu_resources': gpu_resources
        }
        config = ContainerResourcesConfig.from_dict(config_dict)
        config_to_dict = config.to_dict()
        assert config_to_dict == config_dict
Esempio n. 3
0
 def resources(cls, jobs_resources):
     jobs_resources = to_list(jobs_resources)
     click.clear()
     data = [['Job', 'Mem Usage / Total', 'CPU% - CPUs']]
     for job_resources in jobs_resources:
         job_resources = ContainerResourcesConfig.from_dict(job_resources)
         line = [
             job_resources.job_name,
             '{} / {}'.format(to_unit_memory(job_resources.memory_used),
                              to_unit_memory(job_resources.memory_limit)),
             '{} - {}'.format(
                 to_percentage(job_resources.cpu_percentage / 100),
                 job_resources.n_cpus)
         ]
         data.append(line)
     click.echo(tabulate(data, headers="firstrow"))
     sys.stdout.flush()
Esempio n. 4
0
def get_container_resources(node, container, gpu_resources):
    # Check if the container is running
    if container.status != ContainerStatuses.RUNNING:
        logger.info("`{}` container is not running".format(container.name))
        RedisJobContainers.remove_container(container.id)
        return

    job_uuid, experiment_uuid = RedisJobContainers.get_job(container.id)

    if not job_uuid:
        logger.info("`{}` container is not recognised".format(container.name))
        return

    logger.info("Streaming resources for container {} "
                "in (job, experiment) (`{}`, `{}`) ".format(container.id,
                                                            job_uuid,
                                                            experiment_uuid))

    try:
        stats = container.stats(decode=True, stream=False)
    except NotFound:
        logger.info("`{}` was not found".format(container.name))
        RedisJobContainers.remove_container(container.id)
        return
    except requests.ReadTimeout:
        return

    precpu_stats = stats['precpu_stats']
    cpu_stats = stats['cpu_stats']

    pre_total_usage = float(precpu_stats['cpu_usage']['total_usage'])
    total_usage = float(cpu_stats['cpu_usage']['total_usage'])
    delta_total_usage = total_usage - pre_total_usage

    pre_system_cpu_usage = float(precpu_stats['system_cpu_usage'])
    system_cpu_usage = float(cpu_stats['system_cpu_usage'])
    delta_system_cpu_usage = system_cpu_usage - pre_system_cpu_usage

    percpu_usage = cpu_stats['cpu_usage']['percpu_usage']
    num_cpu_cores = len(percpu_usage)
    if num_cpu_cores >= node.cpu * 1.5:
        logger.warning('Docker reporting num cpus `{}` and kubernetes reporting `{}`'.format(
            num_cpu_cores, node.cpu
        ))
        num_cpu_cores = node.cpu
    cpu_percentage = 0.
    percpu_percentage = [0.] * num_cpu_cores
    if delta_total_usage > 0 and delta_system_cpu_usage > 0:
        cpu_percentage = (delta_total_usage / delta_system_cpu_usage) * num_cpu_cores * 100.0
        percpu_percentage = [cpu_usage / total_usage * cpu_percentage for cpu_usage in percpu_usage]

    memory_used = int(stats['memory_stats']['usage'])
    memory_limit = int(stats['memory_stats']['limit'])

    container_gpu_resources = None
    if gpu_resources:
        gpu_indices = get_container_gpu_indices(container)
        container_gpu_resources = [gpu_resources[gpu_indice] for gpu_indice in gpu_indices]

    return ContainerResourcesConfig.from_dict({
        'job_uuid': job_uuid,
        'job_name': job_uuid,  # it will be updated during the streaming
        'experiment_uuid': experiment_uuid,
        'container_id': container.id,
        'cpu_percentage': cpu_percentage,
        'n_cpus': num_cpu_cores,
        'percpu_percentage': percpu_percentage,
        'memory_used': memory_used,
        'memory_limit': memory_limit,
        'gpu_resources': container_gpu_resources
    })
Esempio n. 5
0
def get_container_resources(node, container, gpu_resources):
    # Check if the container is running
    if container.status != ContainerStatuses.RUNNING:
        logger.info("`%s` container is not running", container.name)
        RedisJobContainers.remove_container(container.id)
        return

    job_uuid, experiment_uuid = RedisJobContainers.get_job(container.id)

    if not job_uuid:
        logger.info("`%s` container is not recognised", container.name)
        return

    logger.info(
        "Streaming resources for container %s in (job, experiment) (`%s`, `%s`) ",
        container.id, job_uuid, experiment_uuid)

    try:
        stats = container.stats(decode=True, stream=False)
    except NotFound:
        logger.info("`%s` was not found", container.name)
        RedisJobContainers.remove_container(container.id)
        return
    except requests.ReadTimeout:
        return

    precpu_stats = stats['precpu_stats']
    cpu_stats = stats['cpu_stats']

    pre_total_usage = float(precpu_stats['cpu_usage']['total_usage'])
    total_usage = float(cpu_stats['cpu_usage']['total_usage'])
    delta_total_usage = total_usage - pre_total_usage

    pre_system_cpu_usage = float(precpu_stats['system_cpu_usage'])
    system_cpu_usage = float(cpu_stats['system_cpu_usage'])
    delta_system_cpu_usage = system_cpu_usage - pre_system_cpu_usage

    percpu_usage = cpu_stats['cpu_usage']['percpu_usage']
    num_cpu_cores = len(percpu_usage)
    if num_cpu_cores >= node.cpu * 1.5:
        logger.warning('Docker reporting num cpus `%s` and kubernetes reporting `%s`',
                       num_cpu_cores, node.cpu)
        num_cpu_cores = node.cpu
    cpu_percentage = 0.
    percpu_percentage = [0.] * num_cpu_cores
    if delta_total_usage > 0 and delta_system_cpu_usage > 0:
        cpu_percentage = (delta_total_usage / delta_system_cpu_usage) * num_cpu_cores * 100.0
        percpu_percentage = [cpu_usage / total_usage * cpu_percentage for cpu_usage in percpu_usage]

    memory_used = int(stats['memory_stats']['usage'])
    memory_limit = int(stats['memory_stats']['limit'])

    container_gpu_resources = None
    if gpu_resources:
        gpu_indices = get_container_gpu_indices(container)
        container_gpu_resources = [gpu_resources[gpu_indice] for gpu_indice in gpu_indices]

    return ContainerResourcesConfig.from_dict({
        'job_uuid': job_uuid,
        'job_name': job_uuid,  # it will be updated during the streaming
        'experiment_uuid': experiment_uuid,
        'container_id': container.id,
        'cpu_percentage': cpu_percentage,
        'n_cpus': num_cpu_cores,
        'percpu_percentage': percpu_percentage,
        'memory_used': memory_used,
        'memory_limit': memory_limit,
        'gpu_resources': container_gpu_resources
    })